2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
26 class InfoExtractor(object):
27 """Information Extractor class.
29 Information extractors are the classes that, given a URL, extract
30 information about the video (or videos) the URL refers to. This
31 information includes the real video URL, the video title, author and
32 others. The information is stored in a dictionary which is then
33 passed to the FileDownloader. The FileDownloader processes this
34 information possibly downloading the video to the file system, among
35 other possible outcomes.
37 The dictionaries must include the following fields:
41 title: Video title, unescaped.
42 ext: Video filename extension.
44 The following fields are optional:
46 format: The video format, defaults to ext (used for --get-format)
47 thumbnail: Full URL to a video thumbnail image.
48 description: One-line video description.
49 uploader: Full name of the video uploader.
50 upload_date: Video upload date (YYYYMMDD).
51 uploader_id: Nickname or id of the video uploader.
52 location: Physical location of the video.
53 player_url: SWF Player URL (used for rtmpdump).
54 subtitles: The subtitle file contents.
55 urlhandle: [internal] The urlHandle to be used to download the file,
56 like returned by urllib.request.urlopen
58 The fields should all be Unicode strings.
60 Subclasses of this one should re-define the _real_initialize() and
61 _real_extract() methods and define a _VALID_URL regexp.
62 Probably, they should also be added to the list of extractors.
64 _real_extract() must return a *list* of information dictionaries as
67 Finally, the _WORKING attribute should be set to False for broken IEs
68 in order to warn the users and skip the tests.
75 def __init__(self, downloader=None):
76 """Constructor. Receives an optional downloader."""
78 self.set_downloader(downloader)
81 def suitable(cls, url):
82 """Receives a URL and returns True if suitable for this IE."""
83 return re.match(cls._VALID_URL, url) is not None
87 """Getter method for _WORKING."""
91 """Initializes an instance (authentication, etc)."""
93 self._real_initialize()
96 def extract(self, url):
97 """Extracts URL information and returns it in list of dicts."""
99 return self._real_extract(url)
101 def set_downloader(self, downloader):
102 """Sets the downloader for this IE."""
103 self._downloader = downloader
105 def _real_initialize(self):
106 """Real initialization process. Redefine in subclasses."""
109 def _real_extract(self, url):
110 """Real extraction process. Redefine in subclasses."""
115 return type(self).__name__[:-2]
117 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
118 """ Returns the response handle """
120 self.report_download_webpage(video_id)
121 elif note is not False:
122 self.to_screen(u'%s: %s' % (video_id, note))
124 return compat_urllib_request.urlopen(url_or_request)
125 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
127 errnote = u'Unable to download webpage'
128 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
130 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
131 """ Returns a tuple (page content as string, URL handle) """
132 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
133 content_type = urlh.headers.get('Content-Type', '')
134 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
136 encoding = m.group(1)
139 webpage_bytes = urlh.read()
140 if self._downloader.params.get('dump_intermediate_pages', False):
142 url = url_or_request.get_full_url()
143 except AttributeError:
145 self.to_screen(u'Dumping request to ' + url)
146 dump = base64.b64encode(webpage_bytes).decode('ascii')
147 self._downloader.to_screen(dump)
148 content = webpage_bytes.decode(encoding, 'replace')
149 return (content, urlh)
151 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
152 """ Returns the data of the page as a string """
153 return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
155 def to_screen(self, msg):
156 """Print msg to screen, prefixing it with '[ie_name]'"""
157 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
159 def report_extraction(self, id_or_name):
160 """Report information extraction."""
161 self.to_screen(u'%s: Extracting information' % id_or_name)
163 def report_download_webpage(self, video_id):
164 """Report webpage download."""
165 self.to_screen(u'%s: Downloading webpage' % video_id)
167 def report_age_confirmation(self):
168 """Report attempt to confirm age."""
169 self.to_screen(u'Confirming age')
171 #Methods for following #608
172 #They set the correct value of the '_type' key
173 def video_result(self, video_info):
174 """Returns a video"""
175 video_info['_type'] = 'video'
177 def url_result(self, url, ie=None):
178 """Returns a url that points to a page that should be processed"""
179 #TODO: ie should be the class used for getting the info
180 video_info = {'_type': 'url',
184 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
185 """Returns a playlist"""
186 video_info = {'_type': 'playlist',
189 video_info['id'] = playlist_id
191 video_info['title'] = playlist_title
194 def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
196 Perform a regex search on the given string, using a single or a list of
197 patterns returning the first matching group.
198 In case of failure return a default value or raise a WARNING or a
199 ExtractorError, depending on fatal, specifying the field name.
201 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
202 mobj = re.search(pattern, string, flags)
205 mobj = re.search(p, string, flags)
208 if sys.stderr.isatty() and os.name != 'nt':
209 _name = u'\033[0;34m%s\033[0m' % name
214 # return the first matching group
215 return next(g for g in mobj.groups() if g is not None)
216 elif default is not None:
219 raise ExtractorError(u'Unable to extract %s; '
220 u'please report this issue on GitHub.' % _name)
222 self._downloader.report_warning(u'unable to extract %s; '
223 u'please report this issue on GitHub.' % _name)
226 class SearchInfoExtractor(InfoExtractor):
228 Base class for paged search queries extractors.
229 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
230 Instances should define _SEARCH_KEY and _MAX_RESULTS.
234 def _make_valid_url(cls):
235 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
238 def suitable(cls, url):
239 return re.match(cls._make_valid_url(), url) is not None
241 def _real_extract(self, query):
242 mobj = re.match(self._make_valid_url(), query)
244 raise ExtractorError(u'Invalid search query "%s"' % query)
246 prefix = mobj.group('prefix')
247 query = mobj.group('query')
249 return self._get_n_results(query, 1)
250 elif prefix == 'all':
251 return self._get_n_results(query, self._MAX_RESULTS)
255 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
256 elif n > self._MAX_RESULTS:
257 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
258 n = self._MAX_RESULTS
259 return self._get_n_results(query, n)
261 def _get_n_results(self, query, n):
262 """Get a specified number of results for a query"""
263 raise NotImplementedError("This method must be implemented by sublclasses")
266 class YoutubeIE(InfoExtractor):
267 """Information extractor for youtube.com."""
271 (?:https?://)? # http(s):// (optional)
272 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
273 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
274 (?:.*?\#/)? # handle anchor (#/) redirect urls
275 (?: # the various things that can precede the ID:
276 (?:(?:v|embed|e)/) # v/ or embed/ or e/
277 |(?: # or the v= param in all its forms
278 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
279 (?:\?|\#!?) # the params delimiter ? or # or #!
280 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
283 )? # optional -> youtube.com/xxxx is OK
284 )? # all until now is optional -> you can pass the naked ID
285 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
286 (?(1).+)? # if we found the ID, everything can follow
288 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
289 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
290 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
291 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
292 _NETRC_MACHINE = 'youtube'
293 # Listed in order of quality
294 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
295 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
296 _video_extensions = {
302 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
308 _video_dimensions = {
327 def suitable(cls, url):
328 """Receives a URL and returns True if suitable for this IE."""
329 if YoutubePlaylistIE.suitable(url): return False
330 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
332 def report_lang(self):
333 """Report attempt to set language."""
334 self.to_screen(u'Setting language')
336 def report_login(self):
337 """Report attempt to log in."""
338 self.to_screen(u'Logging in')
340 def report_video_webpage_download(self, video_id):
341 """Report attempt to download video webpage."""
342 self.to_screen(u'%s: Downloading video webpage' % video_id)
344 def report_video_info_webpage_download(self, video_id):
345 """Report attempt to download video info webpage."""
346 self.to_screen(u'%s: Downloading video info webpage' % video_id)
348 def report_video_subtitles_download(self, video_id):
349 """Report attempt to download video info webpage."""
350 self.to_screen(u'%s: Checking available subtitles' % video_id)
352 def report_video_subtitles_request(self, video_id, sub_lang, format):
353 """Report attempt to download video info webpage."""
354 self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
356 def report_video_subtitles_available(self, video_id, sub_lang_list):
357 """Report available subtitles."""
358 sub_lang = ",".join(list(sub_lang_list.keys()))
359 self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
361 def report_information_extraction(self, video_id):
362 """Report attempt to extract video information."""
363 self.to_screen(u'%s: Extracting video information' % video_id)
365 def report_unavailable_format(self, video_id, format):
366 """Report extracted video URL."""
367 self.to_screen(u'%s: Format %s not available' % (video_id, format))
369 def report_rtmp_download(self):
370 """Indicate the download will use the RTMP protocol."""
371 self.to_screen(u'RTMP download detected')
373 def _get_available_subtitles(self, video_id):
374 self.report_video_subtitles_download(video_id)
375 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
377 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
378 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
379 return (u'unable to download video subtitles: %s' % compat_str(err), None)
380 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
381 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
382 if not sub_lang_list:
383 return (u'video doesn\'t have subtitles', None)
386 def _list_available_subtitles(self, video_id):
387 sub_lang_list = self._get_available_subtitles(video_id)
388 self.report_video_subtitles_available(video_id, sub_lang_list)
390 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
393 (error_message, sub_lang, sub)
395 self.report_video_subtitles_request(video_id, sub_lang, format)
396 params = compat_urllib_parse.urlencode({
402 url = 'http://www.youtube.com/api/timedtext?' + params
404 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
405 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
406 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
408 return (u'Did not fetch video subtitles', None, None)
409 return (None, sub_lang, sub)
411 def _request_automatic_caption(self, video_id, webpage):
412 """We need the webpage for getting the captions url, pass it as an
413 argument to speed up the process."""
414 sub_lang = self._downloader.params.get('subtitleslang')
415 sub_format = self._downloader.params.get('subtitlesformat')
416 self.to_screen(u'%s: Looking for automatic captions' % video_id)
417 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
418 err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang
420 return [(err_msg, None, None)]
421 player_config = json.loads(mobj.group(1))
423 args = player_config[u'args']
424 caption_url = args[u'ttsurl']
425 timestamp = args[u'timestamp']
426 params = compat_urllib_parse.urlencode({
433 subtitles_url = caption_url + '&' + params
434 sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions')
435 return [(None, sub_lang, sub)]
437 return [(err_msg, None, None)]
439 def _extract_subtitle(self, video_id):
441 Return a list with a tuple:
442 [(error_message, sub_lang, sub)]
444 sub_lang_list = self._get_available_subtitles(video_id)
445 sub_format = self._downloader.params.get('subtitlesformat')
446 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
447 return [(sub_lang_list[0], None, None)]
448 if self._downloader.params.get('subtitleslang', False):
449 sub_lang = self._downloader.params.get('subtitleslang')
450 elif 'en' in sub_lang_list:
453 sub_lang = list(sub_lang_list.keys())[0]
454 if not sub_lang in sub_lang_list:
455 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
457 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
460 def _extract_all_subtitles(self, video_id):
461 sub_lang_list = self._get_available_subtitles(video_id)
462 sub_format = self._downloader.params.get('subtitlesformat')
463 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
464 return [(sub_lang_list[0], None, None)]
466 for sub_lang in sub_lang_list:
467 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
468 subtitles.append(subtitle)
471 def _print_formats(self, formats):
472 print('Available formats:')
474 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
476 def _real_initialize(self):
477 if self._downloader is None:
482 downloader_params = self._downloader.params
484 # Attempt to use provided username and password or .netrc data
485 if downloader_params.get('username', None) is not None:
486 username = downloader_params['username']
487 password = downloader_params['password']
488 elif downloader_params.get('usenetrc', False):
490 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
495 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
496 except (IOError, netrc.NetrcParseError) as err:
497 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
501 request = compat_urllib_request.Request(self._LANG_URL)
504 compat_urllib_request.urlopen(request).read()
505 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
506 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
509 # No authentication to be performed
513 request = compat_urllib_request.Request(self._LOGIN_URL)
515 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
516 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
517 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
522 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
524 galx = match.group(1)
526 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
532 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
536 u'PersistentCookie': u'yes',
538 u'bgresponse': u'js_disabled',
539 u'checkConnection': u'',
540 u'checkedDomains': u'youtube',
546 u'signIn': u'Sign in',
548 u'service': u'youtube',
552 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
554 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
555 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
556 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
559 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
560 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
561 self._downloader.report_warning(u'unable to log in: bad username or password')
563 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
564 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
570 'action_confirm': 'Confirm',
572 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
574 self.report_age_confirmation()
575 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
576 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
577 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
579 def _extract_id(self, url):
580 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
582 raise ExtractorError(u'Invalid URL: %s' % url)
583 video_id = mobj.group(2)
586 def _real_extract(self, url):
587 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
588 mobj = re.search(self._NEXT_URL_RE, url)
590 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
591 video_id = self._extract_id(url)
594 self.report_video_webpage_download(video_id)
595 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
596 request = compat_urllib_request.Request(url)
598 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
599 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
600 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
602 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
604 # Attempt to extract SWF player URL
605 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
607 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
612 self.report_video_info_webpage_download(video_id)
613 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
614 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
615 % (video_id, el_type))
616 video_info_webpage = self._download_webpage(video_info_url, video_id,
618 errnote='unable to download video info webpage')
619 video_info = compat_parse_qs(video_info_webpage)
620 if 'token' in video_info:
622 if 'token' not in video_info:
623 if 'reason' in video_info:
624 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
626 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
628 # Check for "rental" videos
629 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
630 raise ExtractorError(u'"rental" videos not supported')
632 # Start extracting information
633 self.report_information_extraction(video_id)
636 if 'author' not in video_info:
637 raise ExtractorError(u'Unable to extract uploader name')
638 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
641 video_uploader_id = None
642 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
644 video_uploader_id = mobj.group(1)
646 self._downloader.report_warning(u'unable to extract uploader nickname')
649 if 'title' not in video_info:
650 raise ExtractorError(u'Unable to extract video title')
651 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
654 if 'thumbnail_url' not in video_info:
655 self._downloader.report_warning(u'unable to extract video thumbnail')
657 else: # don't panic if we can't find it
658 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
662 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
664 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
665 upload_date = unified_strdate(upload_date)
668 video_description = get_element_by_id("eow-description", video_webpage)
669 if video_description:
670 video_description = clean_html(video_description)
672 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
674 video_description = unescapeHTML(fd_mobj.group(1))
676 video_description = u''
679 video_subtitles = None
681 if self._downloader.params.get('writesubtitles', False):
682 video_subtitles = self._extract_subtitle(video_id)
684 (sub_error, sub_lang, sub) = video_subtitles[0]
686 # We try with the automatic captions
687 video_subtitles = self._request_automatic_caption(video_id, video_webpage)
688 (sub_error_auto, sub_lang, sub) = video_subtitles[0]
692 # We report the original error
693 self._downloader.report_error(sub_error)
695 if self._downloader.params.get('allsubtitles', False):
696 video_subtitles = self._extract_all_subtitles(video_id)
697 for video_subtitle in video_subtitles:
698 (sub_error, sub_lang, sub) = video_subtitle
700 self._downloader.report_error(sub_error)
702 if self._downloader.params.get('listsubtitles', False):
703 sub_lang_list = self._list_available_subtitles(video_id)
706 if 'length_seconds' not in video_info:
707 self._downloader.report_warning(u'unable to extract video duration')
710 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
713 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
715 # Decide which formats to download
716 req_format = self._downloader.params.get('format', None)
718 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
719 self.report_rtmp_download()
720 video_url_list = [(None, video_info['conn'][0])]
721 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
723 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
724 url_data = compat_parse_qs(url_data_str)
725 if 'itag' in url_data and 'url' in url_data:
726 url = url_data['url'][0] + '&signature=' + url_data['sig'][0]
727 if not 'ratebypass' in url: url += '&ratebypass=yes'
728 url_map[url_data['itag'][0]] = url
730 format_limit = self._downloader.params.get('format_limit', None)
731 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
732 if format_limit is not None and format_limit in available_formats:
733 format_list = available_formats[available_formats.index(format_limit):]
735 format_list = available_formats
736 existing_formats = [x for x in format_list if x in url_map]
737 if len(existing_formats) == 0:
738 raise ExtractorError(u'no known formats available for video')
739 if self._downloader.params.get('listformats', None):
740 self._print_formats(existing_formats)
742 if req_format is None or req_format == 'best':
743 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
744 elif req_format == 'worst':
745 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
746 elif req_format in ('-1', 'all'):
747 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
749 # Specific formats. We pick the first in a slash-delimeted sequence.
750 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
751 req_formats = req_format.split('/')
752 video_url_list = None
753 for rf in req_formats:
755 video_url_list = [(rf, url_map[rf])]
757 if video_url_list is None:
758 raise ExtractorError(u'requested format not available')
760 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
763 for format_param, video_real_url in video_url_list:
765 video_extension = self._video_extensions.get(format_param, 'flv')
767 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
768 self._video_dimensions.get(format_param, '???'))
772 'url': video_real_url,
773 'uploader': video_uploader,
774 'uploader_id': video_uploader_id,
775 'upload_date': upload_date,
776 'title': video_title,
777 'ext': video_extension,
778 'format': video_format,
779 'thumbnail': video_thumbnail,
780 'description': video_description,
781 'player_url': player_url,
782 'subtitles': video_subtitles,
783 'duration': video_duration
788 class MetacafeIE(InfoExtractor):
789 """Information Extractor for metacafe.com."""
791 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
792 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
793 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
794 IE_NAME = u'metacafe'
796 def report_disclaimer(self):
797 """Report disclaimer retrieval."""
798 self.to_screen(u'Retrieving disclaimer')
800 def _real_initialize(self):
801 # Retrieve disclaimer
802 request = compat_urllib_request.Request(self._DISCLAIMER)
804 self.report_disclaimer()
805 disclaimer = compat_urllib_request.urlopen(request).read()
806 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
807 raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
812 'submit': "Continue - I'm over 18",
814 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
816 self.report_age_confirmation()
817 disclaimer = compat_urllib_request.urlopen(request).read()
818 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
819 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
821 def _real_extract(self, url):
822 # Extract id and simplified title from URL
823 mobj = re.match(self._VALID_URL, url)
825 raise ExtractorError(u'Invalid URL: %s' % url)
827 video_id = mobj.group(1)
829 # Check if video comes from YouTube
830 mobj2 = re.match(r'^yt-(.*)$', video_id)
831 if mobj2 is not None:
832 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
834 # Retrieve video webpage to extract further information
835 webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
837 # Extract URL, uploader and title from webpage
838 self.report_extraction(video_id)
839 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
841 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
842 video_extension = mediaURL[-3:]
844 # Extract gdaKey if available
845 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
849 gdaKey = mobj.group(1)
850 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
852 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
854 raise ExtractorError(u'Unable to extract media URL')
855 vardict = compat_parse_qs(mobj.group(1))
856 if 'mediaData' not in vardict:
857 raise ExtractorError(u'Unable to extract media URL')
858 mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
860 raise ExtractorError(u'Unable to extract media URL')
861 mediaURL = mobj.group('mediaURL').replace('\\/', '/')
862 video_extension = mediaURL[-3:]
863 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
865 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
867 raise ExtractorError(u'Unable to extract title')
868 video_title = mobj.group(1).decode('utf-8')
870 mobj = re.search(r'submitter=(.*?);', webpage)
872 raise ExtractorError(u'Unable to extract uploader nickname')
873 video_uploader = mobj.group(1)
876 'id': video_id.decode('utf-8'),
877 'url': video_url.decode('utf-8'),
878 'uploader': video_uploader.decode('utf-8'),
880 'title': video_title,
881 'ext': video_extension.decode('utf-8'),
884 class DailymotionIE(InfoExtractor):
885 """Information Extractor for Dailymotion"""
887 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
888 IE_NAME = u'dailymotion'
890 def _real_extract(self, url):
891 # Extract id and simplified title from URL
892 mobj = re.match(self._VALID_URL, url)
894 raise ExtractorError(u'Invalid URL: %s' % url)
896 video_id = mobj.group(1).split('_')[0].split('?')[0]
898 video_extension = 'mp4'
900 # Retrieve video webpage to extract further information
901 request = compat_urllib_request.Request(url)
902 request.add_header('Cookie', 'family_filter=off')
903 webpage = self._download_webpage(request, video_id)
905 # Extract URL, uploader and title from webpage
906 self.report_extraction(video_id)
907 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
909 raise ExtractorError(u'Unable to extract media URL')
910 flashvars = compat_urllib_parse.unquote(mobj.group(1))
912 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
915 self.to_screen(u'Using %s' % key)
918 raise ExtractorError(u'Unable to extract video URL')
920 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
922 raise ExtractorError(u'Unable to extract video URL')
924 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
926 # TODO: support choosing qualities
928 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
930 raise ExtractorError(u'Unable to extract title')
931 video_title = unescapeHTML(mobj.group('title'))
933 video_uploader = None
934 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
936 # lookin for official user
937 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
938 if mobj_official is None:
939 self._downloader.report_warning(u'unable to extract uploader nickname')
941 video_uploader = mobj_official.group(1)
943 video_uploader = mobj.group(1)
945 video_upload_date = None
946 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
948 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
953 'uploader': video_uploader,
954 'upload_date': video_upload_date,
955 'title': video_title,
956 'ext': video_extension,
960 class PhotobucketIE(InfoExtractor):
961 """Information extractor for photobucket.com."""
963 # TODO: the original _VALID_URL was:
964 # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
965 # Check if it's necessary to keep the old extracion process
966 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
967 IE_NAME = u'photobucket'
969 def _real_extract(self, url):
970 # Extract id from URL
971 mobj = re.match(self._VALID_URL, url)
973 raise ExtractorError(u'Invalid URL: %s' % url)
975 video_id = mobj.group('id')
977 video_extension = mobj.group('ext')
979 # Retrieve video webpage to extract further information
980 webpage = self._download_webpage(url, video_id)
982 # Extract URL, uploader, and title from webpage
983 self.report_extraction(video_id)
984 # We try first by looking the javascript code:
985 mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
987 info = json.loads(mobj.group('json'))
990 'url': info[u'downloadUrl'],
991 'uploader': info[u'username'],
992 'upload_date': datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
993 'title': info[u'title'],
994 'ext': video_extension,
995 'thumbnail': info[u'thumbUrl'],
998 # We try looking in other parts of the webpage
999 video_url = self._search_regex(r'<link rel="video_src" href=".*\?file=([^"]+)" />',
1000 webpage, u'video URL')
1002 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1004 raise ExtractorError(u'Unable to extract title')
1005 video_title = mobj.group(1).decode('utf-8')
1006 video_uploader = mobj.group(2).decode('utf-8')
1009 'id': video_id.decode('utf-8'),
1010 'url': video_url.decode('utf-8'),
1011 'uploader': video_uploader,
1012 'upload_date': None,
1013 'title': video_title,
1014 'ext': video_extension.decode('utf-8'),
1018 class YahooIE(InfoExtractor):
1019 """Information extractor for screen.yahoo.com."""
1020 _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
1022 def _real_extract(self, url):
1023 mobj = re.match(self._VALID_URL, url)
1025 raise ExtractorError(u'Invalid URL: %s' % url)
1026 video_id = mobj.group('id')
1027 webpage = self._download_webpage(url, video_id)
1028 m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
1031 # TODO: Check which url parameters are required
1032 info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1033 webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
1034 info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
1035 <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
1036 <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
1037 <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
1039 self.report_extraction(video_id)
1040 m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
1042 raise ExtractorError(u'Unable to extract video info')
1043 video_title = m_info.group('title')
1044 video_description = m_info.group('description')
1045 video_thumb = m_info.group('thumb')
1046 video_date = m_info.group('date')
1047 video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
1049 # TODO: Find a way to get mp4 videos
1050 rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1051 webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
1052 m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
1053 video_url = m_rest.group('url')
1054 video_path = m_rest.group('path')
1056 raise ExtractorError(u'Unable to extract video url')
1058 else: # We have to use a different method if another id is defined
1059 long_id = m_id.group('new_id')
1060 info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
1061 webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
1062 json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
1063 info = json.loads(json_str)
1064 res = info[u'query'][u'results'][u'mediaObj'][0]
1065 stream = res[u'streams'][0]
1066 video_path = stream[u'path']
1067 video_url = stream[u'host']
1069 video_title = meta[u'title']
1070 video_description = meta[u'description']
1071 video_thumb = meta[u'thumbnail']
1072 video_date = None # I can't find it
1077 'play_path': video_path,
1078 'title':video_title,
1079 'description': video_description,
1080 'thumbnail': video_thumb,
1081 'upload_date': video_date,
1086 class VimeoIE(InfoExtractor):
1087 """Information extractor for vimeo.com."""
1089 # _VALID_URL matches Vimeo URLs
1090 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1093 def _real_extract(self, url, new_video=True):
1094 # Extract ID from URL
1095 mobj = re.match(self._VALID_URL, url)
1097 raise ExtractorError(u'Invalid URL: %s' % url)
1099 video_id = mobj.group('id')
1100 if not mobj.group('proto'):
1101 url = 'https://' + url
1102 if mobj.group('direct_link') or mobj.group('pro'):
1103 url = 'https://vimeo.com/' + video_id
1105 # Retrieve video webpage to extract further information
1106 request = compat_urllib_request.Request(url, None, std_headers)
1107 webpage = self._download_webpage(request, video_id)
1109 # Now we begin extracting as much information as we can from what we
1110 # retrieved. First we extract the information common to all extractors,
1111 # and latter we extract those that are Vimeo specific.
1112 self.report_extraction(video_id)
1114 # Extract the config JSON
1116 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1117 config = json.loads(config)
1119 if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1120 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1122 raise ExtractorError(u'Unable to extract info section')
1125 video_title = config["video"]["title"]
1127 # Extract uploader and uploader_id
1128 video_uploader = config["video"]["owner"]["name"]
1129 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None
1131 # Extract video thumbnail
1132 video_thumbnail = config["video"]["thumbnail"]
1134 # Extract video description
1135 video_description = get_element_by_attribute("itemprop", "description", webpage)
1136 if video_description: video_description = clean_html(video_description)
1137 else: video_description = u''
1139 # Extract upload date
1140 video_upload_date = None
1141 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1142 if mobj is not None:
1143 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1145 # Vimeo specific: extract request signature and timestamp
1146 sig = config['request']['signature']
1147 timestamp = config['request']['timestamp']
1149 # Vimeo specific: extract video codec and quality information
1150 # First consider quality, then codecs, then take everything
1151 # TODO bind to format param
1152 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1153 files = { 'hd': [], 'sd': [], 'other': []}
1154 for codec_name, codec_extension in codecs:
1155 if codec_name in config["video"]["files"]:
1156 if 'hd' in config["video"]["files"][codec_name]:
1157 files['hd'].append((codec_name, codec_extension, 'hd'))
1158 elif 'sd' in config["video"]["files"][codec_name]:
1159 files['sd'].append((codec_name, codec_extension, 'sd'))
1161 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1163 for quality in ('hd', 'sd', 'other'):
1164 if len(files[quality]) > 0:
1165 video_quality = files[quality][0][2]
1166 video_codec = files[quality][0][0]
1167 video_extension = files[quality][0][1]
1168 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1171 raise ExtractorError(u'No known codec found')
1173 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1174 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1179 'uploader': video_uploader,
1180 'uploader_id': video_uploader_id,
1181 'upload_date': video_upload_date,
1182 'title': video_title,
1183 'ext': video_extension,
1184 'thumbnail': video_thumbnail,
1185 'description': video_description,
1189 class ArteTvIE(InfoExtractor):
1190 """arte.tv information extractor."""
1192 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1193 _LIVE_URL = r'index-[0-9]+\.html$'
1195 IE_NAME = u'arte.tv'
1197 def fetch_webpage(self, url):
1198 request = compat_urllib_request.Request(url)
1200 self.report_download_webpage(url)
1201 webpage = compat_urllib_request.urlopen(request).read()
1202 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1203 raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1204 except ValueError as err:
1205 raise ExtractorError(u'Invalid URL: %s' % url)
1208 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1209 page = self.fetch_webpage(url)
1210 mobj = re.search(regex, page, regexFlags)
1214 raise ExtractorError(u'Invalid URL: %s' % url)
1216 for (i, key, err) in matchTuples:
1217 if mobj.group(i) is None:
1218 raise ExtractorError(err)
1220 info[key] = mobj.group(i)
1224 def extractLiveStream(self, url):
1225 video_lang = url.split('/')[-4]
1226 info = self.grep_webpage(
1228 r'src="(.*?/videothek_js.*?\.js)',
1231 (1, 'url', u'Invalid URL: %s' % url)
1234 http_host = url.split('/')[2]
1235 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1236 info = self.grep_webpage(
1238 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1239 '(http://.*?\.swf).*?' +
1243 (1, 'path', u'could not extract video path: %s' % url),
1244 (2, 'player', u'could not extract video player: %s' % url),
1245 (3, 'url', u'could not extract video url: %s' % url)
1248 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1250 def extractPlus7Stream(self, url):
1251 video_lang = url.split('/')[-3]
1252 info = self.grep_webpage(
1254 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1257 (1, 'url', u'Invalid URL: %s' % url)
1260 next_url = compat_urllib_parse.unquote(info.get('url'))
1261 info = self.grep_webpage(
1263 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1266 (1, 'url', u'Could not find <video> tag: %s' % url)
1269 next_url = compat_urllib_parse.unquote(info.get('url'))
1271 info = self.grep_webpage(
1273 r'<video id="(.*?)".*?>.*?' +
1274 '<name>(.*?)</name>.*?' +
1275 '<dateVideo>(.*?)</dateVideo>.*?' +
1276 '<url quality="hd">(.*?)</url>',
1279 (1, 'id', u'could not extract video id: %s' % url),
1280 (2, 'title', u'could not extract video title: %s' % url),
1281 (3, 'date', u'could not extract video date: %s' % url),
1282 (4, 'url', u'could not extract video url: %s' % url)
1287 'id': info.get('id'),
1288 'url': compat_urllib_parse.unquote(info.get('url')),
1289 'uploader': u'arte.tv',
1290 'upload_date': unified_strdate(info.get('date')),
1291 'title': info.get('title').decode('utf-8'),
1297 def _real_extract(self, url):
1298 video_id = url.split('/')[-1]
1299 self.report_extraction(video_id)
1301 if re.search(self._LIVE_URL, video_id) is not None:
1302 self.extractLiveStream(url)
1305 info = self.extractPlus7Stream(url)
1310 class GenericIE(InfoExtractor):
1311 """Generic last-resort information extractor."""
1314 IE_NAME = u'generic'
1316 def report_download_webpage(self, video_id):
1317 """Report webpage download."""
1318 if not self._downloader.params.get('test', False):
1319 self._downloader.report_warning(u'Falling back on generic information extractor.')
1320 super(GenericIE, self).report_download_webpage(video_id)
1322 def report_following_redirect(self, new_url):
1323 """Report information extraction."""
1324 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1326 def _test_redirect(self, url):
1327 """Check if it is a redirect, like url shorteners, in case return the new url."""
1328 class HeadRequest(compat_urllib_request.Request):
1329 def get_method(self):
1332 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1334 Subclass the HTTPRedirectHandler to make it use our
1335 HeadRequest also on the redirected URL
1337 def redirect_request(self, req, fp, code, msg, headers, newurl):
1338 if code in (301, 302, 303, 307):
1339 newurl = newurl.replace(' ', '%20')
1340 newheaders = dict((k,v) for k,v in req.headers.items()
1341 if k.lower() not in ("content-length", "content-type"))
1342 return HeadRequest(newurl,
1344 origin_req_host=req.get_origin_req_host(),
1347 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1349 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1351 Fallback to GET if HEAD is not allowed (405 HTTP error)
1353 def http_error_405(self, req, fp, code, msg, headers):
1357 newheaders = dict((k,v) for k,v in req.headers.items()
1358 if k.lower() not in ("content-length", "content-type"))
1359 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1361 origin_req_host=req.get_origin_req_host(),
1365 opener = compat_urllib_request.OpenerDirector()
1366 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1367 HTTPMethodFallback, HEADRedirectHandler,
1368 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1369 opener.add_handler(handler())
1371 response = opener.open(HeadRequest(url))
1372 if response is None:
1373 raise ExtractorError(u'Invalid URL protocol')
1374 new_url = response.geturl()
1379 self.report_following_redirect(new_url)
1382 def _real_extract(self, url):
1383 new_url = self._test_redirect(url)
1384 if new_url: return [self.url_result(new_url)]
1386 video_id = url.split('/')[-1]
1388 webpage = self._download_webpage(url, video_id)
1389 except ValueError as err:
1390 # since this is the last-resort InfoExtractor, if
1391 # this error is thrown, it'll be thrown here
1392 raise ExtractorError(u'Invalid URL: %s' % url)
1394 self.report_extraction(video_id)
1395 # Start with something easy: JW Player in SWFObject
1396 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1398 # Broaden the search a little bit
1399 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1401 # Broaden the search a little bit: JWPlayer JS loader
1402 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1404 raise ExtractorError(u'Invalid URL: %s' % url)
1406 # It's possible that one of the regexes
1407 # matched, but returned an empty group:
1408 if mobj.group(1) is None:
1409 raise ExtractorError(u'Invalid URL: %s' % url)
1411 video_url = compat_urllib_parse.unquote(mobj.group(1))
1412 video_id = os.path.basename(video_url)
1414 # here's a fun little line of code for you:
1415 video_extension = os.path.splitext(video_id)[1][1:]
1416 video_id = os.path.splitext(video_id)[0]
1418 # it's tempting to parse this further, but you would
1419 # have to take into account all the variations like
1420 # Video Title - Site Name
1421 # Site Name | Video Title
1422 # Video Title - Tagline | Site Name
1423 # and so on and so forth; it's just not practical
1424 mobj = re.search(r'<title>(.*)</title>', webpage)
1426 raise ExtractorError(u'Unable to extract title')
1427 video_title = mobj.group(1)
1429 # video uploader is domain name
1430 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1432 raise ExtractorError(u'Unable to extract title')
1433 video_uploader = mobj.group(1)
1438 'uploader': video_uploader,
1439 'upload_date': None,
1440 'title': video_title,
1441 'ext': video_extension,
1445 class YoutubeSearchIE(SearchInfoExtractor):
1446 """Information Extractor for YouTube search queries."""
1447 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1449 IE_NAME = u'youtube:search'
1450 _SEARCH_KEY = 'ytsearch'
1452 def report_download_page(self, query, pagenum):
1453 """Report attempt to download search page with given number."""
1454 query = query.decode(preferredencoding())
1455 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1457 def _get_n_results(self, query, n):
1458 """Get a specified number of results for a query"""
1464 while (50 * pagenum) < limit:
1465 self.report_download_page(query, pagenum+1)
1466 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1467 request = compat_urllib_request.Request(result_url)
1469 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1470 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1471 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1472 api_response = json.loads(data)['data']
1474 if not 'items' in api_response:
1475 raise ExtractorError(u'[youtube] No video results')
1477 new_ids = list(video['id'] for video in api_response['items'])
1478 video_ids += new_ids
1480 limit = min(n, api_response['totalItems'])
1483 if len(video_ids) > n:
1484 video_ids = video_ids[:n]
1485 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1486 return self.playlist_result(videos, query)
1489 class GoogleSearchIE(SearchInfoExtractor):
1490 """Information Extractor for Google Video search queries."""
1491 _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
1493 IE_NAME = u'video.google:search'
1494 _SEARCH_KEY = 'gvsearch'
1496 def _get_n_results(self, query, n):
1497 """Get a specified number of results for a query"""
1500 '_type': 'playlist',
1505 for pagenum in itertools.count(1):
1506 result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1507 webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1508 note='Downloading result page ' + str(pagenum))
1510 for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1513 'url': mobj.group(1)
1515 res['entries'].append(e)
1517 if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1520 class YahooSearchIE(SearchInfoExtractor):
1521 """Information Extractor for Yahoo! Video search queries."""
1524 IE_NAME = u'screen.yahoo:search'
1525 _SEARCH_KEY = 'yvsearch'
1527 def _get_n_results(self, query, n):
1528 """Get a specified number of results for a query"""
1531 '_type': 'playlist',
1535 for pagenum in itertools.count(0):
1536 result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
1537 webpage = self._download_webpage(result_url, query,
1538 note='Downloading results page '+str(pagenum+1))
1539 info = json.loads(webpage)
1541 results = info[u'results']
1543 for (i, r) in enumerate(results):
1544 if (pagenum * 30) +i >= n:
1546 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
1547 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
1548 res['entries'].append(e)
1549 if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
1555 class YoutubePlaylistIE(InfoExtractor):
1556 """Information Extractor for YouTube playlists."""
1558 _VALID_URL = r"""(?:
1563 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1564 \? (?:.*?&)*? (?:p|a|list)=
1567 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1570 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1572 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1574 IE_NAME = u'youtube:playlist'
1577 def suitable(cls, url):
1578 """Receives a URL and returns True if suitable for this IE."""
1579 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1581 def _real_extract(self, url):
1582 # Extract playlist id
1583 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1585 raise ExtractorError(u'Invalid URL: %s' % url)
1587 # Download playlist videos from API
1588 playlist_id = mobj.group(1) or mobj.group(2)
1593 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1594 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1597 response = json.loads(page)
1598 except ValueError as err:
1599 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1601 if 'feed' not in response:
1602 raise ExtractorError(u'Got a malformed response from YouTube API')
1603 playlist_title = response['feed']['title']['$t']
1604 if 'entry' not in response['feed']:
1605 # Number of videos is a multiple of self._MAX_RESULTS
1608 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1609 for entry in response['feed']['entry']
1610 if 'content' in entry ]
1612 if len(response['feed']['entry']) < self._MAX_RESULTS:
1616 videos = [v[1] for v in sorted(videos)]
1618 url_results = [self.url_result(url, 'Youtube') for url in videos]
1619 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1622 class YoutubeChannelIE(InfoExtractor):
1623 """Information Extractor for YouTube channels."""
1625 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1626 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1627 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1628 _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1629 IE_NAME = u'youtube:channel'
1631 def extract_videos_from_page(self, page):
1633 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1634 if mobj.group(1) not in ids_in_page:
1635 ids_in_page.append(mobj.group(1))
1638 def _real_extract(self, url):
1639 # Extract channel id
1640 mobj = re.match(self._VALID_URL, url)
1642 raise ExtractorError(u'Invalid URL: %s' % url)
1644 # Download channel page
1645 channel_id = mobj.group(1)
1649 url = self._TEMPLATE_URL % (channel_id, pagenum)
1650 page = self._download_webpage(url, channel_id,
1651 u'Downloading page #%s' % pagenum)
1653 # Extract video identifiers
1654 ids_in_page = self.extract_videos_from_page(page)
1655 video_ids.extend(ids_in_page)
1657 # Download any subsequent channel pages using the json-based channel_ajax query
1658 if self._MORE_PAGES_INDICATOR in page:
1660 pagenum = pagenum + 1
1662 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1663 page = self._download_webpage(url, channel_id,
1664 u'Downloading page #%s' % pagenum)
1666 page = json.loads(page)
1668 ids_in_page = self.extract_videos_from_page(page['content_html'])
1669 video_ids.extend(ids_in_page)
1671 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1674 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1676 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1677 url_entries = [self.url_result(url, 'Youtube') for url in urls]
1678 return [self.playlist_result(url_entries, channel_id)]
1681 class YoutubeUserIE(InfoExtractor):
1682 """Information Extractor for YouTube users."""
1684 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1685 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1686 _GDATA_PAGE_SIZE = 50
1687 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1688 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1689 IE_NAME = u'youtube:user'
1691 def _real_extract(self, url):
1693 mobj = re.match(self._VALID_URL, url)
1695 raise ExtractorError(u'Invalid URL: %s' % url)
1697 username = mobj.group(1)
1699 # Download video ids using YouTube Data API. Result size per
1700 # query is limited (currently to 50 videos) so we need to query
1701 # page by page until there are no video ids - it means we got
1708 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1710 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1711 page = self._download_webpage(gdata_url, username,
1712 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1714 # Extract video identifiers
1717 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1718 if mobj.group(1) not in ids_in_page:
1719 ids_in_page.append(mobj.group(1))
1721 video_ids.extend(ids_in_page)
1723 # A little optimization - if current page is not
1724 # "full", ie. does not contain PAGE_SIZE video ids then
1725 # we can assume that this page is the last one - there
1726 # are no more ids on further pages - no need to query
1729 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1734 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1735 url_results = [self.url_result(url, 'Youtube') for url in urls]
1736 return [self.playlist_result(url_results, playlist_title = username)]
1739 class BlipTVUserIE(InfoExtractor):
1740 """Information Extractor for blip.tv users."""
1742 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1744 IE_NAME = u'blip.tv:user'
1746 def _real_extract(self, url):
1748 mobj = re.match(self._VALID_URL, url)
1750 raise ExtractorError(u'Invalid URL: %s' % url)
1752 username = mobj.group(1)
1754 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1756 page = self._download_webpage(url, username, u'Downloading user page')
1757 mobj = re.search(r'data-users-id="([^"]+)"', page)
1758 page_base = page_base % mobj.group(1)
1761 # Download video ids using BlipTV Ajax calls. Result size per
1762 # query is limited (currently to 12 videos) so we need to query
1763 # page by page until there are no video ids - it means we got
1770 url = page_base + "&page=" + str(pagenum)
1771 page = self._download_webpage(url, username,
1772 u'Downloading video ids from page %d' % pagenum)
1774 # Extract video identifiers
1777 for mobj in re.finditer(r'href="/([^"]+)"', page):
1778 if mobj.group(1) not in ids_in_page:
1779 ids_in_page.append(unescapeHTML(mobj.group(1)))
1781 video_ids.extend(ids_in_page)
1783 # A little optimization - if current page is not
1784 # "full", ie. does not contain PAGE_SIZE video ids then
1785 # we can assume that this page is the last one - there
1786 # are no more ids on further pages - no need to query
1789 if len(ids_in_page) < self._PAGE_SIZE:
1794 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1795 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1796 return [self.playlist_result(url_entries, playlist_title = username)]
1799 class DepositFilesIE(InfoExtractor):
1800 """Information extractor for depositfiles.com"""
1802 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1804 def _real_extract(self, url):
1805 file_id = url.split('/')[-1]
1806 # Rebuild url in english locale
1807 url = 'http://depositfiles.com/en/files/' + file_id
1809 # Retrieve file webpage with 'Free download' button pressed
1810 free_download_indication = { 'gateway_result' : '1' }
1811 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1813 self.report_download_webpage(file_id)
1814 webpage = compat_urllib_request.urlopen(request).read()
1815 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1816 raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1818 # Search for the real file URL
1819 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1820 if (mobj is None) or (mobj.group(1) is None):
1821 # Try to figure out reason of the error.
1822 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1823 if (mobj is not None) and (mobj.group(1) is not None):
1824 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1825 raise ExtractorError(u'%s' % restriction_message)
1827 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1829 file_url = mobj.group(1)
1830 file_extension = os.path.splitext(file_url)[1][1:]
1832 # Search for file title
1833 file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
1836 'id': file_id.decode('utf-8'),
1837 'url': file_url.decode('utf-8'),
1839 'upload_date': None,
1840 'title': file_title,
1841 'ext': file_extension.decode('utf-8'),
1845 class FacebookIE(InfoExtractor):
1846 """Information Extractor for Facebook"""
1848 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1849 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1850 _NETRC_MACHINE = 'facebook'
1851 IE_NAME = u'facebook'
1853 def report_login(self):
1854 """Report attempt to log in."""
1855 self.to_screen(u'Logging in')
1857 def _real_initialize(self):
1858 if self._downloader is None:
1863 downloader_params = self._downloader.params
1865 # Attempt to use provided username and password or .netrc data
1866 if downloader_params.get('username', None) is not None:
1867 useremail = downloader_params['username']
1868 password = downloader_params['password']
1869 elif downloader_params.get('usenetrc', False):
1871 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1872 if info is not None:
1876 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1877 except (IOError, netrc.NetrcParseError) as err:
1878 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1881 if useremail is None:
1890 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1893 login_results = compat_urllib_request.urlopen(request).read()
1894 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1895 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1897 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1898 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1901 def _real_extract(self, url):
1902 mobj = re.match(self._VALID_URL, url)
1904 raise ExtractorError(u'Invalid URL: %s' % url)
1905 video_id = mobj.group('ID')
1907 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1908 webpage = self._download_webpage(url, video_id)
1910 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1911 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1912 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1914 raise ExtractorError(u'Cannot parse data')
1915 data = dict(json.loads(m.group(1)))
1916 params_raw = compat_urllib_parse.unquote(data['params'])
1917 params = json.loads(params_raw)
1918 video_data = params['video_data'][0]
1919 video_url = video_data.get('hd_src')
1921 video_url = video_data['sd_src']
1923 raise ExtractorError(u'Cannot find video URL')
1924 video_duration = int(video_data['video_duration'])
1925 thumbnail = video_data['thumbnail_src']
1927 video_title = self._search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
1929 video_title = unescapeHTML(video_title)
1933 'title': video_title,
1936 'duration': video_duration,
1937 'thumbnail': thumbnail,
1942 class BlipTVIE(InfoExtractor):
1943 """Information extractor for blip.tv"""
1945 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
1946 _URL_EXT = r'^.*\.([a-z0-9]+)$'
1947 IE_NAME = u'blip.tv'
1949 def report_direct_download(self, title):
1950 """Report information extraction."""
1951 self.to_screen(u'%s: Direct download detected' % title)
1953 def _real_extract(self, url):
1954 mobj = re.match(self._VALID_URL, url)
1956 raise ExtractorError(u'Invalid URL: %s' % url)
1958 # See https://github.com/rg3/youtube-dl/issues/857
1959 api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
1960 if api_mobj is not None:
1961 url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
1962 urlp = compat_urllib_parse_urlparse(url)
1963 if urlp.path.startswith('/play/'):
1964 request = compat_urllib_request.Request(url)
1965 response = compat_urllib_request.urlopen(request)
1966 redirecturl = response.geturl()
1967 rurlp = compat_urllib_parse_urlparse(redirecturl)
1968 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1969 url = 'http://blip.tv/a/a-' + file_id
1970 return self._real_extract(url)
1977 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1978 request = compat_urllib_request.Request(json_url)
1979 request.add_header('User-Agent', 'iTunes/10.6.1')
1980 self.report_extraction(mobj.group(1))
1983 urlh = compat_urllib_request.urlopen(request)
1984 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1985 basename = url.split('/')[-1]
1986 title,ext = os.path.splitext(basename)
1987 title = title.decode('UTF-8')
1988 ext = ext.replace('.', '')
1989 self.report_direct_download(title)
1994 'upload_date': None,
1999 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2000 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2001 if info is None: # Regular URL
2003 json_code_bytes = urlh.read()
2004 json_code = json_code_bytes.decode('utf-8')
2005 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2006 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
2009 json_data = json.loads(json_code)
2010 if 'Post' in json_data:
2011 data = json_data['Post']
2015 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2016 video_url = data['media']['url']
2017 umobj = re.match(self._URL_EXT, video_url)
2019 raise ValueError('Can not determine filename extension')
2020 ext = umobj.group(1)
2023 'id': data['item_id'],
2025 'uploader': data['display_name'],
2026 'upload_date': upload_date,
2027 'title': data['title'],
2029 'format': data['media']['mimeType'],
2030 'thumbnail': data['thumbnailUrl'],
2031 'description': data['description'],
2032 'player_url': data['embedUrl'],
2033 'user_agent': 'iTunes/10.6.1',
2035 except (ValueError,KeyError) as err:
2036 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
2041 class MyVideoIE(InfoExtractor):
2042 """Information Extractor for myvideo.de."""
2044 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2045 IE_NAME = u'myvideo'
2047 # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
2048 # Released into the Public Domain by Tristan Fischer on 2013-05-19
2049 # https://github.com/rg3/youtube-dl/pull/842
2050 def __rc4crypt(self,data, key):
2052 box = list(range(256))
2053 for i in list(range(256)):
2054 x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
2055 box[i], box[x] = box[x], box[i]
2061 y = (y + box[x]) % 256
2062 box[x], box[y] = box[y], box[x]
2063 out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
2067 return hashlib.md5(s).hexdigest().encode()
2069 def _real_extract(self,url):
2070 mobj = re.match(self._VALID_URL, url)
2072 raise ExtractorError(u'invalid URL: %s' % url)
2074 video_id = mobj.group(1)
2077 b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
2078 b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
2079 b'TnpsbA0KTVRkbU1tSTRNdz09'
2083 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2084 webpage = self._download_webpage(webpage_url, video_id)
2086 mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
2087 if mobj is not None:
2088 self.report_extraction(video_id)
2089 video_url = mobj.group(1) + '.flv'
2091 video_title = self._search_regex('<title>([^<]+)</title>',
2094 video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
2100 'upload_date': None,
2101 'title': video_title,
2106 mobj = re.search('var flashvars={(.+?)}', webpage)
2108 raise ExtractorError(u'Unable to extract video')
2113 for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
2114 if not a == '_encxml':
2117 encxml = compat_urllib_parse.unquote(b)
2118 if not params.get('domain'):
2119 params['domain'] = 'www.myvideo.de'
2120 xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
2121 if 'flash_playertype=MTV' in xmldata_url:
2122 self._downloader.report_warning(u'avoiding MTV player')
2124 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
2125 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
2129 enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
2130 enc_data_b = binascii.unhexlify(enc_data)
2132 base64.b64decode(base64.b64decode(GK)) +
2134 str(video_id).encode('utf-8')
2137 dec_data = self.__rc4crypt(enc_data_b, sk)
2140 self.report_extraction(video_id)
2143 mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
2145 video_url = compat_urllib_parse.unquote(mobj.group(1))
2146 if 'myvideo2flash' in video_url:
2147 self._downloader.report_warning(u'forcing RTMPT ...')
2148 video_url = video_url.replace('rtmpe://', 'rtmpt://')
2151 # extract non rtmp videos
2152 mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
2154 raise ExtractorError(u'unable to extract url')
2155 video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
2157 video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
2158 video_file = compat_urllib_parse.unquote(video_file)
2160 if not video_file.endswith('f4m'):
2161 ppath, prefix = video_file.split('.')
2162 video_playpath = '%s:%s' % (prefix, ppath)
2163 video_hls_playlist = ''
2166 video_hls_playlist = (
2167 video_filepath + video_file
2168 ).replace('.f4m', '.m3u8')
2170 video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
2171 video_swfobj = compat_urllib_parse.unquote(video_swfobj)
2173 video_title = self._search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
2179 'tc_url': video_url,
2181 'upload_date': None,
2182 'title': video_title,
2184 'play_path': video_playpath,
2185 'video_file': video_file,
2186 'video_hls_playlist': video_hls_playlist,
2187 'player_url': video_swfobj,
2191 class ComedyCentralIE(InfoExtractor):
2192 """Information extractor for The Daily Show and Colbert Report """
2194 # urls can be abbreviations like :thedailyshow or :colbert
2195 # urls for episodes like:
2196 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2197 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2198 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2199 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2200 |(https?://)?(www\.)?
2201 (?P<showname>thedailyshow|colbertnation)\.com/
2202 (full-episodes/(?P<episode>.*)|
2204 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2205 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2208 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2210 _video_extensions = {
2218 _video_dimensions = {
2228 def suitable(cls, url):
2229 """Receives a URL and returns True if suitable for this IE."""
2230 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2232 def _print_formats(self, formats):
2233 print('Available formats:')
2235 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2238 def _real_extract(self, url):
2239 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2241 raise ExtractorError(u'Invalid URL: %s' % url)
2243 if mobj.group('shortname'):
2244 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2245 url = u'http://www.thedailyshow.com/full-episodes/'
2247 url = u'http://www.colbertnation.com/full-episodes/'
2248 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2249 assert mobj is not None
2251 if mobj.group('clip'):
2252 if mobj.group('showname') == 'thedailyshow':
2253 epTitle = mobj.group('tdstitle')
2255 epTitle = mobj.group('cntitle')
2258 dlNewest = not mobj.group('episode')
2260 epTitle = mobj.group('showname')
2262 epTitle = mobj.group('episode')
2264 self.report_extraction(epTitle)
2265 webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2267 url = htmlHandle.geturl()
2268 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2270 raise ExtractorError(u'Invalid redirected URL: ' + url)
2271 if mobj.group('episode') == '':
2272 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2273 epTitle = mobj.group('episode')
2275 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2277 if len(mMovieParams) == 0:
2278 # The Colbert Report embeds the information in a without
2279 # a URL prefix; so extract the alternate reference
2280 # and then add the URL prefix manually.
2282 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2283 if len(altMovieParams) == 0:
2284 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2286 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2288 uri = mMovieParams[0][1]
2289 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2290 indexXml = self._download_webpage(indexUrl, epTitle,
2291 u'Downloading show index',
2292 u'unable to download episode index')
2296 idoc = xml.etree.ElementTree.fromstring(indexXml)
2297 itemEls = idoc.findall('.//item')
2298 for partNum,itemEl in enumerate(itemEls):
2299 mediaId = itemEl.findall('./guid')[0].text
2300 shortMediaId = mediaId.split(':')[-1]
2301 showId = mediaId.split(':')[-2].replace('.com', '')
2302 officialTitle = itemEl.findall('./title')[0].text
2303 officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2305 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2306 compat_urllib_parse.urlencode({'uri': mediaId}))
2307 configXml = self._download_webpage(configUrl, epTitle,
2308 u'Downloading configuration for %s' % shortMediaId)
2310 cdoc = xml.etree.ElementTree.fromstring(configXml)
2312 for rendition in cdoc.findall('.//rendition'):
2313 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2317 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2320 if self._downloader.params.get('listformats', None):
2321 self._print_formats([i[0] for i in turls])
2324 # For now, just pick the highest bitrate
2325 format,rtmp_video_url = turls[-1]
2327 # Get the format arg from the arg stream
2328 req_format = self._downloader.params.get('format', None)
2330 # Select format if we can find one
2333 format, rtmp_video_url = f, v
2336 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2338 raise ExtractorError(u'Cannot transform RTMP url')
2339 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2340 video_url = base + m.group('finalid')
2342 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2347 'upload_date': officialDate,
2352 'description': officialTitle,
2354 results.append(info)
2359 class EscapistIE(InfoExtractor):
2360 """Information extractor for The Escapist """
2362 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2363 IE_NAME = u'escapist'
2365 def _real_extract(self, url):
2366 mobj = re.match(self._VALID_URL, url)
2368 raise ExtractorError(u'Invalid URL: %s' % url)
2369 showName = mobj.group('showname')
2370 videoId = mobj.group('episode')
2372 self.report_extraction(showName)
2373 webpage = self._download_webpage(url, showName)
2375 videoDesc = self._search_regex('<meta name="description" content="([^"]*)"',
2376 webpage, u'description', fatal=False)
2377 if videoDesc: videoDesc = unescapeHTML(videoDesc)
2379 imgUrl = self._search_regex('<meta property="og:image" content="([^"]*)"',
2380 webpage, u'thumbnail', fatal=False)
2381 if imgUrl: imgUrl = unescapeHTML(imgUrl)
2383 playerUrl = self._search_regex('<meta property="og:video" content="([^"]*)"',
2384 webpage, u'player url')
2385 playerUrl = unescapeHTML(playerUrl)
2387 configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
2388 configUrl = compat_urllib_parse.unquote(configUrl)
2390 configJSON = self._download_webpage(configUrl, showName,
2391 u'Downloading configuration',
2392 u'unable to download configuration')
2394 # Technically, it's JavaScript, not JSON
2395 configJSON = configJSON.replace("'", '"')
2398 config = json.loads(configJSON)
2399 except (ValueError,) as err:
2400 raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2402 playlist = config['playlist']
2403 videoUrl = playlist[1]['url']
2408 'uploader': showName,
2409 'upload_date': None,
2412 'thumbnail': imgUrl,
2413 'description': videoDesc,
2414 'player_url': playerUrl,
2419 class CollegeHumorIE(InfoExtractor):
2420 """Information extractor for collegehumor.com"""
2423 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2424 IE_NAME = u'collegehumor'
2426 def report_manifest(self, video_id):
2427 """Report information extraction."""
2428 self.to_screen(u'%s: Downloading XML manifest' % video_id)
2430 def _real_extract(self, url):
2431 mobj = re.match(self._VALID_URL, url)
2433 raise ExtractorError(u'Invalid URL: %s' % url)
2434 video_id = mobj.group('videoid')
2439 'upload_date': None,
2442 self.report_extraction(video_id)
2443 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2445 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2446 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2447 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2449 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2451 videoNode = mdoc.findall('./video')[0]
2452 info['description'] = videoNode.findall('./description')[0].text
2453 info['title'] = videoNode.findall('./caption')[0].text
2454 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2455 manifest_url = videoNode.findall('./file')[0].text
2457 raise ExtractorError(u'Invalid metadata XML file')
2459 manifest_url += '?hdcore=2.10.3'
2460 self.report_manifest(video_id)
2462 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2463 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2464 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2466 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2468 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2469 node_id = media_node.attrib['url']
2470 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2471 except IndexError as err:
2472 raise ExtractorError(u'Invalid manifest file')
2474 url_pr = compat_urllib_parse_urlparse(manifest_url)
2475 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2482 class XVideosIE(InfoExtractor):
2483 """Information extractor for xvideos.com"""
2485 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2486 IE_NAME = u'xvideos'
2488 def _real_extract(self, url):
2489 mobj = re.match(self._VALID_URL, url)
2491 raise ExtractorError(u'Invalid URL: %s' % url)
2492 video_id = mobj.group(1)
2494 webpage = self._download_webpage(url, video_id)
2496 self.report_extraction(video_id)
2499 video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
2500 webpage, u'video URL'))
2503 video_title = self._search_regex(r'<title>(.*?)\s+-\s+XVID',
2506 # Extract video thumbnail
2507 video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
2508 webpage, u'thumbnail', fatal=False)
2514 'upload_date': None,
2515 'title': video_title,
2517 'thumbnail': video_thumbnail,
2518 'description': None,
2524 class SoundcloudIE(InfoExtractor):
2525 """Information extractor for soundcloud.com
2526 To access the media, the uid of the song and a stream token
2527 must be extracted from the page source and the script must make
2528 a request to media.soundcloud.com/crossdomain.xml. Then
2529 the media can be grabbed by requesting from an url composed
2530 of the stream token and uid
2533 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2534 IE_NAME = u'soundcloud'
2536 def report_resolve(self, video_id):
2537 """Report information extraction."""
2538 self.to_screen(u'%s: Resolving id' % video_id)
2540 def _real_extract(self, url):
2541 mobj = re.match(self._VALID_URL, url)
2543 raise ExtractorError(u'Invalid URL: %s' % url)
2545 # extract uploader (which is in the url)
2546 uploader = mobj.group(1)
2547 # extract simple title (uploader + slug of song title)
2548 slug_title = mobj.group(2)
2549 simple_title = uploader + u'-' + slug_title
2550 full_title = '%s/%s' % (uploader, slug_title)
2552 self.report_resolve(full_title)
2554 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2555 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2556 info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2558 info = json.loads(info_json)
2559 video_id = info['id']
2560 self.report_extraction(full_title)
2562 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2563 stream_json = self._download_webpage(streams_url, full_title,
2564 u'Downloading stream definitions',
2565 u'unable to download stream definitions')
2567 streams = json.loads(stream_json)
2568 mediaURL = streams['http_mp3_128_url']
2569 upload_date = unified_strdate(info['created_at'])
2574 'uploader': info['user']['username'],
2575 'upload_date': upload_date,
2576 'title': info['title'],
2578 'description': info['description'],
2581 class SoundcloudSetIE(InfoExtractor):
2582 """Information extractor for soundcloud.com sets
2583 To access the media, the uid of the song and a stream token
2584 must be extracted from the page source and the script must make
2585 a request to media.soundcloud.com/crossdomain.xml. Then
2586 the media can be grabbed by requesting from an url composed
2587 of the stream token and uid
2590 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2591 IE_NAME = u'soundcloud:set'
2593 def report_resolve(self, video_id):
2594 """Report information extraction."""
2595 self.to_screen(u'%s: Resolving id' % video_id)
2597 def _real_extract(self, url):
2598 mobj = re.match(self._VALID_URL, url)
2600 raise ExtractorError(u'Invalid URL: %s' % url)
2602 # extract uploader (which is in the url)
2603 uploader = mobj.group(1)
2604 # extract simple title (uploader + slug of song title)
2605 slug_title = mobj.group(2)
2606 simple_title = uploader + u'-' + slug_title
2607 full_title = '%s/sets/%s' % (uploader, slug_title)
2609 self.report_resolve(full_title)
2611 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2612 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2613 info_json = self._download_webpage(resolv_url, full_title)
2616 info = json.loads(info_json)
2617 if 'errors' in info:
2618 for err in info['errors']:
2619 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2622 self.report_extraction(full_title)
2623 for track in info['tracks']:
2624 video_id = track['id']
2626 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2627 stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2629 self.report_extraction(video_id)
2630 streams = json.loads(stream_json)
2631 mediaURL = streams['http_mp3_128_url']
2636 'uploader': track['user']['username'],
2637 'upload_date': unified_strdate(track['created_at']),
2638 'title': track['title'],
2640 'description': track['description'],
2645 class InfoQIE(InfoExtractor):
2646 """Information extractor for infoq.com"""
2647 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2649 def _real_extract(self, url):
2650 mobj = re.match(self._VALID_URL, url)
2652 raise ExtractorError(u'Invalid URL: %s' % url)
2654 webpage = self._download_webpage(url, video_id=url)
2655 self.report_extraction(url)
2658 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2660 raise ExtractorError(u'Unable to extract video url')
2661 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2662 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2665 video_title = self._search_regex(r'contentTitle = "(.*?)";',
2668 # Extract description
2669 video_description = self._search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
2670 webpage, u'description', fatal=False)
2672 video_filename = video_url.split('/')[-1]
2673 video_id, extension = video_filename.split('.')
2679 'upload_date': None,
2680 'title': video_title,
2681 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2683 'description': video_description,
2688 class MixcloudIE(InfoExtractor):
2689 """Information extractor for www.mixcloud.com"""
2691 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2692 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2693 IE_NAME = u'mixcloud'
2695 def report_download_json(self, file_id):
2696 """Report JSON download."""
2697 self.to_screen(u'Downloading json')
2699 def get_urls(self, jsonData, fmt, bitrate='best'):
2700 """Get urls from 'audio_formats' section in json"""
2703 bitrate_list = jsonData[fmt]
2704 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2705 bitrate = max(bitrate_list) # select highest
2707 url_list = jsonData[fmt][bitrate]
2708 except TypeError: # we have no bitrate info.
2709 url_list = jsonData[fmt]
2712 def check_urls(self, url_list):
2713 """Returns 1st active url from list"""
2714 for url in url_list:
2716 compat_urllib_request.urlopen(url)
2718 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2723 def _print_formats(self, formats):
2724 print('Available formats:')
2725 for fmt in formats.keys():
2726 for b in formats[fmt]:
2728 ext = formats[fmt][b][0]
2729 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2730 except TypeError: # we have no bitrate info
2731 ext = formats[fmt][0]
2732 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2735 def _real_extract(self, url):
2736 mobj = re.match(self._VALID_URL, url)
2738 raise ExtractorError(u'Invalid URL: %s' % url)
2739 # extract uploader & filename from url
2740 uploader = mobj.group(1).decode('utf-8')
2741 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2743 # construct API request
2744 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2745 # retrieve .json file with links to files
2746 request = compat_urllib_request.Request(file_url)
2748 self.report_download_json(file_url)
2749 jsonData = compat_urllib_request.urlopen(request).read()
2750 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2751 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2754 json_data = json.loads(jsonData)
2755 player_url = json_data['player_swf_url']
2756 formats = dict(json_data['audio_formats'])
2758 req_format = self._downloader.params.get('format', None)
2761 if self._downloader.params.get('listformats', None):
2762 self._print_formats(formats)
2765 if req_format is None or req_format == 'best':
2766 for format_param in formats.keys():
2767 url_list = self.get_urls(formats, format_param)
2769 file_url = self.check_urls(url_list)
2770 if file_url is not None:
2773 if req_format not in formats:
2774 raise ExtractorError(u'Format is not available')
2776 url_list = self.get_urls(formats, req_format)
2777 file_url = self.check_urls(url_list)
2778 format_param = req_format
2781 'id': file_id.decode('utf-8'),
2782 'url': file_url.decode('utf-8'),
2783 'uploader': uploader.decode('utf-8'),
2784 'upload_date': None,
2785 'title': json_data['name'],
2786 'ext': file_url.split('.')[-1].decode('utf-8'),
2787 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2788 'thumbnail': json_data['thumbnail_url'],
2789 'description': json_data['description'],
2790 'player_url': player_url.decode('utf-8'),
2793 class StanfordOpenClassroomIE(InfoExtractor):
2794 """Information extractor for Stanford's Open ClassRoom"""
2796 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2797 IE_NAME = u'stanfordoc'
2799 def _real_extract(self, url):
2800 mobj = re.match(self._VALID_URL, url)
2802 raise ExtractorError(u'Invalid URL: %s' % url)
2804 if mobj.group('course') and mobj.group('video'): # A specific video
2805 course = mobj.group('course')
2806 video = mobj.group('video')
2808 'id': course + '_' + video,
2810 'upload_date': None,
2813 self.report_extraction(info['id'])
2814 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2815 xmlUrl = baseUrl + video + '.xml'
2817 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2818 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2819 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2820 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2822 info['title'] = mdoc.findall('./title')[0].text
2823 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2825 raise ExtractorError(u'Invalid metadata XML file')
2826 info['ext'] = info['url'].rpartition('.')[2]
2828 elif mobj.group('course'): # A course page
2829 course = mobj.group('course')
2834 'upload_date': None,
2837 coursepage = self._download_webpage(url, info['id'],
2838 note='Downloading course info page',
2839 errnote='Unable to download course info page')
2841 info['title'] = self._search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
2842 info['title'] = unescapeHTML(info['title'])
2844 info['description'] = self._search_regex('<description>([^<]+)</description>',
2845 coursepage, u'description', fatal=False)
2846 if info['description']: info['description'] = unescapeHTML(info['description'])
2848 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2851 'type': 'reference',
2852 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2856 for entry in info['list']:
2857 assert entry['type'] == 'reference'
2858 results += self.extract(entry['url'])
2862 'id': 'Stanford OpenClassroom',
2865 'upload_date': None,
2868 self.report_download_webpage(info['id'])
2869 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2871 rootpage = compat_urllib_request.urlopen(rootURL).read()
2872 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2873 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2875 info['title'] = info['id']
2877 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2880 'type': 'reference',
2881 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2886 for entry in info['list']:
2887 assert entry['type'] == 'reference'
2888 results += self.extract(entry['url'])
2891 class MTVIE(InfoExtractor):
2892 """Information extractor for MTV.com"""
2894 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2897 def _real_extract(self, url):
2898 mobj = re.match(self._VALID_URL, url)
2900 raise ExtractorError(u'Invalid URL: %s' % url)
2901 if not mobj.group('proto'):
2902 url = 'http://' + url
2903 video_id = mobj.group('videoid')
2905 webpage = self._download_webpage(url, video_id)
2907 song_name = self._search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
2908 webpage, u'song name', fatal=False)
2909 if song_name: song_name = unescapeHTML(song_name)
2911 video_title = self._search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
2913 video_title = unescapeHTML(video_title)
2915 mtvn_uri = self._search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
2916 webpage, u'mtvn_uri', fatal=False)
2918 content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
2919 webpage, u'content id', fatal=False)
2921 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2922 self.report_extraction(video_id)
2923 request = compat_urllib_request.Request(videogen_url)
2925 metadataXml = compat_urllib_request.urlopen(request).read()
2926 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2927 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2929 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2930 renditions = mdoc.findall('.//rendition')
2932 # For now, always pick the highest quality.
2933 rendition = renditions[-1]
2936 _,_,ext = rendition.attrib['type'].partition('/')
2937 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2938 video_url = rendition.find('./src').text
2940 raise ExtractorError('Invalid rendition field.')
2945 'uploader': performer,
2946 'upload_date': None,
2947 'title': video_title,
2955 class YoukuIE(InfoExtractor):
2956 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2959 nowTime = int(time.time() * 1000)
2960 random1 = random.randint(1000,1998)
2961 random2 = random.randint(1000,9999)
2963 return "%d%d%d" %(nowTime,random1,random2)
2965 def _get_file_ID_mix_string(self, seed):
2967 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2969 for i in range(len(source)):
2970 seed = (seed * 211 + 30031 ) % 65536
2971 index = math.floor(seed / 65536 * len(source) )
2972 mixed.append(source[int(index)])
2973 source.remove(source[int(index)])
2974 #return ''.join(mixed)
2977 def _get_file_id(self, fileId, seed):
2978 mixed = self._get_file_ID_mix_string(seed)
2979 ids = fileId.split('*')
2983 realId.append(mixed[int(ch)])
2984 return ''.join(realId)
2986 def _real_extract(self, url):
2987 mobj = re.match(self._VALID_URL, url)
2989 raise ExtractorError(u'Invalid URL: %s' % url)
2990 video_id = mobj.group('ID')
2992 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
2994 jsondata = self._download_webpage(info_url, video_id)
2996 self.report_extraction(video_id)
2998 config = json.loads(jsondata)
3000 video_title = config['data'][0]['title']
3001 seed = config['data'][0]['seed']
3003 format = self._downloader.params.get('format', None)
3004 supported_format = list(config['data'][0]['streamfileids'].keys())
3006 if format is None or format == 'best':
3007 if 'hd2' in supported_format:
3012 elif format == 'worst':
3020 fileid = config['data'][0]['streamfileids'][format]
3021 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3022 except (UnicodeDecodeError, ValueError, KeyError):
3023 raise ExtractorError(u'Unable to extract info section')
3026 sid = self._gen_sid()
3027 fileid = self._get_file_id(fileid, seed)
3029 #column 8,9 of fileid represent the segment number
3030 #fileid[7:9] should be changed
3031 for index, key in enumerate(keys):
3033 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3034 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3037 'id': '%s_part%02d' % (video_id, index),
3038 'url': download_url,
3040 'upload_date': None,
3041 'title': video_title,
3044 files_info.append(info)
3049 class XNXXIE(InfoExtractor):
3050 """Information extractor for xnxx.com"""
3052 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3054 VIDEO_URL_RE = r'flv_url=(.*?)&'
3055 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3056 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3058 def _real_extract(self, url):
3059 mobj = re.match(self._VALID_URL, url)
3061 raise ExtractorError(u'Invalid URL: %s' % url)
3062 video_id = mobj.group(1)
3064 # Get webpage content
3065 webpage = self._download_webpage(url, video_id)
3067 video_url = self._search_regex(self.VIDEO_URL_RE,
3068 webpage, u'video URL')
3069 video_url = compat_urllib_parse.unquote(video_url)
3071 video_title = self._search_regex(self.VIDEO_TITLE_RE,
3074 video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
3075 webpage, u'thumbnail', fatal=False)
3081 'upload_date': None,
3082 'title': video_title,
3084 'thumbnail': video_thumbnail,
3085 'description': None,
3089 class GooglePlusIE(InfoExtractor):
3090 """Information extractor for plus.google.com."""
3092 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3093 IE_NAME = u'plus.google'
3095 def _real_extract(self, url):
3096 # Extract id from URL
3097 mobj = re.match(self._VALID_URL, url)
3099 raise ExtractorError(u'Invalid URL: %s' % url)
3101 post_url = mobj.group(0)
3102 video_id = mobj.group(1)
3104 video_extension = 'flv'
3106 # Step 1, Retrieve post webpage to extract further information
3107 webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3109 self.report_extraction(video_id)
3111 # Extract update date
3112 upload_date = self._search_regex('title="Timestamp">(.*?)</a>',
3113 webpage, u'upload date', fatal=False)
3115 # Convert timestring to a format suitable for filename
3116 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3117 upload_date = upload_date.strftime('%Y%m%d')
3120 uploader = self._search_regex(r'rel\="author".*?>(.*?)</a>',
3121 webpage, u'uploader', fatal=False)
3124 # Get the first line for title
3125 video_title = self._search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
3126 webpage, 'title', default=u'NA')
3128 # Step 2, Stimulate clicking the image box to launch video
3129 video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
3130 webpage, u'video page URL')
3131 webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3133 # Extract video links on video page
3134 """Extract video links of all sizes"""
3135 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3136 mobj = re.findall(pattern, webpage)
3138 raise ExtractorError(u'Unable to extract video links')
3140 # Sort in resolution
3141 links = sorted(mobj)
3143 # Choose the lowest of the sort, i.e. highest resolution
3144 video_url = links[-1]
3145 # Only get the url. The resolution part in the tuple has no use anymore
3146 video_url = video_url[-1]
3147 # Treat escaped \u0026 style hex
3149 video_url = video_url.decode("unicode_escape")
3150 except AttributeError: # Python 3
3151 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3157 'uploader': uploader,
3158 'upload_date': upload_date,
3159 'title': video_title,
3160 'ext': video_extension,
3163 class NBAIE(InfoExtractor):
3164 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3167 def _real_extract(self, url):
3168 mobj = re.match(self._VALID_URL, url)
3170 raise ExtractorError(u'Invalid URL: %s' % url)
3172 video_id = mobj.group(1)
3173 if video_id.endswith('/index.html'):
3174 video_id = video_id[:-len('/index.html')]
3176 webpage = self._download_webpage(url, video_id)
3178 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3180 shortened_video_id = video_id.rpartition('/')[2]
3181 title = self._search_regex(r'<meta property="og:title" content="(.*?)"',
3182 webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
3184 uploader_date = self._search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
3186 description = self._search_regex(r'<div class="description">(.*?)</h1>', webpage, 'description', fatal=False)
3189 'id': shortened_video_id,
3193 'uploader_date': uploader_date,
3194 'description': description,
3198 class JustinTVIE(InfoExtractor):
3199 """Information extractor for justin.tv and twitch.tv"""
3200 # TODO: One broadcast may be split into multiple videos. The key
3201 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3202 # starts at 1 and increases. Can we treat all parts as one video?
3204 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3206 (?P<channelid>[^/]+)|
3207 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3208 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3212 _JUSTIN_PAGE_LIMIT = 100
3213 IE_NAME = u'justin.tv'
3215 def report_download_page(self, channel, offset):
3216 """Report attempt to download a single page of videos."""
3217 self.to_screen(u'%s: Downloading video information from %d to %d' %
3218 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3220 # Return count of items, list of *valid* items
3221 def _parse_page(self, url, video_id):
3222 webpage = self._download_webpage(url, video_id,
3223 u'Downloading video info JSON',
3224 u'unable to download video info JSON')
3226 response = json.loads(webpage)
3227 if type(response) != list:
3228 error_text = response.get('error', 'unknown error')
3229 raise ExtractorError(u'Justin.tv API: %s' % error_text)
3231 for clip in response:
3232 video_url = clip['video_file_url']
3234 video_extension = os.path.splitext(video_url)[1][1:]
3235 video_date = re.sub('-', '', clip['start_time'][:10])
3236 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3237 video_id = clip['id']
3238 video_title = clip.get('title', video_id)
3242 'title': video_title,
3243 'uploader': clip.get('channel_name', video_uploader_id),
3244 'uploader_id': video_uploader_id,
3245 'upload_date': video_date,
3246 'ext': video_extension,
3248 return (len(response), info)
3250 def _real_extract(self, url):
3251 mobj = re.match(self._VALID_URL, url)
3253 raise ExtractorError(u'invalid URL: %s' % url)
3255 api_base = 'http://api.justin.tv'
3257 if mobj.group('channelid'):
3259 video_id = mobj.group('channelid')
3260 api = api_base + '/channel/archives/%s.json' % video_id
3261 elif mobj.group('chapterid'):
3262 chapter_id = mobj.group('chapterid')
3264 webpage = self._download_webpage(url, chapter_id)
3265 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3267 raise ExtractorError(u'Cannot find archive of a chapter')
3268 archive_id = m.group(1)
3270 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3271 chapter_info_xml = self._download_webpage(api, chapter_id,
3272 note=u'Downloading chapter information',
3273 errnote=u'Chapter information download failed')
3274 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3275 for a in doc.findall('.//archive'):
3276 if archive_id == a.find('./id').text:
3279 raise ExtractorError(u'Could not find chapter in chapter information')
3281 video_url = a.find('./video_file_url').text
3282 video_ext = video_url.rpartition('.')[2] or u'flv'
3284 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3285 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3286 note='Downloading chapter metadata',
3287 errnote='Download of chapter metadata failed')
3288 chapter_info = json.loads(chapter_info_json)
3290 bracket_start = int(doc.find('.//bracket_start').text)
3291 bracket_end = int(doc.find('.//bracket_end').text)
3293 # TODO determine start (and probably fix up file)
3294 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3295 #video_url += u'?start=' + TODO:start_timestamp
3296 # bracket_start is 13290, but we want 51670615
3297 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3298 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3301 'id': u'c' + chapter_id,
3304 'title': chapter_info['title'],
3305 'thumbnail': chapter_info['preview'],
3306 'description': chapter_info['description'],
3307 'uploader': chapter_info['channel']['display_name'],
3308 'uploader_id': chapter_info['channel']['name'],
3312 video_id = mobj.group('videoid')
3313 api = api_base + '/broadcast/by_archive/%s.json' % video_id
3315 self.report_extraction(video_id)
3319 limit = self._JUSTIN_PAGE_LIMIT
3322 self.report_download_page(video_id, offset)
3323 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3324 page_count, page_info = self._parse_page(page_url, video_id)
3325 info.extend(page_info)
3326 if not paged or page_count != limit:
3331 class FunnyOrDieIE(InfoExtractor):
3332 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3334 def _real_extract(self, url):
3335 mobj = re.match(self._VALID_URL, url)
3337 raise ExtractorError(u'invalid URL: %s' % url)
3339 video_id = mobj.group('id')
3340 webpage = self._download_webpage(url, video_id)
3342 video_url = self._search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
3343 webpage, u'video URL', flags=re.DOTALL)
3344 video_url = unescapeHTML(video_url)
3346 title = self._search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
3347 r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
3348 title = clean_html(title)
3350 video_description = self._search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3351 webpage, u'description', flags=re.DOTALL)
3352 if video_description: video_description = unescapeHTML(video_description)
3359 'description': video_description,
3363 class SteamIE(InfoExtractor):
3364 _VALID_URL = r"""http://store\.steampowered\.com/
3366 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3368 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3372 def suitable(cls, url):
3373 """Receives a URL and returns True if suitable for this IE."""
3374 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3376 def _real_extract(self, url):
3377 m = re.match(self._VALID_URL, url, re.VERBOSE)
3378 gameID = m.group('gameID')
3379 videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3380 self.report_age_confirmation()
3381 webpage = self._download_webpage(videourl, gameID)
3382 game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3384 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3385 mweb = re.finditer(urlRE, webpage)
3386 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3387 titles = re.finditer(namesRE, webpage)
3388 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3389 thumbs = re.finditer(thumbsRE, webpage)
3391 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3392 video_id = vid.group('videoID')
3393 title = vtitle.group('videoName')
3394 video_url = vid.group('videoURL')
3395 video_thumb = thumb.group('thumbnail')
3397 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3402 'title': unescapeHTML(title),
3403 'thumbnail': video_thumb
3406 return [self.playlist_result(videos, gameID, game_title)]
3408 class UstreamIE(InfoExtractor):
3409 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3410 IE_NAME = u'ustream'
3412 def _real_extract(self, url):
3413 m = re.match(self._VALID_URL, url)
3414 video_id = m.group('videoID')
3416 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3417 webpage = self._download_webpage(url, video_id)
3419 self.report_extraction(video_id)
3421 video_title = self._search_regex(r'data-title="(?P<title>.+)"',
3424 uploader = self._search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
3425 webpage, u'uploader', fatal=False, flags=re.DOTALL)
3426 if uploader: uploader = unescapeHTML(uploader.strip())
3428 thumbnail = self._search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
3429 webpage, u'thumbnail', fatal=False)
3435 'title': video_title,
3436 'uploader': uploader,
3437 'thumbnail': thumbnail,
3441 class WorldStarHipHopIE(InfoExtractor):
3442 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3443 IE_NAME = u'WorldStarHipHop'
3445 def _real_extract(self, url):
3446 m = re.match(self._VALID_URL, url)
3447 video_id = m.group('id')
3449 webpage_src = self._download_webpage(url, video_id)
3451 video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
3452 webpage_src, u'video URL')
3454 if 'mp4' in video_url:
3459 video_title = self._search_regex(r"<title>(.*)</title>",
3460 webpage_src, u'title')
3462 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3463 thumbnail = self._search_regex(r'rel="image_src" href="(.*)" />',
3464 webpage_src, u'thumbnail', fatal=False)
3467 _title = r"""candytitles.*>(.*)</span>"""
3468 mobj = re.search(_title, webpage_src)
3469 if mobj is not None:
3470 video_title = mobj.group(1)
3475 'title' : video_title,
3476 'thumbnail' : thumbnail,
3481 class RBMARadioIE(InfoExtractor):
3482 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3484 def _real_extract(self, url):
3485 m = re.match(self._VALID_URL, url)
3486 video_id = m.group('videoID')
3488 webpage = self._download_webpage(url, video_id)
3490 json_data = self._search_regex(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>',
3491 webpage, u'json data')
3494 data = json.loads(json_data)
3495 except ValueError as e:
3496 raise ExtractorError(u'Invalid JSON: ' + str(e))
3498 video_url = data['akamai_url'] + '&cbr=256'
3499 url_parts = compat_urllib_parse_urlparse(video_url)
3500 video_ext = url_parts.path.rpartition('.')[2]
3505 'title': data['title'],
3506 'description': data.get('teaser_text'),
3507 'location': data.get('country_of_origin'),
3508 'uploader': data.get('host', {}).get('name'),
3509 'uploader_id': data.get('host', {}).get('slug'),
3510 'thumbnail': data.get('image', {}).get('large_url_2x'),
3511 'duration': data.get('duration'),
3516 class YouPornIE(InfoExtractor):
3517 """Information extractor for youporn.com."""
3518 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3520 def _print_formats(self, formats):
3521 """Print all available formats"""
3522 print(u'Available formats:')
3523 print(u'ext\t\tformat')
3524 print(u'---------------------------------')
3525 for format in formats:
3526 print(u'%s\t\t%s' % (format['ext'], format['format']))
3528 def _specific(self, req_format, formats):
3530 if(x["format"]==req_format):
3534 def _real_extract(self, url):
3535 mobj = re.match(self._VALID_URL, url)
3537 raise ExtractorError(u'Invalid URL: %s' % url)
3538 video_id = mobj.group('videoid')
3540 req = compat_urllib_request.Request(url)
3541 req.add_header('Cookie', 'age_verified=1')
3542 webpage = self._download_webpage(req, video_id)
3544 # Get the video title
3545 video_title = self._search_regex(r'<h1.*?>(?P<title>.*)</h1>',
3546 webpage, u'title').strip()
3548 # Get the video date
3549 upload_date = self._search_regex(r'Date:</label>(?P<date>.*) </li>',
3550 webpage, u'upload date', fatal=False)
3551 if upload_date: upload_date = unified_strdate(upload_date.strip())
3553 # Get the video uploader
3554 video_uploader = self._search_regex(r'Submitted:</label>(?P<uploader>.*)</li>',
3555 webpage, u'uploader', fatal=False)
3556 if video_uploader: video_uploader = clean_html(video_uploader.strip())
3558 # Get all of the formats available
3559 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3560 download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
3561 webpage, u'download list').strip()
3563 # Get all of the links from the page
3564 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3565 links = re.findall(LINK_RE, download_list_html)
3566 if(len(links) == 0):
3567 raise ExtractorError(u'ERROR: no known formats available for video')
3569 self.to_screen(u'Links found: %d' % len(links))
3574 # A link looks like this:
3575 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3576 # A path looks like this:
3577 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3578 video_url = unescapeHTML( link )
3579 path = compat_urllib_parse_urlparse( video_url ).path
3580 extension = os.path.splitext( path )[1][1:]
3581 format = path.split('/')[4].split('_')[:2]
3584 format = "-".join( format )
3585 title = u'%s-%s-%s' % (video_title, size, bitrate)
3590 'uploader': video_uploader,
3591 'upload_date': upload_date,
3596 'description': None,
3600 if self._downloader.params.get('listformats', None):
3601 self._print_formats(formats)
3604 req_format = self._downloader.params.get('format', None)
3605 self.to_screen(u'Format: %s' % req_format)
3607 if req_format is None or req_format == 'best':
3609 elif req_format == 'worst':
3610 return [formats[-1]]
3611 elif req_format in ('-1', 'all'):
3614 format = self._specific( req_format, formats )
3616 raise ExtractorError(u'Requested format not available')
3621 class PornotubeIE(InfoExtractor):
3622 """Information extractor for pornotube.com."""
3623 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3625 def _real_extract(self, url):
3626 mobj = re.match(self._VALID_URL, url)
3628 raise ExtractorError(u'Invalid URL: %s' % url)
3630 video_id = mobj.group('videoid')
3631 video_title = mobj.group('title')
3633 # Get webpage content
3634 webpage = self._download_webpage(url, video_id)
3637 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3638 video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
3639 video_url = compat_urllib_parse.unquote(video_url)
3641 #Get the uploaded date
3642 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3643 upload_date = self._search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
3644 if upload_date: upload_date = unified_strdate(upload_date)
3646 info = {'id': video_id,
3649 'upload_date': upload_date,
3650 'title': video_title,
3656 class YouJizzIE(InfoExtractor):
3657 """Information extractor for youjizz.com."""
3658 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3660 def _real_extract(self, url):
3661 mobj = re.match(self._VALID_URL, url)
3663 raise ExtractorError(u'Invalid URL: %s' % url)
3665 video_id = mobj.group('videoid')
3667 # Get webpage content
3668 webpage = self._download_webpage(url, video_id)
3670 # Get the video title
3671 video_title = self._search_regex(r'<title>(?P<title>.*)</title>',
3672 webpage, u'title').strip()
3674 # Get the embed page
3675 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3677 raise ExtractorError(u'ERROR: unable to extract embed page')
3679 embed_page_url = result.group(0).strip()
3680 video_id = result.group('videoid')
3682 webpage = self._download_webpage(embed_page_url, video_id)
3685 video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
3686 webpage, u'video URL')
3688 info = {'id': video_id,
3690 'title': video_title,
3693 'player_url': embed_page_url}
3697 class EightTracksIE(InfoExtractor):
3699 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3701 def _real_extract(self, url):
3702 mobj = re.match(self._VALID_URL, url)
3704 raise ExtractorError(u'Invalid URL: %s' % url)
3705 playlist_id = mobj.group('id')
3707 webpage = self._download_webpage(url, playlist_id)
3709 json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
3710 data = json.loads(json_like)
3712 session = str(random.randint(0, 1000000000))
3714 track_count = data['tracks_count']
3715 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3716 next_url = first_url
3718 for i in itertools.count():
3719 api_json = self._download_webpage(next_url, playlist_id,
3720 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3721 errnote=u'Failed to download song information')
3722 api_data = json.loads(api_json)
3723 track_data = api_data[u'set']['track']
3725 'id': track_data['id'],
3726 'url': track_data['track_file_stream_url'],
3727 'title': track_data['performer'] + u' - ' + track_data['name'],
3728 'raw_title': track_data['name'],
3729 'uploader_id': data['user']['login'],
3733 if api_data['set']['at_last_track']:
3735 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3738 class KeekIE(InfoExtractor):
3739 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3742 def _real_extract(self, url):
3743 m = re.match(self._VALID_URL, url)
3744 video_id = m.group('videoID')
3746 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3747 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3748 webpage = self._download_webpage(url, video_id)
3750 video_title = self._search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3752 video_title = unescapeHTML(video_title)
3754 uploader = self._search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
3755 webpage, u'uploader', fatal=False)
3756 if uploader: uploader = clean_html(uploader)
3762 'title': video_title,
3763 'thumbnail': thumbnail,
3764 'uploader': uploader
3768 class TEDIE(InfoExtractor):
3769 _VALID_URL=r'''http://www\.ted\.com/
3771 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3773 ((?P<type_talk>talks)) # We have a simple talk
3775 (/lang/(.*?))? # The url may contain the language
3776 /(?P<name>\w+) # Here goes the name and then ".html"
3780 def suitable(cls, url):
3781 """Receives a URL and returns True if suitable for this IE."""
3782 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3784 def _real_extract(self, url):
3785 m=re.match(self._VALID_URL, url, re.VERBOSE)
3786 if m.group('type_talk'):
3787 return [self._talk_info(url)]
3789 playlist_id=m.group('playlist_id')
3790 name=m.group('name')
3791 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3792 return [self._playlist_videos_info(url,name,playlist_id)]
3794 def _talk_video_link(self,mediaSlug):
3795 '''Returns the video link for that mediaSlug'''
3796 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3798 def _playlist_videos_info(self,url,name,playlist_id=0):
3799 '''Returns the videos of the playlist'''
3801 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3802 ([.\s]*?)data-playlist_item_id="(\d+)"
3803 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3805 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3806 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3807 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3808 m_names=re.finditer(video_name_RE,webpage)
3810 playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3811 m_playlist = re.search(playlist_RE, webpage)
3812 playlist_title = m_playlist.group('playlist_title')
3814 playlist_entries = []
3815 for m_video, m_name in zip(m_videos,m_names):
3816 video_id=m_video.group('video_id')
3817 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3818 playlist_entries.append(self.url_result(talk_url, 'TED'))
3819 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3821 def _talk_info(self, url, video_id=0):
3822 """Return the video for the talk in the url"""
3823 m=re.match(self._VALID_URL, url,re.VERBOSE)
3824 videoName=m.group('name')
3825 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3826 # If the url includes the language we get the title translated
3827 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3828 title=re.search(title_RE, webpage).group('title')
3829 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3830 "id":(?P<videoID>[\d]+).*?
3831 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3832 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3833 thumb_match=re.search(thumb_RE,webpage)
3834 info_match=re.search(info_RE,webpage,re.VERBOSE)
3835 video_id=info_match.group('videoID')
3836 mediaSlug=info_match.group('mediaSlug')
3837 video_url=self._talk_video_link(mediaSlug)
3843 'thumbnail': thumb_match.group('thumbnail')
3847 class MySpassIE(InfoExtractor):
3848 _VALID_URL = r'http://www.myspass.de/.*'
3850 def _real_extract(self, url):
3851 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3853 # video id is the last path element of the URL
3854 # usually there is a trailing slash, so also try the second but last
3855 url_path = compat_urllib_parse_urlparse(url).path
3856 url_parent_path, video_id = os.path.split(url_path)
3858 _, video_id = os.path.split(url_parent_path)
3861 metadata_url = META_DATA_URL_TEMPLATE % video_id
3862 metadata_text = self._download_webpage(metadata_url, video_id)
3863 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3865 # extract values from metadata
3866 url_flv_el = metadata.find('url_flv')
3867 if url_flv_el is None:
3868 raise ExtractorError(u'Unable to extract download url')
3869 video_url = url_flv_el.text
3870 extension = os.path.splitext(video_url)[1][1:]
3871 title_el = metadata.find('title')
3872 if title_el is None:
3873 raise ExtractorError(u'Unable to extract title')
3874 title = title_el.text
3875 format_id_el = metadata.find('format_id')
3876 if format_id_el is None:
3879 format = format_id_el.text
3880 description_el = metadata.find('description')
3881 if description_el is not None:
3882 description = description_el.text
3885 imagePreview_el = metadata.find('imagePreview')
3886 if imagePreview_el is not None:
3887 thumbnail = imagePreview_el.text
3896 'thumbnail': thumbnail,
3897 'description': description
3901 class SpiegelIE(InfoExtractor):
3902 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3904 def _real_extract(self, url):
3905 m = re.match(self._VALID_URL, url)
3906 video_id = m.group('videoID')
3908 webpage = self._download_webpage(url, video_id)
3910 video_title = self._search_regex(r'<div class="module-title">(.*?)</div>',
3912 video_title = unescapeHTML(video_title)
3914 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3915 xml_code = self._download_webpage(xml_url, video_id,
3916 note=u'Downloading XML', errnote=u'Failed to download XML')
3918 idoc = xml.etree.ElementTree.fromstring(xml_code)
3919 last_type = idoc[-1]
3920 filename = last_type.findall('./filename')[0].text
3921 duration = float(last_type.findall('./duration')[0].text)
3923 video_url = 'http://video2.spiegel.de/flash/' + filename
3924 video_ext = filename.rpartition('.')[2]
3929 'title': video_title,
3930 'duration': duration,
3934 class LiveLeakIE(InfoExtractor):
3936 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3937 IE_NAME = u'liveleak'
3939 def _real_extract(self, url):
3940 mobj = re.match(self._VALID_URL, url)
3942 raise ExtractorError(u'Invalid URL: %s' % url)
3944 video_id = mobj.group('video_id')
3946 webpage = self._download_webpage(url, video_id)
3948 video_url = self._search_regex(r'file: "(.*?)",',
3949 webpage, u'video URL')
3951 video_title = self._search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3953 video_title = unescapeHTML(video_title).replace('LiveLeak.com -', '').strip()
3955 video_description = self._search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3956 webpage, u'description', fatal=False)
3957 if video_description: video_description = unescapeHTML(video_description)
3959 video_uploader = self._search_regex(r'By:.*?(\w+)</a>',
3960 webpage, u'uploader', fatal=False)
3966 'title': video_title,
3967 'description': video_description,
3968 'uploader': video_uploader
3973 class ARDIE(InfoExtractor):
3974 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
3975 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
3976 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
3978 def _real_extract(self, url):
3979 # determine video id from url
3980 m = re.match(self._VALID_URL, url)
3982 numid = re.search(r'documentId=([0-9]+)', url)
3984 video_id = numid.group(1)
3986 video_id = m.group('video_id')
3988 # determine title and media streams from webpage
3989 html = self._download_webpage(url, video_id)
3990 title = re.search(self._TITLE, html).group('title')
3991 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
3993 assert '"fsk"' in html
3994 raise ExtractorError(u'This video is only available after 8:00 pm')
3996 # choose default media type and highest quality for now
3997 stream = max([s for s in streams if int(s["media_type"]) == 0],
3998 key=lambda s: int(s["quality"]))
4000 # there's two possibilities: RTMP stream or HTTP download
4001 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4002 if stream['rtmp_url']:
4003 self.to_screen(u'RTMP download detected')
4004 assert stream['video_url'].startswith('mp4:')
4005 info["url"] = stream["rtmp_url"]
4006 info["play_path"] = stream['video_url']
4008 assert stream["video_url"].endswith('.mp4')
4009 info["url"] = stream["video_url"]
4012 class TumblrIE(InfoExtractor):
4013 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4015 def _real_extract(self, url):
4016 m_url = re.match(self._VALID_URL, url)
4017 video_id = m_url.group('id')
4018 blog = m_url.group('blog_name')
4020 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4021 webpage = self._download_webpage(url, video_id)
4023 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4024 video = re.search(re_video, webpage)
4026 raise ExtractorError(u'Unable to extract video')
4027 video_url = video.group('video_url')
4028 ext = video.group('ext')
4030 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
4031 webpage, u'thumbnail', fatal=False) # We pick the first poster
4032 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
4034 # The only place where you can get a title, it's not complete,
4035 # but searching in other places doesn't work for all videos
4036 video_title = self._search_regex(r'<title>(?P<title>.*?)</title>',
4037 webpage, u'title', flags=re.DOTALL)
4038 video_title = unescapeHTML(video_title)
4040 return [{'id': video_id,
4042 'title': video_title,
4043 'thumbnail': video_thumbnail,
4047 class BandcampIE(InfoExtractor):
4048 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4050 def _real_extract(self, url):
4051 mobj = re.match(self._VALID_URL, url)
4052 title = mobj.group('title')
4053 webpage = self._download_webpage(url, title)
4054 # We get the link to the free download page
4055 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4056 if m_download is None:
4057 raise ExtractorError(u'No free songs found')
4059 download_link = m_download.group(1)
4060 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4061 webpage, re.MULTILINE|re.DOTALL).group('id')
4063 download_webpage = self._download_webpage(download_link, id,
4064 'Downloading free downloads page')
4065 # We get the dictionary of the track from some javascrip code
4066 info = re.search(r'items: (.*?),$',
4067 download_webpage, re.MULTILINE).group(1)
4068 info = json.loads(info)[0]
4069 # We pick mp3-320 for now, until format selection can be easily implemented.
4070 mp3_info = info[u'downloads'][u'mp3-320']
4071 # If we try to use this url it says the link has expired
4072 initial_url = mp3_info[u'url']
4073 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4074 m_url = re.match(re_url, initial_url)
4075 #We build the url we will use to get the final track url
4076 # This url is build in Bandcamp in the script download_bunde_*.js
4077 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4078 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4079 # If we could correctly generate the .rand field the url would be
4080 #in the "download_url" key
4081 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4083 track_info = {'id':id,
4084 'title' : info[u'title'],
4087 'thumbnail' : info[u'thumb_url'],
4088 'uploader' : info[u'artist']
4093 class RedTubeIE(InfoExtractor):
4094 """Information Extractor for redtube"""
4095 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4097 def _real_extract(self,url):
4098 mobj = re.match(self._VALID_URL, url)
4100 raise ExtractorError(u'Invalid URL: %s' % url)
4102 video_id = mobj.group('id')
4103 video_extension = 'mp4'
4104 webpage = self._download_webpage(url, video_id)
4106 self.report_extraction(video_id)
4108 video_url = self._search_regex(r'<source src="(.+?)" type="video/mp4">',
4109 webpage, u'video URL')
4111 video_title = self._search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
4117 'ext': video_extension,
4118 'title': video_title,
4121 class InaIE(InfoExtractor):
4122 """Information Extractor for Ina.fr"""
4123 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4125 def _real_extract(self,url):
4126 mobj = re.match(self._VALID_URL, url)
4128 video_id = mobj.group('id')
4129 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4130 video_extension = 'mp4'
4131 webpage = self._download_webpage(mrss_url, video_id)
4133 self.report_extraction(video_id)
4135 video_url = self._search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
4136 webpage, u'video URL')
4138 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
4144 'ext': video_extension,
4145 'title': video_title,
4148 class HowcastIE(InfoExtractor):
4149 """Information Extractor for Howcast.com"""
4150 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
4152 def _real_extract(self, url):
4153 mobj = re.match(self._VALID_URL, url)
4155 video_id = mobj.group('id')
4156 webpage_url = 'http://www.howcast.com/videos/' + video_id
4157 webpage = self._download_webpage(webpage_url, video_id)
4159 self.report_extraction(video_id)
4161 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
4162 webpage, u'video URL')
4164 video_title = self._search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
4167 video_description = self._search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
4168 webpage, u'description', fatal=False)
4170 thumbnail = self._search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
4171 webpage, u'thumbnail', fatal=False)
4177 'title': video_title,
4178 'description': video_description,
4179 'thumbnail': thumbnail,
4182 class VineIE(InfoExtractor):
4183 """Information Extractor for Vine.co"""
4184 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
4186 def _real_extract(self, url):
4187 mobj = re.match(self._VALID_URL, url)
4189 video_id = mobj.group('id')
4190 webpage_url = 'https://vine.co/v/' + video_id
4191 webpage = self._download_webpage(webpage_url, video_id)
4193 self.report_extraction(video_id)
4195 video_url = self._search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
4196 webpage, u'video URL')
4198 video_title = self._search_regex(r'<meta property="og:title" content="(.+?)"',
4201 thumbnail = self._search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
4202 webpage, u'thumbnail', fatal=False)
4204 uploader = self._search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
4205 webpage, u'uploader', fatal=False, flags=re.DOTALL)
4211 'title': video_title,
4212 'thumbnail': thumbnail,
4213 'uploader': uploader,
4216 class FlickrIE(InfoExtractor):
4217 """Information Extractor for Flickr videos"""
4218 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
4220 def _real_extract(self, url):
4221 mobj = re.match(self._VALID_URL, url)
4223 video_id = mobj.group('id')
4224 video_uploader_id = mobj.group('uploader_id')
4225 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
4226 webpage = self._download_webpage(webpage_url, video_id)
4228 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
4230 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
4231 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
4233 node_id = self._search_regex(r'<Item id="id">(\d+-\d+)</Item>',
4234 first_xml, u'node_id')
4236 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
4237 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
4239 self.report_extraction(video_id)
4241 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
4243 raise ExtractorError(u'Unable to extract video url')
4244 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
4246 video_title = self._search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
4247 webpage, u'video title')
4249 video_description = self._search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
4250 webpage, u'description', fatal=False)
4252 thumbnail = self._search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
4253 webpage, u'thumbnail', fatal=False)
4259 'title': video_title,
4260 'description': video_description,
4261 'thumbnail': thumbnail,
4262 'uploader_id': video_uploader_id,
4265 class TeamcocoIE(InfoExtractor):
4266 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
4268 def _real_extract(self, url):
4269 mobj = re.match(self._VALID_URL, url)
4271 raise ExtractorError(u'Invalid URL: %s' % url)
4272 url_title = mobj.group('url_title')
4273 webpage = self._download_webpage(url, url_title)
4275 video_id = self._search_regex(r'<article class="video" data-id="(\d+?)"',
4276 webpage, u'video id')
4278 self.report_extraction(video_id)
4280 video_title = self._search_regex(r'<meta property="og:title" content="(.+?)"',
4283 thumbnail = self._search_regex(r'<meta property="og:image" content="(.+?)"',
4284 webpage, u'thumbnail', fatal=False)
4286 video_description = self._search_regex(r'<meta property="og:description" content="(.*?)"',
4287 webpage, u'description', fatal=False)
4289 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
4290 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
4292 video_url = self._search_regex(r'<file type="high".*?>(.*?)</file>',
4299 'title': video_title,
4300 'thumbnail': thumbnail,
4301 'description': video_description,
4304 class XHamsterIE(InfoExtractor):
4305 """Information Extractor for xHamster"""
4306 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
4308 def _real_extract(self,url):
4309 mobj = re.match(self._VALID_URL, url)
4311 video_id = mobj.group('id')
4312 mrss_url='http://xhamster.com/movies/%s/.html' % video_id
4313 webpage = self._download_webpage(mrss_url, video_id)
4314 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
4316 raise ExtractorError(u'Unable to extract media URL')
4317 if len(mobj.group('server')) == 0:
4318 video_url = compat_urllib_parse.unquote(mobj.group('file'))
4320 video_url = mobj.group('server')+'/key='+mobj.group('file')
4321 video_extension = video_url.split('.')[-1]
4323 mobj = re.search(r'<title>(?P<title>.+?) - xHamster\.com</title>', webpage)
4325 raise ExtractorError(u'Unable to extract title')
4326 video_title = unescapeHTML(mobj.group('title'))
4328 mobj = re.search(r'<span>Description: </span>(?P<description>[^<]+)', webpage)
4330 video_description = u''
4332 video_description = unescapeHTML(mobj.group('description'))
4334 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
4336 raise ExtractorError(u'Unable to extract upload date')
4337 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
4339 mobj = re.search(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^>]+)', webpage)
4341 video_uploader_id = u'anonymous'
4343 video_uploader_id = mobj.group('uploader_id')
4345 mobj = re.search(r'\'image\':\'(?P<thumbnail>[^\']+)\'', webpage)
4347 raise ExtractorError(u'Unable to extract thumbnail URL')
4348 video_thumbnail = mobj.group('thumbnail')
4353 'ext': video_extension,
4354 'title': video_title,
4355 'description': video_description,
4356 'upload_date': video_upload_date,
4357 'uploader_id': video_uploader_id,
4358 'thumbnail': video_thumbnail
4361 class HypemIE(InfoExtractor):
4362 """Information Extractor for hypem"""
4363 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
4365 def _real_extract(self, url):
4366 mobj = re.match(self._VALID_URL, url)
4368 raise ExtractorError(u'Invalid URL: %s' % url)
4369 track_id = mobj.group(1)
4371 data = { 'ax': 1, 'ts': time.time() }
4372 data_encoded = compat_urllib_parse.urlencode(data)
4373 complete_url = url + "?" + data_encoded
4374 request = compat_urllib_request.Request(complete_url)
4375 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
4376 cookie = urlh.headers.get('Set-Cookie', '')
4378 self.report_extraction(track_id)
4379 mobj = re.search(r'<script type="application/json" id="displayList-data">(.*?)</script>', response, flags=re.MULTILINE|re.DOTALL)
4381 raise ExtractorError(u'Unable to extrack tracks')
4382 html_tracks = mobj.group(1).strip()
4384 track_list = json.loads(html_tracks)
4385 track = track_list[u'tracks'][0]
4387 raise ExtractorError(u'Hypemachine contained invalid JSON.')
4390 track_id = track[u"id"]
4391 artist = track[u"artist"]
4392 title = track[u"song"]
4394 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
4395 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
4396 request.add_header('cookie', cookie)
4397 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
4399 song_data = json.loads(song_data_json)
4401 raise ExtractorError(u'Hypemachine contained invalid JSON.')
4402 final_url = song_data[u"url"]
4413 def gen_extractors():
4414 """ Return a list of an instance of every supported extractor.
4415 The order does matter; the first extractor matched is the one handling the URL.
4418 YoutubePlaylistIE(),
4443 StanfordOpenClassroomIE(),
4453 WorldStarHipHopIE(),
4479 def get_info_extractor(ie_name):
4480 """Returns the info extractor class with the given ie_name"""
4481 return globals()[ie_name+'IE']