2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
26 class InfoExtractor(object):
27 """Information Extractor class.
29 Information extractors are the classes that, given a URL, extract
30 information about the video (or videos) the URL refers to. This
31 information includes the real video URL, the video title, author and
32 others. The information is stored in a dictionary which is then
33 passed to the FileDownloader. The FileDownloader processes this
34 information possibly downloading the video to the file system, among
35 other possible outcomes.
37 The dictionaries must include the following fields:
41 title: Video title, unescaped.
42 ext: Video filename extension.
44 The following fields are optional:
46 format: The video format, defaults to ext (used for --get-format)
47 thumbnail: Full URL to a video thumbnail image.
48 description: One-line video description.
49 uploader: Full name of the video uploader.
50 upload_date: Video upload date (YYYYMMDD).
51 uploader_id: Nickname or id of the video uploader.
52 location: Physical location of the video.
53 player_url: SWF Player URL (used for rtmpdump).
54 subtitles: The subtitle file contents.
55 urlhandle: [internal] The urlHandle to be used to download the file,
56 like returned by urllib.request.urlopen
58 The fields should all be Unicode strings.
60 Subclasses of this one should re-define the _real_initialize() and
61 _real_extract() methods and define a _VALID_URL regexp.
62 Probably, they should also be added to the list of extractors.
64 _real_extract() must return a *list* of information dictionaries as
67 Finally, the _WORKING attribute should be set to False for broken IEs
68 in order to warn the users and skip the tests.
75 def __init__(self, downloader=None):
76 """Constructor. Receives an optional downloader."""
78 self.set_downloader(downloader)
81 def suitable(cls, url):
82 """Receives a URL and returns True if suitable for this IE."""
83 return re.match(cls._VALID_URL, url) is not None
87 """Getter method for _WORKING."""
91 """Initializes an instance (authentication, etc)."""
93 self._real_initialize()
96 def extract(self, url):
97 """Extracts URL information and returns it in list of dicts."""
99 return self._real_extract(url)
101 def set_downloader(self, downloader):
102 """Sets the downloader for this IE."""
103 self._downloader = downloader
105 def _real_initialize(self):
106 """Real initialization process. Redefine in subclasses."""
109 def _real_extract(self, url):
110 """Real extraction process. Redefine in subclasses."""
115 return type(self).__name__[:-2]
117 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
118 """ Returns the response handle """
120 self.report_download_webpage(video_id)
121 elif note is not False:
122 self.to_screen(u'%s: %s' % (video_id, note))
124 return compat_urllib_request.urlopen(url_or_request)
125 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
127 errnote = u'Unable to download webpage'
128 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
130 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
131 """ Returns a tuple (page content as string, URL handle) """
132 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
133 content_type = urlh.headers.get('Content-Type', '')
134 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
136 encoding = m.group(1)
139 webpage_bytes = urlh.read()
140 if self._downloader.params.get('dump_intermediate_pages', False):
142 url = url_or_request.get_full_url()
143 except AttributeError:
145 self.to_screen(u'Dumping request to ' + url)
146 dump = base64.b64encode(webpage_bytes).decode('ascii')
147 self._downloader.to_screen(dump)
148 content = webpage_bytes.decode(encoding, 'replace')
149 return (content, urlh)
151 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
152 """ Returns the data of the page as a string """
153 return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
155 def to_screen(self, msg):
156 """Print msg to screen, prefixing it with '[ie_name]'"""
157 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
159 def report_extraction(self, id_or_name):
160 """Report information extraction."""
161 self.to_screen(u'%s: Extracting information' % id_or_name)
163 def report_download_webpage(self, video_id):
164 """Report webpage download."""
165 self.to_screen(u'%s: Downloading webpage' % video_id)
167 def report_age_confirmation(self):
168 """Report attempt to confirm age."""
169 self.to_screen(u'Confirming age')
171 #Methods for following #608
172 #They set the correct value of the '_type' key
173 def video_result(self, video_info):
174 """Returns a video"""
175 video_info['_type'] = 'video'
177 def url_result(self, url, ie=None):
178 """Returns a url that points to a page that should be processed"""
179 #TODO: ie should be the class used for getting the info
180 video_info = {'_type': 'url',
184 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
185 """Returns a playlist"""
186 video_info = {'_type': 'playlist',
189 video_info['id'] = playlist_id
191 video_info['title'] = playlist_title
194 def _search_regex(self, pattern, text, name, fatal=True, flags=0):
195 """Extract a field from some text based on regex"""
196 mobj = re.search(pattern, text, flags)
197 if mobj is None and fatal:
198 raise ExtractorError(u'Unable to extract %s; '
199 u'please report this issue on GitHub.' % name)
201 self._downloader.report_warning(u'unable to extract %s; '
202 u'please report this issue on GitHub.' % name)
205 # return the first matched group
206 return next(g for g in mobj.groups() if g is not None)
208 class SearchInfoExtractor(InfoExtractor):
210 Base class for paged search queries extractors.
211 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
212 Instances should define _SEARCH_KEY and _MAX_RESULTS.
216 def _make_valid_url(cls):
217 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
220 def suitable(cls, url):
221 return re.match(cls._make_valid_url(), url) is not None
223 def _real_extract(self, query):
224 mobj = re.match(self._make_valid_url(), query)
226 raise ExtractorError(u'Invalid search query "%s"' % query)
228 prefix = mobj.group('prefix')
229 query = mobj.group('query')
231 return self._get_n_results(query, 1)
232 elif prefix == 'all':
233 return self._get_n_results(query, self._MAX_RESULTS)
237 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
238 elif n > self._MAX_RESULTS:
239 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
240 n = self._MAX_RESULTS
241 return self._get_n_results(query, n)
243 def _get_n_results(self, query, n):
244 """Get a specified number of results for a query"""
245 raise NotImplementedError("This method must be implemented by sublclasses")
248 class YoutubeIE(InfoExtractor):
249 """Information extractor for youtube.com."""
253 (?:https?://)? # http(s):// (optional)
254 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
255 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
256 (?:.*?\#/)? # handle anchor (#/) redirect urls
257 (?: # the various things that can precede the ID:
258 (?:(?:v|embed|e)/) # v/ or embed/ or e/
259 |(?: # or the v= param in all its forms
260 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
261 (?:\?|\#!?) # the params delimiter ? or # or #!
262 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
265 )? # optional -> youtube.com/xxxx is OK
266 )? # all until now is optional -> you can pass the naked ID
267 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
268 (?(1).+)? # if we found the ID, everything can follow
270 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
271 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
272 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
273 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
274 _NETRC_MACHINE = 'youtube'
275 # Listed in order of quality
276 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
277 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
278 _video_extensions = {
284 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
290 _video_dimensions = {
309 def suitable(cls, url):
310 """Receives a URL and returns True if suitable for this IE."""
311 if YoutubePlaylistIE.suitable(url): return False
312 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
314 def report_lang(self):
315 """Report attempt to set language."""
316 self.to_screen(u'Setting language')
318 def report_login(self):
319 """Report attempt to log in."""
320 self.to_screen(u'Logging in')
322 def report_video_webpage_download(self, video_id):
323 """Report attempt to download video webpage."""
324 self.to_screen(u'%s: Downloading video webpage' % video_id)
326 def report_video_info_webpage_download(self, video_id):
327 """Report attempt to download video info webpage."""
328 self.to_screen(u'%s: Downloading video info webpage' % video_id)
330 def report_video_subtitles_download(self, video_id):
331 """Report attempt to download video info webpage."""
332 self.to_screen(u'%s: Checking available subtitles' % video_id)
334 def report_video_subtitles_request(self, video_id, sub_lang, format):
335 """Report attempt to download video info webpage."""
336 self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
338 def report_video_subtitles_available(self, video_id, sub_lang_list):
339 """Report available subtitles."""
340 sub_lang = ",".join(list(sub_lang_list.keys()))
341 self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
343 def report_information_extraction(self, video_id):
344 """Report attempt to extract video information."""
345 self.to_screen(u'%s: Extracting video information' % video_id)
347 def report_unavailable_format(self, video_id, format):
348 """Report extracted video URL."""
349 self.to_screen(u'%s: Format %s not available' % (video_id, format))
351 def report_rtmp_download(self):
352 """Indicate the download will use the RTMP protocol."""
353 self.to_screen(u'RTMP download detected')
355 def _get_available_subtitles(self, video_id):
356 self.report_video_subtitles_download(video_id)
357 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
359 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
360 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
361 return (u'unable to download video subtitles: %s' % compat_str(err), None)
362 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
363 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
364 if not sub_lang_list:
365 return (u'video doesn\'t have subtitles', None)
368 def _list_available_subtitles(self, video_id):
369 sub_lang_list = self._get_available_subtitles(video_id)
370 self.report_video_subtitles_available(video_id, sub_lang_list)
372 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
375 (error_message, sub_lang, sub)
377 self.report_video_subtitles_request(video_id, sub_lang, format)
378 params = compat_urllib_parse.urlencode({
384 url = 'http://www.youtube.com/api/timedtext?' + params
386 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
387 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
388 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
390 return (u'Did not fetch video subtitles', None, None)
391 return (None, sub_lang, sub)
393 def _request_automatic_caption(self, video_id, webpage):
394 """We need the webpage for getting the captions url, pass it as an
395 argument to speed up the process."""
396 sub_lang = self._downloader.params.get('subtitleslang')
397 sub_format = self._downloader.params.get('subtitlesformat')
398 self.to_screen(u'%s: Looking for automatic captions' % video_id)
399 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
400 err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang
402 return [(err_msg, None, None)]
403 player_config = json.loads(mobj.group(1))
405 args = player_config[u'args']
406 caption_url = args[u'ttsurl']
407 timestamp = args[u'timestamp']
408 params = compat_urllib_parse.urlencode({
415 subtitles_url = caption_url + '&' + params
416 sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions')
417 return [(None, sub_lang, sub)]
419 return [(err_msg, None, None)]
421 def _extract_subtitle(self, video_id):
423 Return a list with a tuple:
424 [(error_message, sub_lang, sub)]
426 sub_lang_list = self._get_available_subtitles(video_id)
427 sub_format = self._downloader.params.get('subtitlesformat')
428 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
429 return [(sub_lang_list[0], None, None)]
430 if self._downloader.params.get('subtitleslang', False):
431 sub_lang = self._downloader.params.get('subtitleslang')
432 elif 'en' in sub_lang_list:
435 sub_lang = list(sub_lang_list.keys())[0]
436 if not sub_lang in sub_lang_list:
437 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
439 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
442 def _extract_all_subtitles(self, video_id):
443 sub_lang_list = self._get_available_subtitles(video_id)
444 sub_format = self._downloader.params.get('subtitlesformat')
445 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
446 return [(sub_lang_list[0], None, None)]
448 for sub_lang in sub_lang_list:
449 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
450 subtitles.append(subtitle)
453 def _print_formats(self, formats):
454 print('Available formats:')
456 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
458 def _real_initialize(self):
459 if self._downloader is None:
464 downloader_params = self._downloader.params
466 # Attempt to use provided username and password or .netrc data
467 if downloader_params.get('username', None) is not None:
468 username = downloader_params['username']
469 password = downloader_params['password']
470 elif downloader_params.get('usenetrc', False):
472 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
477 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
478 except (IOError, netrc.NetrcParseError) as err:
479 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
483 request = compat_urllib_request.Request(self._LANG_URL)
486 compat_urllib_request.urlopen(request).read()
487 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
488 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
491 # No authentication to be performed
495 request = compat_urllib_request.Request(self._LOGIN_URL)
497 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
498 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
499 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
504 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
506 galx = match.group(1)
508 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
514 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
518 u'PersistentCookie': u'yes',
520 u'bgresponse': u'js_disabled',
521 u'checkConnection': u'',
522 u'checkedDomains': u'youtube',
528 u'signIn': u'Sign in',
530 u'service': u'youtube',
534 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
536 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
537 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
538 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
541 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
542 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
543 self._downloader.report_warning(u'unable to log in: bad username or password')
545 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
546 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
552 'action_confirm': 'Confirm',
554 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
556 self.report_age_confirmation()
557 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
558 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
559 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
561 def _extract_id(self, url):
562 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
564 raise ExtractorError(u'Invalid URL: %s' % url)
565 video_id = mobj.group(2)
568 def _real_extract(self, url):
569 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
570 mobj = re.search(self._NEXT_URL_RE, url)
572 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
573 video_id = self._extract_id(url)
576 self.report_video_webpage_download(video_id)
577 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
578 request = compat_urllib_request.Request(url)
580 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
581 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
582 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
584 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
586 # Attempt to extract SWF player URL
587 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
589 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
594 self.report_video_info_webpage_download(video_id)
595 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
596 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
597 % (video_id, el_type))
598 video_info_webpage = self._download_webpage(video_info_url, video_id,
600 errnote='unable to download video info webpage')
601 video_info = compat_parse_qs(video_info_webpage)
602 if 'token' in video_info:
604 if 'token' not in video_info:
605 if 'reason' in video_info:
606 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
608 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
610 # Check for "rental" videos
611 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
612 raise ExtractorError(u'"rental" videos not supported')
614 # Start extracting information
615 self.report_information_extraction(video_id)
618 if 'author' not in video_info:
619 raise ExtractorError(u'Unable to extract uploader name')
620 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
623 video_uploader_id = None
624 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
626 video_uploader_id = mobj.group(1)
628 self._downloader.report_warning(u'unable to extract uploader nickname')
631 if 'title' not in video_info:
632 raise ExtractorError(u'Unable to extract video title')
633 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
636 if 'thumbnail_url' not in video_info:
637 self._downloader.report_warning(u'unable to extract video thumbnail')
639 else: # don't panic if we can't find it
640 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
644 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
646 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
647 upload_date = unified_strdate(upload_date)
650 video_description = get_element_by_id("eow-description", video_webpage)
651 if video_description:
652 video_description = clean_html(video_description)
654 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
656 video_description = unescapeHTML(fd_mobj.group(1))
658 video_description = u''
661 video_subtitles = None
663 if self._downloader.params.get('writesubtitles', False):
664 video_subtitles = self._extract_subtitle(video_id)
666 (sub_error, sub_lang, sub) = video_subtitles[0]
668 # We try with the automatic captions
669 video_subtitles = self._request_automatic_caption(video_id, video_webpage)
670 (sub_error_auto, sub_lang, sub) = video_subtitles[0]
674 # We report the original error
675 self._downloader.report_error(sub_error)
677 if self._downloader.params.get('allsubtitles', False):
678 video_subtitles = self._extract_all_subtitles(video_id)
679 for video_subtitle in video_subtitles:
680 (sub_error, sub_lang, sub) = video_subtitle
682 self._downloader.report_error(sub_error)
684 if self._downloader.params.get('listsubtitles', False):
685 sub_lang_list = self._list_available_subtitles(video_id)
688 if 'length_seconds' not in video_info:
689 self._downloader.report_warning(u'unable to extract video duration')
692 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
695 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
697 # Decide which formats to download
698 req_format = self._downloader.params.get('format', None)
700 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
701 self.report_rtmp_download()
702 video_url_list = [(None, video_info['conn'][0])]
703 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
705 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
706 url_data = compat_parse_qs(url_data_str)
707 if 'itag' in url_data and 'url' in url_data:
708 url = url_data['url'][0] + '&signature=' + url_data['sig'][0]
709 if not 'ratebypass' in url: url += '&ratebypass=yes'
710 url_map[url_data['itag'][0]] = url
712 format_limit = self._downloader.params.get('format_limit', None)
713 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
714 if format_limit is not None and format_limit in available_formats:
715 format_list = available_formats[available_formats.index(format_limit):]
717 format_list = available_formats
718 existing_formats = [x for x in format_list if x in url_map]
719 if len(existing_formats) == 0:
720 raise ExtractorError(u'no known formats available for video')
721 if self._downloader.params.get('listformats', None):
722 self._print_formats(existing_formats)
724 if req_format is None or req_format == 'best':
725 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
726 elif req_format == 'worst':
727 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
728 elif req_format in ('-1', 'all'):
729 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
731 # Specific formats. We pick the first in a slash-delimeted sequence.
732 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
733 req_formats = req_format.split('/')
734 video_url_list = None
735 for rf in req_formats:
737 video_url_list = [(rf, url_map[rf])]
739 if video_url_list is None:
740 raise ExtractorError(u'requested format not available')
742 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
745 for format_param, video_real_url in video_url_list:
747 video_extension = self._video_extensions.get(format_param, 'flv')
749 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
750 self._video_dimensions.get(format_param, '???'))
754 'url': video_real_url,
755 'uploader': video_uploader,
756 'uploader_id': video_uploader_id,
757 'upload_date': upload_date,
758 'title': video_title,
759 'ext': video_extension,
760 'format': video_format,
761 'thumbnail': video_thumbnail,
762 'description': video_description,
763 'player_url': player_url,
764 'subtitles': video_subtitles,
765 'duration': video_duration
770 class MetacafeIE(InfoExtractor):
771 """Information Extractor for metacafe.com."""
773 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
774 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
775 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
776 IE_NAME = u'metacafe'
778 def report_disclaimer(self):
779 """Report disclaimer retrieval."""
780 self.to_screen(u'Retrieving disclaimer')
782 def _real_initialize(self):
783 # Retrieve disclaimer
784 request = compat_urllib_request.Request(self._DISCLAIMER)
786 self.report_disclaimer()
787 disclaimer = compat_urllib_request.urlopen(request).read()
788 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
789 raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
794 'submit': "Continue - I'm over 18",
796 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
798 self.report_age_confirmation()
799 disclaimer = compat_urllib_request.urlopen(request).read()
800 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
801 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
803 def _real_extract(self, url):
804 # Extract id and simplified title from URL
805 mobj = re.match(self._VALID_URL, url)
807 raise ExtractorError(u'Invalid URL: %s' % url)
809 video_id = mobj.group(1)
811 # Check if video comes from YouTube
812 mobj2 = re.match(r'^yt-(.*)$', video_id)
813 if mobj2 is not None:
814 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
816 # Retrieve video webpage to extract further information
817 webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
819 # Extract URL, uploader and title from webpage
820 self.report_extraction(video_id)
821 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
823 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
824 video_extension = mediaURL[-3:]
826 # Extract gdaKey if available
827 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
831 gdaKey = mobj.group(1)
832 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
834 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
836 raise ExtractorError(u'Unable to extract media URL')
837 vardict = compat_parse_qs(mobj.group(1))
838 if 'mediaData' not in vardict:
839 raise ExtractorError(u'Unable to extract media URL')
840 mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
842 raise ExtractorError(u'Unable to extract media URL')
843 mediaURL = mobj.group('mediaURL').replace('\\/', '/')
844 video_extension = mediaURL[-3:]
845 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
847 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
849 raise ExtractorError(u'Unable to extract title')
850 video_title = mobj.group(1).decode('utf-8')
852 mobj = re.search(r'submitter=(.*?);', webpage)
854 raise ExtractorError(u'Unable to extract uploader nickname')
855 video_uploader = mobj.group(1)
858 'id': video_id.decode('utf-8'),
859 'url': video_url.decode('utf-8'),
860 'uploader': video_uploader.decode('utf-8'),
862 'title': video_title,
863 'ext': video_extension.decode('utf-8'),
866 class DailymotionIE(InfoExtractor):
867 """Information Extractor for Dailymotion"""
869 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
870 IE_NAME = u'dailymotion'
872 def _real_extract(self, url):
873 # Extract id and simplified title from URL
874 mobj = re.match(self._VALID_URL, url)
876 raise ExtractorError(u'Invalid URL: %s' % url)
878 video_id = mobj.group(1).split('_')[0].split('?')[0]
880 video_extension = 'mp4'
882 # Retrieve video webpage to extract further information
883 request = compat_urllib_request.Request(url)
884 request.add_header('Cookie', 'family_filter=off')
885 webpage = self._download_webpage(request, video_id)
887 # Extract URL, uploader and title from webpage
888 self.report_extraction(video_id)
889 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
891 raise ExtractorError(u'Unable to extract media URL')
892 flashvars = compat_urllib_parse.unquote(mobj.group(1))
894 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
897 self.to_screen(u'Using %s' % key)
900 raise ExtractorError(u'Unable to extract video URL')
902 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
904 raise ExtractorError(u'Unable to extract video URL')
906 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
908 # TODO: support choosing qualities
910 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
912 raise ExtractorError(u'Unable to extract title')
913 video_title = unescapeHTML(mobj.group('title'))
915 video_uploader = None
916 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
918 # lookin for official user
919 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
920 if mobj_official is None:
921 self._downloader.report_warning(u'unable to extract uploader nickname')
923 video_uploader = mobj_official.group(1)
925 video_uploader = mobj.group(1)
927 video_upload_date = None
928 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
930 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
935 'uploader': video_uploader,
936 'upload_date': video_upload_date,
937 'title': video_title,
938 'ext': video_extension,
942 class PhotobucketIE(InfoExtractor):
943 """Information extractor for photobucket.com."""
945 # TODO: the original _VALID_URL was:
946 # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
947 # Check if it's necessary to keep the old extracion process
948 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
949 IE_NAME = u'photobucket'
951 def _real_extract(self, url):
952 # Extract id from URL
953 mobj = re.match(self._VALID_URL, url)
955 raise ExtractorError(u'Invalid URL: %s' % url)
957 video_id = mobj.group('id')
959 video_extension = mobj.group('ext')
961 # Retrieve video webpage to extract further information
962 webpage = self._download_webpage(url, video_id)
964 # Extract URL, uploader, and title from webpage
965 self.report_extraction(video_id)
966 # We try first by looking the javascript code:
967 mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
969 info = json.loads(mobj.group('json'))
972 'url': info[u'downloadUrl'],
973 'uploader': info[u'username'],
974 'upload_date': datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
975 'title': info[u'title'],
976 'ext': video_extension,
977 'thumbnail': info[u'thumbUrl'],
980 # We try looking in other parts of the webpage
981 video_url = self._search_regex(r'<link rel="video_src" href=".*\?file=([^"]+)" />',
982 webpage, u'video URL')
984 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
986 raise ExtractorError(u'Unable to extract title')
987 video_title = mobj.group(1).decode('utf-8')
988 video_uploader = mobj.group(2).decode('utf-8')
991 'id': video_id.decode('utf-8'),
992 'url': video_url.decode('utf-8'),
993 'uploader': video_uploader,
995 'title': video_title,
996 'ext': video_extension.decode('utf-8'),
1000 class YahooIE(InfoExtractor):
1001 """Information extractor for screen.yahoo.com."""
1002 _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
1004 def _real_extract(self, url):
1005 mobj = re.match(self._VALID_URL, url)
1007 raise ExtractorError(u'Invalid URL: %s' % url)
1008 video_id = mobj.group('id')
1009 webpage = self._download_webpage(url, video_id)
1010 m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
1013 # TODO: Check which url parameters are required
1014 info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1015 webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
1016 info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
1017 <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
1018 <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
1019 <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
1021 self.report_extraction(video_id)
1022 m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
1024 raise ExtractorError(u'Unable to extract video info')
1025 video_title = m_info.group('title')
1026 video_description = m_info.group('description')
1027 video_thumb = m_info.group('thumb')
1028 video_date = m_info.group('date')
1029 video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
1031 # TODO: Find a way to get mp4 videos
1032 rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1033 webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
1034 m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
1035 video_url = m_rest.group('url')
1036 video_path = m_rest.group('path')
1038 raise ExtractorError(u'Unable to extract video url')
1040 else: # We have to use a different method if another id is defined
1041 long_id = m_id.group('new_id')
1042 info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
1043 webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
1044 json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
1045 info = json.loads(json_str)
1046 res = info[u'query'][u'results'][u'mediaObj'][0]
1047 stream = res[u'streams'][0]
1048 video_path = stream[u'path']
1049 video_url = stream[u'host']
1051 video_title = meta[u'title']
1052 video_description = meta[u'description']
1053 video_thumb = meta[u'thumbnail']
1054 video_date = None # I can't find it
1059 'play_path': video_path,
1060 'title':video_title,
1061 'description': video_description,
1062 'thumbnail': video_thumb,
1063 'upload_date': video_date,
1068 class VimeoIE(InfoExtractor):
1069 """Information extractor for vimeo.com."""
1071 # _VALID_URL matches Vimeo URLs
1072 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1075 def _real_extract(self, url, new_video=True):
1076 # Extract ID from URL
1077 mobj = re.match(self._VALID_URL, url)
1079 raise ExtractorError(u'Invalid URL: %s' % url)
1081 video_id = mobj.group('id')
1082 if not mobj.group('proto'):
1083 url = 'https://' + url
1084 if mobj.group('direct_link') or mobj.group('pro'):
1085 url = 'https://vimeo.com/' + video_id
1087 # Retrieve video webpage to extract further information
1088 request = compat_urllib_request.Request(url, None, std_headers)
1089 webpage = self._download_webpage(request, video_id)
1091 # Now we begin extracting as much information as we can from what we
1092 # retrieved. First we extract the information common to all extractors,
1093 # and latter we extract those that are Vimeo specific.
1094 self.report_extraction(video_id)
1096 # Extract the config JSON
1098 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1099 config = json.loads(config)
1101 if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1102 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1104 raise ExtractorError(u'Unable to extract info section')
1107 video_title = config["video"]["title"]
1109 # Extract uploader and uploader_id
1110 video_uploader = config["video"]["owner"]["name"]
1111 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None
1113 # Extract video thumbnail
1114 video_thumbnail = config["video"]["thumbnail"]
1116 # Extract video description
1117 video_description = get_element_by_attribute("itemprop", "description", webpage)
1118 if video_description: video_description = clean_html(video_description)
1119 else: video_description = u''
1121 # Extract upload date
1122 video_upload_date = None
1123 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1124 if mobj is not None:
1125 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1127 # Vimeo specific: extract request signature and timestamp
1128 sig = config['request']['signature']
1129 timestamp = config['request']['timestamp']
1131 # Vimeo specific: extract video codec and quality information
1132 # First consider quality, then codecs, then take everything
1133 # TODO bind to format param
1134 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1135 files = { 'hd': [], 'sd': [], 'other': []}
1136 for codec_name, codec_extension in codecs:
1137 if codec_name in config["video"]["files"]:
1138 if 'hd' in config["video"]["files"][codec_name]:
1139 files['hd'].append((codec_name, codec_extension, 'hd'))
1140 elif 'sd' in config["video"]["files"][codec_name]:
1141 files['sd'].append((codec_name, codec_extension, 'sd'))
1143 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1145 for quality in ('hd', 'sd', 'other'):
1146 if len(files[quality]) > 0:
1147 video_quality = files[quality][0][2]
1148 video_codec = files[quality][0][0]
1149 video_extension = files[quality][0][1]
1150 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1153 raise ExtractorError(u'No known codec found')
1155 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1156 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1161 'uploader': video_uploader,
1162 'uploader_id': video_uploader_id,
1163 'upload_date': video_upload_date,
1164 'title': video_title,
1165 'ext': video_extension,
1166 'thumbnail': video_thumbnail,
1167 'description': video_description,
1171 class ArteTvIE(InfoExtractor):
1172 """arte.tv information extractor."""
1174 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1175 _LIVE_URL = r'index-[0-9]+\.html$'
1177 IE_NAME = u'arte.tv'
1179 def fetch_webpage(self, url):
1180 request = compat_urllib_request.Request(url)
1182 self.report_download_webpage(url)
1183 webpage = compat_urllib_request.urlopen(request).read()
1184 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1185 raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1186 except ValueError as err:
1187 raise ExtractorError(u'Invalid URL: %s' % url)
1190 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1191 page = self.fetch_webpage(url)
1192 mobj = re.search(regex, page, regexFlags)
1196 raise ExtractorError(u'Invalid URL: %s' % url)
1198 for (i, key, err) in matchTuples:
1199 if mobj.group(i) is None:
1200 raise ExtractorError(err)
1202 info[key] = mobj.group(i)
1206 def extractLiveStream(self, url):
1207 video_lang = url.split('/')[-4]
1208 info = self.grep_webpage(
1210 r'src="(.*?/videothek_js.*?\.js)',
1213 (1, 'url', u'Invalid URL: %s' % url)
1216 http_host = url.split('/')[2]
1217 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1218 info = self.grep_webpage(
1220 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1221 '(http://.*?\.swf).*?' +
1225 (1, 'path', u'could not extract video path: %s' % url),
1226 (2, 'player', u'could not extract video player: %s' % url),
1227 (3, 'url', u'could not extract video url: %s' % url)
1230 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1232 def extractPlus7Stream(self, url):
1233 video_lang = url.split('/')[-3]
1234 info = self.grep_webpage(
1236 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1239 (1, 'url', u'Invalid URL: %s' % url)
1242 next_url = compat_urllib_parse.unquote(info.get('url'))
1243 info = self.grep_webpage(
1245 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1248 (1, 'url', u'Could not find <video> tag: %s' % url)
1251 next_url = compat_urllib_parse.unquote(info.get('url'))
1253 info = self.grep_webpage(
1255 r'<video id="(.*?)".*?>.*?' +
1256 '<name>(.*?)</name>.*?' +
1257 '<dateVideo>(.*?)</dateVideo>.*?' +
1258 '<url quality="hd">(.*?)</url>',
1261 (1, 'id', u'could not extract video id: %s' % url),
1262 (2, 'title', u'could not extract video title: %s' % url),
1263 (3, 'date', u'could not extract video date: %s' % url),
1264 (4, 'url', u'could not extract video url: %s' % url)
1269 'id': info.get('id'),
1270 'url': compat_urllib_parse.unquote(info.get('url')),
1271 'uploader': u'arte.tv',
1272 'upload_date': unified_strdate(info.get('date')),
1273 'title': info.get('title').decode('utf-8'),
1279 def _real_extract(self, url):
1280 video_id = url.split('/')[-1]
1281 self.report_extraction(video_id)
1283 if re.search(self._LIVE_URL, video_id) is not None:
1284 self.extractLiveStream(url)
1287 info = self.extractPlus7Stream(url)
1292 class GenericIE(InfoExtractor):
1293 """Generic last-resort information extractor."""
1296 IE_NAME = u'generic'
1298 def report_download_webpage(self, video_id):
1299 """Report webpage download."""
1300 if not self._downloader.params.get('test', False):
1301 self._downloader.report_warning(u'Falling back on generic information extractor.')
1302 super(GenericIE, self).report_download_webpage(video_id)
1304 def report_following_redirect(self, new_url):
1305 """Report information extraction."""
1306 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1308 def _test_redirect(self, url):
1309 """Check if it is a redirect, like url shorteners, in case return the new url."""
1310 class HeadRequest(compat_urllib_request.Request):
1311 def get_method(self):
1314 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1316 Subclass the HTTPRedirectHandler to make it use our
1317 HeadRequest also on the redirected URL
1319 def redirect_request(self, req, fp, code, msg, headers, newurl):
1320 if code in (301, 302, 303, 307):
1321 newurl = newurl.replace(' ', '%20')
1322 newheaders = dict((k,v) for k,v in req.headers.items()
1323 if k.lower() not in ("content-length", "content-type"))
1324 return HeadRequest(newurl,
1326 origin_req_host=req.get_origin_req_host(),
1329 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1331 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1333 Fallback to GET if HEAD is not allowed (405 HTTP error)
1335 def http_error_405(self, req, fp, code, msg, headers):
1339 newheaders = dict((k,v) for k,v in req.headers.items()
1340 if k.lower() not in ("content-length", "content-type"))
1341 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1343 origin_req_host=req.get_origin_req_host(),
1347 opener = compat_urllib_request.OpenerDirector()
1348 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1349 HTTPMethodFallback, HEADRedirectHandler,
1350 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1351 opener.add_handler(handler())
1353 response = opener.open(HeadRequest(url))
1354 if response is None:
1355 raise ExtractorError(u'Invalid URL protocol')
1356 new_url = response.geturl()
1361 self.report_following_redirect(new_url)
1364 def _real_extract(self, url):
1365 new_url = self._test_redirect(url)
1366 if new_url: return [self.url_result(new_url)]
1368 video_id = url.split('/')[-1]
1370 webpage = self._download_webpage(url, video_id)
1371 except ValueError as err:
1372 # since this is the last-resort InfoExtractor, if
1373 # this error is thrown, it'll be thrown here
1374 raise ExtractorError(u'Invalid URL: %s' % url)
1376 self.report_extraction(video_id)
1377 # Start with something easy: JW Player in SWFObject
1378 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1380 # Broaden the search a little bit
1381 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1383 # Broaden the search a little bit: JWPlayer JS loader
1384 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1386 raise ExtractorError(u'Invalid URL: %s' % url)
1388 # It's possible that one of the regexes
1389 # matched, but returned an empty group:
1390 if mobj.group(1) is None:
1391 raise ExtractorError(u'Invalid URL: %s' % url)
1393 video_url = compat_urllib_parse.unquote(mobj.group(1))
1394 video_id = os.path.basename(video_url)
1396 # here's a fun little line of code for you:
1397 video_extension = os.path.splitext(video_id)[1][1:]
1398 video_id = os.path.splitext(video_id)[0]
1400 # it's tempting to parse this further, but you would
1401 # have to take into account all the variations like
1402 # Video Title - Site Name
1403 # Site Name | Video Title
1404 # Video Title - Tagline | Site Name
1405 # and so on and so forth; it's just not practical
1406 mobj = re.search(r'<title>(.*)</title>', webpage)
1408 raise ExtractorError(u'Unable to extract title')
1409 video_title = mobj.group(1)
1411 # video uploader is domain name
1412 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1414 raise ExtractorError(u'Unable to extract title')
1415 video_uploader = mobj.group(1)
1420 'uploader': video_uploader,
1421 'upload_date': None,
1422 'title': video_title,
1423 'ext': video_extension,
1427 class YoutubeSearchIE(SearchInfoExtractor):
1428 """Information Extractor for YouTube search queries."""
1429 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1431 IE_NAME = u'youtube:search'
1432 _SEARCH_KEY = 'ytsearch'
1434 def report_download_page(self, query, pagenum):
1435 """Report attempt to download search page with given number."""
1436 query = query.decode(preferredencoding())
1437 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1439 def _get_n_results(self, query, n):
1440 """Get a specified number of results for a query"""
1446 while (50 * pagenum) < limit:
1447 self.report_download_page(query, pagenum+1)
1448 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1449 request = compat_urllib_request.Request(result_url)
1451 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1452 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1453 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1454 api_response = json.loads(data)['data']
1456 if not 'items' in api_response:
1457 raise ExtractorError(u'[youtube] No video results')
1459 new_ids = list(video['id'] for video in api_response['items'])
1460 video_ids += new_ids
1462 limit = min(n, api_response['totalItems'])
1465 if len(video_ids) > n:
1466 video_ids = video_ids[:n]
1467 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1468 return self.playlist_result(videos, query)
1471 class GoogleSearchIE(SearchInfoExtractor):
1472 """Information Extractor for Google Video search queries."""
1473 _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
1475 IE_NAME = u'video.google:search'
1476 _SEARCH_KEY = 'gvsearch'
1478 def _get_n_results(self, query, n):
1479 """Get a specified number of results for a query"""
1482 '_type': 'playlist',
1487 for pagenum in itertools.count(1):
1488 result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1489 webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1490 note='Downloading result page ' + str(pagenum))
1492 for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1495 'url': mobj.group(1)
1497 res['entries'].append(e)
1499 if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1502 class YahooSearchIE(SearchInfoExtractor):
1503 """Information Extractor for Yahoo! Video search queries."""
1506 IE_NAME = u'screen.yahoo:search'
1507 _SEARCH_KEY = 'yvsearch'
1509 def _get_n_results(self, query, n):
1510 """Get a specified number of results for a query"""
1513 '_type': 'playlist',
1517 for pagenum in itertools.count(0):
1518 result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
1519 webpage = self._download_webpage(result_url, query,
1520 note='Downloading results page '+str(pagenum+1))
1521 info = json.loads(webpage)
1523 results = info[u'results']
1525 for (i, r) in enumerate(results):
1526 if (pagenum * 30) +i >= n:
1528 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
1529 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
1530 res['entries'].append(e)
1531 if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
1537 class YoutubePlaylistIE(InfoExtractor):
1538 """Information Extractor for YouTube playlists."""
1540 _VALID_URL = r"""(?:
1545 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1546 \? (?:.*?&)*? (?:p|a|list)=
1549 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1552 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1554 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1556 IE_NAME = u'youtube:playlist'
1559 def suitable(cls, url):
1560 """Receives a URL and returns True if suitable for this IE."""
1561 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1563 def _real_extract(self, url):
1564 # Extract playlist id
1565 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1567 raise ExtractorError(u'Invalid URL: %s' % url)
1569 # Download playlist videos from API
1570 playlist_id = mobj.group(1) or mobj.group(2)
1575 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1576 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1579 response = json.loads(page)
1580 except ValueError as err:
1581 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1583 if 'feed' not in response:
1584 raise ExtractorError(u'Got a malformed response from YouTube API')
1585 playlist_title = response['feed']['title']['$t']
1586 if 'entry' not in response['feed']:
1587 # Number of videos is a multiple of self._MAX_RESULTS
1590 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1591 for entry in response['feed']['entry']
1592 if 'content' in entry ]
1594 if len(response['feed']['entry']) < self._MAX_RESULTS:
1598 videos = [v[1] for v in sorted(videos)]
1600 url_results = [self.url_result(url, 'Youtube') for url in videos]
1601 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1604 class YoutubeChannelIE(InfoExtractor):
1605 """Information Extractor for YouTube channels."""
1607 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1608 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1609 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1610 _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1611 IE_NAME = u'youtube:channel'
1613 def extract_videos_from_page(self, page):
1615 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1616 if mobj.group(1) not in ids_in_page:
1617 ids_in_page.append(mobj.group(1))
1620 def _real_extract(self, url):
1621 # Extract channel id
1622 mobj = re.match(self._VALID_URL, url)
1624 raise ExtractorError(u'Invalid URL: %s' % url)
1626 # Download channel page
1627 channel_id = mobj.group(1)
1631 url = self._TEMPLATE_URL % (channel_id, pagenum)
1632 page = self._download_webpage(url, channel_id,
1633 u'Downloading page #%s' % pagenum)
1635 # Extract video identifiers
1636 ids_in_page = self.extract_videos_from_page(page)
1637 video_ids.extend(ids_in_page)
1639 # Download any subsequent channel pages using the json-based channel_ajax query
1640 if self._MORE_PAGES_INDICATOR in page:
1642 pagenum = pagenum + 1
1644 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1645 page = self._download_webpage(url, channel_id,
1646 u'Downloading page #%s' % pagenum)
1648 page = json.loads(page)
1650 ids_in_page = self.extract_videos_from_page(page['content_html'])
1651 video_ids.extend(ids_in_page)
1653 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1656 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1658 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1659 url_entries = [self.url_result(url, 'Youtube') for url in urls]
1660 return [self.playlist_result(url_entries, channel_id)]
1663 class YoutubeUserIE(InfoExtractor):
1664 """Information Extractor for YouTube users."""
1666 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1667 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1668 _GDATA_PAGE_SIZE = 50
1669 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1670 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1671 IE_NAME = u'youtube:user'
1673 def _real_extract(self, url):
1675 mobj = re.match(self._VALID_URL, url)
1677 raise ExtractorError(u'Invalid URL: %s' % url)
1679 username = mobj.group(1)
1681 # Download video ids using YouTube Data API. Result size per
1682 # query is limited (currently to 50 videos) so we need to query
1683 # page by page until there are no video ids - it means we got
1690 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1692 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1693 page = self._download_webpage(gdata_url, username,
1694 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1696 # Extract video identifiers
1699 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1700 if mobj.group(1) not in ids_in_page:
1701 ids_in_page.append(mobj.group(1))
1703 video_ids.extend(ids_in_page)
1705 # A little optimization - if current page is not
1706 # "full", ie. does not contain PAGE_SIZE video ids then
1707 # we can assume that this page is the last one - there
1708 # are no more ids on further pages - no need to query
1711 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1716 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1717 url_results = [self.url_result(url, 'Youtube') for url in urls]
1718 return [self.playlist_result(url_results, playlist_title = username)]
1721 class BlipTVUserIE(InfoExtractor):
1722 """Information Extractor for blip.tv users."""
1724 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1726 IE_NAME = u'blip.tv:user'
1728 def _real_extract(self, url):
1730 mobj = re.match(self._VALID_URL, url)
1732 raise ExtractorError(u'Invalid URL: %s' % url)
1734 username = mobj.group(1)
1736 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1738 page = self._download_webpage(url, username, u'Downloading user page')
1739 mobj = re.search(r'data-users-id="([^"]+)"', page)
1740 page_base = page_base % mobj.group(1)
1743 # Download video ids using BlipTV Ajax calls. Result size per
1744 # query is limited (currently to 12 videos) so we need to query
1745 # page by page until there are no video ids - it means we got
1752 url = page_base + "&page=" + str(pagenum)
1753 page = self._download_webpage(url, username,
1754 u'Downloading video ids from page %d' % pagenum)
1756 # Extract video identifiers
1759 for mobj in re.finditer(r'href="/([^"]+)"', page):
1760 if mobj.group(1) not in ids_in_page:
1761 ids_in_page.append(unescapeHTML(mobj.group(1)))
1763 video_ids.extend(ids_in_page)
1765 # A little optimization - if current page is not
1766 # "full", ie. does not contain PAGE_SIZE video ids then
1767 # we can assume that this page is the last one - there
1768 # are no more ids on further pages - no need to query
1771 if len(ids_in_page) < self._PAGE_SIZE:
1776 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1777 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1778 return [self.playlist_result(url_entries, playlist_title = username)]
1781 class DepositFilesIE(InfoExtractor):
1782 """Information extractor for depositfiles.com"""
1784 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1786 def _real_extract(self, url):
1787 file_id = url.split('/')[-1]
1788 # Rebuild url in english locale
1789 url = 'http://depositfiles.com/en/files/' + file_id
1791 # Retrieve file webpage with 'Free download' button pressed
1792 free_download_indication = { 'gateway_result' : '1' }
1793 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1795 self.report_download_webpage(file_id)
1796 webpage = compat_urllib_request.urlopen(request).read()
1797 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1798 raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1800 # Search for the real file URL
1801 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1802 if (mobj is None) or (mobj.group(1) is None):
1803 # Try to figure out reason of the error.
1804 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1805 if (mobj is not None) and (mobj.group(1) is not None):
1806 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1807 raise ExtractorError(u'%s' % restriction_message)
1809 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1811 file_url = mobj.group(1)
1812 file_extension = os.path.splitext(file_url)[1][1:]
1814 # Search for file title
1815 file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
1818 'id': file_id.decode('utf-8'),
1819 'url': file_url.decode('utf-8'),
1821 'upload_date': None,
1822 'title': file_title,
1823 'ext': file_extension.decode('utf-8'),
1827 class FacebookIE(InfoExtractor):
1828 """Information Extractor for Facebook"""
1830 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1831 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1832 _NETRC_MACHINE = 'facebook'
1833 IE_NAME = u'facebook'
1835 def report_login(self):
1836 """Report attempt to log in."""
1837 self.to_screen(u'Logging in')
1839 def _real_initialize(self):
1840 if self._downloader is None:
1845 downloader_params = self._downloader.params
1847 # Attempt to use provided username and password or .netrc data
1848 if downloader_params.get('username', None) is not None:
1849 useremail = downloader_params['username']
1850 password = downloader_params['password']
1851 elif downloader_params.get('usenetrc', False):
1853 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1854 if info is not None:
1858 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1859 except (IOError, netrc.NetrcParseError) as err:
1860 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1863 if useremail is None:
1872 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1875 login_results = compat_urllib_request.urlopen(request).read()
1876 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1877 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1879 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1880 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1883 def _real_extract(self, url):
1884 mobj = re.match(self._VALID_URL, url)
1886 raise ExtractorError(u'Invalid URL: %s' % url)
1887 video_id = mobj.group('ID')
1889 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1890 webpage = self._download_webpage(url, video_id)
1892 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1893 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1894 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1896 raise ExtractorError(u'Cannot parse data')
1897 data = dict(json.loads(m.group(1)))
1898 params_raw = compat_urllib_parse.unquote(data['params'])
1899 params = json.loads(params_raw)
1900 video_data = params['video_data'][0]
1901 video_url = video_data.get('hd_src')
1903 video_url = video_data['sd_src']
1905 raise ExtractorError(u'Cannot find video URL')
1906 video_duration = int(video_data['video_duration'])
1907 thumbnail = video_data['thumbnail_src']
1909 video_title = self._search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
1911 video_title = unescapeHTML(video_title)
1915 'title': video_title,
1918 'duration': video_duration,
1919 'thumbnail': thumbnail,
1924 class BlipTVIE(InfoExtractor):
1925 """Information extractor for blip.tv"""
1927 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
1928 _URL_EXT = r'^.*\.([a-z0-9]+)$'
1929 IE_NAME = u'blip.tv'
1931 def report_direct_download(self, title):
1932 """Report information extraction."""
1933 self.to_screen(u'%s: Direct download detected' % title)
1935 def _real_extract(self, url):
1936 mobj = re.match(self._VALID_URL, url)
1938 raise ExtractorError(u'Invalid URL: %s' % url)
1940 # See https://github.com/rg3/youtube-dl/issues/857
1941 api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
1942 if api_mobj is not None:
1943 url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
1944 urlp = compat_urllib_parse_urlparse(url)
1945 if urlp.path.startswith('/play/'):
1946 request = compat_urllib_request.Request(url)
1947 response = compat_urllib_request.urlopen(request)
1948 redirecturl = response.geturl()
1949 rurlp = compat_urllib_parse_urlparse(redirecturl)
1950 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1951 url = 'http://blip.tv/a/a-' + file_id
1952 return self._real_extract(url)
1959 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1960 request = compat_urllib_request.Request(json_url)
1961 request.add_header('User-Agent', 'iTunes/10.6.1')
1962 self.report_extraction(mobj.group(1))
1965 urlh = compat_urllib_request.urlopen(request)
1966 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1967 basename = url.split('/')[-1]
1968 title,ext = os.path.splitext(basename)
1969 title = title.decode('UTF-8')
1970 ext = ext.replace('.', '')
1971 self.report_direct_download(title)
1976 'upload_date': None,
1981 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1982 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
1983 if info is None: # Regular URL
1985 json_code_bytes = urlh.read()
1986 json_code = json_code_bytes.decode('utf-8')
1987 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1988 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
1991 json_data = json.loads(json_code)
1992 if 'Post' in json_data:
1993 data = json_data['Post']
1997 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
1998 video_url = data['media']['url']
1999 umobj = re.match(self._URL_EXT, video_url)
2001 raise ValueError('Can not determine filename extension')
2002 ext = umobj.group(1)
2005 'id': data['item_id'],
2007 'uploader': data['display_name'],
2008 'upload_date': upload_date,
2009 'title': data['title'],
2011 'format': data['media']['mimeType'],
2012 'thumbnail': data['thumbnailUrl'],
2013 'description': data['description'],
2014 'player_url': data['embedUrl'],
2015 'user_agent': 'iTunes/10.6.1',
2017 except (ValueError,KeyError) as err:
2018 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
2023 class MyVideoIE(InfoExtractor):
2024 """Information Extractor for myvideo.de."""
2026 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2027 IE_NAME = u'myvideo'
2029 # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
2030 # Released into the Public Domain by Tristan Fischer on 2013-05-19
2031 # https://github.com/rg3/youtube-dl/pull/842
2032 def __rc4crypt(self,data, key):
2034 box = list(range(256))
2035 for i in list(range(256)):
2036 x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
2037 box[i], box[x] = box[x], box[i]
2043 y = (y + box[x]) % 256
2044 box[x], box[y] = box[y], box[x]
2045 out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
2049 return hashlib.md5(s).hexdigest().encode()
2051 def _real_extract(self,url):
2052 mobj = re.match(self._VALID_URL, url)
2054 raise ExtractorError(u'invalid URL: %s' % url)
2056 video_id = mobj.group(1)
2059 b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
2060 b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
2061 b'TnpsbA0KTVRkbU1tSTRNdz09'
2065 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2066 webpage = self._download_webpage(webpage_url, video_id)
2068 mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
2069 if mobj is not None:
2070 self.report_extraction(video_id)
2071 video_url = mobj.group(1) + '.flv'
2073 video_title = self._search_regex('<title>([^<]+)</title>',
2076 video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
2082 'upload_date': None,
2083 'title': video_title,
2088 mobj = re.search('var flashvars={(.+?)}', webpage)
2090 raise ExtractorError(u'Unable to extract video')
2095 for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
2096 if not a == '_encxml':
2099 encxml = compat_urllib_parse.unquote(b)
2100 if not params.get('domain'):
2101 params['domain'] = 'www.myvideo.de'
2102 xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
2103 if 'flash_playertype=MTV' in xmldata_url:
2104 self._downloader.report_warning(u'avoiding MTV player')
2106 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
2107 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
2111 enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
2112 enc_data_b = binascii.unhexlify(enc_data)
2114 base64.b64decode(base64.b64decode(GK)) +
2116 str(video_id).encode('utf-8')
2119 dec_data = self.__rc4crypt(enc_data_b, sk)
2122 self.report_extraction(video_id)
2125 mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
2127 video_url = compat_urllib_parse.unquote(mobj.group(1))
2128 if 'myvideo2flash' in video_url:
2129 self._downloader.report_warning(u'forcing RTMPT ...')
2130 video_url = video_url.replace('rtmpe://', 'rtmpt://')
2133 # extract non rtmp videos
2134 mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
2136 raise ExtractorError(u'unable to extract url')
2137 video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
2139 video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
2140 video_file = compat_urllib_parse.unquote(video_file)
2142 if not video_file.endswith('f4m'):
2143 ppath, prefix = video_file.split('.')
2144 video_playpath = '%s:%s' % (prefix, ppath)
2145 video_hls_playlist = ''
2148 video_hls_playlist = (
2149 video_filepath + video_file
2150 ).replace('.f4m', '.m3u8')
2152 video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
2153 video_swfobj = compat_urllib_parse.unquote(video_swfobj)
2155 video_title = self._search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
2161 'tc_url': video_url,
2163 'upload_date': None,
2164 'title': video_title,
2166 'play_path': video_playpath,
2167 'video_file': video_file,
2168 'video_hls_playlist': video_hls_playlist,
2169 'player_url': video_swfobj,
2173 class ComedyCentralIE(InfoExtractor):
2174 """Information extractor for The Daily Show and Colbert Report """
2176 # urls can be abbreviations like :thedailyshow or :colbert
2177 # urls for episodes like:
2178 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2179 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2180 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2181 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2182 |(https?://)?(www\.)?
2183 (?P<showname>thedailyshow|colbertnation)\.com/
2184 (full-episodes/(?P<episode>.*)|
2186 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2187 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2190 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2192 _video_extensions = {
2200 _video_dimensions = {
2210 def suitable(cls, url):
2211 """Receives a URL and returns True if suitable for this IE."""
2212 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2214 def _print_formats(self, formats):
2215 print('Available formats:')
2217 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2220 def _real_extract(self, url):
2221 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2223 raise ExtractorError(u'Invalid URL: %s' % url)
2225 if mobj.group('shortname'):
2226 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2227 url = u'http://www.thedailyshow.com/full-episodes/'
2229 url = u'http://www.colbertnation.com/full-episodes/'
2230 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2231 assert mobj is not None
2233 if mobj.group('clip'):
2234 if mobj.group('showname') == 'thedailyshow':
2235 epTitle = mobj.group('tdstitle')
2237 epTitle = mobj.group('cntitle')
2240 dlNewest = not mobj.group('episode')
2242 epTitle = mobj.group('showname')
2244 epTitle = mobj.group('episode')
2246 self.report_extraction(epTitle)
2247 webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2249 url = htmlHandle.geturl()
2250 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2252 raise ExtractorError(u'Invalid redirected URL: ' + url)
2253 if mobj.group('episode') == '':
2254 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2255 epTitle = mobj.group('episode')
2257 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2259 if len(mMovieParams) == 0:
2260 # The Colbert Report embeds the information in a without
2261 # a URL prefix; so extract the alternate reference
2262 # and then add the URL prefix manually.
2264 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2265 if len(altMovieParams) == 0:
2266 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2268 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2270 uri = mMovieParams[0][1]
2271 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2272 indexXml = self._download_webpage(indexUrl, epTitle,
2273 u'Downloading show index',
2274 u'unable to download episode index')
2278 idoc = xml.etree.ElementTree.fromstring(indexXml)
2279 itemEls = idoc.findall('.//item')
2280 for partNum,itemEl in enumerate(itemEls):
2281 mediaId = itemEl.findall('./guid')[0].text
2282 shortMediaId = mediaId.split(':')[-1]
2283 showId = mediaId.split(':')[-2].replace('.com', '')
2284 officialTitle = itemEl.findall('./title')[0].text
2285 officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2287 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2288 compat_urllib_parse.urlencode({'uri': mediaId}))
2289 configXml = self._download_webpage(configUrl, epTitle,
2290 u'Downloading configuration for %s' % shortMediaId)
2292 cdoc = xml.etree.ElementTree.fromstring(configXml)
2294 for rendition in cdoc.findall('.//rendition'):
2295 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2299 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2302 if self._downloader.params.get('listformats', None):
2303 self._print_formats([i[0] for i in turls])
2306 # For now, just pick the highest bitrate
2307 format,rtmp_video_url = turls[-1]
2309 # Get the format arg from the arg stream
2310 req_format = self._downloader.params.get('format', None)
2312 # Select format if we can find one
2315 format, rtmp_video_url = f, v
2318 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2320 raise ExtractorError(u'Cannot transform RTMP url')
2321 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2322 video_url = base + m.group('finalid')
2324 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2329 'upload_date': officialDate,
2334 'description': officialTitle,
2336 results.append(info)
2341 class EscapistIE(InfoExtractor):
2342 """Information extractor for The Escapist """
2344 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2345 IE_NAME = u'escapist'
2347 def _real_extract(self, url):
2348 mobj = re.match(self._VALID_URL, url)
2350 raise ExtractorError(u'Invalid URL: %s' % url)
2351 showName = mobj.group('showname')
2352 videoId = mobj.group('episode')
2354 self.report_extraction(showName)
2355 webpage = self._download_webpage(url, showName)
2357 videoDesc = self._search_regex('<meta name="description" content="([^"]*)"',
2358 webpage, u'description', fatal=False)
2359 if videoDesc: videoDesc = unescapeHTML(videoDesc)
2361 imgUrl = self._search_regex('<meta property="og:image" content="([^"]*)"',
2362 webpage, u'thumbnail', fatal=False)
2363 if imgUrl: imgUrl = unescapeHTML(imgUrl)
2365 playerUrl = self._search_regex('<meta property="og:video" content="([^"]*)"',
2366 webpage, u'player url')
2367 playerUrl = unescapeHTML(playerUrl)
2369 configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
2370 configUrl = compat_urllib_parse.unquote(configUrl)
2372 configJSON = self._download_webpage(configUrl, showName,
2373 u'Downloading configuration',
2374 u'unable to download configuration')
2376 # Technically, it's JavaScript, not JSON
2377 configJSON = configJSON.replace("'", '"')
2380 config = json.loads(configJSON)
2381 except (ValueError,) as err:
2382 raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2384 playlist = config['playlist']
2385 videoUrl = playlist[1]['url']
2390 'uploader': showName,
2391 'upload_date': None,
2394 'thumbnail': imgUrl,
2395 'description': videoDesc,
2396 'player_url': playerUrl,
2401 class CollegeHumorIE(InfoExtractor):
2402 """Information extractor for collegehumor.com"""
2405 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2406 IE_NAME = u'collegehumor'
2408 def report_manifest(self, video_id):
2409 """Report information extraction."""
2410 self.to_screen(u'%s: Downloading XML manifest' % video_id)
2412 def _real_extract(self, url):
2413 mobj = re.match(self._VALID_URL, url)
2415 raise ExtractorError(u'Invalid URL: %s' % url)
2416 video_id = mobj.group('videoid')
2421 'upload_date': None,
2424 self.report_extraction(video_id)
2425 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2427 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2428 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2429 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2431 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2433 videoNode = mdoc.findall('./video')[0]
2434 info['description'] = videoNode.findall('./description')[0].text
2435 info['title'] = videoNode.findall('./caption')[0].text
2436 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2437 manifest_url = videoNode.findall('./file')[0].text
2439 raise ExtractorError(u'Invalid metadata XML file')
2441 manifest_url += '?hdcore=2.10.3'
2442 self.report_manifest(video_id)
2444 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2445 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2446 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2448 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2450 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2451 node_id = media_node.attrib['url']
2452 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2453 except IndexError as err:
2454 raise ExtractorError(u'Invalid manifest file')
2456 url_pr = compat_urllib_parse_urlparse(manifest_url)
2457 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2464 class XVideosIE(InfoExtractor):
2465 """Information extractor for xvideos.com"""
2467 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2468 IE_NAME = u'xvideos'
2470 def _real_extract(self, url):
2471 mobj = re.match(self._VALID_URL, url)
2473 raise ExtractorError(u'Invalid URL: %s' % url)
2474 video_id = mobj.group(1)
2476 webpage = self._download_webpage(url, video_id)
2478 self.report_extraction(video_id)
2481 video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
2482 webpage, u'video URL'))
2485 video_title = self._search_regex(r'<title>(.*?)\s+-\s+XVID',
2488 # Extract video thumbnail
2489 video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
2490 webpage, u'thumbnail', fatal=False)
2496 'upload_date': None,
2497 'title': video_title,
2499 'thumbnail': video_thumbnail,
2500 'description': None,
2506 class SoundcloudIE(InfoExtractor):
2507 """Information extractor for soundcloud.com
2508 To access the media, the uid of the song and a stream token
2509 must be extracted from the page source and the script must make
2510 a request to media.soundcloud.com/crossdomain.xml. Then
2511 the media can be grabbed by requesting from an url composed
2512 of the stream token and uid
2515 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2516 IE_NAME = u'soundcloud'
2518 def report_resolve(self, video_id):
2519 """Report information extraction."""
2520 self.to_screen(u'%s: Resolving id' % video_id)
2522 def _real_extract(self, url):
2523 mobj = re.match(self._VALID_URL, url)
2525 raise ExtractorError(u'Invalid URL: %s' % url)
2527 # extract uploader (which is in the url)
2528 uploader = mobj.group(1)
2529 # extract simple title (uploader + slug of song title)
2530 slug_title = mobj.group(2)
2531 simple_title = uploader + u'-' + slug_title
2532 full_title = '%s/%s' % (uploader, slug_title)
2534 self.report_resolve(full_title)
2536 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2537 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2538 info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2540 info = json.loads(info_json)
2541 video_id = info['id']
2542 self.report_extraction(full_title)
2544 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2545 stream_json = self._download_webpage(streams_url, full_title,
2546 u'Downloading stream definitions',
2547 u'unable to download stream definitions')
2549 streams = json.loads(stream_json)
2550 mediaURL = streams['http_mp3_128_url']
2551 upload_date = unified_strdate(info['created_at'])
2556 'uploader': info['user']['username'],
2557 'upload_date': upload_date,
2558 'title': info['title'],
2560 'description': info['description'],
2563 class SoundcloudSetIE(InfoExtractor):
2564 """Information extractor for soundcloud.com sets
2565 To access the media, the uid of the song and a stream token
2566 must be extracted from the page source and the script must make
2567 a request to media.soundcloud.com/crossdomain.xml. Then
2568 the media can be grabbed by requesting from an url composed
2569 of the stream token and uid
2572 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2573 IE_NAME = u'soundcloud:set'
2575 def report_resolve(self, video_id):
2576 """Report information extraction."""
2577 self.to_screen(u'%s: Resolving id' % video_id)
2579 def _real_extract(self, url):
2580 mobj = re.match(self._VALID_URL, url)
2582 raise ExtractorError(u'Invalid URL: %s' % url)
2584 # extract uploader (which is in the url)
2585 uploader = mobj.group(1)
2586 # extract simple title (uploader + slug of song title)
2587 slug_title = mobj.group(2)
2588 simple_title = uploader + u'-' + slug_title
2589 full_title = '%s/sets/%s' % (uploader, slug_title)
2591 self.report_resolve(full_title)
2593 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2594 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2595 info_json = self._download_webpage(resolv_url, full_title)
2598 info = json.loads(info_json)
2599 if 'errors' in info:
2600 for err in info['errors']:
2601 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2604 self.report_extraction(full_title)
2605 for track in info['tracks']:
2606 video_id = track['id']
2608 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2609 stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2611 self.report_extraction(video_id)
2612 streams = json.loads(stream_json)
2613 mediaURL = streams['http_mp3_128_url']
2618 'uploader': track['user']['username'],
2619 'upload_date': unified_strdate(track['created_at']),
2620 'title': track['title'],
2622 'description': track['description'],
2627 class InfoQIE(InfoExtractor):
2628 """Information extractor for infoq.com"""
2629 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2631 def _real_extract(self, url):
2632 mobj = re.match(self._VALID_URL, url)
2634 raise ExtractorError(u'Invalid URL: %s' % url)
2636 webpage = self._download_webpage(url, video_id=url)
2637 self.report_extraction(url)
2640 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2642 raise ExtractorError(u'Unable to extract video url')
2643 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2644 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2647 video_title = self._search_regex(r'contentTitle = "(.*?)";',
2650 # Extract description
2651 video_description = self._search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
2652 webpage, u'description', fatal=False)
2654 video_filename = video_url.split('/')[-1]
2655 video_id, extension = video_filename.split('.')
2661 'upload_date': None,
2662 'title': video_title,
2663 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2665 'description': video_description,
2670 class MixcloudIE(InfoExtractor):
2671 """Information extractor for www.mixcloud.com"""
2673 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2674 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2675 IE_NAME = u'mixcloud'
2677 def report_download_json(self, file_id):
2678 """Report JSON download."""
2679 self.to_screen(u'Downloading json')
2681 def get_urls(self, jsonData, fmt, bitrate='best'):
2682 """Get urls from 'audio_formats' section in json"""
2685 bitrate_list = jsonData[fmt]
2686 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2687 bitrate = max(bitrate_list) # select highest
2689 url_list = jsonData[fmt][bitrate]
2690 except TypeError: # we have no bitrate info.
2691 url_list = jsonData[fmt]
2694 def check_urls(self, url_list):
2695 """Returns 1st active url from list"""
2696 for url in url_list:
2698 compat_urllib_request.urlopen(url)
2700 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2705 def _print_formats(self, formats):
2706 print('Available formats:')
2707 for fmt in formats.keys():
2708 for b in formats[fmt]:
2710 ext = formats[fmt][b][0]
2711 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2712 except TypeError: # we have no bitrate info
2713 ext = formats[fmt][0]
2714 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2717 def _real_extract(self, url):
2718 mobj = re.match(self._VALID_URL, url)
2720 raise ExtractorError(u'Invalid URL: %s' % url)
2721 # extract uploader & filename from url
2722 uploader = mobj.group(1).decode('utf-8')
2723 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2725 # construct API request
2726 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2727 # retrieve .json file with links to files
2728 request = compat_urllib_request.Request(file_url)
2730 self.report_download_json(file_url)
2731 jsonData = compat_urllib_request.urlopen(request).read()
2732 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2733 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2736 json_data = json.loads(jsonData)
2737 player_url = json_data['player_swf_url']
2738 formats = dict(json_data['audio_formats'])
2740 req_format = self._downloader.params.get('format', None)
2743 if self._downloader.params.get('listformats', None):
2744 self._print_formats(formats)
2747 if req_format is None or req_format == 'best':
2748 for format_param in formats.keys():
2749 url_list = self.get_urls(formats, format_param)
2751 file_url = self.check_urls(url_list)
2752 if file_url is not None:
2755 if req_format not in formats:
2756 raise ExtractorError(u'Format is not available')
2758 url_list = self.get_urls(formats, req_format)
2759 file_url = self.check_urls(url_list)
2760 format_param = req_format
2763 'id': file_id.decode('utf-8'),
2764 'url': file_url.decode('utf-8'),
2765 'uploader': uploader.decode('utf-8'),
2766 'upload_date': None,
2767 'title': json_data['name'],
2768 'ext': file_url.split('.')[-1].decode('utf-8'),
2769 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2770 'thumbnail': json_data['thumbnail_url'],
2771 'description': json_data['description'],
2772 'player_url': player_url.decode('utf-8'),
2775 class StanfordOpenClassroomIE(InfoExtractor):
2776 """Information extractor for Stanford's Open ClassRoom"""
2778 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2779 IE_NAME = u'stanfordoc'
2781 def _real_extract(self, url):
2782 mobj = re.match(self._VALID_URL, url)
2784 raise ExtractorError(u'Invalid URL: %s' % url)
2786 if mobj.group('course') and mobj.group('video'): # A specific video
2787 course = mobj.group('course')
2788 video = mobj.group('video')
2790 'id': course + '_' + video,
2792 'upload_date': None,
2795 self.report_extraction(info['id'])
2796 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2797 xmlUrl = baseUrl + video + '.xml'
2799 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2800 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2801 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2802 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2804 info['title'] = mdoc.findall('./title')[0].text
2805 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2807 raise ExtractorError(u'Invalid metadata XML file')
2808 info['ext'] = info['url'].rpartition('.')[2]
2810 elif mobj.group('course'): # A course page
2811 course = mobj.group('course')
2816 'upload_date': None,
2819 coursepage = self._download_webpage(url, info['id'],
2820 note='Downloading course info page',
2821 errnote='Unable to download course info page')
2823 # TODO: implement default_value in search_regex
2824 m = re.search('<h1>([^<]+)</h1>', coursepage)
2826 info['title'] = unescapeHTML(m.group(1))
2828 info['title'] = info['id']
2830 info['description'] = self._search_regex('<description>([^<]+)</description>',
2831 coursepage, u'description', fatal=False)
2832 if info['description']: info['description'] = unescapeHTML(info['description'])
2834 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2837 'type': 'reference',
2838 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2842 for entry in info['list']:
2843 assert entry['type'] == 'reference'
2844 results += self.extract(entry['url'])
2848 'id': 'Stanford OpenClassroom',
2851 'upload_date': None,
2854 self.report_download_webpage(info['id'])
2855 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2857 rootpage = compat_urllib_request.urlopen(rootURL).read()
2858 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2859 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2861 info['title'] = info['id']
2863 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2866 'type': 'reference',
2867 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2872 for entry in info['list']:
2873 assert entry['type'] == 'reference'
2874 results += self.extract(entry['url'])
2877 class MTVIE(InfoExtractor):
2878 """Information extractor for MTV.com"""
2880 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2883 def _real_extract(self, url):
2884 mobj = re.match(self._VALID_URL, url)
2886 raise ExtractorError(u'Invalid URL: %s' % url)
2887 if not mobj.group('proto'):
2888 url = 'http://' + url
2889 video_id = mobj.group('videoid')
2891 webpage = self._download_webpage(url, video_id)
2893 song_name = self._search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
2894 webpage, u'song name', fatal=False)
2895 if song_name: song_name = unescapeHTML(song_name)
2897 video_title = self._search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
2899 video_title = unescapeHTML(video_title)
2901 mtvn_uri = self._search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
2902 webpage, u'mtvn_uri', fatal=False)
2904 content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
2905 webpage, u'content id', fatal=False)
2907 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2908 self.report_extraction(video_id)
2909 request = compat_urllib_request.Request(videogen_url)
2911 metadataXml = compat_urllib_request.urlopen(request).read()
2912 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2913 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2915 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2916 renditions = mdoc.findall('.//rendition')
2918 # For now, always pick the highest quality.
2919 rendition = renditions[-1]
2922 _,_,ext = rendition.attrib['type'].partition('/')
2923 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2924 video_url = rendition.find('./src').text
2926 raise ExtractorError('Invalid rendition field.')
2931 'uploader': performer,
2932 'upload_date': None,
2933 'title': video_title,
2941 class YoukuIE(InfoExtractor):
2942 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2945 nowTime = int(time.time() * 1000)
2946 random1 = random.randint(1000,1998)
2947 random2 = random.randint(1000,9999)
2949 return "%d%d%d" %(nowTime,random1,random2)
2951 def _get_file_ID_mix_string(self, seed):
2953 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2955 for i in range(len(source)):
2956 seed = (seed * 211 + 30031 ) % 65536
2957 index = math.floor(seed / 65536 * len(source) )
2958 mixed.append(source[int(index)])
2959 source.remove(source[int(index)])
2960 #return ''.join(mixed)
2963 def _get_file_id(self, fileId, seed):
2964 mixed = self._get_file_ID_mix_string(seed)
2965 ids = fileId.split('*')
2969 realId.append(mixed[int(ch)])
2970 return ''.join(realId)
2972 def _real_extract(self, url):
2973 mobj = re.match(self._VALID_URL, url)
2975 raise ExtractorError(u'Invalid URL: %s' % url)
2976 video_id = mobj.group('ID')
2978 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
2980 jsondata = self._download_webpage(info_url, video_id)
2982 self.report_extraction(video_id)
2984 config = json.loads(jsondata)
2986 video_title = config['data'][0]['title']
2987 seed = config['data'][0]['seed']
2989 format = self._downloader.params.get('format', None)
2990 supported_format = list(config['data'][0]['streamfileids'].keys())
2992 if format is None or format == 'best':
2993 if 'hd2' in supported_format:
2998 elif format == 'worst':
3006 fileid = config['data'][0]['streamfileids'][format]
3007 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3008 except (UnicodeDecodeError, ValueError, KeyError):
3009 raise ExtractorError(u'Unable to extract info section')
3012 sid = self._gen_sid()
3013 fileid = self._get_file_id(fileid, seed)
3015 #column 8,9 of fileid represent the segment number
3016 #fileid[7:9] should be changed
3017 for index, key in enumerate(keys):
3019 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3020 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3023 'id': '%s_part%02d' % (video_id, index),
3024 'url': download_url,
3026 'upload_date': None,
3027 'title': video_title,
3030 files_info.append(info)
3035 class XNXXIE(InfoExtractor):
3036 """Information extractor for xnxx.com"""
3038 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3040 VIDEO_URL_RE = r'flv_url=(.*?)&'
3041 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3042 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3044 def _real_extract(self, url):
3045 mobj = re.match(self._VALID_URL, url)
3047 raise ExtractorError(u'Invalid URL: %s' % url)
3048 video_id = mobj.group(1)
3050 # Get webpage content
3051 webpage = self._download_webpage(url, video_id)
3053 video_url = self._search_regex(self.VIDEO_URL_RE,
3054 webpage, u'video URL')
3055 video_url = compat_urllib_parse.unquote(video_url)
3057 video_title = self._search_regex(self.VIDEO_TITLE_RE,
3060 video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
3061 webpage, u'thumbnail', fatal=False)
3067 'upload_date': None,
3068 'title': video_title,
3070 'thumbnail': video_thumbnail,
3071 'description': None,
3075 class GooglePlusIE(InfoExtractor):
3076 """Information extractor for plus.google.com."""
3078 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3079 IE_NAME = u'plus.google'
3081 def _real_extract(self, url):
3082 # Extract id from URL
3083 mobj = re.match(self._VALID_URL, url)
3085 raise ExtractorError(u'Invalid URL: %s' % url)
3087 post_url = mobj.group(0)
3088 video_id = mobj.group(1)
3090 video_extension = 'flv'
3092 # Step 1, Retrieve post webpage to extract further information
3093 webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3095 self.report_extraction(video_id)
3097 # Extract update date
3098 upload_date = self._search_regex('title="Timestamp">(.*?)</a>',
3099 webpage, u'upload date', fatal=False)
3101 # Convert timestring to a format suitable for filename
3102 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3103 upload_date = upload_date.strftime('%Y%m%d')
3106 uploader = self._search_regex(r'rel\="author".*?>(.*?)</a>',
3107 webpage, u'uploader', fatal=False)
3110 # Get the first line for title
3111 # TODO: implement default_value in search_regex
3113 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3114 mobj = re.search(pattern, webpage)
3116 video_title = mobj.group(1)
3118 # Step 2, Stimulate clicking the image box to launch video
3119 video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
3120 webpage, u'video page URL')
3121 webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3123 # Extract video links on video page
3124 """Extract video links of all sizes"""
3125 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3126 mobj = re.findall(pattern, webpage)
3128 raise ExtractorError(u'Unable to extract video links')
3130 # Sort in resolution
3131 links = sorted(mobj)
3133 # Choose the lowest of the sort, i.e. highest resolution
3134 video_url = links[-1]
3135 # Only get the url. The resolution part in the tuple has no use anymore
3136 video_url = video_url[-1]
3137 # Treat escaped \u0026 style hex
3139 video_url = video_url.decode("unicode_escape")
3140 except AttributeError: # Python 3
3141 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3147 'uploader': uploader,
3148 'upload_date': upload_date,
3149 'title': video_title,
3150 'ext': video_extension,
3153 class NBAIE(InfoExtractor):
3154 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3157 def _real_extract(self, url):
3158 mobj = re.match(self._VALID_URL, url)
3160 raise ExtractorError(u'Invalid URL: %s' % url)
3162 video_id = mobj.group(1)
3163 if video_id.endswith('/index.html'):
3164 video_id = video_id[:-len('/index.html')]
3166 webpage = self._download_webpage(url, video_id)
3168 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3170 # TODO: implement default_value in search_regex
3171 def _findProp(rexp, default=None):
3172 m = re.search(rexp, webpage)
3174 return unescapeHTML(m.group(1))
3178 shortened_video_id = video_id.rpartition('/')[2]
3179 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3181 'id': shortened_video_id,
3185 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3186 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3190 class JustinTVIE(InfoExtractor):
3191 """Information extractor for justin.tv and twitch.tv"""
3192 # TODO: One broadcast may be split into multiple videos. The key
3193 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3194 # starts at 1 and increases. Can we treat all parts as one video?
3196 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3198 (?P<channelid>[^/]+)|
3199 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3200 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3204 _JUSTIN_PAGE_LIMIT = 100
3205 IE_NAME = u'justin.tv'
3207 def report_download_page(self, channel, offset):
3208 """Report attempt to download a single page of videos."""
3209 self.to_screen(u'%s: Downloading video information from %d to %d' %
3210 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3212 # Return count of items, list of *valid* items
3213 def _parse_page(self, url, video_id):
3214 webpage = self._download_webpage(url, video_id,
3215 u'Downloading video info JSON',
3216 u'unable to download video info JSON')
3218 response = json.loads(webpage)
3219 if type(response) != list:
3220 error_text = response.get('error', 'unknown error')
3221 raise ExtractorError(u'Justin.tv API: %s' % error_text)
3223 for clip in response:
3224 video_url = clip['video_file_url']
3226 video_extension = os.path.splitext(video_url)[1][1:]
3227 video_date = re.sub('-', '', clip['start_time'][:10])
3228 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3229 video_id = clip['id']
3230 video_title = clip.get('title', video_id)
3234 'title': video_title,
3235 'uploader': clip.get('channel_name', video_uploader_id),
3236 'uploader_id': video_uploader_id,
3237 'upload_date': video_date,
3238 'ext': video_extension,
3240 return (len(response), info)
3242 def _real_extract(self, url):
3243 mobj = re.match(self._VALID_URL, url)
3245 raise ExtractorError(u'invalid URL: %s' % url)
3247 api_base = 'http://api.justin.tv'
3249 if mobj.group('channelid'):
3251 video_id = mobj.group('channelid')
3252 api = api_base + '/channel/archives/%s.json' % video_id
3253 elif mobj.group('chapterid'):
3254 chapter_id = mobj.group('chapterid')
3256 webpage = self._download_webpage(url, chapter_id)
3257 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3259 raise ExtractorError(u'Cannot find archive of a chapter')
3260 archive_id = m.group(1)
3262 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3263 chapter_info_xml = self._download_webpage(api, chapter_id,
3264 note=u'Downloading chapter information',
3265 errnote=u'Chapter information download failed')
3266 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3267 for a in doc.findall('.//archive'):
3268 if archive_id == a.find('./id').text:
3271 raise ExtractorError(u'Could not find chapter in chapter information')
3273 video_url = a.find('./video_file_url').text
3274 video_ext = video_url.rpartition('.')[2] or u'flv'
3276 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3277 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3278 note='Downloading chapter metadata',
3279 errnote='Download of chapter metadata failed')
3280 chapter_info = json.loads(chapter_info_json)
3282 bracket_start = int(doc.find('.//bracket_start').text)
3283 bracket_end = int(doc.find('.//bracket_end').text)
3285 # TODO determine start (and probably fix up file)
3286 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3287 #video_url += u'?start=' + TODO:start_timestamp
3288 # bracket_start is 13290, but we want 51670615
3289 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3290 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3293 'id': u'c' + chapter_id,
3296 'title': chapter_info['title'],
3297 'thumbnail': chapter_info['preview'],
3298 'description': chapter_info['description'],
3299 'uploader': chapter_info['channel']['display_name'],
3300 'uploader_id': chapter_info['channel']['name'],
3304 video_id = mobj.group('videoid')
3305 api = api_base + '/broadcast/by_archive/%s.json' % video_id
3307 self.report_extraction(video_id)
3311 limit = self._JUSTIN_PAGE_LIMIT
3314 self.report_download_page(video_id, offset)
3315 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3316 page_count, page_info = self._parse_page(page_url, video_id)
3317 info.extend(page_info)
3318 if not paged or page_count != limit:
3323 class FunnyOrDieIE(InfoExtractor):
3324 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3326 def _real_extract(self, url):
3327 mobj = re.match(self._VALID_URL, url)
3329 raise ExtractorError(u'invalid URL: %s' % url)
3331 video_id = mobj.group('id')
3332 webpage = self._download_webpage(url, video_id)
3334 video_url = self._search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
3335 webpage, u'video URL', flags=re.DOTALL)
3336 video_url = unescapeHTML(video_url)
3338 # TODO: implement fallbacks in regex_search
3339 m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3341 m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3343 raise ExtractorError(u'Cannot find video title')
3344 title = clean_html(m.group('title'))
3346 video_description = self._search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3347 webpage, u'description', flags=re.DOTALL)
3348 if video_description: video_description = unescapeHTML(video_description)
3355 'description': video_description,
3359 class SteamIE(InfoExtractor):
3360 _VALID_URL = r"""http://store\.steampowered\.com/
3362 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3364 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3368 def suitable(cls, url):
3369 """Receives a URL and returns True if suitable for this IE."""
3370 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3372 def _real_extract(self, url):
3373 m = re.match(self._VALID_URL, url, re.VERBOSE)
3374 gameID = m.group('gameID')
3375 videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3376 self.report_age_confirmation()
3377 webpage = self._download_webpage(videourl, gameID)
3378 game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3380 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3381 mweb = re.finditer(urlRE, webpage)
3382 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3383 titles = re.finditer(namesRE, webpage)
3384 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3385 thumbs = re.finditer(thumbsRE, webpage)
3387 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3388 video_id = vid.group('videoID')
3389 title = vtitle.group('videoName')
3390 video_url = vid.group('videoURL')
3391 video_thumb = thumb.group('thumbnail')
3393 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3398 'title': unescapeHTML(title),
3399 'thumbnail': video_thumb
3402 return [self.playlist_result(videos, gameID, game_title)]
3404 class UstreamIE(InfoExtractor):
3405 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3406 IE_NAME = u'ustream'
3408 def _real_extract(self, url):
3409 m = re.match(self._VALID_URL, url)
3410 video_id = m.group('videoID')
3412 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3413 webpage = self._download_webpage(url, video_id)
3415 self.report_extraction(video_id)
3417 video_title = self._search_regex(r'data-title="(?P<title>.+)"',
3420 uploader = self._search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
3421 webpage, u'uploader', fatal=False, flags=re.DOTALL)
3422 if uploader: uploader = unescapeHTML(uploader.strip())
3424 thumbnail = self._search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
3425 webpage, u'thumbnail', fatal=False)
3431 'title': video_title,
3432 'uploader': uploader,
3433 'thumbnail': thumbnail,
3437 class WorldStarHipHopIE(InfoExtractor):
3438 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3439 IE_NAME = u'WorldStarHipHop'
3441 def _real_extract(self, url):
3442 m = re.match(self._VALID_URL, url)
3443 video_id = m.group('id')
3445 webpage_src = self._download_webpage(url, video_id)
3447 video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
3448 webpage_src, u'video URL')
3450 if 'mp4' in video_url:
3455 video_title = self._search_regex(r"<title>(.*)</title>",
3456 webpage_src, u'title')
3458 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3459 thumbnail = self._search_regex(r'rel="image_src" href="(.*)" />',
3460 webpage_src, u'thumbnail', fatal=False)
3463 _title = r"""candytitles.*>(.*)</span>"""
3464 mobj = re.search(_title, webpage_src)
3465 if mobj is not None:
3466 video_title = mobj.group(1)
3471 'title' : video_title,
3472 'thumbnail' : thumbnail,
3477 class RBMARadioIE(InfoExtractor):
3478 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3480 def _real_extract(self, url):
3481 m = re.match(self._VALID_URL, url)
3482 video_id = m.group('videoID')
3484 webpage = self._download_webpage(url, video_id)
3486 json_data = self._search_regex(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>',
3487 webpage, u'json data')
3490 data = json.loads(json_data)
3491 except ValueError as e:
3492 raise ExtractorError(u'Invalid JSON: ' + str(e))
3494 video_url = data['akamai_url'] + '&cbr=256'
3495 url_parts = compat_urllib_parse_urlparse(video_url)
3496 video_ext = url_parts.path.rpartition('.')[2]
3501 'title': data['title'],
3502 'description': data.get('teaser_text'),
3503 'location': data.get('country_of_origin'),
3504 'uploader': data.get('host', {}).get('name'),
3505 'uploader_id': data.get('host', {}).get('slug'),
3506 'thumbnail': data.get('image', {}).get('large_url_2x'),
3507 'duration': data.get('duration'),
3512 class YouPornIE(InfoExtractor):
3513 """Information extractor for youporn.com."""
3514 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3516 def _print_formats(self, formats):
3517 """Print all available formats"""
3518 print(u'Available formats:')
3519 print(u'ext\t\tformat')
3520 print(u'---------------------------------')
3521 for format in formats:
3522 print(u'%s\t\t%s' % (format['ext'], format['format']))
3524 def _specific(self, req_format, formats):
3526 if(x["format"]==req_format):
3530 def _real_extract(self, url):
3531 mobj = re.match(self._VALID_URL, url)
3533 raise ExtractorError(u'Invalid URL: %s' % url)
3534 video_id = mobj.group('videoid')
3536 req = compat_urllib_request.Request(url)
3537 req.add_header('Cookie', 'age_verified=1')
3538 webpage = self._download_webpage(req, video_id)
3540 # Get the video title
3541 video_title = self._search_regex(r'<h1.*?>(?P<title>.*)</h1>',
3542 webpage, u'title').strip()
3544 # Get the video date
3545 upload_date = self._search_regex(r'Date:</label>(?P<date>.*) </li>',
3546 webpage, u'upload date', fatal=False)
3547 if upload_date: upload_date = unified_strdate(upload_date.strip())
3549 # Get the video uploader
3550 video_uploader = self._search_regex(r'Submitted:</label>(?P<uploader>.*)</li>',
3551 webpage, u'uploader', fatal=False)
3552 if video_uploader: video_uploader = clean_html(video_uploader.strip())
3554 # Get all of the formats available
3555 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3556 download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
3557 webpage, u'download list').strip()
3559 # Get all of the links from the page
3560 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3561 links = re.findall(LINK_RE, download_list_html)
3562 if(len(links) == 0):
3563 raise ExtractorError(u'ERROR: no known formats available for video')
3565 self.to_screen(u'Links found: %d' % len(links))
3570 # A link looks like this:
3571 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3572 # A path looks like this:
3573 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3574 video_url = unescapeHTML( link )
3575 path = compat_urllib_parse_urlparse( video_url ).path
3576 extension = os.path.splitext( path )[1][1:]
3577 format = path.split('/')[4].split('_')[:2]
3580 format = "-".join( format )
3581 title = u'%s-%s-%s' % (video_title, size, bitrate)
3586 'uploader': video_uploader,
3587 'upload_date': upload_date,
3592 'description': None,
3596 if self._downloader.params.get('listformats', None):
3597 self._print_formats(formats)
3600 req_format = self._downloader.params.get('format', None)
3601 self.to_screen(u'Format: %s' % req_format)
3603 if req_format is None or req_format == 'best':
3605 elif req_format == 'worst':
3606 return [formats[-1]]
3607 elif req_format in ('-1', 'all'):
3610 format = self._specific( req_format, formats )
3612 raise ExtractorError(u'Requested format not available')
3617 class PornotubeIE(InfoExtractor):
3618 """Information extractor for pornotube.com."""
3619 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3621 def _real_extract(self, url):
3622 mobj = re.match(self._VALID_URL, url)
3624 raise ExtractorError(u'Invalid URL: %s' % url)
3626 video_id = mobj.group('videoid')
3627 video_title = mobj.group('title')
3629 # Get webpage content
3630 webpage = self._download_webpage(url, video_id)
3633 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3634 video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
3635 video_url = compat_urllib_parse.unquote(video_url)
3637 #Get the uploaded date
3638 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3639 upload_date = self._search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
3640 if upload_date: upload_date = unified_strdate(upload_date)
3642 info = {'id': video_id,
3645 'upload_date': upload_date,
3646 'title': video_title,
3652 class YouJizzIE(InfoExtractor):
3653 """Information extractor for youjizz.com."""
3654 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3656 def _real_extract(self, url):
3657 mobj = re.match(self._VALID_URL, url)
3659 raise ExtractorError(u'Invalid URL: %s' % url)
3661 video_id = mobj.group('videoid')
3663 # Get webpage content
3664 webpage = self._download_webpage(url, video_id)
3666 # Get the video title
3667 video_title = self._search_regex(r'<title>(?P<title>.*)</title>',
3668 webpage, u'title').strip()
3670 # Get the embed page
3671 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3673 raise ExtractorError(u'ERROR: unable to extract embed page')
3675 embed_page_url = result.group(0).strip()
3676 video_id = result.group('videoid')
3678 webpage = self._download_webpage(embed_page_url, video_id)
3681 video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
3682 webpage, u'video URL')
3684 info = {'id': video_id,
3686 'title': video_title,
3689 'player_url': embed_page_url}
3693 class EightTracksIE(InfoExtractor):
3695 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3697 def _real_extract(self, url):
3698 mobj = re.match(self._VALID_URL, url)
3700 raise ExtractorError(u'Invalid URL: %s' % url)
3701 playlist_id = mobj.group('id')
3703 webpage = self._download_webpage(url, playlist_id)
3705 json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
3706 data = json.loads(json_like)
3708 session = str(random.randint(0, 1000000000))
3710 track_count = data['tracks_count']
3711 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3712 next_url = first_url
3714 for i in itertools.count():
3715 api_json = self._download_webpage(next_url, playlist_id,
3716 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3717 errnote=u'Failed to download song information')
3718 api_data = json.loads(api_json)
3719 track_data = api_data[u'set']['track']
3721 'id': track_data['id'],
3722 'url': track_data['track_file_stream_url'],
3723 'title': track_data['performer'] + u' - ' + track_data['name'],
3724 'raw_title': track_data['name'],
3725 'uploader_id': data['user']['login'],
3729 if api_data['set']['at_last_track']:
3731 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3734 class KeekIE(InfoExtractor):
3735 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3738 def _real_extract(self, url):
3739 m = re.match(self._VALID_URL, url)
3740 video_id = m.group('videoID')
3742 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3743 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3744 webpage = self._download_webpage(url, video_id)
3746 video_title = self._search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3748 video_title = unescapeHTML(video_title)
3750 uploader = self._search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
3751 webpage, u'uploader', fatal=False)
3752 if uploader: uploader = clean_html(uploader)
3758 'title': video_title,
3759 'thumbnail': thumbnail,
3760 'uploader': uploader
3764 class TEDIE(InfoExtractor):
3765 _VALID_URL=r'''http://www\.ted\.com/
3767 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3769 ((?P<type_talk>talks)) # We have a simple talk
3771 (/lang/(.*?))? # The url may contain the language
3772 /(?P<name>\w+) # Here goes the name and then ".html"
3776 def suitable(cls, url):
3777 """Receives a URL and returns True if suitable for this IE."""
3778 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3780 def _real_extract(self, url):
3781 m=re.match(self._VALID_URL, url, re.VERBOSE)
3782 if m.group('type_talk'):
3783 return [self._talk_info(url)]
3785 playlist_id=m.group('playlist_id')
3786 name=m.group('name')
3787 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3788 return [self._playlist_videos_info(url,name,playlist_id)]
3790 def _talk_video_link(self,mediaSlug):
3791 '''Returns the video link for that mediaSlug'''
3792 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3794 def _playlist_videos_info(self,url,name,playlist_id=0):
3795 '''Returns the videos of the playlist'''
3797 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3798 ([.\s]*?)data-playlist_item_id="(\d+)"
3799 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3801 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3802 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3803 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3804 m_names=re.finditer(video_name_RE,webpage)
3806 playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3807 m_playlist = re.search(playlist_RE, webpage)
3808 playlist_title = m_playlist.group('playlist_title')
3810 playlist_entries = []
3811 for m_video, m_name in zip(m_videos,m_names):
3812 video_id=m_video.group('video_id')
3813 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3814 playlist_entries.append(self.url_result(talk_url, 'TED'))
3815 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3817 def _talk_info(self, url, video_id=0):
3818 """Return the video for the talk in the url"""
3819 m=re.match(self._VALID_URL, url,re.VERBOSE)
3820 videoName=m.group('name')
3821 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3822 # If the url includes the language we get the title translated
3823 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3824 title=re.search(title_RE, webpage).group('title')
3825 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3826 "id":(?P<videoID>[\d]+).*?
3827 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3828 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3829 thumb_match=re.search(thumb_RE,webpage)
3830 info_match=re.search(info_RE,webpage,re.VERBOSE)
3831 video_id=info_match.group('videoID')
3832 mediaSlug=info_match.group('mediaSlug')
3833 video_url=self._talk_video_link(mediaSlug)
3839 'thumbnail': thumb_match.group('thumbnail')
3843 class MySpassIE(InfoExtractor):
3844 _VALID_URL = r'http://www.myspass.de/.*'
3846 def _real_extract(self, url):
3847 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3849 # video id is the last path element of the URL
3850 # usually there is a trailing slash, so also try the second but last
3851 url_path = compat_urllib_parse_urlparse(url).path
3852 url_parent_path, video_id = os.path.split(url_path)
3854 _, video_id = os.path.split(url_parent_path)
3857 metadata_url = META_DATA_URL_TEMPLATE % video_id
3858 metadata_text = self._download_webpage(metadata_url, video_id)
3859 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3861 # extract values from metadata
3862 url_flv_el = metadata.find('url_flv')
3863 if url_flv_el is None:
3864 raise ExtractorError(u'Unable to extract download url')
3865 video_url = url_flv_el.text
3866 extension = os.path.splitext(video_url)[1][1:]
3867 title_el = metadata.find('title')
3868 if title_el is None:
3869 raise ExtractorError(u'Unable to extract title')
3870 title = title_el.text
3871 format_id_el = metadata.find('format_id')
3872 if format_id_el is None:
3875 format = format_id_el.text
3876 description_el = metadata.find('description')
3877 if description_el is not None:
3878 description = description_el.text
3881 imagePreview_el = metadata.find('imagePreview')
3882 if imagePreview_el is not None:
3883 thumbnail = imagePreview_el.text
3892 'thumbnail': thumbnail,
3893 'description': description
3897 class SpiegelIE(InfoExtractor):
3898 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3900 def _real_extract(self, url):
3901 m = re.match(self._VALID_URL, url)
3902 video_id = m.group('videoID')
3904 webpage = self._download_webpage(url, video_id)
3906 video_title = self._search_regex(r'<div class="module-title">(.*?)</div>',
3908 video_title = unescapeHTML(video_title)
3910 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3911 xml_code = self._download_webpage(xml_url, video_id,
3912 note=u'Downloading XML', errnote=u'Failed to download XML')
3914 idoc = xml.etree.ElementTree.fromstring(xml_code)
3915 last_type = idoc[-1]
3916 filename = last_type.findall('./filename')[0].text
3917 duration = float(last_type.findall('./duration')[0].text)
3919 video_url = 'http://video2.spiegel.de/flash/' + filename
3920 video_ext = filename.rpartition('.')[2]
3925 'title': video_title,
3926 'duration': duration,
3930 class LiveLeakIE(InfoExtractor):
3932 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3933 IE_NAME = u'liveleak'
3935 def _real_extract(self, url):
3936 mobj = re.match(self._VALID_URL, url)
3938 raise ExtractorError(u'Invalid URL: %s' % url)
3940 video_id = mobj.group('video_id')
3942 webpage = self._download_webpage(url, video_id)
3944 video_url = self._search_regex(r'file: "(.*?)",',
3945 webpage, u'video URL')
3947 video_title = self._search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3949 video_title = unescapeHTML(video_title).replace('LiveLeak.com -', '').strip()
3951 video_description = self._search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3952 webpage, u'description', fatal=False)
3953 if video_description: video_description = unescapeHTML(video_description)
3955 video_uploader = self._search_regex(r'By:.*?(\w+)</a>',
3956 webpage, u'uploader', fatal=False)
3962 'title': video_title,
3963 'description': video_description,
3964 'uploader': video_uploader
3969 class ARDIE(InfoExtractor):
3970 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
3971 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
3972 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
3974 def _real_extract(self, url):
3975 # determine video id from url
3976 m = re.match(self._VALID_URL, url)
3978 numid = re.search(r'documentId=([0-9]+)', url)
3980 video_id = numid.group(1)
3982 video_id = m.group('video_id')
3984 # determine title and media streams from webpage
3985 html = self._download_webpage(url, video_id)
3986 title = re.search(self._TITLE, html).group('title')
3987 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
3989 assert '"fsk"' in html
3990 raise ExtractorError(u'This video is only available after 8:00 pm')
3992 # choose default media type and highest quality for now
3993 stream = max([s for s in streams if int(s["media_type"]) == 0],
3994 key=lambda s: int(s["quality"]))
3996 # there's two possibilities: RTMP stream or HTTP download
3997 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
3998 if stream['rtmp_url']:
3999 self.to_screen(u'RTMP download detected')
4000 assert stream['video_url'].startswith('mp4:')
4001 info["url"] = stream["rtmp_url"]
4002 info["play_path"] = stream['video_url']
4004 assert stream["video_url"].endswith('.mp4')
4005 info["url"] = stream["video_url"]
4008 class TumblrIE(InfoExtractor):
4009 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4011 def _real_extract(self, url):
4012 m_url = re.match(self._VALID_URL, url)
4013 video_id = m_url.group('id')
4014 blog = m_url.group('blog_name')
4016 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4017 webpage = self._download_webpage(url, video_id)
4019 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4020 video = re.search(re_video, webpage)
4022 raise ExtractorError(u'Unable to extract video')
4023 video_url = video.group('video_url')
4024 ext = video.group('ext')
4026 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
4027 webpage, u'thumbnail', fatal=False) # We pick the first poster
4028 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
4030 # The only place where you can get a title, it's not complete,
4031 # but searching in other places doesn't work for all videos
4032 video_title = self._search_regex(r'<title>(?P<title>.*?)</title>',
4033 webpage, u'title', flags=re.DOTALL)
4034 video_title = unescapeHTML(video_title)
4036 return [{'id': video_id,
4038 'title': video_title,
4039 'thumbnail': video_thumbnail,
4043 class BandcampIE(InfoExtractor):
4044 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4046 def _real_extract(self, url):
4047 mobj = re.match(self._VALID_URL, url)
4048 title = mobj.group('title')
4049 webpage = self._download_webpage(url, title)
4050 # We get the link to the free download page
4051 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4052 if m_download is None:
4053 raise ExtractorError(u'No free songs found')
4055 download_link = m_download.group(1)
4056 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4057 webpage, re.MULTILINE|re.DOTALL).group('id')
4059 download_webpage = self._download_webpage(download_link, id,
4060 'Downloading free downloads page')
4061 # We get the dictionary of the track from some javascrip code
4062 info = re.search(r'items: (.*?),$',
4063 download_webpage, re.MULTILINE).group(1)
4064 info = json.loads(info)[0]
4065 # We pick mp3-320 for now, until format selection can be easily implemented.
4066 mp3_info = info[u'downloads'][u'mp3-320']
4067 # If we try to use this url it says the link has expired
4068 initial_url = mp3_info[u'url']
4069 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4070 m_url = re.match(re_url, initial_url)
4071 #We build the url we will use to get the final track url
4072 # This url is build in Bandcamp in the script download_bunde_*.js
4073 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4074 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4075 # If we could correctly generate the .rand field the url would be
4076 #in the "download_url" key
4077 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4079 track_info = {'id':id,
4080 'title' : info[u'title'],
4083 'thumbnail' : info[u'thumb_url'],
4084 'uploader' : info[u'artist']
4089 class RedTubeIE(InfoExtractor):
4090 """Information Extractor for redtube"""
4091 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4093 def _real_extract(self,url):
4094 mobj = re.match(self._VALID_URL, url)
4096 raise ExtractorError(u'Invalid URL: %s' % url)
4098 video_id = mobj.group('id')
4099 video_extension = 'mp4'
4100 webpage = self._download_webpage(url, video_id)
4102 self.report_extraction(video_id)
4104 video_url = self._search_regex(r'<source src="(.+?)" type="video/mp4">',
4105 webpage, u'video URL')
4107 video_title = self._search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
4113 'ext': video_extension,
4114 'title': video_title,
4117 class InaIE(InfoExtractor):
4118 """Information Extractor for Ina.fr"""
4119 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4121 def _real_extract(self,url):
4122 mobj = re.match(self._VALID_URL, url)
4124 video_id = mobj.group('id')
4125 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4126 video_extension = 'mp4'
4127 webpage = self._download_webpage(mrss_url, video_id)
4129 self.report_extraction(video_id)
4131 video_url = self._search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
4132 webpage, u'video URL')
4134 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
4140 'ext': video_extension,
4141 'title': video_title,
4144 class HowcastIE(InfoExtractor):
4145 """Information Extractor for Howcast.com"""
4146 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
4148 def _real_extract(self, url):
4149 mobj = re.match(self._VALID_URL, url)
4151 video_id = mobj.group('id')
4152 webpage_url = 'http://www.howcast.com/videos/' + video_id
4153 webpage = self._download_webpage(webpage_url, video_id)
4155 self.report_extraction(video_id)
4157 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
4158 webpage, u'video URL')
4160 video_title = self._search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
4163 video_description = self._search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
4164 webpage, u'description', fatal=False)
4166 thumbnail = self._search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
4167 webpage, u'thumbnail', fatal=False)
4173 'title': video_title,
4174 'description': video_description,
4175 'thumbnail': thumbnail,
4178 class VineIE(InfoExtractor):
4179 """Information Extractor for Vine.co"""
4180 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
4182 def _real_extract(self, url):
4183 mobj = re.match(self._VALID_URL, url)
4185 video_id = mobj.group('id')
4186 webpage_url = 'https://vine.co/v/' + video_id
4187 webpage = self._download_webpage(webpage_url, video_id)
4189 self.report_extraction(video_id)
4191 video_url = self._search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
4192 webpage, u'video URL')
4194 video_title = self._search_regex(r'<meta property="og:title" content="(.+?)"',
4197 thumbnail = self._search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
4198 webpage, u'thumbnail', fatal=False)
4200 uploader = self._search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
4201 webpage, u'uploader', fatal=False, flags=re.DOTALL)
4207 'title': video_title,
4208 'thumbnail': thumbnail,
4209 'uploader': uploader,
4212 class FlickrIE(InfoExtractor):
4213 """Information Extractor for Flickr videos"""
4214 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
4216 def _real_extract(self, url):
4217 mobj = re.match(self._VALID_URL, url)
4219 video_id = mobj.group('id')
4220 video_uploader_id = mobj.group('uploader_id')
4221 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
4222 webpage = self._download_webpage(webpage_url, video_id)
4224 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
4226 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
4227 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
4229 node_id = self._search_regex(r'<Item id="id">(\d+-\d+)</Item>',
4230 first_xml, u'node_id')
4232 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
4233 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
4235 self.report_extraction(video_id)
4237 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
4239 raise ExtractorError(u'Unable to extract video url')
4240 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
4242 video_title = self._search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
4243 webpage, u'video title')
4245 video_description = self._search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
4246 webpage, u'description', fatal=False)
4248 thumbnail = self._search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
4249 webpage, u'thumbnail', fatal=False)
4255 'title': video_title,
4256 'description': video_description,
4257 'thumbnail': thumbnail,
4258 'uploader_id': video_uploader_id,
4261 class TeamcocoIE(InfoExtractor):
4262 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
4264 def _real_extract(self, url):
4265 mobj = re.match(self._VALID_URL, url)
4267 raise ExtractorError(u'Invalid URL: %s' % url)
4268 url_title = mobj.group('url_title')
4269 webpage = self._download_webpage(url, url_title)
4271 video_id = self._search_regex(r'<article class="video" data-id="(\d+?)"',
4272 webpage, u'video id')
4274 self.report_extraction(video_id)
4276 video_title = self._search_regex(r'<meta property="og:title" content="(.+?)"',
4279 thumbnail = self._search_regex(r'<meta property="og:image" content="(.+?)"',
4280 webpage, u'thumbnail', fatal=False)
4282 video_description = self._search_regex(r'<meta property="og:description" content="(.*?)"',
4283 webpage, u'description', fatal=False)
4285 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
4286 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
4288 video_url = self._search_regex(r'<file type="high".*?>(.*?)</file>',
4295 'title': video_title,
4296 'thumbnail': thumbnail,
4297 'description': video_description,
4300 class XHamsterIE(InfoExtractor):
4301 """Information Extractor for xHamster"""
4302 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
4304 def _real_extract(self,url):
4305 mobj = re.match(self._VALID_URL, url)
4307 video_id = mobj.group('id')
4308 mrss_url='http://xhamster.com/movies/%s/.html' % video_id
4309 webpage = self._download_webpage(mrss_url, video_id)
4310 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
4312 raise ExtractorError(u'Unable to extract media URL')
4313 if len(mobj.group('server')) == 0:
4314 video_url = compat_urllib_parse.unquote(mobj.group('file'))
4316 video_url = mobj.group('server')+'/key='+mobj.group('file')
4317 video_extension = video_url.split('.')[-1]
4319 mobj = re.search(r'<title>(?P<title>.+?) - xHamster\.com</title>', webpage)
4321 raise ExtractorError(u'Unable to extract title')
4322 video_title = unescapeHTML(mobj.group('title'))
4324 mobj = re.search(r'<span>Description: </span>(?P<description>[^<]+)', webpage)
4326 video_description = u''
4328 video_description = unescapeHTML(mobj.group('description'))
4330 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
4332 raise ExtractorError(u'Unable to extract upload date')
4333 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
4335 mobj = re.search(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^>]+)', webpage)
4337 video_uploader_id = u'anonymous'
4339 video_uploader_id = mobj.group('uploader_id')
4341 mobj = re.search(r'\'image\':\'(?P<thumbnail>[^\']+)\'', webpage)
4343 raise ExtractorError(u'Unable to extract thumbnail URL')
4344 video_thumbnail = mobj.group('thumbnail')
4349 'ext': video_extension,
4350 'title': video_title,
4351 'description': video_description,
4352 'upload_date': video_upload_date,
4353 'uploader_id': video_uploader_id,
4354 'thumbnail': video_thumbnail
4357 class HypemIE(InfoExtractor):
4358 """Information Extractor for hypem"""
4359 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
4361 def _real_extract(self, url):
4362 mobj = re.match(self._VALID_URL, url)
4364 raise ExtractorError(u'Invalid URL: %s' % url)
4365 track_id = mobj.group(1)
4367 data = { 'ax': 1, 'ts': time.time() }
4368 data_encoded = compat_urllib_parse.urlencode(data)
4369 complete_url = url + "?" + data_encoded
4370 request = compat_urllib_request.Request(complete_url)
4371 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
4372 cookie = urlh.headers.get('Set-Cookie', '')
4374 self.report_extraction(track_id)
4375 mobj = re.search(r'<script type="application/json" id="displayList-data">(.*?)</script>', response, flags=re.MULTILINE|re.DOTALL)
4377 raise ExtractorError(u'Unable to extrack tracks')
4378 html_tracks = mobj.group(1).strip()
4380 track_list = json.loads(html_tracks)
4381 track = track_list[u'tracks'][0]
4383 raise ExtractorError(u'Hypemachine contained invalid JSON.')
4386 track_id = track[u"id"]
4387 artist = track[u"artist"]
4388 title = track[u"song"]
4390 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
4391 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
4392 request.add_header('cookie', cookie)
4393 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
4395 song_data = json.loads(song_data_json)
4397 raise ExtractorError(u'Hypemachine contained invalid JSON.')
4398 final_url = song_data[u"url"]
4409 def gen_extractors():
4410 """ Return a list of an instance of every supported extractor.
4411 The order does matter; the first extractor matched is the one handling the URL.
4414 YoutubePlaylistIE(),
4439 StanfordOpenClassroomIE(),
4449 WorldStarHipHopIE(),
4475 def get_info_extractor(ie_name):
4476 """Returns the info extractor class with the given ie_name"""
4477 return globals()[ie_name+'IE']