2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
26 class InfoExtractor(object):
27 """Information Extractor class.
29 Information extractors are the classes that, given a URL, extract
30 information about the video (or videos) the URL refers to. This
31 information includes the real video URL, the video title, author and
32 others. The information is stored in a dictionary which is then
33 passed to the FileDownloader. The FileDownloader processes this
34 information possibly downloading the video to the file system, among
35 other possible outcomes.
37 The dictionaries must include the following fields:
41 title: Video title, unescaped.
42 ext: Video filename extension.
44 The following fields are optional:
46 format: The video format, defaults to ext (used for --get-format)
47 thumbnail: Full URL to a video thumbnail image.
48 description: One-line video description.
49 uploader: Full name of the video uploader.
50 upload_date: Video upload date (YYYYMMDD).
51 uploader_id: Nickname or id of the video uploader.
52 location: Physical location of the video.
53 player_url: SWF Player URL (used for rtmpdump).
54 subtitles: The subtitle file contents.
55 urlhandle: [internal] The urlHandle to be used to download the file,
56 like returned by urllib.request.urlopen
58 The fields should all be Unicode strings.
60 Subclasses of this one should re-define the _real_initialize() and
61 _real_extract() methods and define a _VALID_URL regexp.
62 Probably, they should also be added to the list of extractors.
64 _real_extract() must return a *list* of information dictionaries as
67 Finally, the _WORKING attribute should be set to False for broken IEs
68 in order to warn the users and skip the tests.
75 def __init__(self, downloader=None):
76 """Constructor. Receives an optional downloader."""
78 self.set_downloader(downloader)
81 def suitable(cls, url):
82 """Receives a URL and returns True if suitable for this IE."""
83 return re.match(cls._VALID_URL, url) is not None
87 """Getter method for _WORKING."""
91 """Initializes an instance (authentication, etc)."""
93 self._real_initialize()
96 def extract(self, url):
97 """Extracts URL information and returns it in list of dicts."""
99 return self._real_extract(url)
101 def set_downloader(self, downloader):
102 """Sets the downloader for this IE."""
103 self._downloader = downloader
105 def _real_initialize(self):
106 """Real initialization process. Redefine in subclasses."""
109 def _real_extract(self, url):
110 """Real extraction process. Redefine in subclasses."""
115 return type(self).__name__[:-2]
117 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
118 """ Returns the response handle """
120 self.report_download_webpage(video_id)
121 elif note is not False:
122 self.to_screen(u'%s: %s' % (video_id, note))
124 return compat_urllib_request.urlopen(url_or_request)
125 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
127 errnote = u'Unable to download webpage'
128 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
130 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
131 """ Returns a tuple (page content as string, URL handle) """
132 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
133 content_type = urlh.headers.get('Content-Type', '')
134 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
136 encoding = m.group(1)
139 webpage_bytes = urlh.read()
140 if self._downloader.params.get('dump_intermediate_pages', False):
142 url = url_or_request.get_full_url()
143 except AttributeError:
145 self.to_screen(u'Dumping request to ' + url)
146 dump = base64.b64encode(webpage_bytes).decode('ascii')
147 self._downloader.to_screen(dump)
148 content = webpage_bytes.decode(encoding, 'replace')
149 return (content, urlh)
151 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
152 """ Returns the data of the page as a string """
153 return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
155 def to_screen(self, msg):
156 """Print msg to screen, prefixing it with '[ie_name]'"""
157 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
159 def report_extraction(self, id_or_name):
160 """Report information extraction."""
161 self.to_screen(u'%s: Extracting information' % id_or_name)
163 def report_download_webpage(self, video_id):
164 """Report webpage download."""
165 self.to_screen(u'%s: Downloading webpage' % video_id)
167 def report_age_confirmation(self):
168 """Report attempt to confirm age."""
169 self.to_screen(u'Confirming age')
171 #Methods for following #608
172 #They set the correct value of the '_type' key
173 def video_result(self, video_info):
174 """Returns a video"""
175 video_info['_type'] = 'video'
177 def url_result(self, url, ie=None):
178 """Returns a url that points to a page that should be processed"""
179 #TODO: ie should be the class used for getting the info
180 video_info = {'_type': 'url',
184 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
185 """Returns a playlist"""
186 video_info = {'_type': 'playlist',
189 video_info['id'] = playlist_id
191 video_info['title'] = playlist_title
194 def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
196 Perform a regex search on the given string, using a single or a list of
197 patterns returning the first matching group.
198 In case of failure return a default value or raise a WARNING or a
199 ExtractorError, depending on fatal, specifying the field name.
201 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
202 mobj = re.search(pattern, string, flags)
205 mobj = re.search(p, string, flags)
208 if sys.stderr.isatty() and os.name != 'nt':
209 _name = u'\033[0;34m%s\033[0m' % name
214 # return the first matching group
215 return next(g for g in mobj.groups() if g is not None)
216 elif default is not None:
219 raise ExtractorError(u'Unable to extract %s' % _name)
221 self._downloader.report_warning(u'unable to extract %s; '
222 u'please report this issue on GitHub.' % _name)
225 def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
227 Like _search_regex, but strips HTML tags and unescapes entities.
229 res = self._search_regex(pattern, string, name, default, fatal, flags)
231 return clean_html(res).strip()
235 class SearchInfoExtractor(InfoExtractor):
237 Base class for paged search queries extractors.
238 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
239 Instances should define _SEARCH_KEY and _MAX_RESULTS.
243 def _make_valid_url(cls):
244 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
247 def suitable(cls, url):
248 return re.match(cls._make_valid_url(), url) is not None
250 def _real_extract(self, query):
251 mobj = re.match(self._make_valid_url(), query)
253 raise ExtractorError(u'Invalid search query "%s"' % query)
255 prefix = mobj.group('prefix')
256 query = mobj.group('query')
258 return self._get_n_results(query, 1)
259 elif prefix == 'all':
260 return self._get_n_results(query, self._MAX_RESULTS)
264 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
265 elif n > self._MAX_RESULTS:
266 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
267 n = self._MAX_RESULTS
268 return self._get_n_results(query, n)
270 def _get_n_results(self, query, n):
271 """Get a specified number of results for a query"""
272 raise NotImplementedError("This method must be implemented by sublclasses")
275 class YoutubeIE(InfoExtractor):
276 """Information extractor for youtube.com."""
280 (?:https?://)? # http(s):// (optional)
281 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
282 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
283 (?:.*?\#/)? # handle anchor (#/) redirect urls
284 (?: # the various things that can precede the ID:
285 (?:(?:v|embed|e)/) # v/ or embed/ or e/
286 |(?: # or the v= param in all its forms
287 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
288 (?:\?|\#!?) # the params delimiter ? or # or #!
289 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
292 )? # optional -> youtube.com/xxxx is OK
293 )? # all until now is optional -> you can pass the naked ID
294 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
295 (?(1).+)? # if we found the ID, everything can follow
297 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
298 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
299 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
300 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
301 _NETRC_MACHINE = 'youtube'
302 # Listed in order of quality
303 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
304 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
305 _video_extensions = {
311 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
317 _video_dimensions = {
336 def suitable(cls, url):
337 """Receives a URL and returns True if suitable for this IE."""
338 if YoutubePlaylistIE.suitable(url): return False
339 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
341 def report_lang(self):
342 """Report attempt to set language."""
343 self.to_screen(u'Setting language')
345 def report_login(self):
346 """Report attempt to log in."""
347 self.to_screen(u'Logging in')
349 def report_video_webpage_download(self, video_id):
350 """Report attempt to download video webpage."""
351 self.to_screen(u'%s: Downloading video webpage' % video_id)
353 def report_video_info_webpage_download(self, video_id):
354 """Report attempt to download video info webpage."""
355 self.to_screen(u'%s: Downloading video info webpage' % video_id)
357 def report_video_subtitles_download(self, video_id):
358 """Report attempt to download video info webpage."""
359 self.to_screen(u'%s: Checking available subtitles' % video_id)
361 def report_video_subtitles_request(self, video_id, sub_lang, format):
362 """Report attempt to download video info webpage."""
363 self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
365 def report_video_subtitles_available(self, video_id, sub_lang_list):
366 """Report available subtitles."""
367 sub_lang = ",".join(list(sub_lang_list.keys()))
368 self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
370 def report_information_extraction(self, video_id):
371 """Report attempt to extract video information."""
372 self.to_screen(u'%s: Extracting video information' % video_id)
374 def report_unavailable_format(self, video_id, format):
375 """Report extracted video URL."""
376 self.to_screen(u'%s: Format %s not available' % (video_id, format))
378 def report_rtmp_download(self):
379 """Indicate the download will use the RTMP protocol."""
380 self.to_screen(u'RTMP download detected')
383 def _decrypt_signature(s):
384 """Decrypt the key the two subkeys must have a length of 43"""
386 if len(a) != 43 or len(b) != 43:
387 raise ExtractorError(u'Unable to decrypt signature, subkeys lengths not valid')
388 b = ''.join([b[:8],a[0],b[9:18],b[-4],b[19:39], b[18]])[0:40]
390 s_dec = '.'.join((a,b))[::-1]
393 def _get_available_subtitles(self, video_id):
394 self.report_video_subtitles_download(video_id)
395 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
397 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
398 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
399 return (u'unable to download video subtitles: %s' % compat_str(err), None)
400 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
401 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
402 if not sub_lang_list:
403 return (u'video doesn\'t have subtitles', None)
406 def _list_available_subtitles(self, video_id):
407 sub_lang_list = self._get_available_subtitles(video_id)
408 self.report_video_subtitles_available(video_id, sub_lang_list)
410 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
413 (error_message, sub_lang, sub)
415 self.report_video_subtitles_request(video_id, sub_lang, format)
416 params = compat_urllib_parse.urlencode({
422 url = 'http://www.youtube.com/api/timedtext?' + params
424 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
425 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
426 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
428 return (u'Did not fetch video subtitles', None, None)
429 return (None, sub_lang, sub)
431 def _request_automatic_caption(self, video_id, webpage):
432 """We need the webpage for getting the captions url, pass it as an
433 argument to speed up the process."""
434 sub_lang = self._downloader.params.get('subtitleslang') or 'en'
435 sub_format = self._downloader.params.get('subtitlesformat')
436 self.to_screen(u'%s: Looking for automatic captions' % video_id)
437 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
438 err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang
440 return [(err_msg, None, None)]
441 player_config = json.loads(mobj.group(1))
443 args = player_config[u'args']
444 caption_url = args[u'ttsurl']
445 timestamp = args[u'timestamp']
446 params = compat_urllib_parse.urlencode({
453 subtitles_url = caption_url + '&' + params
454 sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions')
455 return [(None, sub_lang, sub)]
457 return [(err_msg, None, None)]
459 def _extract_subtitle(self, video_id):
461 Return a list with a tuple:
462 [(error_message, sub_lang, sub)]
464 sub_lang_list = self._get_available_subtitles(video_id)
465 sub_format = self._downloader.params.get('subtitlesformat')
466 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
467 return [(sub_lang_list[0], None, None)]
468 if self._downloader.params.get('subtitleslang', False):
469 sub_lang = self._downloader.params.get('subtitleslang')
470 elif 'en' in sub_lang_list:
473 sub_lang = list(sub_lang_list.keys())[0]
474 if not sub_lang in sub_lang_list:
475 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
477 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
480 def _extract_all_subtitles(self, video_id):
481 sub_lang_list = self._get_available_subtitles(video_id)
482 sub_format = self._downloader.params.get('subtitlesformat')
483 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
484 return [(sub_lang_list[0], None, None)]
486 for sub_lang in sub_lang_list:
487 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
488 subtitles.append(subtitle)
491 def _print_formats(self, formats):
492 print('Available formats:')
494 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
496 def _real_initialize(self):
497 if self._downloader is None:
502 downloader_params = self._downloader.params
504 # Attempt to use provided username and password or .netrc data
505 if downloader_params.get('username', None) is not None:
506 username = downloader_params['username']
507 password = downloader_params['password']
508 elif downloader_params.get('usenetrc', False):
510 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
515 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
516 except (IOError, netrc.NetrcParseError) as err:
517 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
521 request = compat_urllib_request.Request(self._LANG_URL)
524 compat_urllib_request.urlopen(request).read()
525 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
526 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
529 # No authentication to be performed
533 request = compat_urllib_request.Request(self._LOGIN_URL)
535 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
536 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
537 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
542 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
544 galx = match.group(1)
546 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
552 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
556 u'PersistentCookie': u'yes',
558 u'bgresponse': u'js_disabled',
559 u'checkConnection': u'',
560 u'checkedDomains': u'youtube',
566 u'signIn': u'Sign in',
568 u'service': u'youtube',
572 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
574 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
575 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
576 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
579 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
580 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
581 self._downloader.report_warning(u'unable to log in: bad username or password')
583 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
584 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
590 'action_confirm': 'Confirm',
592 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
594 self.report_age_confirmation()
595 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
596 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
597 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
599 def _extract_id(self, url):
600 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
602 raise ExtractorError(u'Invalid URL: %s' % url)
603 video_id = mobj.group(2)
606 def _real_extract(self, url):
607 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
608 mobj = re.search(self._NEXT_URL_RE, url)
610 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
611 video_id = self._extract_id(url)
614 self.report_video_webpage_download(video_id)
615 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
616 request = compat_urllib_request.Request(url)
618 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
619 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
620 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
622 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
624 # Attempt to extract SWF player URL
625 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
627 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
632 self.report_video_info_webpage_download(video_id)
633 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
634 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
635 % (video_id, el_type))
636 video_info_webpage = self._download_webpage(video_info_url, video_id,
638 errnote='unable to download video info webpage')
639 video_info = compat_parse_qs(video_info_webpage)
640 if 'token' in video_info:
642 if 'token' not in video_info:
643 if 'reason' in video_info:
644 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
646 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
648 # Check for "rental" videos
649 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
650 raise ExtractorError(u'"rental" videos not supported')
652 # Start extracting information
653 self.report_information_extraction(video_id)
656 if 'author' not in video_info:
657 raise ExtractorError(u'Unable to extract uploader name')
658 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
661 video_uploader_id = None
662 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
664 video_uploader_id = mobj.group(1)
666 self._downloader.report_warning(u'unable to extract uploader nickname')
669 if 'title' not in video_info:
670 raise ExtractorError(u'Unable to extract video title')
671 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
674 if 'thumbnail_url' not in video_info:
675 self._downloader.report_warning(u'unable to extract video thumbnail')
677 else: # don't panic if we can't find it
678 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
682 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
684 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
685 upload_date = unified_strdate(upload_date)
688 video_description = get_element_by_id("eow-description", video_webpage)
689 if video_description:
690 video_description = clean_html(video_description)
692 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
694 video_description = unescapeHTML(fd_mobj.group(1))
696 video_description = u''
699 video_subtitles = None
701 if self._downloader.params.get('writesubtitles', False):
702 video_subtitles = self._extract_subtitle(video_id)
704 (sub_error, sub_lang, sub) = video_subtitles[0]
706 # We try with the automatic captions
707 video_subtitles = self._request_automatic_caption(video_id, video_webpage)
708 (sub_error_auto, sub_lang, sub) = video_subtitles[0]
712 # We report the original error
713 self._downloader.report_warning(sub_error)
715 if self._downloader.params.get('allsubtitles', False):
716 video_subtitles = self._extract_all_subtitles(video_id)
717 for video_subtitle in video_subtitles:
718 (sub_error, sub_lang, sub) = video_subtitle
720 self._downloader.report_warning(sub_error)
722 if self._downloader.params.get('listsubtitles', False):
723 sub_lang_list = self._list_available_subtitles(video_id)
726 if 'length_seconds' not in video_info:
727 self._downloader.report_warning(u'unable to extract video duration')
730 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
733 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
735 # Decide which formats to download
736 req_format = self._downloader.params.get('format', None)
739 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
740 info = json.loads(mobj.group(1))
742 if args.get('ptk','') == 'vevo' or 'dashmpd':
743 # Vevo videos with encrypted signatures
744 self.to_screen(u'Vevo video detected.')
745 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
749 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
750 self.report_rtmp_download()
751 video_url_list = [(None, video_info['conn'][0])]
752 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
754 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
755 url_data = compat_parse_qs(url_data_str)
756 if 'itag' in url_data and 'url' in url_data:
757 url = url_data['url'][0]
758 if 'sig' in url_data:
759 url += '&signature=' + url_data['sig'][0]
761 signature = self._decrypt_signature(url_data['s'][0])
762 url += '&signature=' + signature
763 if 'ratebypass' not in url:
764 url += '&ratebypass=yes'
765 url_map[url_data['itag'][0]] = url
767 format_limit = self._downloader.params.get('format_limit', None)
768 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
769 if format_limit is not None and format_limit in available_formats:
770 format_list = available_formats[available_formats.index(format_limit):]
772 format_list = available_formats
773 existing_formats = [x for x in format_list if x in url_map]
774 if len(existing_formats) == 0:
775 raise ExtractorError(u'no known formats available for video')
776 if self._downloader.params.get('listformats', None):
777 self._print_formats(existing_formats)
779 if req_format is None or req_format == 'best':
780 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
781 elif req_format == 'worst':
782 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
783 elif req_format in ('-1', 'all'):
784 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
786 # Specific formats. We pick the first in a slash-delimeted sequence.
787 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
788 req_formats = req_format.split('/')
789 video_url_list = None
790 for rf in req_formats:
792 video_url_list = [(rf, url_map[rf])]
794 if video_url_list is None:
795 raise ExtractorError(u'requested format not available')
797 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
800 for format_param, video_real_url in video_url_list:
802 video_extension = self._video_extensions.get(format_param, 'flv')
804 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
805 self._video_dimensions.get(format_param, '???'))
809 'url': video_real_url,
810 'uploader': video_uploader,
811 'uploader_id': video_uploader_id,
812 'upload_date': upload_date,
813 'title': video_title,
814 'ext': video_extension,
815 'format': video_format,
816 'thumbnail': video_thumbnail,
817 'description': video_description,
818 'player_url': player_url,
819 'subtitles': video_subtitles,
820 'duration': video_duration
825 class MetacafeIE(InfoExtractor):
826 """Information Extractor for metacafe.com."""
828 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
829 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
830 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
831 IE_NAME = u'metacafe'
833 def report_disclaimer(self):
834 """Report disclaimer retrieval."""
835 self.to_screen(u'Retrieving disclaimer')
837 def _real_initialize(self):
838 # Retrieve disclaimer
839 request = compat_urllib_request.Request(self._DISCLAIMER)
841 self.report_disclaimer()
842 disclaimer = compat_urllib_request.urlopen(request).read()
843 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
844 raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
849 'submit': "Continue - I'm over 18",
851 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
853 self.report_age_confirmation()
854 disclaimer = compat_urllib_request.urlopen(request).read()
855 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
856 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
858 def _real_extract(self, url):
859 # Extract id and simplified title from URL
860 mobj = re.match(self._VALID_URL, url)
862 raise ExtractorError(u'Invalid URL: %s' % url)
864 video_id = mobj.group(1)
866 # Check if video comes from YouTube
867 mobj2 = re.match(r'^yt-(.*)$', video_id)
868 if mobj2 is not None:
869 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
871 # Retrieve video webpage to extract further information
872 webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
874 # Extract URL, uploader and title from webpage
875 self.report_extraction(video_id)
876 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
878 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
879 video_extension = mediaURL[-3:]
881 # Extract gdaKey if available
882 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
886 gdaKey = mobj.group(1)
887 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
889 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
891 raise ExtractorError(u'Unable to extract media URL')
892 vardict = compat_parse_qs(mobj.group(1))
893 if 'mediaData' not in vardict:
894 raise ExtractorError(u'Unable to extract media URL')
895 mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
897 raise ExtractorError(u'Unable to extract media URL')
898 mediaURL = mobj.group('mediaURL').replace('\\/', '/')
899 video_extension = mediaURL[-3:]
900 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
902 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
904 raise ExtractorError(u'Unable to extract title')
905 video_title = mobj.group(1).decode('utf-8')
907 mobj = re.search(r'submitter=(.*?);', webpage)
909 raise ExtractorError(u'Unable to extract uploader nickname')
910 video_uploader = mobj.group(1)
913 'id': video_id.decode('utf-8'),
914 'url': video_url.decode('utf-8'),
915 'uploader': video_uploader.decode('utf-8'),
917 'title': video_title,
918 'ext': video_extension.decode('utf-8'),
921 class DailymotionIE(InfoExtractor):
922 """Information Extractor for Dailymotion"""
924 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
925 IE_NAME = u'dailymotion'
927 def _real_extract(self, url):
928 # Extract id and simplified title from URL
929 mobj = re.match(self._VALID_URL, url)
931 raise ExtractorError(u'Invalid URL: %s' % url)
933 video_id = mobj.group(1).split('_')[0].split('?')[0]
935 video_extension = 'mp4'
937 # Retrieve video webpage to extract further information
938 request = compat_urllib_request.Request(url)
939 request.add_header('Cookie', 'family_filter=off')
940 webpage = self._download_webpage(request, video_id)
942 # Extract URL, uploader and title from webpage
943 self.report_extraction(video_id)
944 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
946 raise ExtractorError(u'Unable to extract media URL')
947 flashvars = compat_urllib_parse.unquote(mobj.group(1))
949 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
952 self.to_screen(u'Using %s' % key)
955 raise ExtractorError(u'Unable to extract video URL')
957 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
959 raise ExtractorError(u'Unable to extract video URL')
961 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
963 # TODO: support choosing qualities
965 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
967 raise ExtractorError(u'Unable to extract title')
968 video_title = unescapeHTML(mobj.group('title'))
970 video_uploader = None
971 video_uploader = self._search_regex([r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>',
972 # Looking for official user
973 r'<(?:span|a) .*?rel="author".*?>([^<]+?)</'],
974 webpage, 'video uploader')
976 video_upload_date = None
977 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
979 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
984 'uploader': video_uploader,
985 'upload_date': video_upload_date,
986 'title': video_title,
987 'ext': video_extension,
991 class PhotobucketIE(InfoExtractor):
992 """Information extractor for photobucket.com."""
994 # TODO: the original _VALID_URL was:
995 # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
996 # Check if it's necessary to keep the old extracion process
997 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
998 IE_NAME = u'photobucket'
1000 def _real_extract(self, url):
1001 # Extract id from URL
1002 mobj = re.match(self._VALID_URL, url)
1004 raise ExtractorError(u'Invalid URL: %s' % url)
1006 video_id = mobj.group('id')
1008 video_extension = mobj.group('ext')
1010 # Retrieve video webpage to extract further information
1011 webpage = self._download_webpage(url, video_id)
1013 # Extract URL, uploader, and title from webpage
1014 self.report_extraction(video_id)
1015 # We try first by looking the javascript code:
1016 mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
1017 if mobj is not None:
1018 info = json.loads(mobj.group('json'))
1021 'url': info[u'downloadUrl'],
1022 'uploader': info[u'username'],
1023 'upload_date': datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
1024 'title': info[u'title'],
1025 'ext': video_extension,
1026 'thumbnail': info[u'thumbUrl'],
1029 # We try looking in other parts of the webpage
1030 video_url = self._search_regex(r'<link rel="video_src" href=".*\?file=([^"]+)" />',
1031 webpage, u'video URL')
1033 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1035 raise ExtractorError(u'Unable to extract title')
1036 video_title = mobj.group(1).decode('utf-8')
1037 video_uploader = mobj.group(2).decode('utf-8')
1040 'id': video_id.decode('utf-8'),
1041 'url': video_url.decode('utf-8'),
1042 'uploader': video_uploader,
1043 'upload_date': None,
1044 'title': video_title,
1045 'ext': video_extension.decode('utf-8'),
1049 class YahooIE(InfoExtractor):
1050 """Information extractor for screen.yahoo.com."""
1051 _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
1053 def _real_extract(self, url):
1054 mobj = re.match(self._VALID_URL, url)
1056 raise ExtractorError(u'Invalid URL: %s' % url)
1057 video_id = mobj.group('id')
1058 webpage = self._download_webpage(url, video_id)
1059 m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
1062 # TODO: Check which url parameters are required
1063 info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1064 webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
1065 info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
1066 <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
1067 <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
1068 <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
1070 self.report_extraction(video_id)
1071 m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
1073 raise ExtractorError(u'Unable to extract video info')
1074 video_title = m_info.group('title')
1075 video_description = m_info.group('description')
1076 video_thumb = m_info.group('thumb')
1077 video_date = m_info.group('date')
1078 video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
1080 # TODO: Find a way to get mp4 videos
1081 rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1082 webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
1083 m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
1084 video_url = m_rest.group('url')
1085 video_path = m_rest.group('path')
1087 raise ExtractorError(u'Unable to extract video url')
1089 else: # We have to use a different method if another id is defined
1090 long_id = m_id.group('new_id')
1091 info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
1092 webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
1093 json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
1094 info = json.loads(json_str)
1095 res = info[u'query'][u'results'][u'mediaObj'][0]
1096 stream = res[u'streams'][0]
1097 video_path = stream[u'path']
1098 video_url = stream[u'host']
1100 video_title = meta[u'title']
1101 video_description = meta[u'description']
1102 video_thumb = meta[u'thumbnail']
1103 video_date = None # I can't find it
1108 'play_path': video_path,
1109 'title':video_title,
1110 'description': video_description,
1111 'thumbnail': video_thumb,
1112 'upload_date': video_date,
1117 class VimeoIE(InfoExtractor):
1118 """Information extractor for vimeo.com."""
1120 # _VALID_URL matches Vimeo URLs
1121 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1124 def _verify_video_password(self, url, video_id, webpage):
1125 password = self._downloader.params.get('password', None)
1126 if password is None:
1127 raise ExtractorError(u'This video is protected by a password, use the --password option')
1128 token = re.search(r'xsrft: \'(.*?)\'', webpage).group(1)
1129 data = compat_urllib_parse.urlencode({'password': password,
1131 # I didn't manage to use the password with https
1132 if url.startswith('https'):
1133 pass_url = url.replace('https','http')
1136 password_request = compat_urllib_request.Request(pass_url+'/password', data)
1137 password_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
1138 password_request.add_header('Cookie', 'xsrft=%s' % token)
1139 pass_web = self._download_webpage(password_request, video_id,
1140 u'Verifying the password',
1143 def _real_extract(self, url, new_video=True):
1144 # Extract ID from URL
1145 mobj = re.match(self._VALID_URL, url)
1147 raise ExtractorError(u'Invalid URL: %s' % url)
1149 video_id = mobj.group('id')
1150 if not mobj.group('proto'):
1151 url = 'https://' + url
1152 if mobj.group('direct_link') or mobj.group('pro'):
1153 url = 'https://vimeo.com/' + video_id
1155 # Retrieve video webpage to extract further information
1156 request = compat_urllib_request.Request(url, None, std_headers)
1157 webpage = self._download_webpage(request, video_id)
1159 # Now we begin extracting as much information as we can from what we
1160 # retrieved. First we extract the information common to all extractors,
1161 # and latter we extract those that are Vimeo specific.
1162 self.report_extraction(video_id)
1164 # Extract the config JSON
1166 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1167 config = json.loads(config)
1169 if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1170 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1172 if re.search('If so please provide the correct password.', webpage):
1173 self._verify_video_password(url, video_id, webpage)
1174 return self._real_extract(url)
1176 raise ExtractorError(u'Unable to extract info section')
1179 video_title = config["video"]["title"]
1181 # Extract uploader and uploader_id
1182 video_uploader = config["video"]["owner"]["name"]
1183 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None
1185 # Extract video thumbnail
1186 video_thumbnail = config["video"]["thumbnail"]
1188 # Extract video description
1189 video_description = get_element_by_attribute("itemprop", "description", webpage)
1190 if video_description: video_description = clean_html(video_description)
1191 else: video_description = u''
1193 # Extract upload date
1194 video_upload_date = None
1195 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1196 if mobj is not None:
1197 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1199 # Vimeo specific: extract request signature and timestamp
1200 sig = config['request']['signature']
1201 timestamp = config['request']['timestamp']
1203 # Vimeo specific: extract video codec and quality information
1204 # First consider quality, then codecs, then take everything
1205 # TODO bind to format param
1206 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1207 files = { 'hd': [], 'sd': [], 'other': []}
1208 for codec_name, codec_extension in codecs:
1209 if codec_name in config["video"]["files"]:
1210 if 'hd' in config["video"]["files"][codec_name]:
1211 files['hd'].append((codec_name, codec_extension, 'hd'))
1212 elif 'sd' in config["video"]["files"][codec_name]:
1213 files['sd'].append((codec_name, codec_extension, 'sd'))
1215 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1217 for quality in ('hd', 'sd', 'other'):
1218 if len(files[quality]) > 0:
1219 video_quality = files[quality][0][2]
1220 video_codec = files[quality][0][0]
1221 video_extension = files[quality][0][1]
1222 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1225 raise ExtractorError(u'No known codec found')
1227 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1228 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1233 'uploader': video_uploader,
1234 'uploader_id': video_uploader_id,
1235 'upload_date': video_upload_date,
1236 'title': video_title,
1237 'ext': video_extension,
1238 'thumbnail': video_thumbnail,
1239 'description': video_description,
1243 class ArteTvIE(InfoExtractor):
1244 """arte.tv information extractor."""
1246 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1247 _LIVE_URL = r'index-[0-9]+\.html$'
1249 IE_NAME = u'arte.tv'
1251 def fetch_webpage(self, url):
1252 request = compat_urllib_request.Request(url)
1254 self.report_download_webpage(url)
1255 webpage = compat_urllib_request.urlopen(request).read()
1256 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1257 raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1258 except ValueError as err:
1259 raise ExtractorError(u'Invalid URL: %s' % url)
1262 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1263 page = self.fetch_webpage(url)
1264 mobj = re.search(regex, page, regexFlags)
1268 raise ExtractorError(u'Invalid URL: %s' % url)
1270 for (i, key, err) in matchTuples:
1271 if mobj.group(i) is None:
1272 raise ExtractorError(err)
1274 info[key] = mobj.group(i)
1278 def extractLiveStream(self, url):
1279 video_lang = url.split('/')[-4]
1280 info = self.grep_webpage(
1282 r'src="(.*?/videothek_js.*?\.js)',
1285 (1, 'url', u'Invalid URL: %s' % url)
1288 http_host = url.split('/')[2]
1289 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1290 info = self.grep_webpage(
1292 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1293 '(http://.*?\.swf).*?' +
1297 (1, 'path', u'could not extract video path: %s' % url),
1298 (2, 'player', u'could not extract video player: %s' % url),
1299 (3, 'url', u'could not extract video url: %s' % url)
1302 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1304 def extractPlus7Stream(self, url):
1305 video_lang = url.split('/')[-3]
1306 info = self.grep_webpage(
1308 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1311 (1, 'url', u'Invalid URL: %s' % url)
1314 next_url = compat_urllib_parse.unquote(info.get('url'))
1315 info = self.grep_webpage(
1317 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1320 (1, 'url', u'Could not find <video> tag: %s' % url)
1323 next_url = compat_urllib_parse.unquote(info.get('url'))
1325 info = self.grep_webpage(
1327 r'<video id="(.*?)".*?>.*?' +
1328 '<name>(.*?)</name>.*?' +
1329 '<dateVideo>(.*?)</dateVideo>.*?' +
1330 '<url quality="hd">(.*?)</url>',
1333 (1, 'id', u'could not extract video id: %s' % url),
1334 (2, 'title', u'could not extract video title: %s' % url),
1335 (3, 'date', u'could not extract video date: %s' % url),
1336 (4, 'url', u'could not extract video url: %s' % url)
1341 'id': info.get('id'),
1342 'url': compat_urllib_parse.unquote(info.get('url')),
1343 'uploader': u'arte.tv',
1344 'upload_date': unified_strdate(info.get('date')),
1345 'title': info.get('title').decode('utf-8'),
1351 def _real_extract(self, url):
1352 video_id = url.split('/')[-1]
1353 self.report_extraction(video_id)
1355 if re.search(self._LIVE_URL, video_id) is not None:
1356 self.extractLiveStream(url)
1359 info = self.extractPlus7Stream(url)
1364 class GenericIE(InfoExtractor):
1365 """Generic last-resort information extractor."""
1368 IE_NAME = u'generic'
1370 def report_download_webpage(self, video_id):
1371 """Report webpage download."""
1372 if not self._downloader.params.get('test', False):
1373 self._downloader.report_warning(u'Falling back on generic information extractor.')
1374 super(GenericIE, self).report_download_webpage(video_id)
1376 def report_following_redirect(self, new_url):
1377 """Report information extraction."""
1378 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1380 def _test_redirect(self, url):
1381 """Check if it is a redirect, like url shorteners, in case return the new url."""
1382 class HeadRequest(compat_urllib_request.Request):
1383 def get_method(self):
1386 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1388 Subclass the HTTPRedirectHandler to make it use our
1389 HeadRequest also on the redirected URL
1391 def redirect_request(self, req, fp, code, msg, headers, newurl):
1392 if code in (301, 302, 303, 307):
1393 newurl = newurl.replace(' ', '%20')
1394 newheaders = dict((k,v) for k,v in req.headers.items()
1395 if k.lower() not in ("content-length", "content-type"))
1396 return HeadRequest(newurl,
1398 origin_req_host=req.get_origin_req_host(),
1401 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1403 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1405 Fallback to GET if HEAD is not allowed (405 HTTP error)
1407 def http_error_405(self, req, fp, code, msg, headers):
1411 newheaders = dict((k,v) for k,v in req.headers.items()
1412 if k.lower() not in ("content-length", "content-type"))
1413 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1415 origin_req_host=req.get_origin_req_host(),
1419 opener = compat_urllib_request.OpenerDirector()
1420 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1421 HTTPMethodFallback, HEADRedirectHandler,
1422 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1423 opener.add_handler(handler())
1425 response = opener.open(HeadRequest(url))
1426 if response is None:
1427 raise ExtractorError(u'Invalid URL protocol')
1428 new_url = response.geturl()
1433 self.report_following_redirect(new_url)
1436 def _real_extract(self, url):
1437 new_url = self._test_redirect(url)
1438 if new_url: return [self.url_result(new_url)]
1440 video_id = url.split('/')[-1]
1442 webpage = self._download_webpage(url, video_id)
1443 except ValueError as err:
1444 # since this is the last-resort InfoExtractor, if
1445 # this error is thrown, it'll be thrown here
1446 raise ExtractorError(u'Invalid URL: %s' % url)
1448 self.report_extraction(video_id)
1449 # Start with something easy: JW Player in SWFObject
1450 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1452 # Broaden the search a little bit
1453 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1455 # Broaden the search a little bit: JWPlayer JS loader
1456 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1458 # Try to find twitter cards info
1459 mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
1461 # We look for Open Graph info:
1462 # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
1463 m_video_type = re.search(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
1464 # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
1465 if m_video_type is not None:
1466 mobj = re.search(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)
1468 raise ExtractorError(u'Invalid URL: %s' % url)
1470 # It's possible that one of the regexes
1471 # matched, but returned an empty group:
1472 if mobj.group(1) is None:
1473 raise ExtractorError(u'Invalid URL: %s' % url)
1475 video_url = compat_urllib_parse.unquote(mobj.group(1))
1476 video_id = os.path.basename(video_url)
1478 # here's a fun little line of code for you:
1479 video_extension = os.path.splitext(video_id)[1][1:]
1480 video_id = os.path.splitext(video_id)[0]
1482 # it's tempting to parse this further, but you would
1483 # have to take into account all the variations like
1484 # Video Title - Site Name
1485 # Site Name | Video Title
1486 # Video Title - Tagline | Site Name
1487 # and so on and so forth; it's just not practical
1488 video_title = self._html_search_regex(r'<title>(.*)</title>',
1489 webpage, u'video title')
1491 # video uploader is domain name
1492 video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
1493 url, u'video uploader')
1498 'uploader': video_uploader,
1499 'upload_date': None,
1500 'title': video_title,
1501 'ext': video_extension,
1505 class YoutubeSearchIE(SearchInfoExtractor):
1506 """Information Extractor for YouTube search queries."""
1507 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1509 IE_NAME = u'youtube:search'
1510 _SEARCH_KEY = 'ytsearch'
1512 def report_download_page(self, query, pagenum):
1513 """Report attempt to download search page with given number."""
1514 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1516 def _get_n_results(self, query, n):
1517 """Get a specified number of results for a query"""
1523 while (50 * pagenum) < limit:
1524 self.report_download_page(query, pagenum+1)
1525 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1526 request = compat_urllib_request.Request(result_url)
1528 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1529 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1530 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1531 api_response = json.loads(data)['data']
1533 if not 'items' in api_response:
1534 raise ExtractorError(u'[youtube] No video results')
1536 new_ids = list(video['id'] for video in api_response['items'])
1537 video_ids += new_ids
1539 limit = min(n, api_response['totalItems'])
1542 if len(video_ids) > n:
1543 video_ids = video_ids[:n]
1544 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1545 return self.playlist_result(videos, query)
1548 class GoogleSearchIE(SearchInfoExtractor):
1549 """Information Extractor for Google Video search queries."""
1550 _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
1552 IE_NAME = u'video.google:search'
1553 _SEARCH_KEY = 'gvsearch'
1555 def _get_n_results(self, query, n):
1556 """Get a specified number of results for a query"""
1559 '_type': 'playlist',
1564 for pagenum in itertools.count(1):
1565 result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1566 webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1567 note='Downloading result page ' + str(pagenum))
1569 for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1572 'url': mobj.group(1)
1574 res['entries'].append(e)
1576 if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1579 class YahooSearchIE(SearchInfoExtractor):
1580 """Information Extractor for Yahoo! Video search queries."""
1583 IE_NAME = u'screen.yahoo:search'
1584 _SEARCH_KEY = 'yvsearch'
1586 def _get_n_results(self, query, n):
1587 """Get a specified number of results for a query"""
1590 '_type': 'playlist',
1594 for pagenum in itertools.count(0):
1595 result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
1596 webpage = self._download_webpage(result_url, query,
1597 note='Downloading results page '+str(pagenum+1))
1598 info = json.loads(webpage)
1600 results = info[u'results']
1602 for (i, r) in enumerate(results):
1603 if (pagenum * 30) +i >= n:
1605 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
1606 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
1607 res['entries'].append(e)
1608 if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
1614 class YoutubePlaylistIE(InfoExtractor):
1615 """Information Extractor for YouTube playlists."""
1617 _VALID_URL = r"""(?:
1622 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1623 \? (?:.*?&)*? (?:p|a|list)=
1626 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1629 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1631 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1633 IE_NAME = u'youtube:playlist'
1636 def suitable(cls, url):
1637 """Receives a URL and returns True if suitable for this IE."""
1638 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1640 def _real_extract(self, url):
1641 # Extract playlist id
1642 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1644 raise ExtractorError(u'Invalid URL: %s' % url)
1646 # Download playlist videos from API
1647 playlist_id = mobj.group(1) or mobj.group(2)
1652 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1653 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1656 response = json.loads(page)
1657 except ValueError as err:
1658 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1660 if 'feed' not in response:
1661 raise ExtractorError(u'Got a malformed response from YouTube API')
1662 playlist_title = response['feed']['title']['$t']
1663 if 'entry' not in response['feed']:
1664 # Number of videos is a multiple of self._MAX_RESULTS
1667 for entry in response['feed']['entry']:
1668 index = entry['yt$position']['$t']
1669 if 'media$group' in entry and 'media$player' in entry['media$group']:
1670 videos.append((index, entry['media$group']['media$player']['url']))
1672 if len(response['feed']['entry']) < self._MAX_RESULTS:
1676 videos = [v[1] for v in sorted(videos)]
1678 url_results = [self.url_result(url, 'Youtube') for url in videos]
1679 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1682 class YoutubeChannelIE(InfoExtractor):
1683 """Information Extractor for YouTube channels."""
1685 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1686 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1687 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1688 _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1689 IE_NAME = u'youtube:channel'
1691 def extract_videos_from_page(self, page):
1693 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1694 if mobj.group(1) not in ids_in_page:
1695 ids_in_page.append(mobj.group(1))
1698 def _real_extract(self, url):
1699 # Extract channel id
1700 mobj = re.match(self._VALID_URL, url)
1702 raise ExtractorError(u'Invalid URL: %s' % url)
1704 # Download channel page
1705 channel_id = mobj.group(1)
1709 url = self._TEMPLATE_URL % (channel_id, pagenum)
1710 page = self._download_webpage(url, channel_id,
1711 u'Downloading page #%s' % pagenum)
1713 # Extract video identifiers
1714 ids_in_page = self.extract_videos_from_page(page)
1715 video_ids.extend(ids_in_page)
1717 # Download any subsequent channel pages using the json-based channel_ajax query
1718 if self._MORE_PAGES_INDICATOR in page:
1720 pagenum = pagenum + 1
1722 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1723 page = self._download_webpage(url, channel_id,
1724 u'Downloading page #%s' % pagenum)
1726 page = json.loads(page)
1728 ids_in_page = self.extract_videos_from_page(page['content_html'])
1729 video_ids.extend(ids_in_page)
1731 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1734 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1736 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1737 url_entries = [self.url_result(url, 'Youtube') for url in urls]
1738 return [self.playlist_result(url_entries, channel_id)]
1741 class YoutubeUserIE(InfoExtractor):
1742 """Information Extractor for YouTube users."""
1744 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1745 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1746 _GDATA_PAGE_SIZE = 50
1747 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1748 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1749 IE_NAME = u'youtube:user'
1751 def _real_extract(self, url):
1753 mobj = re.match(self._VALID_URL, url)
1755 raise ExtractorError(u'Invalid URL: %s' % url)
1757 username = mobj.group(1)
1759 # Download video ids using YouTube Data API. Result size per
1760 # query is limited (currently to 50 videos) so we need to query
1761 # page by page until there are no video ids - it means we got
1768 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1770 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1771 page = self._download_webpage(gdata_url, username,
1772 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1774 # Extract video identifiers
1777 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1778 if mobj.group(1) not in ids_in_page:
1779 ids_in_page.append(mobj.group(1))
1781 video_ids.extend(ids_in_page)
1783 # A little optimization - if current page is not
1784 # "full", ie. does not contain PAGE_SIZE video ids then
1785 # we can assume that this page is the last one - there
1786 # are no more ids on further pages - no need to query
1789 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1794 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1795 url_results = [self.url_result(url, 'Youtube') for url in urls]
1796 return [self.playlist_result(url_results, playlist_title = username)]
1799 class BlipTVUserIE(InfoExtractor):
1800 """Information Extractor for blip.tv users."""
1802 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1804 IE_NAME = u'blip.tv:user'
1806 def _real_extract(self, url):
1808 mobj = re.match(self._VALID_URL, url)
1810 raise ExtractorError(u'Invalid URL: %s' % url)
1812 username = mobj.group(1)
1814 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1816 page = self._download_webpage(url, username, u'Downloading user page')
1817 mobj = re.search(r'data-users-id="([^"]+)"', page)
1818 page_base = page_base % mobj.group(1)
1821 # Download video ids using BlipTV Ajax calls. Result size per
1822 # query is limited (currently to 12 videos) so we need to query
1823 # page by page until there are no video ids - it means we got
1830 url = page_base + "&page=" + str(pagenum)
1831 page = self._download_webpage(url, username,
1832 u'Downloading video ids from page %d' % pagenum)
1834 # Extract video identifiers
1837 for mobj in re.finditer(r'href="/([^"]+)"', page):
1838 if mobj.group(1) not in ids_in_page:
1839 ids_in_page.append(unescapeHTML(mobj.group(1)))
1841 video_ids.extend(ids_in_page)
1843 # A little optimization - if current page is not
1844 # "full", ie. does not contain PAGE_SIZE video ids then
1845 # we can assume that this page is the last one - there
1846 # are no more ids on further pages - no need to query
1849 if len(ids_in_page) < self._PAGE_SIZE:
1854 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1855 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1856 return [self.playlist_result(url_entries, playlist_title = username)]
1859 class DepositFilesIE(InfoExtractor):
1860 """Information extractor for depositfiles.com"""
1862 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1864 def _real_extract(self, url):
1865 file_id = url.split('/')[-1]
1866 # Rebuild url in english locale
1867 url = 'http://depositfiles.com/en/files/' + file_id
1869 # Retrieve file webpage with 'Free download' button pressed
1870 free_download_indication = { 'gateway_result' : '1' }
1871 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1873 self.report_download_webpage(file_id)
1874 webpage = compat_urllib_request.urlopen(request).read()
1875 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1876 raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1878 # Search for the real file URL
1879 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1880 if (mobj is None) or (mobj.group(1) is None):
1881 # Try to figure out reason of the error.
1882 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1883 if (mobj is not None) and (mobj.group(1) is not None):
1884 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1885 raise ExtractorError(u'%s' % restriction_message)
1887 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1889 file_url = mobj.group(1)
1890 file_extension = os.path.splitext(file_url)[1][1:]
1892 # Search for file title
1893 file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
1896 'id': file_id.decode('utf-8'),
1897 'url': file_url.decode('utf-8'),
1899 'upload_date': None,
1900 'title': file_title,
1901 'ext': file_extension.decode('utf-8'),
1905 class FacebookIE(InfoExtractor):
1906 """Information Extractor for Facebook"""
1908 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1909 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1910 _NETRC_MACHINE = 'facebook'
1911 IE_NAME = u'facebook'
1913 def report_login(self):
1914 """Report attempt to log in."""
1915 self.to_screen(u'Logging in')
1917 def _real_initialize(self):
1918 if self._downloader is None:
1923 downloader_params = self._downloader.params
1925 # Attempt to use provided username and password or .netrc data
1926 if downloader_params.get('username', None) is not None:
1927 useremail = downloader_params['username']
1928 password = downloader_params['password']
1929 elif downloader_params.get('usenetrc', False):
1931 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1932 if info is not None:
1936 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1937 except (IOError, netrc.NetrcParseError) as err:
1938 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1941 if useremail is None:
1950 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1953 login_results = compat_urllib_request.urlopen(request).read()
1954 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1955 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1957 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1958 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1961 def _real_extract(self, url):
1962 mobj = re.match(self._VALID_URL, url)
1964 raise ExtractorError(u'Invalid URL: %s' % url)
1965 video_id = mobj.group('ID')
1967 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1968 webpage = self._download_webpage(url, video_id)
1970 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1971 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1972 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1974 raise ExtractorError(u'Cannot parse data')
1975 data = dict(json.loads(m.group(1)))
1976 params_raw = compat_urllib_parse.unquote(data['params'])
1977 params = json.loads(params_raw)
1978 video_data = params['video_data'][0]
1979 video_url = video_data.get('hd_src')
1981 video_url = video_data['sd_src']
1983 raise ExtractorError(u'Cannot find video URL')
1984 video_duration = int(video_data['video_duration'])
1985 thumbnail = video_data['thumbnail_src']
1987 video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
1992 'title': video_title,
1995 'duration': video_duration,
1996 'thumbnail': thumbnail,
2001 class BlipTVIE(InfoExtractor):
2002 """Information extractor for blip.tv"""
2004 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
2005 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2006 IE_NAME = u'blip.tv'
2008 def report_direct_download(self, title):
2009 """Report information extraction."""
2010 self.to_screen(u'%s: Direct download detected' % title)
2012 def _real_extract(self, url):
2013 mobj = re.match(self._VALID_URL, url)
2015 raise ExtractorError(u'Invalid URL: %s' % url)
2017 # See https://github.com/rg3/youtube-dl/issues/857
2018 api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
2019 if api_mobj is not None:
2020 url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
2021 urlp = compat_urllib_parse_urlparse(url)
2022 if urlp.path.startswith('/play/'):
2023 request = compat_urllib_request.Request(url)
2024 response = compat_urllib_request.urlopen(request)
2025 redirecturl = response.geturl()
2026 rurlp = compat_urllib_parse_urlparse(redirecturl)
2027 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2028 url = 'http://blip.tv/a/a-' + file_id
2029 return self._real_extract(url)
2036 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2037 request = compat_urllib_request.Request(json_url)
2038 request.add_header('User-Agent', 'iTunes/10.6.1')
2039 self.report_extraction(mobj.group(1))
2042 urlh = compat_urllib_request.urlopen(request)
2043 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2044 basename = url.split('/')[-1]
2045 title,ext = os.path.splitext(basename)
2046 title = title.decode('UTF-8')
2047 ext = ext.replace('.', '')
2048 self.report_direct_download(title)
2053 'upload_date': None,
2058 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2059 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2060 if info is None: # Regular URL
2062 json_code_bytes = urlh.read()
2063 json_code = json_code_bytes.decode('utf-8')
2064 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2065 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
2068 json_data = json.loads(json_code)
2069 if 'Post' in json_data:
2070 data = json_data['Post']
2074 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2075 video_url = data['media']['url']
2076 umobj = re.match(self._URL_EXT, video_url)
2078 raise ValueError('Can not determine filename extension')
2079 ext = umobj.group(1)
2082 'id': data['item_id'],
2084 'uploader': data['display_name'],
2085 'upload_date': upload_date,
2086 'title': data['title'],
2088 'format': data['media']['mimeType'],
2089 'thumbnail': data['thumbnailUrl'],
2090 'description': data['description'],
2091 'player_url': data['embedUrl'],
2092 'user_agent': 'iTunes/10.6.1',
2094 except (ValueError,KeyError) as err:
2095 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
2100 class MyVideoIE(InfoExtractor):
2101 """Information Extractor for myvideo.de."""
2103 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2104 IE_NAME = u'myvideo'
2106 # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
2107 # Released into the Public Domain by Tristan Fischer on 2013-05-19
2108 # https://github.com/rg3/youtube-dl/pull/842
2109 def __rc4crypt(self,data, key):
2111 box = list(range(256))
2112 for i in list(range(256)):
2113 x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
2114 box[i], box[x] = box[x], box[i]
2120 y = (y + box[x]) % 256
2121 box[x], box[y] = box[y], box[x]
2122 out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
2126 return hashlib.md5(s).hexdigest().encode()
2128 def _real_extract(self,url):
2129 mobj = re.match(self._VALID_URL, url)
2131 raise ExtractorError(u'invalid URL: %s' % url)
2133 video_id = mobj.group(1)
2136 b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
2137 b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
2138 b'TnpsbA0KTVRkbU1tSTRNdz09'
2142 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2143 webpage = self._download_webpage(webpage_url, video_id)
2145 mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
2146 if mobj is not None:
2147 self.report_extraction(video_id)
2148 video_url = mobj.group(1) + '.flv'
2150 video_title = self._html_search_regex('<title>([^<]+)</title>',
2153 video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
2159 'upload_date': None,
2160 'title': video_title,
2165 mobj = re.search('var flashvars={(.+?)}', webpage)
2167 raise ExtractorError(u'Unable to extract video')
2172 for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
2173 if not a == '_encxml':
2176 encxml = compat_urllib_parse.unquote(b)
2177 if not params.get('domain'):
2178 params['domain'] = 'www.myvideo.de'
2179 xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
2180 if 'flash_playertype=MTV' in xmldata_url:
2181 self._downloader.report_warning(u'avoiding MTV player')
2183 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
2184 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
2188 enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
2189 enc_data_b = binascii.unhexlify(enc_data)
2191 base64.b64decode(base64.b64decode(GK)) +
2193 str(video_id).encode('utf-8')
2196 dec_data = self.__rc4crypt(enc_data_b, sk)
2199 self.report_extraction(video_id)
2202 mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
2204 video_url = compat_urllib_parse.unquote(mobj.group(1))
2205 if 'myvideo2flash' in video_url:
2206 self._downloader.report_warning(u'forcing RTMPT ...')
2207 video_url = video_url.replace('rtmpe://', 'rtmpt://')
2210 # extract non rtmp videos
2211 mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
2213 raise ExtractorError(u'unable to extract url')
2214 video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
2216 video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
2217 video_file = compat_urllib_parse.unquote(video_file)
2219 if not video_file.endswith('f4m'):
2220 ppath, prefix = video_file.split('.')
2221 video_playpath = '%s:%s' % (prefix, ppath)
2222 video_hls_playlist = ''
2225 video_hls_playlist = (
2226 video_filepath + video_file
2227 ).replace('.f4m', '.m3u8')
2229 video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
2230 video_swfobj = compat_urllib_parse.unquote(video_swfobj)
2232 video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
2238 'tc_url': video_url,
2240 'upload_date': None,
2241 'title': video_title,
2243 'play_path': video_playpath,
2244 'video_file': video_file,
2245 'video_hls_playlist': video_hls_playlist,
2246 'player_url': video_swfobj,
2250 class ComedyCentralIE(InfoExtractor):
2251 """Information extractor for The Daily Show and Colbert Report """
2253 # urls can be abbreviations like :thedailyshow or :colbert
2254 # urls for episodes like:
2255 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2256 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2257 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2258 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2259 |(https?://)?(www\.)?
2260 (?P<showname>thedailyshow|colbertnation)\.com/
2261 (full-episodes/(?P<episode>.*)|
2263 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2264 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2267 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2269 _video_extensions = {
2277 _video_dimensions = {
2287 def suitable(cls, url):
2288 """Receives a URL and returns True if suitable for this IE."""
2289 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2291 def _print_formats(self, formats):
2292 print('Available formats:')
2294 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2297 def _real_extract(self, url):
2298 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2300 raise ExtractorError(u'Invalid URL: %s' % url)
2302 if mobj.group('shortname'):
2303 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2304 url = u'http://www.thedailyshow.com/full-episodes/'
2306 url = u'http://www.colbertnation.com/full-episodes/'
2307 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2308 assert mobj is not None
2310 if mobj.group('clip'):
2311 if mobj.group('showname') == 'thedailyshow':
2312 epTitle = mobj.group('tdstitle')
2314 epTitle = mobj.group('cntitle')
2317 dlNewest = not mobj.group('episode')
2319 epTitle = mobj.group('showname')
2321 epTitle = mobj.group('episode')
2323 self.report_extraction(epTitle)
2324 webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2326 url = htmlHandle.geturl()
2327 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2329 raise ExtractorError(u'Invalid redirected URL: ' + url)
2330 if mobj.group('episode') == '':
2331 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2332 epTitle = mobj.group('episode')
2334 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2336 if len(mMovieParams) == 0:
2337 # The Colbert Report embeds the information in a without
2338 # a URL prefix; so extract the alternate reference
2339 # and then add the URL prefix manually.
2341 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2342 if len(altMovieParams) == 0:
2343 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2345 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2347 uri = mMovieParams[0][1]
2348 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2349 indexXml = self._download_webpage(indexUrl, epTitle,
2350 u'Downloading show index',
2351 u'unable to download episode index')
2355 idoc = xml.etree.ElementTree.fromstring(indexXml)
2356 itemEls = idoc.findall('.//item')
2357 for partNum,itemEl in enumerate(itemEls):
2358 mediaId = itemEl.findall('./guid')[0].text
2359 shortMediaId = mediaId.split(':')[-1]
2360 showId = mediaId.split(':')[-2].replace('.com', '')
2361 officialTitle = itemEl.findall('./title')[0].text
2362 officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2364 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2365 compat_urllib_parse.urlencode({'uri': mediaId}))
2366 configXml = self._download_webpage(configUrl, epTitle,
2367 u'Downloading configuration for %s' % shortMediaId)
2369 cdoc = xml.etree.ElementTree.fromstring(configXml)
2371 for rendition in cdoc.findall('.//rendition'):
2372 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2376 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2379 if self._downloader.params.get('listformats', None):
2380 self._print_formats([i[0] for i in turls])
2383 # For now, just pick the highest bitrate
2384 format,rtmp_video_url = turls[-1]
2386 # Get the format arg from the arg stream
2387 req_format = self._downloader.params.get('format', None)
2389 # Select format if we can find one
2392 format, rtmp_video_url = f, v
2395 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2397 raise ExtractorError(u'Cannot transform RTMP url')
2398 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2399 video_url = base + m.group('finalid')
2401 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2406 'upload_date': officialDate,
2411 'description': officialTitle,
2413 results.append(info)
2418 class EscapistIE(InfoExtractor):
2419 """Information extractor for The Escapist """
2421 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2422 IE_NAME = u'escapist'
2424 def _real_extract(self, url):
2425 mobj = re.match(self._VALID_URL, url)
2427 raise ExtractorError(u'Invalid URL: %s' % url)
2428 showName = mobj.group('showname')
2429 videoId = mobj.group('episode')
2431 self.report_extraction(videoId)
2432 webpage = self._download_webpage(url, videoId)
2434 videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
2435 webpage, u'description', fatal=False)
2437 imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
2438 webpage, u'thumbnail', fatal=False)
2440 playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
2441 webpage, u'player url')
2443 title = self._html_search_regex('<meta name="title" content="([^"]*)"',
2444 webpage, u'player url').split(' : ')[-1]
2446 configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
2447 configUrl = compat_urllib_parse.unquote(configUrl)
2449 configJSON = self._download_webpage(configUrl, videoId,
2450 u'Downloading configuration',
2451 u'unable to download configuration')
2453 # Technically, it's JavaScript, not JSON
2454 configJSON = configJSON.replace("'", '"')
2457 config = json.loads(configJSON)
2458 except (ValueError,) as err:
2459 raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2461 playlist = config['playlist']
2462 videoUrl = playlist[1]['url']
2467 'uploader': showName,
2468 'upload_date': None,
2471 'thumbnail': imgUrl,
2472 'description': videoDesc,
2473 'player_url': playerUrl,
2478 class CollegeHumorIE(InfoExtractor):
2479 """Information extractor for collegehumor.com"""
2482 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2483 IE_NAME = u'collegehumor'
2485 def report_manifest(self, video_id):
2486 """Report information extraction."""
2487 self.to_screen(u'%s: Downloading XML manifest' % video_id)
2489 def _real_extract(self, url):
2490 mobj = re.match(self._VALID_URL, url)
2492 raise ExtractorError(u'Invalid URL: %s' % url)
2493 video_id = mobj.group('videoid')
2498 'upload_date': None,
2501 self.report_extraction(video_id)
2502 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2504 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2505 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2506 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2508 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2510 videoNode = mdoc.findall('./video')[0]
2511 info['description'] = videoNode.findall('./description')[0].text
2512 info['title'] = videoNode.findall('./caption')[0].text
2513 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2514 manifest_url = videoNode.findall('./file')[0].text
2516 raise ExtractorError(u'Invalid metadata XML file')
2518 manifest_url += '?hdcore=2.10.3'
2519 self.report_manifest(video_id)
2521 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2522 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2523 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2525 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2527 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2528 node_id = media_node.attrib['url']
2529 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2530 except IndexError as err:
2531 raise ExtractorError(u'Invalid manifest file')
2533 url_pr = compat_urllib_parse_urlparse(manifest_url)
2534 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2541 class XVideosIE(InfoExtractor):
2542 """Information extractor for xvideos.com"""
2544 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2545 IE_NAME = u'xvideos'
2547 def _real_extract(self, url):
2548 mobj = re.match(self._VALID_URL, url)
2550 raise ExtractorError(u'Invalid URL: %s' % url)
2551 video_id = mobj.group(1)
2553 webpage = self._download_webpage(url, video_id)
2555 self.report_extraction(video_id)
2558 video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
2559 webpage, u'video URL'))
2562 video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
2565 # Extract video thumbnail
2566 video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
2567 webpage, u'thumbnail', fatal=False)
2573 'upload_date': None,
2574 'title': video_title,
2576 'thumbnail': video_thumbnail,
2577 'description': None,
2583 class SoundcloudIE(InfoExtractor):
2584 """Information extractor for soundcloud.com
2585 To access the media, the uid of the song and a stream token
2586 must be extracted from the page source and the script must make
2587 a request to media.soundcloud.com/crossdomain.xml. Then
2588 the media can be grabbed by requesting from an url composed
2589 of the stream token and uid
2592 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2593 IE_NAME = u'soundcloud'
2595 def report_resolve(self, video_id):
2596 """Report information extraction."""
2597 self.to_screen(u'%s: Resolving id' % video_id)
2599 def _real_extract(self, url):
2600 mobj = re.match(self._VALID_URL, url)
2602 raise ExtractorError(u'Invalid URL: %s' % url)
2604 # extract uploader (which is in the url)
2605 uploader = mobj.group(1)
2606 # extract simple title (uploader + slug of song title)
2607 slug_title = mobj.group(2)
2608 simple_title = uploader + u'-' + slug_title
2609 full_title = '%s/%s' % (uploader, slug_title)
2611 self.report_resolve(full_title)
2613 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2614 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2615 info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2617 info = json.loads(info_json)
2618 video_id = info['id']
2619 self.report_extraction(full_title)
2621 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2622 stream_json = self._download_webpage(streams_url, full_title,
2623 u'Downloading stream definitions',
2624 u'unable to download stream definitions')
2626 streams = json.loads(stream_json)
2627 mediaURL = streams['http_mp3_128_url']
2628 upload_date = unified_strdate(info['created_at'])
2633 'uploader': info['user']['username'],
2634 'upload_date': upload_date,
2635 'title': info['title'],
2637 'description': info['description'],
2640 class SoundcloudSetIE(InfoExtractor):
2641 """Information extractor for soundcloud.com sets
2642 To access the media, the uid of the song and a stream token
2643 must be extracted from the page source and the script must make
2644 a request to media.soundcloud.com/crossdomain.xml. Then
2645 the media can be grabbed by requesting from an url composed
2646 of the stream token and uid
2649 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2650 IE_NAME = u'soundcloud:set'
2652 def report_resolve(self, video_id):
2653 """Report information extraction."""
2654 self.to_screen(u'%s: Resolving id' % video_id)
2656 def _real_extract(self, url):
2657 mobj = re.match(self._VALID_URL, url)
2659 raise ExtractorError(u'Invalid URL: %s' % url)
2661 # extract uploader (which is in the url)
2662 uploader = mobj.group(1)
2663 # extract simple title (uploader + slug of song title)
2664 slug_title = mobj.group(2)
2665 simple_title = uploader + u'-' + slug_title
2666 full_title = '%s/sets/%s' % (uploader, slug_title)
2668 self.report_resolve(full_title)
2670 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2671 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2672 info_json = self._download_webpage(resolv_url, full_title)
2675 info = json.loads(info_json)
2676 if 'errors' in info:
2677 for err in info['errors']:
2678 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2681 self.report_extraction(full_title)
2682 for track in info['tracks']:
2683 video_id = track['id']
2685 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2686 stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2688 self.report_extraction(video_id)
2689 streams = json.loads(stream_json)
2690 mediaURL = streams['http_mp3_128_url']
2695 'uploader': track['user']['username'],
2696 'upload_date': unified_strdate(track['created_at']),
2697 'title': track['title'],
2699 'description': track['description'],
2704 class InfoQIE(InfoExtractor):
2705 """Information extractor for infoq.com"""
2706 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2708 def _real_extract(self, url):
2709 mobj = re.match(self._VALID_URL, url)
2711 raise ExtractorError(u'Invalid URL: %s' % url)
2713 webpage = self._download_webpage(url, video_id=url)
2714 self.report_extraction(url)
2717 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2719 raise ExtractorError(u'Unable to extract video url')
2720 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2721 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2724 video_title = self._search_regex(r'contentTitle = "(.*?)";',
2727 # Extract description
2728 video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
2729 webpage, u'description', fatal=False)
2731 video_filename = video_url.split('/')[-1]
2732 video_id, extension = video_filename.split('.')
2738 'upload_date': None,
2739 'title': video_title,
2740 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2742 'description': video_description,
2747 class MixcloudIE(InfoExtractor):
2748 """Information extractor for www.mixcloud.com"""
2750 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2751 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2752 IE_NAME = u'mixcloud'
2754 def report_download_json(self, file_id):
2755 """Report JSON download."""
2756 self.to_screen(u'Downloading json')
2758 def get_urls(self, jsonData, fmt, bitrate='best'):
2759 """Get urls from 'audio_formats' section in json"""
2762 bitrate_list = jsonData[fmt]
2763 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2764 bitrate = max(bitrate_list) # select highest
2766 url_list = jsonData[fmt][bitrate]
2767 except TypeError: # we have no bitrate info.
2768 url_list = jsonData[fmt]
2771 def check_urls(self, url_list):
2772 """Returns 1st active url from list"""
2773 for url in url_list:
2775 compat_urllib_request.urlopen(url)
2777 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2782 def _print_formats(self, formats):
2783 print('Available formats:')
2784 for fmt in formats.keys():
2785 for b in formats[fmt]:
2787 ext = formats[fmt][b][0]
2788 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2789 except TypeError: # we have no bitrate info
2790 ext = formats[fmt][0]
2791 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2794 def _real_extract(self, url):
2795 mobj = re.match(self._VALID_URL, url)
2797 raise ExtractorError(u'Invalid URL: %s' % url)
2798 # extract uploader & filename from url
2799 uploader = mobj.group(1).decode('utf-8')
2800 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2802 # construct API request
2803 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2804 # retrieve .json file with links to files
2805 request = compat_urllib_request.Request(file_url)
2807 self.report_download_json(file_url)
2808 jsonData = compat_urllib_request.urlopen(request).read()
2809 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2810 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2813 json_data = json.loads(jsonData)
2814 player_url = json_data['player_swf_url']
2815 formats = dict(json_data['audio_formats'])
2817 req_format = self._downloader.params.get('format', None)
2820 if self._downloader.params.get('listformats', None):
2821 self._print_formats(formats)
2824 if req_format is None or req_format == 'best':
2825 for format_param in formats.keys():
2826 url_list = self.get_urls(formats, format_param)
2828 file_url = self.check_urls(url_list)
2829 if file_url is not None:
2832 if req_format not in formats:
2833 raise ExtractorError(u'Format is not available')
2835 url_list = self.get_urls(formats, req_format)
2836 file_url = self.check_urls(url_list)
2837 format_param = req_format
2840 'id': file_id.decode('utf-8'),
2841 'url': file_url.decode('utf-8'),
2842 'uploader': uploader.decode('utf-8'),
2843 'upload_date': None,
2844 'title': json_data['name'],
2845 'ext': file_url.split('.')[-1].decode('utf-8'),
2846 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2847 'thumbnail': json_data['thumbnail_url'],
2848 'description': json_data['description'],
2849 'player_url': player_url.decode('utf-8'),
2852 class StanfordOpenClassroomIE(InfoExtractor):
2853 """Information extractor for Stanford's Open ClassRoom"""
2855 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2856 IE_NAME = u'stanfordoc'
2858 def _real_extract(self, url):
2859 mobj = re.match(self._VALID_URL, url)
2861 raise ExtractorError(u'Invalid URL: %s' % url)
2863 if mobj.group('course') and mobj.group('video'): # A specific video
2864 course = mobj.group('course')
2865 video = mobj.group('video')
2867 'id': course + '_' + video,
2869 'upload_date': None,
2872 self.report_extraction(info['id'])
2873 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2874 xmlUrl = baseUrl + video + '.xml'
2876 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2877 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2878 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2879 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2881 info['title'] = mdoc.findall('./title')[0].text
2882 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2884 raise ExtractorError(u'Invalid metadata XML file')
2885 info['ext'] = info['url'].rpartition('.')[2]
2887 elif mobj.group('course'): # A course page
2888 course = mobj.group('course')
2893 'upload_date': None,
2896 coursepage = self._download_webpage(url, info['id'],
2897 note='Downloading course info page',
2898 errnote='Unable to download course info page')
2900 info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
2902 info['description'] = self._html_search_regex('<description>([^<]+)</description>',
2903 coursepage, u'description', fatal=False)
2905 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2908 'type': 'reference',
2909 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2913 for entry in info['list']:
2914 assert entry['type'] == 'reference'
2915 results += self.extract(entry['url'])
2919 'id': 'Stanford OpenClassroom',
2922 'upload_date': None,
2925 self.report_download_webpage(info['id'])
2926 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2928 rootpage = compat_urllib_request.urlopen(rootURL).read()
2929 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2930 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2932 info['title'] = info['id']
2934 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2937 'type': 'reference',
2938 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2943 for entry in info['list']:
2944 assert entry['type'] == 'reference'
2945 results += self.extract(entry['url'])
2948 class MTVIE(InfoExtractor):
2949 """Information extractor for MTV.com"""
2951 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2954 def _real_extract(self, url):
2955 mobj = re.match(self._VALID_URL, url)
2957 raise ExtractorError(u'Invalid URL: %s' % url)
2958 if not mobj.group('proto'):
2959 url = 'http://' + url
2960 video_id = mobj.group('videoid')
2962 webpage = self._download_webpage(url, video_id)
2964 song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
2965 webpage, u'song name', fatal=False)
2967 video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
2970 mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
2971 webpage, u'mtvn_uri', fatal=False)
2973 content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
2974 webpage, u'content id', fatal=False)
2976 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2977 self.report_extraction(video_id)
2978 request = compat_urllib_request.Request(videogen_url)
2980 metadataXml = compat_urllib_request.urlopen(request).read()
2981 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2982 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2984 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2985 renditions = mdoc.findall('.//rendition')
2987 # For now, always pick the highest quality.
2988 rendition = renditions[-1]
2991 _,_,ext = rendition.attrib['type'].partition('/')
2992 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2993 video_url = rendition.find('./src').text
2995 raise ExtractorError('Invalid rendition field.')
3000 'uploader': performer,
3001 'upload_date': None,
3002 'title': video_title,
3010 class YoukuIE(InfoExtractor):
3011 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3014 nowTime = int(time.time() * 1000)
3015 random1 = random.randint(1000,1998)
3016 random2 = random.randint(1000,9999)
3018 return "%d%d%d" %(nowTime,random1,random2)
3020 def _get_file_ID_mix_string(self, seed):
3022 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3024 for i in range(len(source)):
3025 seed = (seed * 211 + 30031 ) % 65536
3026 index = math.floor(seed / 65536 * len(source) )
3027 mixed.append(source[int(index)])
3028 source.remove(source[int(index)])
3029 #return ''.join(mixed)
3032 def _get_file_id(self, fileId, seed):
3033 mixed = self._get_file_ID_mix_string(seed)
3034 ids = fileId.split('*')
3038 realId.append(mixed[int(ch)])
3039 return ''.join(realId)
3041 def _real_extract(self, url):
3042 mobj = re.match(self._VALID_URL, url)
3044 raise ExtractorError(u'Invalid URL: %s' % url)
3045 video_id = mobj.group('ID')
3047 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3049 jsondata = self._download_webpage(info_url, video_id)
3051 self.report_extraction(video_id)
3053 config = json.loads(jsondata)
3055 video_title = config['data'][0]['title']
3056 seed = config['data'][0]['seed']
3058 format = self._downloader.params.get('format', None)
3059 supported_format = list(config['data'][0]['streamfileids'].keys())
3061 if format is None or format == 'best':
3062 if 'hd2' in supported_format:
3067 elif format == 'worst':
3075 fileid = config['data'][0]['streamfileids'][format]
3076 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3077 except (UnicodeDecodeError, ValueError, KeyError):
3078 raise ExtractorError(u'Unable to extract info section')
3081 sid = self._gen_sid()
3082 fileid = self._get_file_id(fileid, seed)
3084 #column 8,9 of fileid represent the segment number
3085 #fileid[7:9] should be changed
3086 for index, key in enumerate(keys):
3088 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3089 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3092 'id': '%s_part%02d' % (video_id, index),
3093 'url': download_url,
3095 'upload_date': None,
3096 'title': video_title,
3099 files_info.append(info)
3104 class XNXXIE(InfoExtractor):
3105 """Information extractor for xnxx.com"""
3107 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3109 VIDEO_URL_RE = r'flv_url=(.*?)&'
3110 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3111 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3113 def _real_extract(self, url):
3114 mobj = re.match(self._VALID_URL, url)
3116 raise ExtractorError(u'Invalid URL: %s' % url)
3117 video_id = mobj.group(1)
3119 # Get webpage content
3120 webpage = self._download_webpage(url, video_id)
3122 video_url = self._search_regex(self.VIDEO_URL_RE,
3123 webpage, u'video URL')
3124 video_url = compat_urllib_parse.unquote(video_url)
3126 video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
3129 video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
3130 webpage, u'thumbnail', fatal=False)
3136 'upload_date': None,
3137 'title': video_title,
3139 'thumbnail': video_thumbnail,
3140 'description': None,
3144 class GooglePlusIE(InfoExtractor):
3145 """Information extractor for plus.google.com."""
3147 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3148 IE_NAME = u'plus.google'
3150 def _real_extract(self, url):
3151 # Extract id from URL
3152 mobj = re.match(self._VALID_URL, url)
3154 raise ExtractorError(u'Invalid URL: %s' % url)
3156 post_url = mobj.group(0)
3157 video_id = mobj.group(1)
3159 video_extension = 'flv'
3161 # Step 1, Retrieve post webpage to extract further information
3162 webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3164 self.report_extraction(video_id)
3166 # Extract update date
3167 upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
3168 webpage, u'upload date', fatal=False)
3170 # Convert timestring to a format suitable for filename
3171 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3172 upload_date = upload_date.strftime('%Y%m%d')
3175 uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
3176 webpage, u'uploader', fatal=False)
3179 # Get the first line for title
3180 video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
3181 webpage, 'title', default=u'NA')
3183 # Step 2, Stimulate clicking the image box to launch video
3184 video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
3185 webpage, u'video page URL')
3186 webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3188 # Extract video links on video page
3189 """Extract video links of all sizes"""
3190 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3191 mobj = re.findall(pattern, webpage)
3193 raise ExtractorError(u'Unable to extract video links')
3195 # Sort in resolution
3196 links = sorted(mobj)
3198 # Choose the lowest of the sort, i.e. highest resolution
3199 video_url = links[-1]
3200 # Only get the url. The resolution part in the tuple has no use anymore
3201 video_url = video_url[-1]
3202 # Treat escaped \u0026 style hex
3204 video_url = video_url.decode("unicode_escape")
3205 except AttributeError: # Python 3
3206 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3212 'uploader': uploader,
3213 'upload_date': upload_date,
3214 'title': video_title,
3215 'ext': video_extension,
3218 class NBAIE(InfoExtractor):
3219 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
3222 def _real_extract(self, url):
3223 mobj = re.match(self._VALID_URL, url)
3225 raise ExtractorError(u'Invalid URL: %s' % url)
3227 video_id = mobj.group(1)
3229 webpage = self._download_webpage(url, video_id)
3231 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3233 shortened_video_id = video_id.rpartition('/')[2]
3234 title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
3235 webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
3237 # It isn't there in the HTML it returns to us
3238 # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
3240 description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
3243 'id': shortened_video_id,
3247 # 'uploader_date': uploader_date,
3248 'description': description,
3252 class JustinTVIE(InfoExtractor):
3253 """Information extractor for justin.tv and twitch.tv"""
3254 # TODO: One broadcast may be split into multiple videos. The key
3255 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3256 # starts at 1 and increases. Can we treat all parts as one video?
3258 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3260 (?P<channelid>[^/]+)|
3261 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3262 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3266 _JUSTIN_PAGE_LIMIT = 100
3267 IE_NAME = u'justin.tv'
3269 def report_download_page(self, channel, offset):
3270 """Report attempt to download a single page of videos."""
3271 self.to_screen(u'%s: Downloading video information from %d to %d' %
3272 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3274 # Return count of items, list of *valid* items
3275 def _parse_page(self, url, video_id):
3276 webpage = self._download_webpage(url, video_id,
3277 u'Downloading video info JSON',
3278 u'unable to download video info JSON')
3280 response = json.loads(webpage)
3281 if type(response) != list:
3282 error_text = response.get('error', 'unknown error')
3283 raise ExtractorError(u'Justin.tv API: %s' % error_text)
3285 for clip in response:
3286 video_url = clip['video_file_url']
3288 video_extension = os.path.splitext(video_url)[1][1:]
3289 video_date = re.sub('-', '', clip['start_time'][:10])
3290 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3291 video_id = clip['id']
3292 video_title = clip.get('title', video_id)
3296 'title': video_title,
3297 'uploader': clip.get('channel_name', video_uploader_id),
3298 'uploader_id': video_uploader_id,
3299 'upload_date': video_date,
3300 'ext': video_extension,
3302 return (len(response), info)
3304 def _real_extract(self, url):
3305 mobj = re.match(self._VALID_URL, url)
3307 raise ExtractorError(u'invalid URL: %s' % url)
3309 api_base = 'http://api.justin.tv'
3311 if mobj.group('channelid'):
3313 video_id = mobj.group('channelid')
3314 api = api_base + '/channel/archives/%s.json' % video_id
3315 elif mobj.group('chapterid'):
3316 chapter_id = mobj.group('chapterid')
3318 webpage = self._download_webpage(url, chapter_id)
3319 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3321 raise ExtractorError(u'Cannot find archive of a chapter')
3322 archive_id = m.group(1)
3324 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3325 chapter_info_xml = self._download_webpage(api, chapter_id,
3326 note=u'Downloading chapter information',
3327 errnote=u'Chapter information download failed')
3328 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3329 for a in doc.findall('.//archive'):
3330 if archive_id == a.find('./id').text:
3333 raise ExtractorError(u'Could not find chapter in chapter information')
3335 video_url = a.find('./video_file_url').text
3336 video_ext = video_url.rpartition('.')[2] or u'flv'
3338 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3339 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3340 note='Downloading chapter metadata',
3341 errnote='Download of chapter metadata failed')
3342 chapter_info = json.loads(chapter_info_json)
3344 bracket_start = int(doc.find('.//bracket_start').text)
3345 bracket_end = int(doc.find('.//bracket_end').text)
3347 # TODO determine start (and probably fix up file)
3348 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3349 #video_url += u'?start=' + TODO:start_timestamp
3350 # bracket_start is 13290, but we want 51670615
3351 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3352 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3355 'id': u'c' + chapter_id,
3358 'title': chapter_info['title'],
3359 'thumbnail': chapter_info['preview'],
3360 'description': chapter_info['description'],
3361 'uploader': chapter_info['channel']['display_name'],
3362 'uploader_id': chapter_info['channel']['name'],
3366 video_id = mobj.group('videoid')
3367 api = api_base + '/broadcast/by_archive/%s.json' % video_id
3369 self.report_extraction(video_id)
3373 limit = self._JUSTIN_PAGE_LIMIT
3376 self.report_download_page(video_id, offset)
3377 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3378 page_count, page_info = self._parse_page(page_url, video_id)
3379 info.extend(page_info)
3380 if not paged or page_count != limit:
3385 class FunnyOrDieIE(InfoExtractor):
3386 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3388 def _real_extract(self, url):
3389 mobj = re.match(self._VALID_URL, url)
3391 raise ExtractorError(u'invalid URL: %s' % url)
3393 video_id = mobj.group('id')
3394 webpage = self._download_webpage(url, video_id)
3396 video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
3397 webpage, u'video URL', flags=re.DOTALL)
3399 title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
3400 r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
3402 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3403 webpage, u'description', fatal=False, flags=re.DOTALL)
3410 'description': video_description,
3414 class SteamIE(InfoExtractor):
3415 _VALID_URL = r"""http://store\.steampowered\.com/
3417 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3419 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3421 _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
3422 _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
3425 def suitable(cls, url):
3426 """Receives a URL and returns True if suitable for this IE."""
3427 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3429 def _real_extract(self, url):
3430 m = re.match(self._VALID_URL, url, re.VERBOSE)
3431 gameID = m.group('gameID')
3433 videourl = self._VIDEO_PAGE_TEMPLATE % gameID
3434 webpage = self._download_webpage(videourl, gameID)
3436 if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
3437 videourl = self._AGECHECK_TEMPLATE % gameID
3438 self.report_age_confirmation()
3439 webpage = self._download_webpage(videourl, gameID)
3441 self.report_extraction(gameID)
3442 game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
3443 webpage, 'game title')
3445 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3446 mweb = re.finditer(urlRE, webpage)
3447 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3448 titles = re.finditer(namesRE, webpage)
3449 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3450 thumbs = re.finditer(thumbsRE, webpage)
3452 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3453 video_id = vid.group('videoID')
3454 title = vtitle.group('videoName')
3455 video_url = vid.group('videoURL')
3456 video_thumb = thumb.group('thumbnail')
3458 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3463 'title': unescapeHTML(title),
3464 'thumbnail': video_thumb
3467 return [self.playlist_result(videos, gameID, game_title)]
3469 class UstreamIE(InfoExtractor):
3470 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3471 IE_NAME = u'ustream'
3473 def _real_extract(self, url):
3474 m = re.match(self._VALID_URL, url)
3475 video_id = m.group('videoID')
3477 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3478 webpage = self._download_webpage(url, video_id)
3480 self.report_extraction(video_id)
3482 video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
3485 uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
3486 webpage, u'uploader', fatal=False, flags=re.DOTALL)
3488 thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
3489 webpage, u'thumbnail', fatal=False)
3495 'title': video_title,
3496 'uploader': uploader,
3497 'thumbnail': thumbnail,
3501 class WorldStarHipHopIE(InfoExtractor):
3502 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3503 IE_NAME = u'WorldStarHipHop'
3505 def _real_extract(self, url):
3506 m = re.match(self._VALID_URL, url)
3507 video_id = m.group('id')
3509 webpage_src = self._download_webpage(url, video_id)
3511 video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
3512 webpage_src, u'video URL')
3514 if 'mp4' in video_url:
3519 video_title = self._html_search_regex(r"<title>(.*)</title>",
3520 webpage_src, u'title')
3522 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3523 thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
3524 webpage_src, u'thumbnail', fatal=False)
3527 _title = r"""candytitles.*>(.*)</span>"""
3528 mobj = re.search(_title, webpage_src)
3529 if mobj is not None:
3530 video_title = mobj.group(1)
3535 'title' : video_title,
3536 'thumbnail' : thumbnail,
3541 class RBMARadioIE(InfoExtractor):
3542 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3544 def _real_extract(self, url):
3545 m = re.match(self._VALID_URL, url)
3546 video_id = m.group('videoID')
3548 webpage = self._download_webpage(url, video_id)
3550 json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
3551 webpage, u'json data', flags=re.MULTILINE)
3554 data = json.loads(json_data)
3555 except ValueError as e:
3556 raise ExtractorError(u'Invalid JSON: ' + str(e))
3558 video_url = data['akamai_url'] + '&cbr=256'
3559 url_parts = compat_urllib_parse_urlparse(video_url)
3560 video_ext = url_parts.path.rpartition('.')[2]
3565 'title': data['title'],
3566 'description': data.get('teaser_text'),
3567 'location': data.get('country_of_origin'),
3568 'uploader': data.get('host', {}).get('name'),
3569 'uploader_id': data.get('host', {}).get('slug'),
3570 'thumbnail': data.get('image', {}).get('large_url_2x'),
3571 'duration': data.get('duration'),
3576 class YouPornIE(InfoExtractor):
3577 """Information extractor for youporn.com."""
3578 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3580 def _print_formats(self, formats):
3581 """Print all available formats"""
3582 print(u'Available formats:')
3583 print(u'ext\t\tformat')
3584 print(u'---------------------------------')
3585 for format in formats:
3586 print(u'%s\t\t%s' % (format['ext'], format['format']))
3588 def _specific(self, req_format, formats):
3590 if(x["format"]==req_format):
3594 def _real_extract(self, url):
3595 mobj = re.match(self._VALID_URL, url)
3597 raise ExtractorError(u'Invalid URL: %s' % url)
3598 video_id = mobj.group('videoid')
3600 req = compat_urllib_request.Request(url)
3601 req.add_header('Cookie', 'age_verified=1')
3602 webpage = self._download_webpage(req, video_id)
3604 # Get JSON parameters
3605 json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
3607 params = json.loads(json_params)
3609 raise ExtractorError(u'Invalid JSON')
3611 self.report_extraction(video_id)
3613 video_title = params['title']
3614 upload_date = unified_strdate(params['release_date_f'])
3615 video_description = params['description']
3616 video_uploader = params['submitted_by']
3617 thumbnail = params['thumbnails'][0]['image']
3619 raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
3621 # Get all of the formats available
3622 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3623 download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
3624 webpage, u'download list').strip()
3626 # Get all of the links from the page
3627 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3628 links = re.findall(LINK_RE, download_list_html)
3629 if(len(links) == 0):
3630 raise ExtractorError(u'ERROR: no known formats available for video')
3632 self.to_screen(u'Links found: %d' % len(links))
3637 # A link looks like this:
3638 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3639 # A path looks like this:
3640 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3641 video_url = unescapeHTML( link )
3642 path = compat_urllib_parse_urlparse( video_url ).path
3643 extension = os.path.splitext( path )[1][1:]
3644 format = path.split('/')[4].split('_')[:2]
3647 format = "-".join( format )
3648 # title = u'%s-%s-%s' % (video_title, size, bitrate)
3653 'uploader': video_uploader,
3654 'upload_date': upload_date,
3655 'title': video_title,
3658 'thumbnail': thumbnail,
3659 'description': video_description
3662 if self._downloader.params.get('listformats', None):
3663 self._print_formats(formats)
3666 req_format = self._downloader.params.get('format', None)
3667 self.to_screen(u'Format: %s' % req_format)
3669 if req_format is None or req_format == 'best':
3671 elif req_format == 'worst':
3672 return [formats[-1]]
3673 elif req_format in ('-1', 'all'):
3676 format = self._specific( req_format, formats )
3678 raise ExtractorError(u'Requested format not available')
3683 class PornotubeIE(InfoExtractor):
3684 """Information extractor for pornotube.com."""
3685 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3687 def _real_extract(self, url):
3688 mobj = re.match(self._VALID_URL, url)
3690 raise ExtractorError(u'Invalid URL: %s' % url)
3692 video_id = mobj.group('videoid')
3693 video_title = mobj.group('title')
3695 # Get webpage content
3696 webpage = self._download_webpage(url, video_id)
3699 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3700 video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
3701 video_url = compat_urllib_parse.unquote(video_url)
3703 #Get the uploaded date
3704 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3705 upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
3706 if upload_date: upload_date = unified_strdate(upload_date)
3708 info = {'id': video_id,
3711 'upload_date': upload_date,
3712 'title': video_title,
3718 class YouJizzIE(InfoExtractor):
3719 """Information extractor for youjizz.com."""
3720 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3722 def _real_extract(self, url):
3723 mobj = re.match(self._VALID_URL, url)
3725 raise ExtractorError(u'Invalid URL: %s' % url)
3727 video_id = mobj.group('videoid')
3729 # Get webpage content
3730 webpage = self._download_webpage(url, video_id)
3732 # Get the video title
3733 video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
3734 webpage, u'title').strip()
3736 # Get the embed page
3737 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3739 raise ExtractorError(u'ERROR: unable to extract embed page')
3741 embed_page_url = result.group(0).strip()
3742 video_id = result.group('videoid')
3744 webpage = self._download_webpage(embed_page_url, video_id)
3747 video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
3748 webpage, u'video URL')
3750 info = {'id': video_id,
3752 'title': video_title,
3755 'player_url': embed_page_url}
3759 class EightTracksIE(InfoExtractor):
3761 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3763 def _real_extract(self, url):
3764 mobj = re.match(self._VALID_URL, url)
3766 raise ExtractorError(u'Invalid URL: %s' % url)
3767 playlist_id = mobj.group('id')
3769 webpage = self._download_webpage(url, playlist_id)
3771 json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
3772 data = json.loads(json_like)
3774 session = str(random.randint(0, 1000000000))
3776 track_count = data['tracks_count']
3777 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3778 next_url = first_url
3780 for i in itertools.count():
3781 api_json = self._download_webpage(next_url, playlist_id,
3782 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3783 errnote=u'Failed to download song information')
3784 api_data = json.loads(api_json)
3785 track_data = api_data[u'set']['track']
3787 'id': track_data['id'],
3788 'url': track_data['track_file_stream_url'],
3789 'title': track_data['performer'] + u' - ' + track_data['name'],
3790 'raw_title': track_data['name'],
3791 'uploader_id': data['user']['login'],
3795 if api_data['set']['at_last_track']:
3797 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3800 class KeekIE(InfoExtractor):
3801 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3804 def _real_extract(self, url):
3805 m = re.match(self._VALID_URL, url)
3806 video_id = m.group('videoID')
3808 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3809 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3810 webpage = self._download_webpage(url, video_id)
3812 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3815 uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
3816 webpage, u'uploader', fatal=False)
3822 'title': video_title,
3823 'thumbnail': thumbnail,
3824 'uploader': uploader
3828 class TEDIE(InfoExtractor):
3829 _VALID_URL=r'''http://www\.ted\.com/
3831 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3833 ((?P<type_talk>talks)) # We have a simple talk
3835 (/lang/(.*?))? # The url may contain the language
3836 /(?P<name>\w+) # Here goes the name and then ".html"
3840 def suitable(cls, url):
3841 """Receives a URL and returns True if suitable for this IE."""
3842 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3844 def _real_extract(self, url):
3845 m=re.match(self._VALID_URL, url, re.VERBOSE)
3846 if m.group('type_talk'):
3847 return [self._talk_info(url)]
3849 playlist_id=m.group('playlist_id')
3850 name=m.group('name')
3851 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3852 return [self._playlist_videos_info(url,name,playlist_id)]
3854 def _playlist_videos_info(self,url,name,playlist_id=0):
3855 '''Returns the videos of the playlist'''
3857 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3858 ([.\s]*?)data-playlist_item_id="(\d+)"
3859 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3861 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3862 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3863 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3864 m_names=re.finditer(video_name_RE,webpage)
3866 playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
3867 webpage, 'playlist title')
3869 playlist_entries = []
3870 for m_video, m_name in zip(m_videos,m_names):
3871 video_id=m_video.group('video_id')
3872 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3873 playlist_entries.append(self.url_result(talk_url, 'TED'))
3874 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3876 def _talk_info(self, url, video_id=0):
3877 """Return the video for the talk in the url"""
3878 m = re.match(self._VALID_URL, url,re.VERBOSE)
3879 video_name = m.group('name')
3880 webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
3881 self.report_extraction(video_name)
3882 # If the url includes the language we get the title translated
3883 title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
3885 json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
3886 webpage, 'json data')
3887 info = json.loads(json_data)
3888 desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
3889 webpage, 'description', flags = re.DOTALL)
3891 thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
3892 webpage, 'thumbnail')
3895 'url': info['htmlStreams'][-1]['file'],
3898 'thumbnail': thumbnail,
3899 'description': desc,
3903 class MySpassIE(InfoExtractor):
3904 _VALID_URL = r'http://www.myspass.de/.*'
3906 def _real_extract(self, url):
3907 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3909 # video id is the last path element of the URL
3910 # usually there is a trailing slash, so also try the second but last
3911 url_path = compat_urllib_parse_urlparse(url).path
3912 url_parent_path, video_id = os.path.split(url_path)
3914 _, video_id = os.path.split(url_parent_path)
3917 metadata_url = META_DATA_URL_TEMPLATE % video_id
3918 metadata_text = self._download_webpage(metadata_url, video_id)
3919 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3921 # extract values from metadata
3922 url_flv_el = metadata.find('url_flv')
3923 if url_flv_el is None:
3924 raise ExtractorError(u'Unable to extract download url')
3925 video_url = url_flv_el.text
3926 extension = os.path.splitext(video_url)[1][1:]
3927 title_el = metadata.find('title')
3928 if title_el is None:
3929 raise ExtractorError(u'Unable to extract title')
3930 title = title_el.text
3931 format_id_el = metadata.find('format_id')
3932 if format_id_el is None:
3935 format = format_id_el.text
3936 description_el = metadata.find('description')
3937 if description_el is not None:
3938 description = description_el.text
3941 imagePreview_el = metadata.find('imagePreview')
3942 if imagePreview_el is not None:
3943 thumbnail = imagePreview_el.text
3952 'thumbnail': thumbnail,
3953 'description': description
3957 class SpiegelIE(InfoExtractor):
3958 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3960 def _real_extract(self, url):
3961 m = re.match(self._VALID_URL, url)
3962 video_id = m.group('videoID')
3964 webpage = self._download_webpage(url, video_id)
3966 video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
3969 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3970 xml_code = self._download_webpage(xml_url, video_id,
3971 note=u'Downloading XML', errnote=u'Failed to download XML')
3973 idoc = xml.etree.ElementTree.fromstring(xml_code)
3974 last_type = idoc[-1]
3975 filename = last_type.findall('./filename')[0].text
3976 duration = float(last_type.findall('./duration')[0].text)
3978 video_url = 'http://video2.spiegel.de/flash/' + filename
3979 video_ext = filename.rpartition('.')[2]
3984 'title': video_title,
3985 'duration': duration,
3989 class LiveLeakIE(InfoExtractor):
3991 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3992 IE_NAME = u'liveleak'
3994 def _real_extract(self, url):
3995 mobj = re.match(self._VALID_URL, url)
3997 raise ExtractorError(u'Invalid URL: %s' % url)
3999 video_id = mobj.group('video_id')
4001 webpage = self._download_webpage(url, video_id)
4003 video_url = self._search_regex(r'file: "(.*?)",',
4004 webpage, u'video URL')
4006 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
4007 webpage, u'title').replace('LiveLeak.com -', '').strip()
4009 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
4010 webpage, u'description', fatal=False)
4012 video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
4013 webpage, u'uploader', fatal=False)
4019 'title': video_title,
4020 'description': video_description,
4021 'uploader': video_uploader
4026 class ARDIE(InfoExtractor):
4027 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4028 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4029 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4031 def _real_extract(self, url):
4032 # determine video id from url
4033 m = re.match(self._VALID_URL, url)
4035 numid = re.search(r'documentId=([0-9]+)', url)
4037 video_id = numid.group(1)
4039 video_id = m.group('video_id')
4041 # determine title and media streams from webpage
4042 html = self._download_webpage(url, video_id)
4043 title = re.search(self._TITLE, html).group('title')
4044 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4046 assert '"fsk"' in html
4047 raise ExtractorError(u'This video is only available after 8:00 pm')
4049 # choose default media type and highest quality for now
4050 stream = max([s for s in streams if int(s["media_type"]) == 0],
4051 key=lambda s: int(s["quality"]))
4053 # there's two possibilities: RTMP stream or HTTP download
4054 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4055 if stream['rtmp_url']:
4056 self.to_screen(u'RTMP download detected')
4057 assert stream['video_url'].startswith('mp4:')
4058 info["url"] = stream["rtmp_url"]
4059 info["play_path"] = stream['video_url']
4061 assert stream["video_url"].endswith('.mp4')
4062 info["url"] = stream["video_url"]
4065 class ZDFIE(InfoExtractor):
4066 _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4067 _TITLE = r'<h1(?: class="beitragHeadline")?>(?P<title>.*)</h1>'
4068 _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>'
4069 _MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"'
4070 _RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)'
4072 def _real_extract(self, url):
4073 mobj = re.match(self._VALID_URL, url)
4075 raise ExtractorError(u'Invalid URL: %s' % url)
4076 video_id = mobj.group('video_id')
4078 html = self._download_webpage(url, video_id)
4079 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4081 raise ExtractorError(u'No media url found.')
4083 # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url
4084 # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url
4085 # choose first/default media type and highest quality for now
4086 for s in streams: #find 300 - dsl1000mbit
4087 if s['quality'] == '300' and s['media_type'] == 'wstreaming':
4090 for s in streams: #find veryhigh - dsl2000mbit
4091 if s['quality'] == 'veryhigh' and s['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working
4095 raise ExtractorError(u'No stream found.')
4097 media_link = self._download_webpage(stream_['video_url'], video_id,'Get stream URL')
4099 self.report_extraction(video_id)
4100 mobj = re.search(self._TITLE, html)
4102 raise ExtractorError(u'Cannot extract title')
4103 title = unescapeHTML(mobj.group('title'))
4105 mobj = re.search(self._MMS_STREAM, media_link)
4107 mobj = re.search(self._RTSP_STREAM, media_link)
4109 raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL')
4110 mms_url = mobj.group('video_url')
4112 mobj = re.search('(.*)[.](?P<ext>[^.]+)', mms_url)
4114 raise ExtractorError(u'Cannot extract extention')
4115 ext = mobj.group('ext')
4117 return [{'id': video_id,
4123 class TumblrIE(InfoExtractor):
4124 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4126 def _real_extract(self, url):
4127 m_url = re.match(self._VALID_URL, url)
4128 video_id = m_url.group('id')
4129 blog = m_url.group('blog_name')
4131 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4132 webpage = self._download_webpage(url, video_id)
4134 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4135 video = re.search(re_video, webpage)
4137 raise ExtractorError(u'Unable to extract video')
4138 video_url = video.group('video_url')
4139 ext = video.group('ext')
4141 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
4142 webpage, u'thumbnail', fatal=False) # We pick the first poster
4143 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
4145 # The only place where you can get a title, it's not complete,
4146 # but searching in other places doesn't work for all videos
4147 video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
4148 webpage, u'title', flags=re.DOTALL)
4150 return [{'id': video_id,
4152 'title': video_title,
4153 'thumbnail': video_thumbnail,
4157 class BandcampIE(InfoExtractor):
4158 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4160 def _real_extract(self, url):
4161 mobj = re.match(self._VALID_URL, url)
4162 title = mobj.group('title')
4163 webpage = self._download_webpage(url, title)
4164 # We get the link to the free download page
4165 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4166 if m_download is None:
4167 raise ExtractorError(u'No free songs found')
4169 download_link = m_download.group(1)
4170 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4171 webpage, re.MULTILINE|re.DOTALL).group('id')
4173 download_webpage = self._download_webpage(download_link, id,
4174 'Downloading free downloads page')
4175 # We get the dictionary of the track from some javascrip code
4176 info = re.search(r'items: (.*?),$',
4177 download_webpage, re.MULTILINE).group(1)
4178 info = json.loads(info)[0]
4179 # We pick mp3-320 for now, until format selection can be easily implemented.
4180 mp3_info = info[u'downloads'][u'mp3-320']
4181 # If we try to use this url it says the link has expired
4182 initial_url = mp3_info[u'url']
4183 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4184 m_url = re.match(re_url, initial_url)
4185 #We build the url we will use to get the final track url
4186 # This url is build in Bandcamp in the script download_bunde_*.js
4187 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4188 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4189 # If we could correctly generate the .rand field the url would be
4190 #in the "download_url" key
4191 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4193 track_info = {'id':id,
4194 'title' : info[u'title'],
4197 'thumbnail' : info[u'thumb_url'],
4198 'uploader' : info[u'artist']
4203 class RedTubeIE(InfoExtractor):
4204 """Information Extractor for redtube"""
4205 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4207 def _real_extract(self,url):
4208 mobj = re.match(self._VALID_URL, url)
4210 raise ExtractorError(u'Invalid URL: %s' % url)
4212 video_id = mobj.group('id')
4213 video_extension = 'mp4'
4214 webpage = self._download_webpage(url, video_id)
4216 self.report_extraction(video_id)
4218 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
4219 webpage, u'video URL')
4221 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
4227 'ext': video_extension,
4228 'title': video_title,
4231 class InaIE(InfoExtractor):
4232 """Information Extractor for Ina.fr"""
4233 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4235 def _real_extract(self,url):
4236 mobj = re.match(self._VALID_URL, url)
4238 video_id = mobj.group('id')
4239 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4240 video_extension = 'mp4'
4241 webpage = self._download_webpage(mrss_url, video_id)
4243 self.report_extraction(video_id)
4245 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
4246 webpage, u'video URL')
4248 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
4254 'ext': video_extension,
4255 'title': video_title,
4258 class HowcastIE(InfoExtractor):
4259 """Information Extractor for Howcast.com"""
4260 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
4262 def _real_extract(self, url):
4263 mobj = re.match(self._VALID_URL, url)
4265 video_id = mobj.group('id')
4266 webpage_url = 'http://www.howcast.com/videos/' + video_id
4267 webpage = self._download_webpage(webpage_url, video_id)
4269 self.report_extraction(video_id)
4271 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
4272 webpage, u'video URL')
4274 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
4277 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
4278 webpage, u'description', fatal=False)
4280 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
4281 webpage, u'thumbnail', fatal=False)
4287 'title': video_title,
4288 'description': video_description,
4289 'thumbnail': thumbnail,
4292 class VineIE(InfoExtractor):
4293 """Information Extractor for Vine.co"""
4294 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
4296 def _real_extract(self, url):
4297 mobj = re.match(self._VALID_URL, url)
4299 video_id = mobj.group('id')
4300 webpage_url = 'https://vine.co/v/' + video_id
4301 webpage = self._download_webpage(webpage_url, video_id)
4303 self.report_extraction(video_id)
4305 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
4306 webpage, u'video URL')
4308 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
4311 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
4312 webpage, u'thumbnail', fatal=False)
4314 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
4315 webpage, u'uploader', fatal=False, flags=re.DOTALL)
4321 'title': video_title,
4322 'thumbnail': thumbnail,
4323 'uploader': uploader,
4326 class FlickrIE(InfoExtractor):
4327 """Information Extractor for Flickr videos"""
4328 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
4330 def _real_extract(self, url):
4331 mobj = re.match(self._VALID_URL, url)
4333 video_id = mobj.group('id')
4334 video_uploader_id = mobj.group('uploader_id')
4335 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
4336 webpage = self._download_webpage(webpage_url, video_id)
4338 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
4340 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
4341 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
4343 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
4344 first_xml, u'node_id')
4346 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
4347 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
4349 self.report_extraction(video_id)
4351 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
4353 raise ExtractorError(u'Unable to extract video url')
4354 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
4356 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
4357 webpage, u'video title')
4359 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
4360 webpage, u'description', fatal=False)
4362 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
4363 webpage, u'thumbnail', fatal=False)
4369 'title': video_title,
4370 'description': video_description,
4371 'thumbnail': thumbnail,
4372 'uploader_id': video_uploader_id,
4375 class TeamcocoIE(InfoExtractor):
4376 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
4378 def _real_extract(self, url):
4379 mobj = re.match(self._VALID_URL, url)
4381 raise ExtractorError(u'Invalid URL: %s' % url)
4382 url_title = mobj.group('url_title')
4383 webpage = self._download_webpage(url, url_title)
4385 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
4386 webpage, u'video id')
4388 self.report_extraction(video_id)
4390 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
4393 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
4394 webpage, u'thumbnail', fatal=False)
4396 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
4397 webpage, u'description', fatal=False)
4399 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
4400 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
4402 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
4409 'title': video_title,
4410 'thumbnail': thumbnail,
4411 'description': video_description,
4414 class XHamsterIE(InfoExtractor):
4415 """Information Extractor for xHamster"""
4416 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
4418 def _real_extract(self,url):
4419 mobj = re.match(self._VALID_URL, url)
4421 video_id = mobj.group('id')
4422 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
4423 webpage = self._download_webpage(mrss_url, video_id)
4425 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
4427 raise ExtractorError(u'Unable to extract media URL')
4428 if len(mobj.group('server')) == 0:
4429 video_url = compat_urllib_parse.unquote(mobj.group('file'))
4431 video_url = mobj.group('server')+'/key='+mobj.group('file')
4432 video_extension = video_url.split('.')[-1]
4434 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
4437 # Can't see the description anywhere in the UI
4438 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
4439 # webpage, u'description', fatal=False)
4440 # if video_description: video_description = unescapeHTML(video_description)
4442 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
4444 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
4446 video_upload_date = None
4447 self._downloader.report_warning(u'Unable to extract upload date')
4449 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
4450 webpage, u'uploader id', default=u'anonymous')
4452 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
4453 webpage, u'thumbnail', fatal=False)
4458 'ext': video_extension,
4459 'title': video_title,
4460 # 'description': video_description,
4461 'upload_date': video_upload_date,
4462 'uploader_id': video_uploader_id,
4463 'thumbnail': video_thumbnail
4466 class HypemIE(InfoExtractor):
4467 """Information Extractor for hypem"""
4468 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
4470 def _real_extract(self, url):
4471 mobj = re.match(self._VALID_URL, url)
4473 raise ExtractorError(u'Invalid URL: %s' % url)
4474 track_id = mobj.group(1)
4476 data = { 'ax': 1, 'ts': time.time() }
4477 data_encoded = compat_urllib_parse.urlencode(data)
4478 complete_url = url + "?" + data_encoded
4479 request = compat_urllib_request.Request(complete_url)
4480 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
4481 cookie = urlh.headers.get('Set-Cookie', '')
4483 self.report_extraction(track_id)
4485 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
4486 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
4488 track_list = json.loads(html_tracks)
4489 track = track_list[u'tracks'][0]
4491 raise ExtractorError(u'Hypemachine contained invalid JSON.')
4494 track_id = track[u"id"]
4495 artist = track[u"artist"]
4496 title = track[u"song"]
4498 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
4499 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
4500 request.add_header('cookie', cookie)
4501 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
4503 song_data = json.loads(song_data_json)
4505 raise ExtractorError(u'Hypemachine contained invalid JSON.')
4506 final_url = song_data[u"url"]
4516 class Vbox7IE(InfoExtractor):
4517 """Information Extractor for Vbox7"""
4518 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
4520 def _real_extract(self,url):
4521 mobj = re.match(self._VALID_URL, url)
4523 raise ExtractorError(u'Invalid URL: %s' % url)
4524 video_id = mobj.group(1)
4526 redirect_page, urlh = self._download_webpage_handle(url, video_id)
4527 new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
4528 redirect_url = urlh.geturl() + new_location
4529 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
4531 title = self._html_search_regex(r'<title>(.*)</title>',
4532 webpage, u'title').split('/')[0].strip()
4535 info_url = "http://vbox7.com/play/magare.do"
4536 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
4537 info_request = compat_urllib_request.Request(info_url, data)
4538 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
4539 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
4540 if info_response is None:
4541 raise ExtractorError(u'Unable to extract the media url')
4542 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
4549 'thumbnail': thumbnail_url,
4552 class GametrailersIE(InfoExtractor):
4553 _VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
4555 def _real_extract(self, url):
4556 mobj = re.match(self._VALID_URL, url)
4558 raise ExtractorError(u'Invalid URL: %s' % url)
4559 video_id = mobj.group('id')
4560 video_type = mobj.group('type')
4561 webpage = self._download_webpage(url, video_id)
4562 if video_type == 'full-episodes':
4563 mgid_re = r'data-video="(?P<mgid>mgid:.*?)"'
4565 mgid_re = r'data-contentId=\'(?P<mgid>mgid:.*?)\''
4566 mgid = self._search_regex(mgid_re, webpage, u'mgid')
4567 data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'})
4569 info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data,
4570 video_id, u'Downloading video info')
4571 links_webpage = self._download_webpage('http://www.gametrailers.com/feeds/mediagen/?' + data,
4572 video_id, u'Downloading video urls info')
4574 self.report_extraction(video_id)
4575 info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
4576 <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
4578 <url>(?P<thumb>.*?)</url>.*
4581 m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL)
4583 raise ExtractorError(u'Unable to extract video info')
4584 video_title = m_info.group('title')
4585 video_description = m_info.group('description')
4586 video_thumb = m_info.group('thumb')
4588 m_urls = list(re.finditer(r'<src>(?P<url>.*)</src>', links_webpage))
4589 if m_urls is None or len(m_urls) == 0:
4590 raise ExtractError(u'Unable to extrat video url')
4591 # They are sorted from worst to best quality
4592 video_url = m_urls[-1].group('url')
4594 return {'url': video_url,
4596 'title': video_title,
4597 # Videos are actually flv not mp4
4599 'thumbnail': video_thumb,
4600 'description': video_description,
4603 class StatigramIE(InfoExtractor):
4604 _VALID_URL = r'(?:http://)?(?:www\.)?statigr\.am/p/([^/]+)'
4606 def _real_extract(self, url):
4607 mobj = re.match(self._VALID_URL, url)
4609 video_id = mobj.group(1)
4610 webpage = self._download_webpage(url, video_id)
4611 video_url = self._html_search_regex(
4612 r'<meta property="og:video:secure_url" content="(.+?)">',
4613 webpage, u'video URL')
4614 thumbnail_url = self._html_search_regex(
4615 r'<meta property="og:image" content="(.+?)" />',
4616 webpage, u'thumbnail URL', fatal=False)
4617 html_title = self._html_search_regex(
4618 r'<title>(.+?)</title>',
4620 title = html_title.rpartition(u' | Statigram')[0]
4621 uploader_id = self._html_search_regex(
4622 r'@([^ ]+)', title, u'uploader name', fatal=False)
4630 'thumbnail': thumbnail_url,
4631 'uploader_id' : uploader_id
4634 def gen_extractors():
4635 """ Return a list of an instance of every supported extractor.
4636 The order does matter; the first extractor matched is the one handling the URL.
4639 YoutubePlaylistIE(),
4664 StanfordOpenClassroomIE(),
4674 WorldStarHipHopIE(),
4704 def get_info_extractor(ie_name):
4705 """Returns the info extractor class with the given ie_name"""
4706 return globals()[ie_name+'IE']