2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
23 class InfoExtractor(object):
24 """Information Extractor class.
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
29 others. The information is stored in a dictionary which is then
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
34 The dictionaries must include the following fields:
38 title: Video title, unescaped.
39 ext: Video filename extension.
41 The following fields are optional:
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
46 uploader: Full name of the video uploader.
47 upload_date: Video upload date (YYYYMMDD).
48 uploader_id: Nickname or id of the video uploader.
49 location: Physical location of the video.
50 player_url: SWF Player URL (used for rtmpdump).
51 subtitles: The subtitle file contents.
52 urlhandle: [internal] The urlHandle to be used to download the file,
53 like returned by urllib.request.urlopen
55 The fields should all be Unicode strings.
57 Subclasses of this one should re-define the _real_initialize() and
58 _real_extract() methods and define a _VALID_URL regexp.
59 Probably, they should also be added to the list of extractors.
61 _real_extract() must return a *list* of information dictionaries as
64 Finally, the _WORKING attribute should be set to False for broken IEs
65 in order to warn the users and skip the tests.
72 def __init__(self, downloader=None):
73 """Constructor. Receives an optional downloader."""
75 self.set_downloader(downloader)
78 def suitable(cls, url):
79 """Receives a URL and returns True if suitable for this IE."""
80 return re.match(cls._VALID_URL, url) is not None
84 """Getter method for _WORKING."""
88 """Initializes an instance (authentication, etc)."""
90 self._real_initialize()
93 def extract(self, url):
94 """Extracts URL information and returns it in list of dicts."""
96 return self._real_extract(url)
98 def set_downloader(self, downloader):
99 """Sets the downloader for this IE."""
100 self._downloader = downloader
102 def _real_initialize(self):
103 """Real initialization process. Redefine in subclasses."""
106 def _real_extract(self, url):
107 """Real extraction process. Redefine in subclasses."""
112 return type(self).__name__[:-2]
114 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115 """ Returns the response handle """
117 self.report_download_webpage(video_id)
118 elif note is not False:
119 self.to_screen(u'%s: %s' % (video_id, note))
121 return compat_urllib_request.urlopen(url_or_request)
122 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
124 errnote = u'Unable to download webpage'
125 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
127 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
128 """ Returns a tuple (page content as string, URL handle) """
129 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
130 content_type = urlh.headers.get('Content-Type', '')
131 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
133 encoding = m.group(1)
136 webpage_bytes = urlh.read()
137 if self._downloader.params.get('dump_intermediate_pages', False):
139 url = url_or_request.get_full_url()
140 except AttributeError:
142 self.to_screen(u'Dumping request to ' + url)
143 dump = base64.b64encode(webpage_bytes).decode('ascii')
144 self._downloader.to_screen(dump)
145 content = webpage_bytes.decode(encoding, 'replace')
146 return (content, urlh)
148 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
149 """ Returns the data of the page as a string """
150 return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
152 def to_screen(self, msg):
153 """Print msg to screen, prefixing it with '[ie_name]'"""
154 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
156 def report_extraction(self, id_or_name):
157 """Report information extraction."""
158 self.to_screen(u'%s: Extracting information' % id_or_name)
160 def report_download_webpage(self, video_id):
161 """Report webpage download."""
162 self.to_screen(u'%s: Downloading webpage' % video_id)
164 def report_age_confirmation(self):
165 """Report attempt to confirm age."""
166 self.to_screen(u'Confirming age')
168 #Methods for following #608
169 #They set the correct value of the '_type' key
170 def video_result(self, video_info):
171 """Returns a video"""
172 video_info['_type'] = 'video'
174 def url_result(self, url, ie=None):
175 """Returns a url that points to a page that should be processed"""
176 #TODO: ie should be the class used for getting the info
177 video_info = {'_type': 'url',
181 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
182 """Returns a playlist"""
183 video_info = {'_type': 'playlist',
186 video_info['id'] = playlist_id
188 video_info['title'] = playlist_title
192 class YoutubeIE(InfoExtractor):
193 """Information extractor for youtube.com."""
197 (?:https?://)? # http(s):// (optional)
198 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
199 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
200 (?:.*?\#/)? # handle anchor (#/) redirect urls
201 (?: # the various things that can precede the ID:
202 (?:(?:v|embed|e)/) # v/ or embed/ or e/
203 |(?: # or the v= param in all its forms
204 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
205 (?:\?|\#!?) # the params delimiter ? or # or #!
206 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
209 )? # optional -> youtube.com/xxxx is OK
210 )? # all until now is optional -> you can pass the naked ID
211 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
212 (?(1).+)? # if we found the ID, everything can follow
214 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
215 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
216 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
217 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
218 _NETRC_MACHINE = 'youtube'
219 # Listed in order of quality
220 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
221 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
222 _video_extensions = {
228 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
234 _video_dimensions = {
253 def suitable(cls, url):
254 """Receives a URL and returns True if suitable for this IE."""
255 if YoutubePlaylistIE.suitable(url): return False
256 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
258 def report_lang(self):
259 """Report attempt to set language."""
260 self.to_screen(u'Setting language')
262 def report_login(self):
263 """Report attempt to log in."""
264 self.to_screen(u'Logging in')
266 def report_video_webpage_download(self, video_id):
267 """Report attempt to download video webpage."""
268 self.to_screen(u'%s: Downloading video webpage' % video_id)
270 def report_video_info_webpage_download(self, video_id):
271 """Report attempt to download video info webpage."""
272 self.to_screen(u'%s: Downloading video info webpage' % video_id)
274 def report_video_subtitles_download(self, video_id):
275 """Report attempt to download video info webpage."""
276 self.to_screen(u'%s: Checking available subtitles' % video_id)
278 def report_video_subtitles_request(self, video_id, sub_lang, format):
279 """Report attempt to download video info webpage."""
280 self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
282 def report_video_subtitles_available(self, video_id, sub_lang_list):
283 """Report available subtitles."""
284 sub_lang = ",".join(list(sub_lang_list.keys()))
285 self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
287 def report_information_extraction(self, video_id):
288 """Report attempt to extract video information."""
289 self.to_screen(u'%s: Extracting video information' % video_id)
291 def report_unavailable_format(self, video_id, format):
292 """Report extracted video URL."""
293 self.to_screen(u'%s: Format %s not available' % (video_id, format))
295 def report_rtmp_download(self):
296 """Indicate the download will use the RTMP protocol."""
297 self.to_screen(u'RTMP download detected')
299 def _get_available_subtitles(self, video_id):
300 self.report_video_subtitles_download(video_id)
301 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
303 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
304 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
305 return (u'unable to download video subtitles: %s' % compat_str(err), None)
306 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
307 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
308 if not sub_lang_list:
309 return (u'video doesn\'t have subtitles', None)
312 def _list_available_subtitles(self, video_id):
313 sub_lang_list = self._get_available_subtitles(video_id)
314 self.report_video_subtitles_available(video_id, sub_lang_list)
316 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
319 (error_message, sub_lang, sub)
321 self.report_video_subtitles_request(video_id, sub_lang, format)
322 params = compat_urllib_parse.urlencode({
328 url = 'http://www.youtube.com/api/timedtext?' + params
330 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
331 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
332 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
334 return (u'Did not fetch video subtitles', None, None)
335 return (None, sub_lang, sub)
337 def _extract_subtitle(self, video_id):
339 Return a list with a tuple:
340 [(error_message, sub_lang, sub)]
342 sub_lang_list = self._get_available_subtitles(video_id)
343 sub_format = self._downloader.params.get('subtitlesformat')
344 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
345 return [(sub_lang_list[0], None, None)]
346 if self._downloader.params.get('subtitleslang', False):
347 sub_lang = self._downloader.params.get('subtitleslang')
348 elif 'en' in sub_lang_list:
351 sub_lang = list(sub_lang_list.keys())[0]
352 if not sub_lang in sub_lang_list:
353 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
355 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
358 def _extract_all_subtitles(self, video_id):
359 sub_lang_list = self._get_available_subtitles(video_id)
360 sub_format = self._downloader.params.get('subtitlesformat')
361 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
362 return [(sub_lang_list[0], None, None)]
364 for sub_lang in sub_lang_list:
365 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
366 subtitles.append(subtitle)
369 def _print_formats(self, formats):
370 print('Available formats:')
372 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
374 def _real_initialize(self):
375 if self._downloader is None:
380 downloader_params = self._downloader.params
382 # Attempt to use provided username and password or .netrc data
383 if downloader_params.get('username', None) is not None:
384 username = downloader_params['username']
385 password = downloader_params['password']
386 elif downloader_params.get('usenetrc', False):
388 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
393 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
394 except (IOError, netrc.NetrcParseError) as err:
395 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
399 request = compat_urllib_request.Request(self._LANG_URL)
402 compat_urllib_request.urlopen(request).read()
403 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
404 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
407 # No authentication to be performed
411 request = compat_urllib_request.Request(self._LOGIN_URL)
413 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
414 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
415 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
420 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
422 galx = match.group(1)
424 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
430 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
434 u'PersistentCookie': u'yes',
436 u'bgresponse': u'js_disabled',
437 u'checkConnection': u'',
438 u'checkedDomains': u'youtube',
444 u'signIn': u'Sign in',
446 u'service': u'youtube',
450 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
452 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
453 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
454 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
457 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
458 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
459 self._downloader.report_warning(u'unable to log in: bad username or password')
461 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
462 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
468 'action_confirm': 'Confirm',
470 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
472 self.report_age_confirmation()
473 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
474 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
475 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
478 def _extract_id(self, url):
479 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
481 self._downloader.report_error(u'invalid URL: %s' % url)
483 video_id = mobj.group(2)
486 def _real_extract(self, url):
487 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
488 mobj = re.search(self._NEXT_URL_RE, url)
490 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
491 video_id = self._extract_id(url)
494 self.report_video_webpage_download(video_id)
495 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
496 request = compat_urllib_request.Request(url)
498 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
499 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
500 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
503 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
505 # Attempt to extract SWF player URL
506 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
508 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
513 self.report_video_info_webpage_download(video_id)
514 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
515 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
516 % (video_id, el_type))
517 video_info_webpage = self._download_webpage(video_info_url, video_id,
519 errnote='unable to download video info webpage')
520 video_info = compat_parse_qs(video_info_webpage)
521 if 'token' in video_info:
523 if 'token' not in video_info:
524 if 'reason' in video_info:
525 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
527 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
530 # Check for "rental" videos
531 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
532 self._downloader.report_error(u'"rental" videos not supported')
535 # Start extracting information
536 self.report_information_extraction(video_id)
539 if 'author' not in video_info:
540 self._downloader.report_error(u'unable to extract uploader name')
542 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
545 video_uploader_id = None
546 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
548 video_uploader_id = mobj.group(1)
550 self._downloader.report_warning(u'unable to extract uploader nickname')
553 if 'title' not in video_info:
554 self._downloader.report_error(u'unable to extract video title')
556 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
559 if 'thumbnail_url' not in video_info:
560 self._downloader.report_warning(u'unable to extract video thumbnail')
562 else: # don't panic if we can't find it
563 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
567 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
569 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
570 upload_date = unified_strdate(upload_date)
573 video_description = get_element_by_id("eow-description", video_webpage)
574 if video_description:
575 video_description = clean_html(video_description)
577 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
579 video_description = unescapeHTML(fd_mobj.group(1))
581 video_description = u''
584 video_subtitles = None
586 if self._downloader.params.get('writesubtitles', False):
587 video_subtitles = self._extract_subtitle(video_id)
589 (sub_error, sub_lang, sub) = video_subtitles[0]
591 self._downloader.report_error(sub_error)
593 if self._downloader.params.get('allsubtitles', False):
594 video_subtitles = self._extract_all_subtitles(video_id)
595 for video_subtitle in video_subtitles:
596 (sub_error, sub_lang, sub) = video_subtitle
598 self._downloader.report_error(sub_error)
600 if self._downloader.params.get('listsubtitles', False):
601 sub_lang_list = self._list_available_subtitles(video_id)
604 if 'length_seconds' not in video_info:
605 self._downloader.report_warning(u'unable to extract video duration')
608 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
611 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
613 # Decide which formats to download
614 req_format = self._downloader.params.get('format', None)
616 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
617 self.report_rtmp_download()
618 video_url_list = [(None, video_info['conn'][0])]
619 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
620 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
621 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
622 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
623 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
625 format_limit = self._downloader.params.get('format_limit', None)
626 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
627 if format_limit is not None and format_limit in available_formats:
628 format_list = available_formats[available_formats.index(format_limit):]
630 format_list = available_formats
631 existing_formats = [x for x in format_list if x in url_map]
632 if len(existing_formats) == 0:
633 raise ExtractorError(u'no known formats available for video')
634 if self._downloader.params.get('listformats', None):
635 self._print_formats(existing_formats)
637 if req_format is None or req_format == 'best':
638 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
639 elif req_format == 'worst':
640 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
641 elif req_format in ('-1', 'all'):
642 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
644 # Specific formats. We pick the first in a slash-delimeted sequence.
645 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
646 req_formats = req_format.split('/')
647 video_url_list = None
648 for rf in req_formats:
650 video_url_list = [(rf, url_map[rf])]
652 if video_url_list is None:
653 raise ExtractorError(u'requested format not available')
655 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
658 for format_param, video_real_url in video_url_list:
660 video_extension = self._video_extensions.get(format_param, 'flv')
662 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
663 self._video_dimensions.get(format_param, '???'))
667 'url': video_real_url,
668 'uploader': video_uploader,
669 'uploader_id': video_uploader_id,
670 'upload_date': upload_date,
671 'title': video_title,
672 'ext': video_extension,
673 'format': video_format,
674 'thumbnail': video_thumbnail,
675 'description': video_description,
676 'player_url': player_url,
677 'subtitles': video_subtitles,
678 'duration': video_duration
683 class MetacafeIE(InfoExtractor):
684 """Information Extractor for metacafe.com."""
686 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
687 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
688 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
689 IE_NAME = u'metacafe'
691 def report_disclaimer(self):
692 """Report disclaimer retrieval."""
693 self.to_screen(u'Retrieving disclaimer')
695 def _real_initialize(self):
696 # Retrieve disclaimer
697 request = compat_urllib_request.Request(self._DISCLAIMER)
699 self.report_disclaimer()
700 disclaimer = compat_urllib_request.urlopen(request).read()
701 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
702 self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
708 'submit': "Continue - I'm over 18",
710 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
712 self.report_age_confirmation()
713 disclaimer = compat_urllib_request.urlopen(request).read()
714 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
715 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
718 def _real_extract(self, url):
719 # Extract id and simplified title from URL
720 mobj = re.match(self._VALID_URL, url)
722 self._downloader.report_error(u'invalid URL: %s' % url)
725 video_id = mobj.group(1)
727 # Check if video comes from YouTube
728 mobj2 = re.match(r'^yt-(.*)$', video_id)
729 if mobj2 is not None:
730 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
732 # Retrieve video webpage to extract further information
733 webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
735 # Extract URL, uploader and title from webpage
736 self.report_extraction(video_id)
737 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
739 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
740 video_extension = mediaURL[-3:]
742 # Extract gdaKey if available
743 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
747 gdaKey = mobj.group(1)
748 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
750 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
752 self._downloader.report_error(u'unable to extract media URL')
754 vardict = compat_parse_qs(mobj.group(1))
755 if 'mediaData' not in vardict:
756 self._downloader.report_error(u'unable to extract media URL')
758 mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
760 self._downloader.report_error(u'unable to extract media URL')
762 mediaURL = mobj.group('mediaURL').replace('\\/', '/')
763 video_extension = mediaURL[-3:]
764 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
766 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
768 self._downloader.report_error(u'unable to extract title')
770 video_title = mobj.group(1).decode('utf-8')
772 mobj = re.search(r'submitter=(.*?);', webpage)
774 self._downloader.report_error(u'unable to extract uploader nickname')
776 video_uploader = mobj.group(1)
779 'id': video_id.decode('utf-8'),
780 'url': video_url.decode('utf-8'),
781 'uploader': video_uploader.decode('utf-8'),
783 'title': video_title,
784 'ext': video_extension.decode('utf-8'),
787 class DailymotionIE(InfoExtractor):
788 """Information Extractor for Dailymotion"""
790 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
791 IE_NAME = u'dailymotion'
793 def _real_extract(self, url):
794 # Extract id and simplified title from URL
795 mobj = re.match(self._VALID_URL, url)
797 self._downloader.report_error(u'invalid URL: %s' % url)
800 video_id = mobj.group(1).split('_')[0].split('?')[0]
802 video_extension = 'mp4'
804 # Retrieve video webpage to extract further information
805 request = compat_urllib_request.Request(url)
806 request.add_header('Cookie', 'family_filter=off')
807 webpage = self._download_webpage(request, video_id)
809 # Extract URL, uploader and title from webpage
810 self.report_extraction(video_id)
811 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
813 self._downloader.report_error(u'unable to extract media URL')
815 flashvars = compat_urllib_parse.unquote(mobj.group(1))
817 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
820 self.to_screen(u'Using %s' % key)
823 self._downloader.report_error(u'unable to extract video URL')
826 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
828 self._downloader.report_error(u'unable to extract video URL')
831 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
833 # TODO: support choosing qualities
835 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
837 self._downloader.report_error(u'unable to extract title')
839 video_title = unescapeHTML(mobj.group('title'))
841 video_uploader = None
842 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
844 # lookin for official user
845 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
846 if mobj_official is None:
847 self._downloader.report_warning(u'unable to extract uploader nickname')
849 video_uploader = mobj_official.group(1)
851 video_uploader = mobj.group(1)
853 video_upload_date = None
854 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
856 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
861 'uploader': video_uploader,
862 'upload_date': video_upload_date,
863 'title': video_title,
864 'ext': video_extension,
868 class PhotobucketIE(InfoExtractor):
869 """Information extractor for photobucket.com."""
871 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
872 IE_NAME = u'photobucket'
874 def _real_extract(self, url):
875 # Extract id from URL
876 mobj = re.match(self._VALID_URL, url)
878 self._downloader.report_error(u'Invalid URL: %s' % url)
881 video_id = mobj.group(1)
883 video_extension = 'flv'
885 # Retrieve video webpage to extract further information
886 request = compat_urllib_request.Request(url)
888 self.report_download_webpage(video_id)
889 webpage = compat_urllib_request.urlopen(request).read()
890 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
891 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
894 # Extract URL, uploader, and title from webpage
895 self.report_extraction(video_id)
896 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
898 self._downloader.report_error(u'unable to extract media URL')
900 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
904 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
906 self._downloader.report_error(u'unable to extract title')
908 video_title = mobj.group(1).decode('utf-8')
910 video_uploader = mobj.group(2).decode('utf-8')
913 'id': video_id.decode('utf-8'),
914 'url': video_url.decode('utf-8'),
915 'uploader': video_uploader,
917 'title': video_title,
918 'ext': video_extension.decode('utf-8'),
922 class YahooIE(InfoExtractor):
923 """Information extractor for video.yahoo.com."""
926 # _VALID_URL matches all Yahoo! Video URLs
927 # _VPAGE_URL matches only the extractable '/watch/' URLs
928 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
929 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
930 IE_NAME = u'video.yahoo'
932 def _real_extract(self, url, new_video=True):
933 # Extract ID from URL
934 mobj = re.match(self._VALID_URL, url)
936 self._downloader.report_error(u'Invalid URL: %s' % url)
939 video_id = mobj.group(2)
940 video_extension = 'flv'
942 # Rewrite valid but non-extractable URLs as
943 # extractable English language /watch/ URLs
944 if re.match(self._VPAGE_URL, url) is None:
945 request = compat_urllib_request.Request(url)
947 webpage = compat_urllib_request.urlopen(request).read()
948 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
949 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
952 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
954 self._downloader.report_error(u'Unable to extract id field')
956 yahoo_id = mobj.group(1)
958 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
960 self._downloader.report_error(u'Unable to extract vid field')
962 yahoo_vid = mobj.group(1)
964 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
965 return self._real_extract(url, new_video=False)
967 # Retrieve video webpage to extract further information
968 request = compat_urllib_request.Request(url)
970 self.report_download_webpage(video_id)
971 webpage = compat_urllib_request.urlopen(request).read()
972 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
973 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
976 # Extract uploader and title from webpage
977 self.report_extraction(video_id)
978 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
980 self._downloader.report_error(u'unable to extract video title')
982 video_title = mobj.group(1).decode('utf-8')
984 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
986 self._downloader.report_error(u'unable to extract video uploader')
988 video_uploader = mobj.group(1).decode('utf-8')
990 # Extract video thumbnail
991 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
993 self._downloader.report_error(u'unable to extract video thumbnail')
995 video_thumbnail = mobj.group(1).decode('utf-8')
997 # Extract video description
998 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1000 self._downloader.report_error(u'unable to extract video description')
1002 video_description = mobj.group(1).decode('utf-8')
1003 if not video_description:
1004 video_description = 'No description available.'
1006 # Extract video height and width
1007 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1009 self._downloader.report_error(u'unable to extract video height')
1011 yv_video_height = mobj.group(1)
1013 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1015 self._downloader.report_error(u'unable to extract video width')
1017 yv_video_width = mobj.group(1)
1019 # Retrieve video playlist to extract media URL
1020 # I'm not completely sure what all these options are, but we
1021 # seem to need most of them, otherwise the server sends a 401.
1022 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1023 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1024 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1025 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1026 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1028 self.report_download_webpage(video_id)
1029 webpage = compat_urllib_request.urlopen(request).read()
1030 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1031 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1034 # Extract media URL from playlist XML
1035 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1037 self._downloader.report_error(u'Unable to extract media URL')
1039 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1040 video_url = unescapeHTML(video_url)
1043 'id': video_id.decode('utf-8'),
1045 'uploader': video_uploader,
1046 'upload_date': None,
1047 'title': video_title,
1048 'ext': video_extension.decode('utf-8'),
1049 'thumbnail': video_thumbnail.decode('utf-8'),
1050 'description': video_description,
1054 class VimeoIE(InfoExtractor):
1055 """Information extractor for vimeo.com."""
1057 # _VALID_URL matches Vimeo URLs
1058 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1061 def _real_extract(self, url, new_video=True):
1062 # Extract ID from URL
1063 mobj = re.match(self._VALID_URL, url)
1065 self._downloader.report_error(u'Invalid URL: %s' % url)
1068 video_id = mobj.group('id')
1069 if not mobj.group('proto'):
1070 url = 'https://' + url
1071 if mobj.group('direct_link'):
1072 url = 'https://vimeo.com/' + video_id
1074 # Retrieve video webpage to extract further information
1075 request = compat_urllib_request.Request(url, None, std_headers)
1076 webpage = self._download_webpage(request, video_id)
1078 # Now we begin extracting as much information as we can from what we
1079 # retrieved. First we extract the information common to all extractors,
1080 # and latter we extract those that are Vimeo specific.
1081 self.report_extraction(video_id)
1083 # Extract the config JSON
1085 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1086 config = json.loads(config)
1088 if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1089 self._downloader.report_error(u'The author has restricted the access to this video, try with the "--referer" option')
1091 self._downloader.report_error(u'unable to extract info section')
1095 video_title = config["video"]["title"]
1097 # Extract uploader and uploader_id
1098 video_uploader = config["video"]["owner"]["name"]
1099 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1101 # Extract video thumbnail
1102 video_thumbnail = config["video"]["thumbnail"]
1104 # Extract video description
1105 video_description = get_element_by_attribute("itemprop", "description", webpage)
1106 if video_description: video_description = clean_html(video_description)
1107 else: video_description = u''
1109 # Extract upload date
1110 video_upload_date = None
1111 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1112 if mobj is not None:
1113 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1115 # Vimeo specific: extract request signature and timestamp
1116 sig = config['request']['signature']
1117 timestamp = config['request']['timestamp']
1119 # Vimeo specific: extract video codec and quality information
1120 # First consider quality, then codecs, then take everything
1121 # TODO bind to format param
1122 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1123 files = { 'hd': [], 'sd': [], 'other': []}
1124 for codec_name, codec_extension in codecs:
1125 if codec_name in config["video"]["files"]:
1126 if 'hd' in config["video"]["files"][codec_name]:
1127 files['hd'].append((codec_name, codec_extension, 'hd'))
1128 elif 'sd' in config["video"]["files"][codec_name]:
1129 files['sd'].append((codec_name, codec_extension, 'sd'))
1131 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1133 for quality in ('hd', 'sd', 'other'):
1134 if len(files[quality]) > 0:
1135 video_quality = files[quality][0][2]
1136 video_codec = files[quality][0][0]
1137 video_extension = files[quality][0][1]
1138 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1141 self._downloader.report_error(u'no known codec found')
1144 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1145 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1150 'uploader': video_uploader,
1151 'uploader_id': video_uploader_id,
1152 'upload_date': video_upload_date,
1153 'title': video_title,
1154 'ext': video_extension,
1155 'thumbnail': video_thumbnail,
1156 'description': video_description,
1160 class ArteTvIE(InfoExtractor):
1161 """arte.tv information extractor."""
1163 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1164 _LIVE_URL = r'index-[0-9]+\.html$'
1166 IE_NAME = u'arte.tv'
1168 def fetch_webpage(self, url):
1169 request = compat_urllib_request.Request(url)
1171 self.report_download_webpage(url)
1172 webpage = compat_urllib_request.urlopen(request).read()
1173 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1174 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1176 except ValueError as err:
1177 self._downloader.report_error(u'Invalid URL: %s' % url)
1181 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1182 page = self.fetch_webpage(url)
1183 mobj = re.search(regex, page, regexFlags)
1187 self._downloader.report_error(u'Invalid URL: %s' % url)
1190 for (i, key, err) in matchTuples:
1191 if mobj.group(i) is None:
1192 self._downloader.report_error(err)
1195 info[key] = mobj.group(i)
1199 def extractLiveStream(self, url):
1200 video_lang = url.split('/')[-4]
1201 info = self.grep_webpage(
1203 r'src="(.*?/videothek_js.*?\.js)',
1206 (1, 'url', u'Invalid URL: %s' % url)
1209 http_host = url.split('/')[2]
1210 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1211 info = self.grep_webpage(
1213 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1214 '(http://.*?\.swf).*?' +
1218 (1, 'path', u'could not extract video path: %s' % url),
1219 (2, 'player', u'could not extract video player: %s' % url),
1220 (3, 'url', u'could not extract video url: %s' % url)
1223 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1225 def extractPlus7Stream(self, url):
1226 video_lang = url.split('/')[-3]
1227 info = self.grep_webpage(
1229 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1232 (1, 'url', u'Invalid URL: %s' % url)
1235 next_url = compat_urllib_parse.unquote(info.get('url'))
1236 info = self.grep_webpage(
1238 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1241 (1, 'url', u'Could not find <video> tag: %s' % url)
1244 next_url = compat_urllib_parse.unquote(info.get('url'))
1246 info = self.grep_webpage(
1248 r'<video id="(.*?)".*?>.*?' +
1249 '<name>(.*?)</name>.*?' +
1250 '<dateVideo>(.*?)</dateVideo>.*?' +
1251 '<url quality="hd">(.*?)</url>',
1254 (1, 'id', u'could not extract video id: %s' % url),
1255 (2, 'title', u'could not extract video title: %s' % url),
1256 (3, 'date', u'could not extract video date: %s' % url),
1257 (4, 'url', u'could not extract video url: %s' % url)
1262 'id': info.get('id'),
1263 'url': compat_urllib_parse.unquote(info.get('url')),
1264 'uploader': u'arte.tv',
1265 'upload_date': info.get('date'),
1266 'title': info.get('title').decode('utf-8'),
1272 def _real_extract(self, url):
1273 video_id = url.split('/')[-1]
1274 self.report_extraction(video_id)
1276 if re.search(self._LIVE_URL, video_id) is not None:
1277 self.extractLiveStream(url)
1280 info = self.extractPlus7Stream(url)
1285 class GenericIE(InfoExtractor):
1286 """Generic last-resort information extractor."""
1289 IE_NAME = u'generic'
1291 def report_download_webpage(self, video_id):
1292 """Report webpage download."""
1293 if not self._downloader.params.get('test', False):
1294 self._downloader.report_warning(u'Falling back on generic information extractor.')
1295 super(GenericIE, self).report_download_webpage(video_id)
1297 def report_following_redirect(self, new_url):
1298 """Report information extraction."""
1299 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1301 def _test_redirect(self, url):
1302 """Check if it is a redirect, like url shorteners, in case return the new url."""
1303 class HeadRequest(compat_urllib_request.Request):
1304 def get_method(self):
1307 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1309 Subclass the HTTPRedirectHandler to make it use our
1310 HeadRequest also on the redirected URL
1312 def redirect_request(self, req, fp, code, msg, headers, newurl):
1313 if code in (301, 302, 303, 307):
1314 newurl = newurl.replace(' ', '%20')
1315 newheaders = dict((k,v) for k,v in req.headers.items()
1316 if k.lower() not in ("content-length", "content-type"))
1317 return HeadRequest(newurl,
1319 origin_req_host=req.get_origin_req_host(),
1322 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1324 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1326 Fallback to GET if HEAD is not allowed (405 HTTP error)
1328 def http_error_405(self, req, fp, code, msg, headers):
1332 newheaders = dict((k,v) for k,v in req.headers.items()
1333 if k.lower() not in ("content-length", "content-type"))
1334 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1336 origin_req_host=req.get_origin_req_host(),
1340 opener = compat_urllib_request.OpenerDirector()
1341 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1342 HTTPMethodFallback, HEADRedirectHandler,
1343 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1344 opener.add_handler(handler())
1346 response = opener.open(HeadRequest(url))
1347 new_url = response.geturl()
1352 self.report_following_redirect(new_url)
1355 def _real_extract(self, url):
1356 new_url = self._test_redirect(url)
1357 if new_url: return [self.url_result(new_url)]
1359 video_id = url.split('/')[-1]
1361 webpage = self._download_webpage(url, video_id)
1362 except ValueError as err:
1363 # since this is the last-resort InfoExtractor, if
1364 # this error is thrown, it'll be thrown here
1365 self._downloader.report_error(u'Invalid URL: %s' % url)
1368 self.report_extraction(video_id)
1369 # Start with something easy: JW Player in SWFObject
1370 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1372 # Broaden the search a little bit
1373 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1375 # Broaden the search a little bit: JWPlayer JS loader
1376 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1378 self._downloader.report_error(u'Invalid URL: %s' % url)
1381 # It's possible that one of the regexes
1382 # matched, but returned an empty group:
1383 if mobj.group(1) is None:
1384 self._downloader.report_error(u'Invalid URL: %s' % url)
1387 video_url = compat_urllib_parse.unquote(mobj.group(1))
1388 video_id = os.path.basename(video_url)
1390 # here's a fun little line of code for you:
1391 video_extension = os.path.splitext(video_id)[1][1:]
1392 video_id = os.path.splitext(video_id)[0]
1394 # it's tempting to parse this further, but you would
1395 # have to take into account all the variations like
1396 # Video Title - Site Name
1397 # Site Name | Video Title
1398 # Video Title - Tagline | Site Name
1399 # and so on and so forth; it's just not practical
1400 mobj = re.search(r'<title>(.*)</title>', webpage)
1402 self._downloader.report_error(u'unable to extract title')
1404 video_title = mobj.group(1)
1406 # video uploader is domain name
1407 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1409 self._downloader.report_error(u'unable to extract title')
1411 video_uploader = mobj.group(1)
1416 'uploader': video_uploader,
1417 'upload_date': None,
1418 'title': video_title,
1419 'ext': video_extension,
1423 class YoutubeSearchIE(InfoExtractor):
1424 """Information Extractor for YouTube search queries."""
1425 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1426 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1427 _max_youtube_results = 1000
1428 IE_NAME = u'youtube:search'
1430 def report_download_page(self, query, pagenum):
1431 """Report attempt to download search page with given number."""
1432 query = query.decode(preferredencoding())
1433 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1435 def _real_extract(self, query):
1436 mobj = re.match(self._VALID_URL, query)
1438 self._downloader.report_error(u'invalid search query "%s"' % query)
1441 prefix, query = query.split(':')
1443 query = query.encode('utf-8')
1445 return self._get_n_results(query, 1)
1446 elif prefix == 'all':
1447 self._get_n_results(query, self._max_youtube_results)
1452 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1454 elif n > self._max_youtube_results:
1455 self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1456 n = self._max_youtube_results
1457 return self._get_n_results(query, n)
1458 except ValueError: # parsing prefix as integer fails
1459 return self._get_n_results(query, 1)
1461 def _get_n_results(self, query, n):
1462 """Get a specified number of results for a query"""
1468 while (50 * pagenum) < limit:
1469 self.report_download_page(query, pagenum+1)
1470 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1471 request = compat_urllib_request.Request(result_url)
1473 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1474 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1475 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1477 api_response = json.loads(data)['data']
1479 if not 'items' in api_response:
1480 self._downloader.report_error(u'[youtube] No video results')
1483 new_ids = list(video['id'] for video in api_response['items'])
1484 video_ids += new_ids
1486 limit = min(n, api_response['totalItems'])
1489 if len(video_ids) > n:
1490 video_ids = video_ids[:n]
1491 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1495 class GoogleSearchIE(InfoExtractor):
1496 """Information Extractor for Google Video search queries."""
1497 _VALID_URL = r'gvsearch(?P<prefix>|\d+|all):(?P<query>[\s\S]+)'
1498 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1499 _max_google_results = 1000
1500 IE_NAME = u'video.google:search'
1502 def _real_extract(self, query):
1503 mobj = re.match(self._VALID_URL, query)
1505 prefix = mobj.group('prefix')
1506 query = mobj.group('query')
1508 return self._download_n_results(query, 1)
1509 elif prefix == 'all':
1510 return self._download_n_results(query, self._max_google_results)
1514 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
1515 elif n > self._max_google_results:
1516 self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1517 n = self._max_google_results
1518 return self._download_n_results(query, n)
1520 def _download_n_results(self, query, n):
1521 """Downloads a specified number of results for a query"""
1524 '_type': 'playlist',
1529 for pagenum in itertools.count(1):
1530 result_url = u'http://video.google.com/videosearch?q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1531 webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1532 note='Downloading result page ' + str(pagenum))
1534 # Extract video identifiers
1535 for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1538 'url': mobj.group(1)
1540 res['entries'].append(e)
1542 if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1545 class YahooSearchIE(InfoExtractor):
1546 """Information Extractor for Yahoo! Video search queries."""
1549 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1550 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1551 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1552 _MORE_PAGES_INDICATOR = r'\s*Next'
1553 _max_yahoo_results = 1000
1554 IE_NAME = u'video.yahoo:search'
1556 def report_download_page(self, query, pagenum):
1557 """Report attempt to download playlist page with given number."""
1558 query = query.decode(preferredencoding())
1559 self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1561 def _real_extract(self, query):
1562 mobj = re.match(self._VALID_URL, query)
1564 self._downloader.report_error(u'invalid search query "%s"' % query)
1567 prefix, query = query.split(':')
1569 query = query.encode('utf-8')
1571 self._download_n_results(query, 1)
1573 elif prefix == 'all':
1574 self._download_n_results(query, self._max_yahoo_results)
1580 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1582 elif n > self._max_yahoo_results:
1583 self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1584 n = self._max_yahoo_results
1585 self._download_n_results(query, n)
1587 except ValueError: # parsing prefix as integer fails
1588 self._download_n_results(query, 1)
1591 def _download_n_results(self, query, n):
1592 """Downloads a specified number of results for a query"""
1595 already_seen = set()
1599 self.report_download_page(query, pagenum)
1600 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1601 request = compat_urllib_request.Request(result_url)
1603 page = compat_urllib_request.urlopen(request).read()
1604 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1605 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1608 # Extract video identifiers
1609 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1610 video_id = mobj.group(1)
1611 if video_id not in already_seen:
1612 video_ids.append(video_id)
1613 already_seen.add(video_id)
1614 if len(video_ids) == n:
1615 # Specified n videos reached
1616 for id in video_ids:
1617 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1620 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1621 for id in video_ids:
1622 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1625 pagenum = pagenum + 1
1628 class YoutubePlaylistIE(InfoExtractor):
1629 """Information Extractor for YouTube playlists."""
1631 _VALID_URL = r"""(?:
1636 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1637 \? (?:.*?&)*? (?:p|a|list)=
1640 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1643 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1645 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1647 IE_NAME = u'youtube:playlist'
1650 def suitable(cls, url):
1651 """Receives a URL and returns True if suitable for this IE."""
1652 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1654 def _real_extract(self, url):
1655 # Extract playlist id
1656 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1658 self._downloader.report_error(u'invalid url: %s' % url)
1661 # Download playlist videos from API
1662 playlist_id = mobj.group(1) or mobj.group(2)
1667 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1668 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1671 response = json.loads(page)
1672 except ValueError as err:
1673 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1676 if 'feed' not in response:
1677 self._downloader.report_error(u'Got a malformed response from YouTube API')
1679 playlist_title = response['feed']['title']['$t']
1680 if 'entry' not in response['feed']:
1681 # Number of videos is a multiple of self._MAX_RESULTS
1684 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1685 for entry in response['feed']['entry']
1686 if 'content' in entry ]
1688 if len(response['feed']['entry']) < self._MAX_RESULTS:
1692 videos = [v[1] for v in sorted(videos)]
1694 url_results = [self.url_result(url, 'Youtube') for url in videos]
1695 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1698 class YoutubeChannelIE(InfoExtractor):
1699 """Information Extractor for YouTube channels."""
1701 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1702 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1703 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1704 _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1705 IE_NAME = u'youtube:channel'
1707 def extract_videos_from_page(self, page):
1709 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1710 if mobj.group(1) not in ids_in_page:
1711 ids_in_page.append(mobj.group(1))
1714 def _real_extract(self, url):
1715 # Extract channel id
1716 mobj = re.match(self._VALID_URL, url)
1718 self._downloader.report_error(u'invalid url: %s' % url)
1721 # Download channel page
1722 channel_id = mobj.group(1)
1726 url = self._TEMPLATE_URL % (channel_id, pagenum)
1727 page = self._download_webpage(url, channel_id,
1728 u'Downloading page #%s' % pagenum)
1730 # Extract video identifiers
1731 ids_in_page = self.extract_videos_from_page(page)
1732 video_ids.extend(ids_in_page)
1734 # Download any subsequent channel pages using the json-based channel_ajax query
1735 if self._MORE_PAGES_INDICATOR in page:
1737 pagenum = pagenum + 1
1739 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1740 page = self._download_webpage(url, channel_id,
1741 u'Downloading page #%s' % pagenum)
1743 page = json.loads(page)
1745 ids_in_page = self.extract_videos_from_page(page['content_html'])
1746 video_ids.extend(ids_in_page)
1748 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1751 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1753 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1754 url_entries = [self.url_result(url, 'Youtube') for url in urls]
1755 return [self.playlist_result(url_entries, channel_id)]
1758 class YoutubeUserIE(InfoExtractor):
1759 """Information Extractor for YouTube users."""
1761 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1762 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1763 _GDATA_PAGE_SIZE = 50
1764 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1765 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1766 IE_NAME = u'youtube:user'
1768 def _real_extract(self, url):
1770 mobj = re.match(self._VALID_URL, url)
1772 self._downloader.report_error(u'invalid url: %s' % url)
1775 username = mobj.group(1)
1777 # Download video ids using YouTube Data API. Result size per
1778 # query is limited (currently to 50 videos) so we need to query
1779 # page by page until there are no video ids - it means we got
1786 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1788 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1789 page = self._download_webpage(gdata_url, username,
1790 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1792 # Extract video identifiers
1795 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1796 if mobj.group(1) not in ids_in_page:
1797 ids_in_page.append(mobj.group(1))
1799 video_ids.extend(ids_in_page)
1801 # A little optimization - if current page is not
1802 # "full", ie. does not contain PAGE_SIZE video ids then
1803 # we can assume that this page is the last one - there
1804 # are no more ids on further pages - no need to query
1807 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1812 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1813 url_results = [self.url_result(url, 'Youtube') for url in urls]
1814 return [self.playlist_result(url_results, playlist_title = username)]
1817 class BlipTVUserIE(InfoExtractor):
1818 """Information Extractor for blip.tv users."""
1820 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1822 IE_NAME = u'blip.tv:user'
1824 def _real_extract(self, url):
1826 mobj = re.match(self._VALID_URL, url)
1828 self._downloader.report_error(u'invalid url: %s' % url)
1831 username = mobj.group(1)
1833 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1835 page = self._download_webpage(url, username, u'Downloading user page')
1836 mobj = re.search(r'data-users-id="([^"]+)"', page)
1837 page_base = page_base % mobj.group(1)
1840 # Download video ids using BlipTV Ajax calls. Result size per
1841 # query is limited (currently to 12 videos) so we need to query
1842 # page by page until there are no video ids - it means we got
1849 url = page_base + "&page=" + str(pagenum)
1850 page = self._download_webpage(url, username,
1851 u'Downloading video ids from page %d' % pagenum)
1853 # Extract video identifiers
1856 for mobj in re.finditer(r'href="/([^"]+)"', page):
1857 if mobj.group(1) not in ids_in_page:
1858 ids_in_page.append(unescapeHTML(mobj.group(1)))
1860 video_ids.extend(ids_in_page)
1862 # A little optimization - if current page is not
1863 # "full", ie. does not contain PAGE_SIZE video ids then
1864 # we can assume that this page is the last one - there
1865 # are no more ids on further pages - no need to query
1868 if len(ids_in_page) < self._PAGE_SIZE:
1873 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1874 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1875 return [self.playlist_result(url_entries, playlist_title = username)]
1878 class DepositFilesIE(InfoExtractor):
1879 """Information extractor for depositfiles.com"""
1881 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1883 def _real_extract(self, url):
1884 file_id = url.split('/')[-1]
1885 # Rebuild url in english locale
1886 url = 'http://depositfiles.com/en/files/' + file_id
1888 # Retrieve file webpage with 'Free download' button pressed
1889 free_download_indication = { 'gateway_result' : '1' }
1890 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1892 self.report_download_webpage(file_id)
1893 webpage = compat_urllib_request.urlopen(request).read()
1894 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1895 self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
1898 # Search for the real file URL
1899 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1900 if (mobj is None) or (mobj.group(1) is None):
1901 # Try to figure out reason of the error.
1902 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1903 if (mobj is not None) and (mobj.group(1) is not None):
1904 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1905 self._downloader.report_error(u'%s' % restriction_message)
1907 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
1910 file_url = mobj.group(1)
1911 file_extension = os.path.splitext(file_url)[1][1:]
1913 # Search for file title
1914 mobj = re.search(r'<b title="(.*?)">', webpage)
1916 self._downloader.report_error(u'unable to extract title')
1918 file_title = mobj.group(1).decode('utf-8')
1921 'id': file_id.decode('utf-8'),
1922 'url': file_url.decode('utf-8'),
1924 'upload_date': None,
1925 'title': file_title,
1926 'ext': file_extension.decode('utf-8'),
1930 class FacebookIE(InfoExtractor):
1931 """Information Extractor for Facebook"""
1933 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1934 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1935 _NETRC_MACHINE = 'facebook'
1936 IE_NAME = u'facebook'
1938 def report_login(self):
1939 """Report attempt to log in."""
1940 self.to_screen(u'Logging in')
1942 def _real_initialize(self):
1943 if self._downloader is None:
1948 downloader_params = self._downloader.params
1950 # Attempt to use provided username and password or .netrc data
1951 if downloader_params.get('username', None) is not None:
1952 useremail = downloader_params['username']
1953 password = downloader_params['password']
1954 elif downloader_params.get('usenetrc', False):
1956 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1957 if info is not None:
1961 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1962 except (IOError, netrc.NetrcParseError) as err:
1963 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1966 if useremail is None:
1975 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1978 login_results = compat_urllib_request.urlopen(request).read()
1979 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1980 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1982 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1983 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1986 def _real_extract(self, url):
1987 mobj = re.match(self._VALID_URL, url)
1989 self._downloader.report_error(u'invalid URL: %s' % url)
1991 video_id = mobj.group('ID')
1993 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1994 webpage = self._download_webpage(url, video_id)
1996 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1997 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1998 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2000 raise ExtractorError(u'Cannot parse data')
2001 data = dict(json.loads(m.group(1)))
2002 params_raw = compat_urllib_parse.unquote(data['params'])
2003 params = json.loads(params_raw)
2004 video_data = params['video_data'][0]
2005 video_url = video_data.get('hd_src')
2007 video_url = video_data['sd_src']
2009 raise ExtractorError(u'Cannot find video URL')
2010 video_duration = int(video_data['video_duration'])
2011 thumbnail = video_data['thumbnail_src']
2013 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2015 raise ExtractorError(u'Cannot find title in webpage')
2016 video_title = unescapeHTML(m.group(1))
2020 'title': video_title,
2023 'duration': video_duration,
2024 'thumbnail': thumbnail,
2029 class BlipTVIE(InfoExtractor):
2030 """Information extractor for blip.tv"""
2032 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2033 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2034 IE_NAME = u'blip.tv'
2036 def report_direct_download(self, title):
2037 """Report information extraction."""
2038 self.to_screen(u'%s: Direct download detected' % title)
2040 def _real_extract(self, url):
2041 mobj = re.match(self._VALID_URL, url)
2043 self._downloader.report_error(u'invalid URL: %s' % url)
2046 urlp = compat_urllib_parse_urlparse(url)
2047 if urlp.path.startswith('/play/'):
2048 request = compat_urllib_request.Request(url)
2049 response = compat_urllib_request.urlopen(request)
2050 redirecturl = response.geturl()
2051 rurlp = compat_urllib_parse_urlparse(redirecturl)
2052 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2053 url = 'http://blip.tv/a/a-' + file_id
2054 return self._real_extract(url)
2061 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2062 request = compat_urllib_request.Request(json_url)
2063 request.add_header('User-Agent', 'iTunes/10.6.1')
2064 self.report_extraction(mobj.group(1))
2067 urlh = compat_urllib_request.urlopen(request)
2068 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2069 basename = url.split('/')[-1]
2070 title,ext = os.path.splitext(basename)
2071 title = title.decode('UTF-8')
2072 ext = ext.replace('.', '')
2073 self.report_direct_download(title)
2078 'upload_date': None,
2083 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2084 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2085 if info is None: # Regular URL
2087 json_code_bytes = urlh.read()
2088 json_code = json_code_bytes.decode('utf-8')
2089 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2090 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2094 json_data = json.loads(json_code)
2095 if 'Post' in json_data:
2096 data = json_data['Post']
2100 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2101 video_url = data['media']['url']
2102 umobj = re.match(self._URL_EXT, video_url)
2104 raise ValueError('Can not determine filename extension')
2105 ext = umobj.group(1)
2108 'id': data['item_id'],
2110 'uploader': data['display_name'],
2111 'upload_date': upload_date,
2112 'title': data['title'],
2114 'format': data['media']['mimeType'],
2115 'thumbnail': data['thumbnailUrl'],
2116 'description': data['description'],
2117 'player_url': data['embedUrl'],
2118 'user_agent': 'iTunes/10.6.1',
2120 except (ValueError,KeyError) as err:
2121 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2127 class MyVideoIE(InfoExtractor):
2128 """Information Extractor for myvideo.de."""
2130 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2131 IE_NAME = u'myvideo'
2133 def _real_extract(self,url):
2134 mobj = re.match(self._VALID_URL, url)
2136 self._download.report_error(u'invalid URL: %s' % url)
2139 video_id = mobj.group(1)
2142 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2143 webpage = self._download_webpage(webpage_url, video_id)
2145 self.report_extraction(video_id)
2146 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2149 self._downloader.report_error(u'unable to extract media URL')
2151 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2153 mobj = re.search('<title>([^<]+)</title>', webpage)
2155 self._downloader.report_error(u'unable to extract title')
2158 video_title = mobj.group(1)
2164 'upload_date': None,
2165 'title': video_title,
2169 class ComedyCentralIE(InfoExtractor):
2170 """Information extractor for The Daily Show and Colbert Report """
2172 # urls can be abbreviations like :thedailyshow or :colbert
2173 # urls for episodes like:
2174 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2175 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2176 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2177 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2178 |(https?://)?(www\.)?
2179 (?P<showname>thedailyshow|colbertnation)\.com/
2180 (full-episodes/(?P<episode>.*)|
2182 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2183 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2186 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2188 _video_extensions = {
2196 _video_dimensions = {
2206 def suitable(cls, url):
2207 """Receives a URL and returns True if suitable for this IE."""
2208 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2210 def _print_formats(self, formats):
2211 print('Available formats:')
2213 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2216 def _real_extract(self, url):
2217 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2219 self._downloader.report_error(u'invalid URL: %s' % url)
2222 if mobj.group('shortname'):
2223 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2224 url = u'http://www.thedailyshow.com/full-episodes/'
2226 url = u'http://www.colbertnation.com/full-episodes/'
2227 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2228 assert mobj is not None
2230 if mobj.group('clip'):
2231 if mobj.group('showname') == 'thedailyshow':
2232 epTitle = mobj.group('tdstitle')
2234 epTitle = mobj.group('cntitle')
2237 dlNewest = not mobj.group('episode')
2239 epTitle = mobj.group('showname')
2241 epTitle = mobj.group('episode')
2243 self.report_extraction(epTitle)
2244 webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2246 url = htmlHandle.geturl()
2247 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2249 raise ExtractorError(u'Invalid redirected URL: ' + url)
2250 if mobj.group('episode') == '':
2251 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2252 epTitle = mobj.group('episode')
2254 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2256 if len(mMovieParams) == 0:
2257 # The Colbert Report embeds the information in a without
2258 # a URL prefix; so extract the alternate reference
2259 # and then add the URL prefix manually.
2261 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2262 if len(altMovieParams) == 0:
2263 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2265 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2267 uri = mMovieParams[0][1]
2268 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2269 indexXml = self._download_webpage(indexUrl, epTitle,
2270 u'Downloading show index',
2271 u'unable to download episode index')
2275 idoc = xml.etree.ElementTree.fromstring(indexXml)
2276 itemEls = idoc.findall('.//item')
2277 for partNum,itemEl in enumerate(itemEls):
2278 mediaId = itemEl.findall('./guid')[0].text
2279 shortMediaId = mediaId.split(':')[-1]
2280 showId = mediaId.split(':')[-2].replace('.com', '')
2281 officialTitle = itemEl.findall('./title')[0].text
2282 officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2284 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2285 compat_urllib_parse.urlencode({'uri': mediaId}))
2286 configXml = self._download_webpage(configUrl, epTitle,
2287 u'Downloading configuration for %s' % shortMediaId)
2289 cdoc = xml.etree.ElementTree.fromstring(configXml)
2291 for rendition in cdoc.findall('.//rendition'):
2292 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2296 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2299 if self._downloader.params.get('listformats', None):
2300 self._print_formats([i[0] for i in turls])
2303 # For now, just pick the highest bitrate
2304 format,rtmp_video_url = turls[-1]
2306 # Get the format arg from the arg stream
2307 req_format = self._downloader.params.get('format', None)
2309 # Select format if we can find one
2312 format, rtmp_video_url = f, v
2315 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2317 raise ExtractorError(u'Cannot transform RTMP url')
2318 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2319 video_url = base + m.group('finalid')
2321 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2326 'upload_date': officialDate,
2331 'description': officialTitle,
2333 results.append(info)
2338 class EscapistIE(InfoExtractor):
2339 """Information extractor for The Escapist """
2341 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2342 IE_NAME = u'escapist'
2344 def _real_extract(self, url):
2345 mobj = re.match(self._VALID_URL, url)
2347 self._downloader.report_error(u'invalid URL: %s' % url)
2349 showName = mobj.group('showname')
2350 videoId = mobj.group('episode')
2352 self.report_extraction(showName)
2353 webPage = self._download_webpage(url, showName)
2355 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2356 description = unescapeHTML(descMatch.group(1))
2357 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2358 imgUrl = unescapeHTML(imgMatch.group(1))
2359 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2360 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2361 configUrlMatch = re.search('config=(.*)$', playerUrl)
2362 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2364 configJSON = self._download_webpage(configUrl, showName,
2365 u'Downloading configuration',
2366 u'unable to download configuration')
2368 # Technically, it's JavaScript, not JSON
2369 configJSON = configJSON.replace("'", '"')
2372 config = json.loads(configJSON)
2373 except (ValueError,) as err:
2374 self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2377 playlist = config['playlist']
2378 videoUrl = playlist[1]['url']
2383 'uploader': showName,
2384 'upload_date': None,
2387 'thumbnail': imgUrl,
2388 'description': description,
2389 'player_url': playerUrl,
2394 class CollegeHumorIE(InfoExtractor):
2395 """Information extractor for collegehumor.com"""
2398 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2399 IE_NAME = u'collegehumor'
2401 def report_manifest(self, video_id):
2402 """Report information extraction."""
2403 self.to_screen(u'%s: Downloading XML manifest' % video_id)
2405 def _real_extract(self, url):
2406 mobj = re.match(self._VALID_URL, url)
2408 self._downloader.report_error(u'invalid URL: %s' % url)
2410 video_id = mobj.group('videoid')
2415 'upload_date': None,
2418 self.report_extraction(video_id)
2419 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2421 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2422 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2423 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2426 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2428 videoNode = mdoc.findall('./video')[0]
2429 info['description'] = videoNode.findall('./description')[0].text
2430 info['title'] = videoNode.findall('./caption')[0].text
2431 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2432 manifest_url = videoNode.findall('./file')[0].text
2434 self._downloader.report_error(u'Invalid metadata XML file')
2437 manifest_url += '?hdcore=2.10.3'
2438 self.report_manifest(video_id)
2440 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2441 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2442 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2445 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2447 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2448 node_id = media_node.attrib['url']
2449 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2450 except IndexError as err:
2451 self._downloader.report_error(u'Invalid manifest file')
2454 url_pr = compat_urllib_parse_urlparse(manifest_url)
2455 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2462 class XVideosIE(InfoExtractor):
2463 """Information extractor for xvideos.com"""
2465 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2466 IE_NAME = u'xvideos'
2468 def _real_extract(self, url):
2469 mobj = re.match(self._VALID_URL, url)
2471 self._downloader.report_error(u'invalid URL: %s' % url)
2473 video_id = mobj.group(1)
2475 webpage = self._download_webpage(url, video_id)
2477 self.report_extraction(video_id)
2481 mobj = re.search(r'flv_url=(.+?)&', webpage)
2483 self._downloader.report_error(u'unable to extract video url')
2485 video_url = compat_urllib_parse.unquote(mobj.group(1))
2489 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2491 self._downloader.report_error(u'unable to extract video title')
2493 video_title = mobj.group(1)
2496 # Extract video thumbnail
2497 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2499 self._downloader.report_error(u'unable to extract video thumbnail')
2501 video_thumbnail = mobj.group(0)
2507 'upload_date': None,
2508 'title': video_title,
2510 'thumbnail': video_thumbnail,
2511 'description': None,
2517 class SoundcloudIE(InfoExtractor):
2518 """Information extractor for soundcloud.com
2519 To access the media, the uid of the song and a stream token
2520 must be extracted from the page source and the script must make
2521 a request to media.soundcloud.com/crossdomain.xml. Then
2522 the media can be grabbed by requesting from an url composed
2523 of the stream token and uid
2526 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2527 IE_NAME = u'soundcloud'
2529 def report_resolve(self, video_id):
2530 """Report information extraction."""
2531 self.to_screen(u'%s: Resolving id' % video_id)
2533 def _real_extract(self, url):
2534 mobj = re.match(self._VALID_URL, url)
2536 self._downloader.report_error(u'invalid URL: %s' % url)
2539 # extract uploader (which is in the url)
2540 uploader = mobj.group(1)
2541 # extract simple title (uploader + slug of song title)
2542 slug_title = mobj.group(2)
2543 simple_title = uploader + u'-' + slug_title
2544 full_title = '%s/%s' % (uploader, slug_title)
2546 self.report_resolve(full_title)
2548 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2549 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2550 info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2552 info = json.loads(info_json)
2553 video_id = info['id']
2554 self.report_extraction(full_title)
2556 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2557 stream_json = self._download_webpage(streams_url, full_title,
2558 u'Downloading stream definitions',
2559 u'unable to download stream definitions')
2561 streams = json.loads(stream_json)
2562 mediaURL = streams['http_mp3_128_url']
2563 upload_date = unified_strdate(info['created_at'])
2568 'uploader': info['user']['username'],
2569 'upload_date': upload_date,
2570 'title': info['title'],
2572 'description': info['description'],
2575 class SoundcloudSetIE(InfoExtractor):
2576 """Information extractor for soundcloud.com sets
2577 To access the media, the uid of the song and a stream token
2578 must be extracted from the page source and the script must make
2579 a request to media.soundcloud.com/crossdomain.xml. Then
2580 the media can be grabbed by requesting from an url composed
2581 of the stream token and uid
2584 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2585 IE_NAME = u'soundcloud:set'
2587 def report_resolve(self, video_id):
2588 """Report information extraction."""
2589 self.to_screen(u'%s: Resolving id' % video_id)
2591 def _real_extract(self, url):
2592 mobj = re.match(self._VALID_URL, url)
2594 self._downloader.report_error(u'invalid URL: %s' % url)
2597 # extract uploader (which is in the url)
2598 uploader = mobj.group(1)
2599 # extract simple title (uploader + slug of song title)
2600 slug_title = mobj.group(2)
2601 simple_title = uploader + u'-' + slug_title
2602 full_title = '%s/sets/%s' % (uploader, slug_title)
2604 self.report_resolve(full_title)
2606 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2607 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2608 info_json = self._download_webpage(resolv_url, full_title)
2611 info = json.loads(info_json)
2612 if 'errors' in info:
2613 for err in info['errors']:
2614 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2617 self.report_extraction(full_title)
2618 for track in info['tracks']:
2619 video_id = track['id']
2621 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2622 stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2624 self.report_extraction(video_id)
2625 streams = json.loads(stream_json)
2626 mediaURL = streams['http_mp3_128_url']
2631 'uploader': track['user']['username'],
2632 'upload_date': unified_strdate(track['created_at']),
2633 'title': track['title'],
2635 'description': track['description'],
2640 class InfoQIE(InfoExtractor):
2641 """Information extractor for infoq.com"""
2642 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2644 def _real_extract(self, url):
2645 mobj = re.match(self._VALID_URL, url)
2647 self._downloader.report_error(u'invalid URL: %s' % url)
2650 webpage = self._download_webpage(url, video_id=url)
2651 self.report_extraction(url)
2654 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2656 self._downloader.report_error(u'unable to extract video url')
2658 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2659 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2662 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2664 self._downloader.report_error(u'unable to extract video title')
2666 video_title = mobj.group(1)
2668 # Extract description
2669 video_description = u'No description available.'
2670 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2671 if mobj is not None:
2672 video_description = mobj.group(1)
2674 video_filename = video_url.split('/')[-1]
2675 video_id, extension = video_filename.split('.')
2681 'upload_date': None,
2682 'title': video_title,
2683 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2685 'description': video_description,
2690 class MixcloudIE(InfoExtractor):
2691 """Information extractor for www.mixcloud.com"""
2693 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2694 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2695 IE_NAME = u'mixcloud'
2697 def report_download_json(self, file_id):
2698 """Report JSON download."""
2699 self.to_screen(u'Downloading json')
2701 def get_urls(self, jsonData, fmt, bitrate='best'):
2702 """Get urls from 'audio_formats' section in json"""
2705 bitrate_list = jsonData[fmt]
2706 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2707 bitrate = max(bitrate_list) # select highest
2709 url_list = jsonData[fmt][bitrate]
2710 except TypeError: # we have no bitrate info.
2711 url_list = jsonData[fmt]
2714 def check_urls(self, url_list):
2715 """Returns 1st active url from list"""
2716 for url in url_list:
2718 compat_urllib_request.urlopen(url)
2720 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2725 def _print_formats(self, formats):
2726 print('Available formats:')
2727 for fmt in formats.keys():
2728 for b in formats[fmt]:
2730 ext = formats[fmt][b][0]
2731 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2732 except TypeError: # we have no bitrate info
2733 ext = formats[fmt][0]
2734 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2737 def _real_extract(self, url):
2738 mobj = re.match(self._VALID_URL, url)
2740 self._downloader.report_error(u'invalid URL: %s' % url)
2742 # extract uploader & filename from url
2743 uploader = mobj.group(1).decode('utf-8')
2744 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2746 # construct API request
2747 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2748 # retrieve .json file with links to files
2749 request = compat_urllib_request.Request(file_url)
2751 self.report_download_json(file_url)
2752 jsonData = compat_urllib_request.urlopen(request).read()
2753 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2754 self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
2758 json_data = json.loads(jsonData)
2759 player_url = json_data['player_swf_url']
2760 formats = dict(json_data['audio_formats'])
2762 req_format = self._downloader.params.get('format', None)
2765 if self._downloader.params.get('listformats', None):
2766 self._print_formats(formats)
2769 if req_format is None or req_format == 'best':
2770 for format_param in formats.keys():
2771 url_list = self.get_urls(formats, format_param)
2773 file_url = self.check_urls(url_list)
2774 if file_url is not None:
2777 if req_format not in formats:
2778 self._downloader.report_error(u'format is not available')
2781 url_list = self.get_urls(formats, req_format)
2782 file_url = self.check_urls(url_list)
2783 format_param = req_format
2786 'id': file_id.decode('utf-8'),
2787 'url': file_url.decode('utf-8'),
2788 'uploader': uploader.decode('utf-8'),
2789 'upload_date': None,
2790 'title': json_data['name'],
2791 'ext': file_url.split('.')[-1].decode('utf-8'),
2792 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2793 'thumbnail': json_data['thumbnail_url'],
2794 'description': json_data['description'],
2795 'player_url': player_url.decode('utf-8'),
2798 class StanfordOpenClassroomIE(InfoExtractor):
2799 """Information extractor for Stanford's Open ClassRoom"""
2801 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2802 IE_NAME = u'stanfordoc'
2804 def _real_extract(self, url):
2805 mobj = re.match(self._VALID_URL, url)
2807 raise ExtractorError(u'Invalid URL: %s' % url)
2809 if mobj.group('course') and mobj.group('video'): # A specific video
2810 course = mobj.group('course')
2811 video = mobj.group('video')
2813 'id': course + '_' + video,
2815 'upload_date': None,
2818 self.report_extraction(info['id'])
2819 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2820 xmlUrl = baseUrl + video + '.xml'
2822 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2823 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2824 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2826 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2828 info['title'] = mdoc.findall('./title')[0].text
2829 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2831 self._downloader.report_error(u'Invalid metadata XML file')
2833 info['ext'] = info['url'].rpartition('.')[2]
2835 elif mobj.group('course'): # A course page
2836 course = mobj.group('course')
2841 'upload_date': None,
2844 coursepage = self._download_webpage(url, info['id'],
2845 note='Downloading course info page',
2846 errnote='Unable to download course info page')
2848 m = re.search('<h1>([^<]+)</h1>', coursepage)
2850 info['title'] = unescapeHTML(m.group(1))
2852 info['title'] = info['id']
2854 m = re.search('<description>([^<]+)</description>', coursepage)
2856 info['description'] = unescapeHTML(m.group(1))
2858 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2861 'type': 'reference',
2862 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2866 for entry in info['list']:
2867 assert entry['type'] == 'reference'
2868 results += self.extract(entry['url'])
2872 'id': 'Stanford OpenClassroom',
2875 'upload_date': None,
2878 self.report_download_webpage(info['id'])
2879 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2881 rootpage = compat_urllib_request.urlopen(rootURL).read()
2882 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2883 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
2886 info['title'] = info['id']
2888 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2891 'type': 'reference',
2892 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2897 for entry in info['list']:
2898 assert entry['type'] == 'reference'
2899 results += self.extract(entry['url'])
2902 class MTVIE(InfoExtractor):
2903 """Information extractor for MTV.com"""
2905 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2908 def _real_extract(self, url):
2909 mobj = re.match(self._VALID_URL, url)
2911 self._downloader.report_error(u'invalid URL: %s' % url)
2913 if not mobj.group('proto'):
2914 url = 'http://' + url
2915 video_id = mobj.group('videoid')
2917 webpage = self._download_webpage(url, video_id)
2919 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2921 self._downloader.report_error(u'unable to extract song name')
2923 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2924 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2926 self._downloader.report_error(u'unable to extract performer')
2928 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2929 video_title = performer + ' - ' + song_name
2931 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2933 self._downloader.report_error(u'unable to mtvn_uri')
2935 mtvn_uri = mobj.group(1)
2937 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2939 self._downloader.report_error(u'unable to extract content id')
2941 content_id = mobj.group(1)
2943 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2944 self.report_extraction(video_id)
2945 request = compat_urllib_request.Request(videogen_url)
2947 metadataXml = compat_urllib_request.urlopen(request).read()
2948 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2949 self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
2952 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2953 renditions = mdoc.findall('.//rendition')
2955 # For now, always pick the highest quality.
2956 rendition = renditions[-1]
2959 _,_,ext = rendition.attrib['type'].partition('/')
2960 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2961 video_url = rendition.find('./src').text
2963 self._downloader.report_error('Invalid rendition field.')
2969 'uploader': performer,
2970 'upload_date': None,
2971 'title': video_title,
2979 class YoukuIE(InfoExtractor):
2980 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2983 nowTime = int(time.time() * 1000)
2984 random1 = random.randint(1000,1998)
2985 random2 = random.randint(1000,9999)
2987 return "%d%d%d" %(nowTime,random1,random2)
2989 def _get_file_ID_mix_string(self, seed):
2991 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2993 for i in range(len(source)):
2994 seed = (seed * 211 + 30031 ) % 65536
2995 index = math.floor(seed / 65536 * len(source) )
2996 mixed.append(source[int(index)])
2997 source.remove(source[int(index)])
2998 #return ''.join(mixed)
3001 def _get_file_id(self, fileId, seed):
3002 mixed = self._get_file_ID_mix_string(seed)
3003 ids = fileId.split('*')
3007 realId.append(mixed[int(ch)])
3008 return ''.join(realId)
3010 def _real_extract(self, url):
3011 mobj = re.match(self._VALID_URL, url)
3013 self._downloader.report_error(u'invalid URL: %s' % url)
3015 video_id = mobj.group('ID')
3017 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3019 jsondata = self._download_webpage(info_url, video_id)
3021 self.report_extraction(video_id)
3023 config = json.loads(jsondata)
3025 video_title = config['data'][0]['title']
3026 seed = config['data'][0]['seed']
3028 format = self._downloader.params.get('format', None)
3029 supported_format = list(config['data'][0]['streamfileids'].keys())
3031 if format is None or format == 'best':
3032 if 'hd2' in supported_format:
3037 elif format == 'worst':
3045 fileid = config['data'][0]['streamfileids'][format]
3046 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3047 except (UnicodeDecodeError, ValueError, KeyError):
3048 self._downloader.report_error(u'unable to extract info section')
3052 sid = self._gen_sid()
3053 fileid = self._get_file_id(fileid, seed)
3055 #column 8,9 of fileid represent the segment number
3056 #fileid[7:9] should be changed
3057 for index, key in enumerate(keys):
3059 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3060 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3063 'id': '%s_part%02d' % (video_id, index),
3064 'url': download_url,
3066 'upload_date': None,
3067 'title': video_title,
3070 files_info.append(info)
3075 class XNXXIE(InfoExtractor):
3076 """Information extractor for xnxx.com"""
3078 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3080 VIDEO_URL_RE = r'flv_url=(.*?)&'
3081 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3082 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3084 def _real_extract(self, url):
3085 mobj = re.match(self._VALID_URL, url)
3087 self._downloader.report_error(u'invalid URL: %s' % url)
3089 video_id = mobj.group(1)
3091 # Get webpage content
3092 webpage = self._download_webpage(url, video_id)
3094 result = re.search(self.VIDEO_URL_RE, webpage)
3096 self._downloader.report_error(u'unable to extract video url')
3098 video_url = compat_urllib_parse.unquote(result.group(1))
3100 result = re.search(self.VIDEO_TITLE_RE, webpage)
3102 self._downloader.report_error(u'unable to extract video title')
3104 video_title = result.group(1)
3106 result = re.search(self.VIDEO_THUMB_RE, webpage)
3108 self._downloader.report_error(u'unable to extract video thumbnail')
3110 video_thumbnail = result.group(1)
3116 'upload_date': None,
3117 'title': video_title,
3119 'thumbnail': video_thumbnail,
3120 'description': None,
3124 class GooglePlusIE(InfoExtractor):
3125 """Information extractor for plus.google.com."""
3127 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3128 IE_NAME = u'plus.google'
3130 def report_extract_entry(self, url):
3131 """Report downloading extry"""
3132 self.to_screen(u'Downloading entry: %s' % url)
3134 def report_date(self, upload_date):
3135 """Report downloading extry"""
3136 self.to_screen(u'Entry date: %s' % upload_date)
3138 def report_uploader(self, uploader):
3139 """Report downloading extry"""
3140 self.to_screen(u'Uploader: %s' % uploader)
3142 def report_title(self, video_title):
3143 """Report downloading extry"""
3144 self.to_screen(u'Title: %s' % video_title)
3146 def report_extract_vid_page(self, video_page):
3147 """Report information extraction."""
3148 self.to_screen(u'Extracting video page: %s' % video_page)
3150 def _real_extract(self, url):
3151 # Extract id from URL
3152 mobj = re.match(self._VALID_URL, url)
3154 self._downloader.report_error(u'Invalid URL: %s' % url)
3157 post_url = mobj.group(0)
3158 video_id = mobj.group(1)
3160 video_extension = 'flv'
3162 # Step 1, Retrieve post webpage to extract further information
3163 self.report_extract_entry(post_url)
3164 webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3166 # Extract update date
3168 pattern = 'title="Timestamp">(.*?)</a>'
3169 mobj = re.search(pattern, webpage)
3171 upload_date = mobj.group(1)
3172 # Convert timestring to a format suitable for filename
3173 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3174 upload_date = upload_date.strftime('%Y%m%d')
3175 self.report_date(upload_date)
3179 pattern = r'rel\="author".*?>(.*?)</a>'
3180 mobj = re.search(pattern, webpage)
3182 uploader = mobj.group(1)
3183 self.report_uploader(uploader)
3186 # Get the first line for title
3188 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3189 mobj = re.search(pattern, webpage)
3191 video_title = mobj.group(1)
3192 self.report_title(video_title)
3194 # Step 2, Stimulate clicking the image box to launch video
3195 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3196 mobj = re.search(pattern, webpage)
3198 self._downloader.report_error(u'unable to extract video page URL')
3200 video_page = mobj.group(1)
3201 webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3202 self.report_extract_vid_page(video_page)
3205 # Extract video links on video page
3206 """Extract video links of all sizes"""
3207 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3208 mobj = re.findall(pattern, webpage)
3210 self._downloader.report_error(u'unable to extract video links')
3212 # Sort in resolution
3213 links = sorted(mobj)
3215 # Choose the lowest of the sort, i.e. highest resolution
3216 video_url = links[-1]
3217 # Only get the url. The resolution part in the tuple has no use anymore
3218 video_url = video_url[-1]
3219 # Treat escaped \u0026 style hex
3221 video_url = video_url.decode("unicode_escape")
3222 except AttributeError: # Python 3
3223 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3229 'uploader': uploader,
3230 'upload_date': upload_date,
3231 'title': video_title,
3232 'ext': video_extension,
3235 class NBAIE(InfoExtractor):
3236 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3239 def _real_extract(self, url):
3240 mobj = re.match(self._VALID_URL, url)
3242 self._downloader.report_error(u'invalid URL: %s' % url)
3245 video_id = mobj.group(1)
3246 if video_id.endswith('/index.html'):
3247 video_id = video_id[:-len('/index.html')]
3249 webpage = self._download_webpage(url, video_id)
3251 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3252 def _findProp(rexp, default=None):
3253 m = re.search(rexp, webpage)
3255 return unescapeHTML(m.group(1))
3259 shortened_video_id = video_id.rpartition('/')[2]
3260 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3262 'id': shortened_video_id,
3266 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3267 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3271 class JustinTVIE(InfoExtractor):
3272 """Information extractor for justin.tv and twitch.tv"""
3273 # TODO: One broadcast may be split into multiple videos. The key
3274 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3275 # starts at 1 and increases. Can we treat all parts as one video?
3277 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3279 (?P<channelid>[^/]+)|
3280 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3281 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3285 _JUSTIN_PAGE_LIMIT = 100
3286 IE_NAME = u'justin.tv'
3288 def report_download_page(self, channel, offset):
3289 """Report attempt to download a single page of videos."""
3290 self.to_screen(u'%s: Downloading video information from %d to %d' %
3291 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3293 # Return count of items, list of *valid* items
3294 def _parse_page(self, url, video_id):
3295 webpage = self._download_webpage(url, video_id,
3296 u'Downloading video info JSON',
3297 u'unable to download video info JSON')
3299 response = json.loads(webpage)
3300 if type(response) != list:
3301 error_text = response.get('error', 'unknown error')
3302 raise ExtractorError(u'Justin.tv API: %s' % error_text)
3304 for clip in response:
3305 video_url = clip['video_file_url']
3307 video_extension = os.path.splitext(video_url)[1][1:]
3308 video_date = re.sub('-', '', clip['start_time'][:10])
3309 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3310 video_id = clip['id']
3311 video_title = clip.get('title', video_id)
3315 'title': video_title,
3316 'uploader': clip.get('channel_name', video_uploader_id),
3317 'uploader_id': video_uploader_id,
3318 'upload_date': video_date,
3319 'ext': video_extension,
3321 return (len(response), info)
3323 def _real_extract(self, url):
3324 mobj = re.match(self._VALID_URL, url)
3326 raise ExtractorError(u'invalid URL: %s' % url)
3328 api_base = 'http://api.justin.tv'
3330 if mobj.group('channelid'):
3332 video_id = mobj.group('channelid')
3333 api = api_base + '/channel/archives/%s.json' % video_id
3334 elif mobj.group('chapterid'):
3335 chapter_id = mobj.group('chapterid')
3337 webpage = self._download_webpage(url, chapter_id)
3338 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3340 raise ExtractorError(u'Cannot find archive of a chapter')
3341 archive_id = m.group(1)
3343 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3344 chapter_info_xml = self._download_webpage(api, chapter_id,
3345 note=u'Downloading chapter information',
3346 errnote=u'Chapter information download failed')
3347 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3348 for a in doc.findall('.//archive'):
3349 if archive_id == a.find('./id').text:
3352 raise ExtractorError(u'Could not find chapter in chapter information')
3354 video_url = a.find('./video_file_url').text
3355 video_ext = video_url.rpartition('.')[2] or u'flv'
3357 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3358 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3359 note='Downloading chapter metadata',
3360 errnote='Download of chapter metadata failed')
3361 chapter_info = json.loads(chapter_info_json)
3363 bracket_start = int(doc.find('.//bracket_start').text)
3364 bracket_end = int(doc.find('.//bracket_end').text)
3366 # TODO determine start (and probably fix up file)
3367 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3368 #video_url += u'?start=' + TODO:start_timestamp
3369 # bracket_start is 13290, but we want 51670615
3370 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3371 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3374 'id': u'c' + chapter_id,
3377 'title': chapter_info['title'],
3378 'thumbnail': chapter_info['preview'],
3379 'description': chapter_info['description'],
3380 'uploader': chapter_info['channel']['display_name'],
3381 'uploader_id': chapter_info['channel']['name'],
3385 video_id = mobj.group('videoid')
3386 api = api_base + '/broadcast/by_archive/%s.json' % video_id
3388 self.report_extraction(video_id)
3392 limit = self._JUSTIN_PAGE_LIMIT
3395 self.report_download_page(video_id, offset)
3396 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3397 page_count, page_info = self._parse_page(page_url, video_id)
3398 info.extend(page_info)
3399 if not paged or page_count != limit:
3404 class FunnyOrDieIE(InfoExtractor):
3405 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3407 def _real_extract(self, url):
3408 mobj = re.match(self._VALID_URL, url)
3410 raise ExtractorError(u'invalid URL: %s' % url)
3412 video_id = mobj.group('id')
3413 webpage = self._download_webpage(url, video_id)
3415 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3417 self._downloader.report_error(u'unable to find video information')
3418 video_url = unescapeHTML(m.group('url'))
3420 m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3422 m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3424 self._downloader.report_error(u'Cannot find video title')
3425 title = clean_html(m.group('title'))
3427 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3429 desc = unescapeHTML(m.group('desc'))
3438 'description': desc,
3442 class SteamIE(InfoExtractor):
3443 _VALID_URL = r"""http://store\.steampowered\.com/
3445 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3447 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3451 def suitable(cls, url):
3452 """Receives a URL and returns True if suitable for this IE."""
3453 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3455 def _real_extract(self, url):
3456 m = re.match(self._VALID_URL, url, re.VERBOSE)
3457 gameID = m.group('gameID')
3458 videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3459 self.report_age_confirmation()
3460 webpage = self._download_webpage(videourl, gameID)
3461 game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3463 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3464 mweb = re.finditer(urlRE, webpage)
3465 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3466 titles = re.finditer(namesRE, webpage)
3467 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3468 thumbs = re.finditer(thumbsRE, webpage)
3470 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3471 video_id = vid.group('videoID')
3472 title = vtitle.group('videoName')
3473 video_url = vid.group('videoURL')
3474 video_thumb = thumb.group('thumbnail')
3476 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3481 'title': unescapeHTML(title),
3482 'thumbnail': video_thumb
3485 return [self.playlist_result(videos, gameID, game_title)]
3487 class UstreamIE(InfoExtractor):
3488 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3489 IE_NAME = u'ustream'
3491 def _real_extract(self, url):
3492 m = re.match(self._VALID_URL, url)
3493 video_id = m.group('videoID')
3494 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3495 webpage = self._download_webpage(url, video_id)
3496 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3497 title = m.group('title')
3498 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3499 uploader = m.group('uploader')
3505 'uploader': uploader
3509 class WorldStarHipHopIE(InfoExtractor):
3510 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3511 IE_NAME = u'WorldStarHipHop'
3513 def _real_extract(self, url):
3514 _src_url = r'so\.addVariable\("file","(.*?)"\)'
3516 m = re.match(self._VALID_URL, url)
3517 video_id = m.group('id')
3519 webpage_src = self._download_webpage(url, video_id)
3521 mobj = re.search(_src_url, webpage_src)
3523 if mobj is not None:
3524 video_url = mobj.group(1)
3525 if 'mp4' in video_url:
3530 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3532 mobj = re.search(r"<title>(.*)</title>", webpage_src)
3535 raise ExtractorError(u'Cannot determine title')
3536 title = mobj.group(1)
3538 mobj = re.search(r'rel="image_src" href="(.*)" />', webpage_src)
3539 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3540 if mobj is not None:
3541 thumbnail = mobj.group(1)
3543 _title = r"""candytitles.*>(.*)</span>"""
3544 mobj = re.search(_title, webpage_src)
3545 if mobj is not None:
3546 title = mobj.group(1)
3553 'thumbnail' : thumbnail,
3558 class RBMARadioIE(InfoExtractor):
3559 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3561 def _real_extract(self, url):
3562 m = re.match(self._VALID_URL, url)
3563 video_id = m.group('videoID')
3565 webpage = self._download_webpage(url, video_id)
3566 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3568 raise ExtractorError(u'Cannot find metadata')
3569 json_data = m.group(1)
3572 data = json.loads(json_data)
3573 except ValueError as e:
3574 raise ExtractorError(u'Invalid JSON: ' + str(e))
3576 video_url = data['akamai_url'] + '&cbr=256'
3577 url_parts = compat_urllib_parse_urlparse(video_url)
3578 video_ext = url_parts.path.rpartition('.')[2]
3583 'title': data['title'],
3584 'description': data.get('teaser_text'),
3585 'location': data.get('country_of_origin'),
3586 'uploader': data.get('host', {}).get('name'),
3587 'uploader_id': data.get('host', {}).get('slug'),
3588 'thumbnail': data.get('image', {}).get('large_url_2x'),
3589 'duration': data.get('duration'),
3594 class YouPornIE(InfoExtractor):
3595 """Information extractor for youporn.com."""
3596 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3598 def _print_formats(self, formats):
3599 """Print all available formats"""
3600 print(u'Available formats:')
3601 print(u'ext\t\tformat')
3602 print(u'---------------------------------')
3603 for format in formats:
3604 print(u'%s\t\t%s' % (format['ext'], format['format']))
3606 def _specific(self, req_format, formats):
3608 if(x["format"]==req_format):
3612 def _real_extract(self, url):
3613 mobj = re.match(self._VALID_URL, url)
3615 self._downloader.report_error(u'invalid URL: %s' % url)
3618 video_id = mobj.group('videoid')
3620 req = compat_urllib_request.Request(url)
3621 req.add_header('Cookie', 'age_verified=1')
3622 webpage = self._download_webpage(req, video_id)
3624 # Get the video title
3625 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3627 raise ExtractorError(u'Unable to extract video title')
3628 video_title = result.group('title').strip()
3630 # Get the video date
3631 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3633 self._downloader.report_warning(u'unable to extract video date')
3636 upload_date = unified_strdate(result.group('date').strip())
3638 # Get the video uploader
3639 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3641 self._downloader.report_warning(u'unable to extract uploader')
3642 video_uploader = None
3644 video_uploader = result.group('uploader').strip()
3645 video_uploader = clean_html( video_uploader )
3647 # Get all of the formats available
3648 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3649 result = re.search(DOWNLOAD_LIST_RE, webpage)
3651 raise ExtractorError(u'Unable to extract download list')
3652 download_list_html = result.group('download_list').strip()
3654 # Get all of the links from the page
3655 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3656 links = re.findall(LINK_RE, download_list_html)
3657 if(len(links) == 0):
3658 raise ExtractorError(u'ERROR: no known formats available for video')
3660 self.to_screen(u'Links found: %d' % len(links))
3665 # A link looks like this:
3666 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3667 # A path looks like this:
3668 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3669 video_url = unescapeHTML( link )
3670 path = compat_urllib_parse_urlparse( video_url ).path
3671 extension = os.path.splitext( path )[1][1:]
3672 format = path.split('/')[4].split('_')[:2]
3675 format = "-".join( format )
3676 title = u'%s-%s-%s' % (video_title, size, bitrate)
3681 'uploader': video_uploader,
3682 'upload_date': upload_date,
3687 'description': None,
3691 if self._downloader.params.get('listformats', None):
3692 self._print_formats(formats)
3695 req_format = self._downloader.params.get('format', None)
3696 self.to_screen(u'Format: %s' % req_format)
3698 if req_format is None or req_format == 'best':
3700 elif req_format == 'worst':
3701 return [formats[-1]]
3702 elif req_format in ('-1', 'all'):
3705 format = self._specific( req_format, formats )
3707 self._downloader.report_error(u'requested format not available')
3713 class PornotubeIE(InfoExtractor):
3714 """Information extractor for pornotube.com."""
3715 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3717 def _real_extract(self, url):
3718 mobj = re.match(self._VALID_URL, url)
3720 self._downloader.report_error(u'invalid URL: %s' % url)
3723 video_id = mobj.group('videoid')
3724 video_title = mobj.group('title')
3726 # Get webpage content
3727 webpage = self._download_webpage(url, video_id)
3730 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3731 result = re.search(VIDEO_URL_RE, webpage)
3733 self._downloader.report_error(u'unable to extract video url')
3735 video_url = compat_urllib_parse.unquote(result.group('url'))
3737 #Get the uploaded date
3738 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3739 result = re.search(VIDEO_UPLOADED_RE, webpage)
3741 self._downloader.report_error(u'unable to extract video title')
3743 upload_date = unified_strdate(result.group('date'))
3745 info = {'id': video_id,
3748 'upload_date': upload_date,
3749 'title': video_title,
3755 class YouJizzIE(InfoExtractor):
3756 """Information extractor for youjizz.com."""
3757 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3759 def _real_extract(self, url):
3760 mobj = re.match(self._VALID_URL, url)
3762 self._downloader.report_error(u'invalid URL: %s' % url)
3765 video_id = mobj.group('videoid')
3767 # Get webpage content
3768 webpage = self._download_webpage(url, video_id)
3770 # Get the video title
3771 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3773 raise ExtractorError(u'ERROR: unable to extract video title')
3774 video_title = result.group('title').strip()
3776 # Get the embed page
3777 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3779 raise ExtractorError(u'ERROR: unable to extract embed page')
3781 embed_page_url = result.group(0).strip()
3782 video_id = result.group('videoid')
3784 webpage = self._download_webpage(embed_page_url, video_id)
3787 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3789 raise ExtractorError(u'ERROR: unable to extract video url')
3790 video_url = result.group('source')
3792 info = {'id': video_id,
3794 'title': video_title,
3797 'player_url': embed_page_url}
3801 class EightTracksIE(InfoExtractor):
3803 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3805 def _real_extract(self, url):
3806 mobj = re.match(self._VALID_URL, url)
3808 raise ExtractorError(u'Invalid URL: %s' % url)
3809 playlist_id = mobj.group('id')
3811 webpage = self._download_webpage(url, playlist_id)
3813 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3815 raise ExtractorError(u'Cannot find trax information')
3816 json_like = m.group(1)
3817 data = json.loads(json_like)
3819 session = str(random.randint(0, 1000000000))
3821 track_count = data['tracks_count']
3822 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3823 next_url = first_url
3825 for i in itertools.count():
3826 api_json = self._download_webpage(next_url, playlist_id,
3827 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3828 errnote=u'Failed to download song information')
3829 api_data = json.loads(api_json)
3830 track_data = api_data[u'set']['track']
3832 'id': track_data['id'],
3833 'url': track_data['track_file_stream_url'],
3834 'title': track_data['performer'] + u' - ' + track_data['name'],
3835 'raw_title': track_data['name'],
3836 'uploader_id': data['user']['login'],
3840 if api_data['set']['at_last_track']:
3842 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3845 class KeekIE(InfoExtractor):
3846 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3849 def _real_extract(self, url):
3850 m = re.match(self._VALID_URL, url)
3851 video_id = m.group('videoID')
3852 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3853 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3854 webpage = self._download_webpage(url, video_id)
3855 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3856 title = unescapeHTML(m.group('title'))
3857 m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
3858 uploader = clean_html(m.group('uploader'))
3864 'thumbnail': thumbnail,
3865 'uploader': uploader
3869 class TEDIE(InfoExtractor):
3870 _VALID_URL=r'''http://www\.ted\.com/
3872 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3874 ((?P<type_talk>talks)) # We have a simple talk
3876 (/lang/(.*?))? # The url may contain the language
3877 /(?P<name>\w+) # Here goes the name and then ".html"
3881 def suitable(cls, url):
3882 """Receives a URL and returns True if suitable for this IE."""
3883 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3885 def _real_extract(self, url):
3886 m=re.match(self._VALID_URL, url, re.VERBOSE)
3887 if m.group('type_talk'):
3888 return [self._talk_info(url)]
3890 playlist_id=m.group('playlist_id')
3891 name=m.group('name')
3892 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3893 return [self._playlist_videos_info(url,name,playlist_id)]
3895 def _talk_video_link(self,mediaSlug):
3896 '''Returns the video link for that mediaSlug'''
3897 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3899 def _playlist_videos_info(self,url,name,playlist_id=0):
3900 '''Returns the videos of the playlist'''
3902 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3903 ([.\s]*?)data-playlist_item_id="(\d+)"
3904 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3906 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3907 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3908 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3909 m_names=re.finditer(video_name_RE,webpage)
3911 playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3912 m_playlist = re.search(playlist_RE, webpage)
3913 playlist_title = m_playlist.group('playlist_title')
3915 playlist_entries = []
3916 for m_video, m_name in zip(m_videos,m_names):
3917 video_id=m_video.group('video_id')
3918 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3919 playlist_entries.append(self.url_result(talk_url, 'TED'))
3920 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3922 def _talk_info(self, url, video_id=0):
3923 """Return the video for the talk in the url"""
3924 m=re.match(self._VALID_URL, url,re.VERBOSE)
3925 videoName=m.group('name')
3926 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3927 # If the url includes the language we get the title translated
3928 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3929 title=re.search(title_RE, webpage).group('title')
3930 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3931 "id":(?P<videoID>[\d]+).*?
3932 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3933 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3934 thumb_match=re.search(thumb_RE,webpage)
3935 info_match=re.search(info_RE,webpage,re.VERBOSE)
3936 video_id=info_match.group('videoID')
3937 mediaSlug=info_match.group('mediaSlug')
3938 video_url=self._talk_video_link(mediaSlug)
3944 'thumbnail': thumb_match.group('thumbnail')
3948 class MySpassIE(InfoExtractor):
3949 _VALID_URL = r'http://www.myspass.de/.*'
3951 def _real_extract(self, url):
3952 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3954 # video id is the last path element of the URL
3955 # usually there is a trailing slash, so also try the second but last
3956 url_path = compat_urllib_parse_urlparse(url).path
3957 url_parent_path, video_id = os.path.split(url_path)
3959 _, video_id = os.path.split(url_parent_path)
3962 metadata_url = META_DATA_URL_TEMPLATE % video_id
3963 metadata_text = self._download_webpage(metadata_url, video_id)
3964 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3966 # extract values from metadata
3967 url_flv_el = metadata.find('url_flv')
3968 if url_flv_el is None:
3969 self._downloader.report_error(u'unable to extract download url')
3971 video_url = url_flv_el.text
3972 extension = os.path.splitext(video_url)[1][1:]
3973 title_el = metadata.find('title')
3974 if title_el is None:
3975 self._downloader.report_error(u'unable to extract title')
3977 title = title_el.text
3978 format_id_el = metadata.find('format_id')
3979 if format_id_el is None:
3982 format = format_id_el.text
3983 description_el = metadata.find('description')
3984 if description_el is not None:
3985 description = description_el.text
3988 imagePreview_el = metadata.find('imagePreview')
3989 if imagePreview_el is not None:
3990 thumbnail = imagePreview_el.text
3999 'thumbnail': thumbnail,
4000 'description': description
4004 class SpiegelIE(InfoExtractor):
4005 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4007 def _real_extract(self, url):
4008 m = re.match(self._VALID_URL, url)
4009 video_id = m.group('videoID')
4011 webpage = self._download_webpage(url, video_id)
4012 m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4014 raise ExtractorError(u'Cannot find title')
4015 video_title = unescapeHTML(m.group(1))
4017 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4018 xml_code = self._download_webpage(xml_url, video_id,
4019 note=u'Downloading XML', errnote=u'Failed to download XML')
4021 idoc = xml.etree.ElementTree.fromstring(xml_code)
4022 last_type = idoc[-1]
4023 filename = last_type.findall('./filename')[0].text
4024 duration = float(last_type.findall('./duration')[0].text)
4026 video_url = 'http://video2.spiegel.de/flash/' + filename
4027 video_ext = filename.rpartition('.')[2]
4032 'title': video_title,
4033 'duration': duration,
4037 class LiveLeakIE(InfoExtractor):
4039 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4040 IE_NAME = u'liveleak'
4042 def _real_extract(self, url):
4043 mobj = re.match(self._VALID_URL, url)
4045 self._downloader.report_error(u'invalid URL: %s' % url)
4048 video_id = mobj.group('video_id')
4050 webpage = self._download_webpage(url, video_id)
4052 m = re.search(r'file: "(.*?)",', webpage)
4054 self._downloader.report_error(u'unable to find video url')
4056 video_url = m.group(1)
4058 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4060 self._downloader.report_error(u'Cannot find video title')
4061 title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4063 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4065 desc = unescapeHTML(m.group('desc'))
4069 m = re.search(r'By:.*?(\w+)</a>', webpage)
4071 uploader = clean_html(m.group(1))
4080 'description': desc,
4081 'uploader': uploader
4086 class ARDIE(InfoExtractor):
4087 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4088 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4089 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4091 def _real_extract(self, url):
4092 # determine video id from url
4093 m = re.match(self._VALID_URL, url)
4095 numid = re.search(r'documentId=([0-9]+)', url)
4097 video_id = numid.group(1)
4099 video_id = m.group('video_id')
4101 # determine title and media streams from webpage
4102 html = self._download_webpage(url, video_id)
4103 title = re.search(self._TITLE, html).group('title')
4104 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4106 assert '"fsk"' in html
4107 self._downloader.report_error(u'this video is only available after 8:00 pm')
4110 # choose default media type and highest quality for now
4111 stream = max([s for s in streams if int(s["media_type"]) == 0],
4112 key=lambda s: int(s["quality"]))
4114 # there's two possibilities: RTMP stream or HTTP download
4115 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4116 if stream['rtmp_url']:
4117 self.to_screen(u'RTMP download detected')
4118 assert stream['video_url'].startswith('mp4:')
4119 info["url"] = stream["rtmp_url"]
4120 info["play_path"] = stream['video_url']
4122 assert stream["video_url"].endswith('.mp4')
4123 info["url"] = stream["video_url"]
4126 class TumblrIE(InfoExtractor):
4127 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4129 def _real_extract(self, url):
4130 m_url = re.match(self._VALID_URL, url)
4131 video_id = m_url.group('id')
4132 blog = m_url.group('blog_name')
4134 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4135 webpage = self._download_webpage(url, video_id)
4137 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4138 video = re.search(re_video, webpage)
4140 self.to_screen("No video founded")
4142 video_url = video.group('video_url')
4143 ext = video.group('ext')
4145 re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22' # We pick the first poster
4146 thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
4148 # The only place where you can get a title, it's not complete,
4149 # but searching in other places doesn't work for all videos
4150 re_title = r'<title>(?P<title>.*?)</title>'
4151 title = unescapeHTML(re.search(re_title, webpage, re.DOTALL).group('title'))
4153 return [{'id': video_id,
4160 class BandcampIE(InfoExtractor):
4161 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4163 def _real_extract(self, url):
4164 mobj = re.match(self._VALID_URL, url)
4165 title = mobj.group('title')
4166 webpage = self._download_webpage(url, title)
4167 # We get the link to the free download page
4168 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4169 if m_download is None:
4170 self._downloader.report_error('No free songs founded')
4172 download_link = m_download.group(1)
4173 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4174 webpage, re.MULTILINE|re.DOTALL).group('id')
4176 download_webpage = self._download_webpage(download_link, id,
4177 'Downloading free downloads page')
4178 # We get the dictionary of the track from some javascrip code
4179 info = re.search(r'items: (.*?),$',
4180 download_webpage, re.MULTILINE).group(1)
4181 info = json.loads(info)[0]
4182 # We pick mp3-320 for now, until format selection can be easily implemented.
4183 mp3_info = info[u'downloads'][u'mp3-320']
4184 # If we try to use this url it says the link has expired
4185 initial_url = mp3_info[u'url']
4186 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4187 m_url = re.match(re_url, initial_url)
4188 #We build the url we will use to get the final track url
4189 # This url is build in Bandcamp in the script download_bunde_*.js
4190 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4191 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4192 # If we could correctly generate the .rand field the url would be
4193 #in the "download_url" key
4194 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4196 track_info = {'id':id,
4197 'title' : info[u'title'],
4200 'thumbnail' : info[u'thumb_url'],
4201 'uploader' : info[u'artist']
4206 class RedTubeIE(InfoExtractor):
4207 """Information Extractor for redtube"""
4208 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4210 def _real_extract(self,url):
4211 mobj = re.match(self._VALID_URL, url)
4213 raise ExtractorError(u'Invalid URL: %s' % url)
4215 video_id = mobj.group('id')
4216 video_extension = 'mp4'
4217 webpage = self._download_webpage(url, video_id)
4218 self.report_extraction(video_id)
4219 mobj = re.search(r'<source src="'+'(.+)'+'" type="video/mp4">',webpage)
4222 raise ExtractorError(u'Unable to extract media URL')
4224 video_url = mobj.group(1)
4225 mobj = re.search('<h1 class="videoTitle slidePanelMovable">(.+)</h1>',webpage)
4227 raise ExtractorError(u'Unable to extract title')
4228 video_title = mobj.group(1)
4233 'ext': video_extension,
4234 'title': video_title,
4238 def gen_extractors():
4239 """ Return a list of an instance of every supported extractor.
4240 The order does matter; the first extractor matched is the one handling the URL.
4243 YoutubePlaylistIE(),
4268 StanfordOpenClassroomIE(),
4278 WorldStarHipHopIE(),
4297 def get_info_extractor(ie_name):
4298 """Returns the info extractor class with the given ie_name"""
4299 return globals()[ie_name+'IE']