2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
23 class InfoExtractor(object):
24 """Information Extractor class.
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
29 others. The information is stored in a dictionary which is then
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
34 The dictionaries must include the following fields:
38 title: Video title, unescaped.
39 ext: Video filename extension.
41 The following fields are optional:
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
46 uploader: Full name of the video uploader.
47 upload_date: Video upload date (YYYYMMDD).
48 uploader_id: Nickname or id of the video uploader.
49 location: Physical location of the video.
50 player_url: SWF Player URL (used for rtmpdump).
51 subtitles: The subtitle file contents.
52 urlhandle: [internal] The urlHandle to be used to download the file,
53 like returned by urllib.request.urlopen
55 The fields should all be Unicode strings.
57 Subclasses of this one should re-define the _real_initialize() and
58 _real_extract() methods and define a _VALID_URL regexp.
59 Probably, they should also be added to the list of extractors.
61 _real_extract() must return a *list* of information dictionaries as
64 Finally, the _WORKING attribute should be set to False for broken IEs
65 in order to warn the users and skip the tests.
72 def __init__(self, downloader=None):
73 """Constructor. Receives an optional downloader."""
75 self.set_downloader(downloader)
78 def suitable(cls, url):
79 """Receives a URL and returns True if suitable for this IE."""
80 return re.match(cls._VALID_URL, url) is not None
84 """Getter method for _WORKING."""
88 """Initializes an instance (authentication, etc)."""
90 self._real_initialize()
93 def extract(self, url):
94 """Extracts URL information and returns it in list of dicts."""
96 return self._real_extract(url)
98 def set_downloader(self, downloader):
99 """Sets the downloader for this IE."""
100 self._downloader = downloader
102 def _real_initialize(self):
103 """Real initialization process. Redefine in subclasses."""
106 def _real_extract(self, url):
107 """Real extraction process. Redefine in subclasses."""
112 return type(self).__name__[:-2]
114 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115 """ Returns the response handle """
117 note = u'Downloading video webpage'
118 self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
120 return compat_urllib_request.urlopen(url_or_request)
121 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
123 errnote = u'Unable to download webpage'
124 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
126 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
127 """ Returns the data of the page as a string """
128 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
129 content_type = urlh.headers.get('Content-Type', '')
130 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
132 encoding = m.group(1)
135 webpage_bytes = urlh.read()
136 return webpage_bytes.decode(encoding, 'replace')
139 class YoutubeIE(InfoExtractor):
140 """Information extractor for youtube.com."""
144 (?:https?://)? # http(s):// (optional)
145 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
146 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
147 (?:.*?\#/)? # handle anchor (#/) redirect urls
148 (?: # the various things that can precede the ID:
149 (?:(?:v|embed|e)/) # v/ or embed/ or e/
150 |(?: # or the v= param in all its forms
151 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
152 (?:\?|\#!?) # the params delimiter ? or # or #!
153 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
156 )? # optional -> youtube.com/xxxx is OK
157 )? # all until now is optional -> you can pass the naked ID
158 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
159 (?(1).+)? # if we found the ID, everything can follow
161 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
162 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
163 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
164 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
165 _NETRC_MACHINE = 'youtube'
166 # Listed in order of quality
167 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
168 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
169 _video_extensions = {
175 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
181 _video_dimensions = {
200 def suitable(cls, url):
201 """Receives a URL and returns True if suitable for this IE."""
202 if YoutubePlaylistIE.suitable(url): return False
203 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
205 def report_lang(self):
206 """Report attempt to set language."""
207 self._downloader.to_screen(u'[youtube] Setting language')
209 def report_login(self):
210 """Report attempt to log in."""
211 self._downloader.to_screen(u'[youtube] Logging in')
213 def report_age_confirmation(self):
214 """Report attempt to confirm age."""
215 self._downloader.to_screen(u'[youtube] Confirming age')
217 def report_video_webpage_download(self, video_id):
218 """Report attempt to download video webpage."""
219 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
221 def report_video_info_webpage_download(self, video_id):
222 """Report attempt to download video info webpage."""
223 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
225 def report_video_subtitles_download(self, video_id):
226 """Report attempt to download video info webpage."""
227 self._downloader.to_screen(u'[youtube] %s: Checking available subtitles' % video_id)
229 def report_video_subtitles_request(self, video_id, sub_lang, format):
230 """Report attempt to download video info webpage."""
231 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
233 def report_video_subtitles_available(self, video_id, sub_lang_list):
234 """Report available subtitles."""
235 sub_lang = ",".join(list(sub_lang_list.keys()))
236 self._downloader.to_screen(u'[youtube] %s: Available subtitles for video: %s' % (video_id, sub_lang))
238 def report_information_extraction(self, video_id):
239 """Report attempt to extract video information."""
240 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
242 def report_unavailable_format(self, video_id, format):
243 """Report extracted video URL."""
244 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
246 def report_rtmp_download(self):
247 """Indicate the download will use the RTMP protocol."""
248 self._downloader.to_screen(u'[youtube] RTMP download detected')
250 def _get_available_subtitles(self, video_id):
251 self.report_video_subtitles_download(video_id)
252 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
254 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
255 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
256 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
257 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
258 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
259 if not sub_lang_list:
260 return (u'WARNING: video doesn\'t have subtitles', None)
263 def _list_available_subtitles(self, video_id):
264 sub_lang_list = self._get_available_subtitles(video_id)
265 self.report_video_subtitles_available(video_id, sub_lang_list)
267 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
268 self.report_video_subtitles_request(video_id, sub_lang, format)
269 params = compat_urllib_parse.urlencode({
275 url = 'http://www.youtube.com/api/timedtext?' + params
277 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
278 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
279 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
281 return (u'WARNING: Did not fetch video subtitles', None)
282 return (None, sub_lang, sub)
284 def _extract_subtitle(self, video_id):
286 Return a list with a tuple:
287 [(error_message, sub_lang, sub)]
289 sub_lang_list = self._get_available_subtitles(video_id)
290 sub_format = self._downloader.params.get('subtitlesformat')
291 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
292 return [(sub_lang_list[0], None, None)]
293 if self._downloader.params.get('subtitleslang', False):
294 sub_lang = self._downloader.params.get('subtitleslang')
295 elif 'en' in sub_lang_list:
298 sub_lang = list(sub_lang_list.keys())[0]
299 if not sub_lang in sub_lang_list:
300 return [(u'WARNING: no closed captions found in the specified language "%s"' % sub_lang, None, None)]
302 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
305 def _extract_all_subtitles(self, video_id):
306 sub_lang_list = self._get_available_subtitles(video_id)
307 sub_format = self._downloader.params.get('subtitlesformat')
309 for sub_lang in sub_lang_list:
310 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
311 subtitles.append(subtitle)
314 def _print_formats(self, formats):
315 print('Available formats:')
317 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
319 def _real_initialize(self):
320 if self._downloader is None:
325 downloader_params = self._downloader.params
327 # Attempt to use provided username and password or .netrc data
328 if downloader_params.get('username', None) is not None:
329 username = downloader_params['username']
330 password = downloader_params['password']
331 elif downloader_params.get('usenetrc', False):
333 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
338 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
339 except (IOError, netrc.NetrcParseError) as err:
340 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
344 request = compat_urllib_request.Request(self._LANG_URL)
347 compat_urllib_request.urlopen(request).read()
348 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
349 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
352 # No authentication to be performed
356 request = compat_urllib_request.Request(self._LOGIN_URL)
358 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
359 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
360 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
365 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
367 galx = match.group(1)
369 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
375 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
379 u'PersistentCookie': u'yes',
381 u'bgresponse': u'js_disabled',
382 u'checkConnection': u'',
383 u'checkedDomains': u'youtube',
389 u'signIn': u'Sign in',
391 u'service': u'youtube',
395 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
397 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
398 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
399 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
402 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
403 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
404 self._downloader.report_warning(u'unable to log in: bad username or password')
406 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
407 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
413 'action_confirm': 'Confirm',
415 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
417 self.report_age_confirmation()
418 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
419 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
420 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
423 def _extract_id(self, url):
424 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
426 self._downloader.report_error(u'invalid URL: %s' % url)
428 video_id = mobj.group(2)
431 def _real_extract(self, url):
432 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
433 mobj = re.search(self._NEXT_URL_RE, url)
435 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
436 video_id = self._extract_id(url)
439 self.report_video_webpage_download(video_id)
440 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
441 request = compat_urllib_request.Request(url)
443 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
444 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
445 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
448 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
450 # Attempt to extract SWF player URL
451 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
453 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
458 self.report_video_info_webpage_download(video_id)
459 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
460 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
461 % (video_id, el_type))
462 request = compat_urllib_request.Request(video_info_url)
464 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
465 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
466 video_info = compat_parse_qs(video_info_webpage)
467 if 'token' in video_info:
469 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
470 self._downloader.report_error(u'unable to download video info webpage: %s' % compat_str(err))
472 if 'token' not in video_info:
473 if 'reason' in video_info:
474 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
476 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
479 # Check for "rental" videos
480 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
481 self._downloader.report_error(u'"rental" videos not supported')
484 # Start extracting information
485 self.report_information_extraction(video_id)
488 if 'author' not in video_info:
489 self._downloader.report_error(u'unable to extract uploader name')
491 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
494 video_uploader_id = None
495 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
497 video_uploader_id = mobj.group(1)
499 self._downloader.report_warning(u'unable to extract uploader nickname')
502 if 'title' not in video_info:
503 self._downloader.report_error(u'unable to extract video title')
505 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
508 if 'thumbnail_url' not in video_info:
509 self._downloader.report_warning(u'unable to extract video thumbnail')
511 else: # don't panic if we can't find it
512 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
516 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
518 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
519 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
520 for expression in format_expressions:
522 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
527 video_description = get_element_by_id("eow-description", video_webpage)
528 if video_description:
529 video_description = clean_html(video_description)
531 video_description = ''
534 video_subtitles = None
536 if self._downloader.params.get('writesubtitles', False):
537 video_subtitles = self._extract_subtitle(video_id)
539 (sub_error, sub_lang, sub) = video_subtitles[0]
541 self._downloader.trouble(sub_error)
543 if self._downloader.params.get('allsubtitles', False):
544 video_subtitles = self._extract_all_subtitles(video_id)
545 for video_subtitle in video_subtitles:
546 (sub_error, sub_lang, sub) = video_subtitle
548 self._downloader.trouble(sub_error)
550 if self._downloader.params.get('listsubtitles', False):
551 sub_lang_list = self._list_available_subtitles(video_id)
554 if 'length_seconds' not in video_info:
555 self._downloader.report_warning(u'unable to extract video duration')
558 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
561 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
563 # Decide which formats to download
564 req_format = self._downloader.params.get('format', None)
566 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
567 self.report_rtmp_download()
568 video_url_list = [(None, video_info['conn'][0])]
569 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
570 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
571 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
572 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
573 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
575 format_limit = self._downloader.params.get('format_limit', None)
576 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
577 if format_limit is not None and format_limit in available_formats:
578 format_list = available_formats[available_formats.index(format_limit):]
580 format_list = available_formats
581 existing_formats = [x for x in format_list if x in url_map]
582 if len(existing_formats) == 0:
583 self._downloader.report_error(u'no known formats available for video')
585 if self._downloader.params.get('listformats', None):
586 self._print_formats(existing_formats)
588 if req_format is None or req_format == 'best':
589 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
590 elif req_format == 'worst':
591 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
592 elif req_format in ('-1', 'all'):
593 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
595 # Specific formats. We pick the first in a slash-delimeted sequence.
596 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
597 req_formats = req_format.split('/')
598 video_url_list = None
599 for rf in req_formats:
601 video_url_list = [(rf, url_map[rf])]
603 if video_url_list is None:
604 self._downloader.report_error(u'requested format not available')
607 self._downloader.report_error(u'no conn or url_encoded_fmt_stream_map information found in video info')
611 for format_param, video_real_url in video_url_list:
613 video_extension = self._video_extensions.get(format_param, 'flv')
615 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
616 self._video_dimensions.get(format_param, '???'))
620 'url': video_real_url,
621 'uploader': video_uploader,
622 'uploader_id': video_uploader_id,
623 'upload_date': upload_date,
624 'title': video_title,
625 'ext': video_extension,
626 'format': video_format,
627 'thumbnail': video_thumbnail,
628 'description': video_description,
629 'player_url': player_url,
630 'subtitles': video_subtitles,
631 'duration': video_duration
636 class MetacafeIE(InfoExtractor):
637 """Information Extractor for metacafe.com."""
639 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
640 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
641 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
642 IE_NAME = u'metacafe'
644 def __init__(self, downloader=None):
645 InfoExtractor.__init__(self, downloader)
647 def report_disclaimer(self):
648 """Report disclaimer retrieval."""
649 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
651 def report_age_confirmation(self):
652 """Report attempt to confirm age."""
653 self._downloader.to_screen(u'[metacafe] Confirming age')
655 def report_download_webpage(self, video_id):
656 """Report webpage download."""
657 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
659 def report_extraction(self, video_id):
660 """Report information extraction."""
661 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
663 def _real_initialize(self):
664 # Retrieve disclaimer
665 request = compat_urllib_request.Request(self._DISCLAIMER)
667 self.report_disclaimer()
668 disclaimer = compat_urllib_request.urlopen(request).read()
669 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
670 self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
676 'submit': "Continue - I'm over 18",
678 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
680 self.report_age_confirmation()
681 disclaimer = compat_urllib_request.urlopen(request).read()
682 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
683 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
686 def _real_extract(self, url):
687 # Extract id and simplified title from URL
688 mobj = re.match(self._VALID_URL, url)
690 self._downloader.report_error(u'invalid URL: %s' % url)
693 video_id = mobj.group(1)
695 # Check if video comes from YouTube
696 mobj2 = re.match(r'^yt-(.*)$', video_id)
697 if mobj2 is not None:
698 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
701 # Retrieve video webpage to extract further information
702 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
704 self.report_download_webpage(video_id)
705 webpage = compat_urllib_request.urlopen(request).read()
706 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
707 self._downloader.report_error(u'unable retrieve video webpage: %s' % compat_str(err))
710 # Extract URL, uploader and title from webpage
711 self.report_extraction(video_id)
712 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
714 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
715 video_extension = mediaURL[-3:]
717 # Extract gdaKey if available
718 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
722 gdaKey = mobj.group(1)
723 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
725 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
727 self._downloader.report_error(u'unable to extract media URL')
729 vardict = compat_parse_qs(mobj.group(1))
730 if 'mediaData' not in vardict:
731 self._downloader.report_error(u'unable to extract media URL')
733 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
735 self._downloader.report_error(u'unable to extract media URL')
737 mediaURL = mobj.group(1).replace('\\/', '/')
738 video_extension = mediaURL[-3:]
739 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
741 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
743 self._downloader.report_error(u'unable to extract title')
745 video_title = mobj.group(1).decode('utf-8')
747 mobj = re.search(r'submitter=(.*?);', webpage)
749 self._downloader.report_error(u'unable to extract uploader nickname')
751 video_uploader = mobj.group(1)
754 'id': video_id.decode('utf-8'),
755 'url': video_url.decode('utf-8'),
756 'uploader': video_uploader.decode('utf-8'),
758 'title': video_title,
759 'ext': video_extension.decode('utf-8'),
763 class DailymotionIE(InfoExtractor):
764 """Information Extractor for Dailymotion"""
766 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
767 IE_NAME = u'dailymotion'
770 def __init__(self, downloader=None):
771 InfoExtractor.__init__(self, downloader)
773 def report_extraction(self, video_id):
774 """Report information extraction."""
775 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
777 def _real_extract(self, url):
778 # Extract id and simplified title from URL
779 mobj = re.match(self._VALID_URL, url)
781 self._downloader.report_error(u'invalid URL: %s' % url)
784 video_id = mobj.group(1).split('_')[0].split('?')[0]
786 video_extension = 'mp4'
788 # Retrieve video webpage to extract further information
789 request = compat_urllib_request.Request(url)
790 request.add_header('Cookie', 'family_filter=off')
791 webpage = self._download_webpage(request, video_id)
793 # Extract URL, uploader and title from webpage
794 self.report_extraction(video_id)
795 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
797 self._downloader.report_error(u'unable to extract media URL')
799 flashvars = compat_urllib_parse.unquote(mobj.group(1))
801 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
804 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
807 self._downloader.report_error(u'unable to extract video URL')
810 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
812 self._downloader.report_error(u'unable to extract video URL')
815 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
817 # TODO: support choosing qualities
819 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
821 self._downloader.report_error(u'unable to extract title')
823 video_title = unescapeHTML(mobj.group('title'))
825 video_uploader = None
826 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
828 # lookin for official user
829 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
830 if mobj_official is None:
831 self._downloader.report_warning(u'unable to extract uploader nickname')
833 video_uploader = mobj_official.group(1)
835 video_uploader = mobj.group(1)
837 video_upload_date = None
838 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
840 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
845 'uploader': video_uploader,
846 'upload_date': video_upload_date,
847 'title': video_title,
848 'ext': video_extension,
852 class PhotobucketIE(InfoExtractor):
853 """Information extractor for photobucket.com."""
855 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
856 IE_NAME = u'photobucket'
858 def __init__(self, downloader=None):
859 InfoExtractor.__init__(self, downloader)
861 def report_download_webpage(self, video_id):
862 """Report webpage download."""
863 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
865 def report_extraction(self, video_id):
866 """Report information extraction."""
867 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
869 def _real_extract(self, url):
870 # Extract id from URL
871 mobj = re.match(self._VALID_URL, url)
873 self._downloader.report_error(u'Invalid URL: %s' % url)
876 video_id = mobj.group(1)
878 video_extension = 'flv'
880 # Retrieve video webpage to extract further information
881 request = compat_urllib_request.Request(url)
883 self.report_download_webpage(video_id)
884 webpage = compat_urllib_request.urlopen(request).read()
885 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
886 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
889 # Extract URL, uploader, and title from webpage
890 self.report_extraction(video_id)
891 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
893 self._downloader.report_error(u'unable to extract media URL')
895 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
899 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
901 self._downloader.report_error(u'unable to extract title')
903 video_title = mobj.group(1).decode('utf-8')
905 video_uploader = mobj.group(2).decode('utf-8')
908 'id': video_id.decode('utf-8'),
909 'url': video_url.decode('utf-8'),
910 'uploader': video_uploader,
912 'title': video_title,
913 'ext': video_extension.decode('utf-8'),
917 class YahooIE(InfoExtractor):
918 """Information extractor for video.yahoo.com."""
921 # _VALID_URL matches all Yahoo! Video URLs
922 # _VPAGE_URL matches only the extractable '/watch/' URLs
923 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
924 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
925 IE_NAME = u'video.yahoo'
927 def __init__(self, downloader=None):
928 InfoExtractor.__init__(self, downloader)
930 def report_download_webpage(self, video_id):
931 """Report webpage download."""
932 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
934 def report_extraction(self, video_id):
935 """Report information extraction."""
936 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
938 def _real_extract(self, url, new_video=True):
939 # Extract ID from URL
940 mobj = re.match(self._VALID_URL, url)
942 self._downloader.report_error(u'Invalid URL: %s' % url)
945 video_id = mobj.group(2)
946 video_extension = 'flv'
948 # Rewrite valid but non-extractable URLs as
949 # extractable English language /watch/ URLs
950 if re.match(self._VPAGE_URL, url) is None:
951 request = compat_urllib_request.Request(url)
953 webpage = compat_urllib_request.urlopen(request).read()
954 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
955 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
958 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
960 self._downloader.report_error(u'Unable to extract id field')
962 yahoo_id = mobj.group(1)
964 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
966 self._downloader.report_error(u'Unable to extract vid field')
968 yahoo_vid = mobj.group(1)
970 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
971 return self._real_extract(url, new_video=False)
973 # Retrieve video webpage to extract further information
974 request = compat_urllib_request.Request(url)
976 self.report_download_webpage(video_id)
977 webpage = compat_urllib_request.urlopen(request).read()
978 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
979 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
982 # Extract uploader and title from webpage
983 self.report_extraction(video_id)
984 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
986 self._downloader.report_error(u'unable to extract video title')
988 video_title = mobj.group(1).decode('utf-8')
990 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
992 self._downloader.report_error(u'unable to extract video uploader')
994 video_uploader = mobj.group(1).decode('utf-8')
996 # Extract video thumbnail
997 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
999 self._downloader.report_error(u'unable to extract video thumbnail')
1001 video_thumbnail = mobj.group(1).decode('utf-8')
1003 # Extract video description
1004 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1006 self._downloader.report_error(u'unable to extract video description')
1008 video_description = mobj.group(1).decode('utf-8')
1009 if not video_description:
1010 video_description = 'No description available.'
1012 # Extract video height and width
1013 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1015 self._downloader.report_error(u'unable to extract video height')
1017 yv_video_height = mobj.group(1)
1019 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1021 self._downloader.report_error(u'unable to extract video width')
1023 yv_video_width = mobj.group(1)
1025 # Retrieve video playlist to extract media URL
1026 # I'm not completely sure what all these options are, but we
1027 # seem to need most of them, otherwise the server sends a 401.
1028 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1029 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1030 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1031 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1032 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1034 self.report_download_webpage(video_id)
1035 webpage = compat_urllib_request.urlopen(request).read()
1036 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1037 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1040 # Extract media URL from playlist XML
1041 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1043 self._downloader.report_error(u'Unable to extract media URL')
1045 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1046 video_url = unescapeHTML(video_url)
1049 'id': video_id.decode('utf-8'),
1051 'uploader': video_uploader,
1052 'upload_date': None,
1053 'title': video_title,
1054 'ext': video_extension.decode('utf-8'),
1055 'thumbnail': video_thumbnail.decode('utf-8'),
1056 'description': video_description,
1060 class VimeoIE(InfoExtractor):
1061 """Information extractor for vimeo.com."""
1063 # _VALID_URL matches Vimeo URLs
1064 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1067 def __init__(self, downloader=None):
1068 InfoExtractor.__init__(self, downloader)
1070 def report_download_webpage(self, video_id):
1071 """Report webpage download."""
1072 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1074 def report_extraction(self, video_id):
1075 """Report information extraction."""
1076 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1078 def _real_extract(self, url, new_video=True):
1079 # Extract ID from URL
1080 mobj = re.match(self._VALID_URL, url)
1082 self._downloader.report_error(u'Invalid URL: %s' % url)
1085 video_id = mobj.group('id')
1086 if not mobj.group('proto'):
1087 url = 'https://' + url
1088 if mobj.group('direct_link'):
1089 url = 'https://vimeo.com/' + video_id
1091 # Retrieve video webpage to extract further information
1092 request = compat_urllib_request.Request(url, None, std_headers)
1094 self.report_download_webpage(video_id)
1095 webpage_bytes = compat_urllib_request.urlopen(request).read()
1096 webpage = webpage_bytes.decode('utf-8')
1097 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1098 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1101 # Now we begin extracting as much information as we can from what we
1102 # retrieved. First we extract the information common to all extractors,
1103 # and latter we extract those that are Vimeo specific.
1104 self.report_extraction(video_id)
1106 # Extract the config JSON
1108 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1109 config = json.loads(config)
1111 self._downloader.report_error(u'unable to extract info section')
1115 video_title = config["video"]["title"]
1117 # Extract uploader and uploader_id
1118 video_uploader = config["video"]["owner"]["name"]
1119 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1121 # Extract video thumbnail
1122 video_thumbnail = config["video"]["thumbnail"]
1124 # Extract video description
1125 video_description = get_element_by_attribute("itemprop", "description", webpage)
1126 if video_description: video_description = clean_html(video_description)
1127 else: video_description = ''
1129 # Extract upload date
1130 video_upload_date = None
1131 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1132 if mobj is not None:
1133 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1135 # Vimeo specific: extract request signature and timestamp
1136 sig = config['request']['signature']
1137 timestamp = config['request']['timestamp']
1139 # Vimeo specific: extract video codec and quality information
1140 # First consider quality, then codecs, then take everything
1141 # TODO bind to format param
1142 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1143 files = { 'hd': [], 'sd': [], 'other': []}
1144 for codec_name, codec_extension in codecs:
1145 if codec_name in config["video"]["files"]:
1146 if 'hd' in config["video"]["files"][codec_name]:
1147 files['hd'].append((codec_name, codec_extension, 'hd'))
1148 elif 'sd' in config["video"]["files"][codec_name]:
1149 files['sd'].append((codec_name, codec_extension, 'sd'))
1151 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1153 for quality in ('hd', 'sd', 'other'):
1154 if len(files[quality]) > 0:
1155 video_quality = files[quality][0][2]
1156 video_codec = files[quality][0][0]
1157 video_extension = files[quality][0][1]
1158 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1161 self._downloader.report_error(u'no known codec found')
1164 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1165 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1170 'uploader': video_uploader,
1171 'uploader_id': video_uploader_id,
1172 'upload_date': video_upload_date,
1173 'title': video_title,
1174 'ext': video_extension,
1175 'thumbnail': video_thumbnail,
1176 'description': video_description,
1180 class ArteTvIE(InfoExtractor):
1181 """arte.tv information extractor."""
1183 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1184 _LIVE_URL = r'index-[0-9]+\.html$'
1186 IE_NAME = u'arte.tv'
1188 def __init__(self, downloader=None):
1189 InfoExtractor.__init__(self, downloader)
1191 def report_download_webpage(self, video_id):
1192 """Report webpage download."""
1193 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1195 def report_extraction(self, video_id):
1196 """Report information extraction."""
1197 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1199 def fetch_webpage(self, url):
1200 request = compat_urllib_request.Request(url)
1202 self.report_download_webpage(url)
1203 webpage = compat_urllib_request.urlopen(request).read()
1204 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1205 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1207 except ValueError as err:
1208 self._downloader.report_error(u'Invalid URL: %s' % url)
1212 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1213 page = self.fetch_webpage(url)
1214 mobj = re.search(regex, page, regexFlags)
1218 self._downloader.report_error(u'Invalid URL: %s' % url)
1221 for (i, key, err) in matchTuples:
1222 if mobj.group(i) is None:
1223 self._downloader.trouble(err)
1226 info[key] = mobj.group(i)
1230 def extractLiveStream(self, url):
1231 video_lang = url.split('/')[-4]
1232 info = self.grep_webpage(
1234 r'src="(.*?/videothek_js.*?\.js)',
1237 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1240 http_host = url.split('/')[2]
1241 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1242 info = self.grep_webpage(
1244 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1245 '(http://.*?\.swf).*?' +
1249 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1250 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1251 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1254 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1256 def extractPlus7Stream(self, url):
1257 video_lang = url.split('/')[-3]
1258 info = self.grep_webpage(
1260 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1263 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1266 next_url = compat_urllib_parse.unquote(info.get('url'))
1267 info = self.grep_webpage(
1269 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1272 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1275 next_url = compat_urllib_parse.unquote(info.get('url'))
1277 info = self.grep_webpage(
1279 r'<video id="(.*?)".*?>.*?' +
1280 '<name>(.*?)</name>.*?' +
1281 '<dateVideo>(.*?)</dateVideo>.*?' +
1282 '<url quality="hd">(.*?)</url>',
1285 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1286 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1287 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1288 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1293 'id': info.get('id'),
1294 'url': compat_urllib_parse.unquote(info.get('url')),
1295 'uploader': u'arte.tv',
1296 'upload_date': info.get('date'),
1297 'title': info.get('title').decode('utf-8'),
1303 def _real_extract(self, url):
1304 video_id = url.split('/')[-1]
1305 self.report_extraction(video_id)
1307 if re.search(self._LIVE_URL, video_id) is not None:
1308 self.extractLiveStream(url)
1311 info = self.extractPlus7Stream(url)
1316 class GenericIE(InfoExtractor):
1317 """Generic last-resort information extractor."""
1320 IE_NAME = u'generic'
1322 def __init__(self, downloader=None):
1323 InfoExtractor.__init__(self, downloader)
1325 def report_download_webpage(self, video_id):
1326 """Report webpage download."""
1327 if not self._downloader.params.get('test', False):
1328 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1329 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1331 def report_extraction(self, video_id):
1332 """Report information extraction."""
1333 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1335 def report_following_redirect(self, new_url):
1336 """Report information extraction."""
1337 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1339 def _test_redirect(self, url):
1340 """Check if it is a redirect, like url shorteners, in case restart chain."""
1341 class HeadRequest(compat_urllib_request.Request):
1342 def get_method(self):
1345 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1347 Subclass the HTTPRedirectHandler to make it use our
1348 HeadRequest also on the redirected URL
1350 def redirect_request(self, req, fp, code, msg, headers, newurl):
1351 if code in (301, 302, 303, 307):
1352 newurl = newurl.replace(' ', '%20')
1353 newheaders = dict((k,v) for k,v in req.headers.items()
1354 if k.lower() not in ("content-length", "content-type"))
1355 return HeadRequest(newurl,
1357 origin_req_host=req.get_origin_req_host(),
1360 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1362 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1364 Fallback to GET if HEAD is not allowed (405 HTTP error)
1366 def http_error_405(self, req, fp, code, msg, headers):
1370 newheaders = dict((k,v) for k,v in req.headers.items()
1371 if k.lower() not in ("content-length", "content-type"))
1372 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1374 origin_req_host=req.get_origin_req_host(),
1378 opener = compat_urllib_request.OpenerDirector()
1379 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1380 HTTPMethodFallback, HEADRedirectHandler,
1381 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1382 opener.add_handler(handler())
1384 response = opener.open(HeadRequest(url))
1385 new_url = response.geturl()
1390 self.report_following_redirect(new_url)
1391 self._downloader.download([new_url])
1394 def _real_extract(self, url):
1395 if self._test_redirect(url): return
1397 video_id = url.split('/')[-1]
1399 webpage = self._download_webpage(url, video_id)
1400 except ValueError as err:
1401 # since this is the last-resort InfoExtractor, if
1402 # this error is thrown, it'll be thrown here
1403 self._downloader.report_error(u'Invalid URL: %s' % url)
1406 self.report_extraction(video_id)
1407 # Start with something easy: JW Player in SWFObject
1408 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1410 # Broaden the search a little bit
1411 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1413 # Broaden the search a little bit: JWPlayer JS loader
1414 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1416 self._downloader.report_error(u'Invalid URL: %s' % url)
1419 # It's possible that one of the regexes
1420 # matched, but returned an empty group:
1421 if mobj.group(1) is None:
1422 self._downloader.report_error(u'Invalid URL: %s' % url)
1425 video_url = compat_urllib_parse.unquote(mobj.group(1))
1426 video_id = os.path.basename(video_url)
1428 # here's a fun little line of code for you:
1429 video_extension = os.path.splitext(video_id)[1][1:]
1430 video_id = os.path.splitext(video_id)[0]
1432 # it's tempting to parse this further, but you would
1433 # have to take into account all the variations like
1434 # Video Title - Site Name
1435 # Site Name | Video Title
1436 # Video Title - Tagline | Site Name
1437 # and so on and so forth; it's just not practical
1438 mobj = re.search(r'<title>(.*)</title>', webpage)
1440 self._downloader.report_error(u'unable to extract title')
1442 video_title = mobj.group(1)
1444 # video uploader is domain name
1445 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1447 self._downloader.report_error(u'unable to extract title')
1449 video_uploader = mobj.group(1)
1454 'uploader': video_uploader,
1455 'upload_date': None,
1456 'title': video_title,
1457 'ext': video_extension,
1461 class YoutubeSearchIE(InfoExtractor):
1462 """Information Extractor for YouTube search queries."""
1463 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1464 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1465 _max_youtube_results = 1000
1466 IE_NAME = u'youtube:search'
1468 def __init__(self, downloader=None):
1469 InfoExtractor.__init__(self, downloader)
1471 def report_download_page(self, query, pagenum):
1472 """Report attempt to download search page with given number."""
1473 query = query.decode(preferredencoding())
1474 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1476 def _real_extract(self, query):
1477 mobj = re.match(self._VALID_URL, query)
1479 self._downloader.report_error(u'invalid search query "%s"' % query)
1482 prefix, query = query.split(':')
1484 query = query.encode('utf-8')
1486 self._download_n_results(query, 1)
1488 elif prefix == 'all':
1489 self._download_n_results(query, self._max_youtube_results)
1495 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1497 elif n > self._max_youtube_results:
1498 self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1499 n = self._max_youtube_results
1500 self._download_n_results(query, n)
1502 except ValueError: # parsing prefix as integer fails
1503 self._download_n_results(query, 1)
1506 def _download_n_results(self, query, n):
1507 """Downloads a specified number of results for a query"""
1513 while (50 * pagenum) < limit:
1514 self.report_download_page(query, pagenum+1)
1515 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1516 request = compat_urllib_request.Request(result_url)
1518 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1519 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1520 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1522 api_response = json.loads(data)['data']
1524 if not 'items' in api_response:
1525 self._downloader.trouble(u'[youtube] No video results')
1528 new_ids = list(video['id'] for video in api_response['items'])
1529 video_ids += new_ids
1531 limit = min(n, api_response['totalItems'])
1534 if len(video_ids) > n:
1535 video_ids = video_ids[:n]
1536 for id in video_ids:
1537 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1541 class GoogleSearchIE(InfoExtractor):
1542 """Information Extractor for Google Video search queries."""
1543 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1544 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1545 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1546 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1547 _max_google_results = 1000
1548 IE_NAME = u'video.google:search'
1550 def __init__(self, downloader=None):
1551 InfoExtractor.__init__(self, downloader)
1553 def report_download_page(self, query, pagenum):
1554 """Report attempt to download playlist page with given number."""
1555 query = query.decode(preferredencoding())
1556 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1558 def _real_extract(self, query):
1559 mobj = re.match(self._VALID_URL, query)
1561 self._downloader.report_error(u'invalid search query "%s"' % query)
1564 prefix, query = query.split(':')
1566 query = query.encode('utf-8')
1568 self._download_n_results(query, 1)
1570 elif prefix == 'all':
1571 self._download_n_results(query, self._max_google_results)
1577 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1579 elif n > self._max_google_results:
1580 self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1581 n = self._max_google_results
1582 self._download_n_results(query, n)
1584 except ValueError: # parsing prefix as integer fails
1585 self._download_n_results(query, 1)
1588 def _download_n_results(self, query, n):
1589 """Downloads a specified number of results for a query"""
1595 self.report_download_page(query, pagenum)
1596 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1597 request = compat_urllib_request.Request(result_url)
1599 page = compat_urllib_request.urlopen(request).read()
1600 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1601 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1604 # Extract video identifiers
1605 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1606 video_id = mobj.group(1)
1607 if video_id not in video_ids:
1608 video_ids.append(video_id)
1609 if len(video_ids) == n:
1610 # Specified n videos reached
1611 for id in video_ids:
1612 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1615 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1616 for id in video_ids:
1617 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1620 pagenum = pagenum + 1
1623 class YahooSearchIE(InfoExtractor):
1624 """Information Extractor for Yahoo! Video search queries."""
1627 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1628 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1629 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1630 _MORE_PAGES_INDICATOR = r'\s*Next'
1631 _max_yahoo_results = 1000
1632 IE_NAME = u'video.yahoo:search'
1634 def __init__(self, downloader=None):
1635 InfoExtractor.__init__(self, downloader)
1637 def report_download_page(self, query, pagenum):
1638 """Report attempt to download playlist page with given number."""
1639 query = query.decode(preferredencoding())
1640 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1642 def _real_extract(self, query):
1643 mobj = re.match(self._VALID_URL, query)
1645 self._downloader.report_error(u'invalid search query "%s"' % query)
1648 prefix, query = query.split(':')
1650 query = query.encode('utf-8')
1652 self._download_n_results(query, 1)
1654 elif prefix == 'all':
1655 self._download_n_results(query, self._max_yahoo_results)
1661 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1663 elif n > self._max_yahoo_results:
1664 self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1665 n = self._max_yahoo_results
1666 self._download_n_results(query, n)
1668 except ValueError: # parsing prefix as integer fails
1669 self._download_n_results(query, 1)
1672 def _download_n_results(self, query, n):
1673 """Downloads a specified number of results for a query"""
1676 already_seen = set()
1680 self.report_download_page(query, pagenum)
1681 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1682 request = compat_urllib_request.Request(result_url)
1684 page = compat_urllib_request.urlopen(request).read()
1685 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1686 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1689 # Extract video identifiers
1690 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1691 video_id = mobj.group(1)
1692 if video_id not in already_seen:
1693 video_ids.append(video_id)
1694 already_seen.add(video_id)
1695 if len(video_ids) == n:
1696 # Specified n videos reached
1697 for id in video_ids:
1698 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1701 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1702 for id in video_ids:
1703 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1706 pagenum = pagenum + 1
1709 class YoutubePlaylistIE(InfoExtractor):
1710 """Information Extractor for YouTube playlists."""
1712 _VALID_URL = r"""(?:
1717 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1718 \? (?:.*?&)*? (?:p|a|list)=
1723 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1726 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1728 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1730 IE_NAME = u'youtube:playlist'
1732 def __init__(self, downloader=None):
1733 InfoExtractor.__init__(self, downloader)
1736 def suitable(cls, url):
1737 """Receives a URL and returns True if suitable for this IE."""
1738 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1740 def report_download_page(self, playlist_id, pagenum):
1741 """Report attempt to download playlist page with given number."""
1742 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1744 def _real_extract(self, url):
1745 # Extract playlist id
1746 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1748 self._downloader.report_error(u'invalid url: %s' % url)
1751 # Download playlist videos from API
1752 playlist_id = mobj.group(1) or mobj.group(2)
1757 self.report_download_page(playlist_id, page_num)
1759 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1761 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1762 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1763 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1767 response = json.loads(page)
1768 except ValueError as err:
1769 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1772 if not 'feed' in response or not 'entry' in response['feed']:
1773 self._downloader.report_error(u'Got a malformed response from YouTube API')
1775 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1776 for entry in response['feed']['entry']
1777 if 'content' in entry ]
1779 if len(response['feed']['entry']) < self._MAX_RESULTS:
1783 videos = [v[1] for v in sorted(videos)]
1786 playliststart = self._downloader.params.get('playliststart', 1) - 1
1787 playlistend = self._downloader.params.get('playlistend', -1)
1788 if playlistend == -1:
1789 videos = videos[playliststart:]
1791 videos = videos[playliststart:playlistend]
1793 if len(videos) == total:
1794 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1796 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(videos)))
1798 for video in videos:
1799 self._downloader.download([video])
1803 class YoutubeChannelIE(InfoExtractor):
1804 """Information Extractor for YouTube channels."""
1806 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1807 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1808 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1809 IE_NAME = u'youtube:channel'
1811 def report_download_page(self, channel_id, pagenum):
1812 """Report attempt to download channel page with given number."""
1813 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1815 def _real_extract(self, url):
1816 # Extract channel id
1817 mobj = re.match(self._VALID_URL, url)
1819 self._downloader.report_error(u'invalid url: %s' % url)
1822 # Download channel pages
1823 channel_id = mobj.group(1)
1828 self.report_download_page(channel_id, pagenum)
1829 url = self._TEMPLATE_URL % (channel_id, pagenum)
1830 request = compat_urllib_request.Request(url)
1832 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1833 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1834 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1837 # Extract video identifiers
1839 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1840 if mobj.group(1) not in ids_in_page:
1841 ids_in_page.append(mobj.group(1))
1842 video_ids.extend(ids_in_page)
1844 if self._MORE_PAGES_INDICATOR not in page:
1846 pagenum = pagenum + 1
1848 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1850 for id in video_ids:
1851 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1855 class YoutubeUserIE(InfoExtractor):
1856 """Information Extractor for YouTube users."""
1858 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1859 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1860 _GDATA_PAGE_SIZE = 50
1861 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1862 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1863 IE_NAME = u'youtube:user'
1865 def __init__(self, downloader=None):
1866 InfoExtractor.__init__(self, downloader)
1868 def report_download_page(self, username, start_index):
1869 """Report attempt to download user page."""
1870 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1871 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1873 def _real_extract(self, url):
1875 mobj = re.match(self._VALID_URL, url)
1877 self._downloader.report_error(u'invalid url: %s' % url)
1880 username = mobj.group(1)
1882 # Download video ids using YouTube Data API. Result size per
1883 # query is limited (currently to 50 videos) so we need to query
1884 # page by page until there are no video ids - it means we got
1891 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1892 self.report_download_page(username, start_index)
1894 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1897 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1898 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1899 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1902 # Extract video identifiers
1905 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1906 if mobj.group(1) not in ids_in_page:
1907 ids_in_page.append(mobj.group(1))
1909 video_ids.extend(ids_in_page)
1911 # A little optimization - if current page is not
1912 # "full", ie. does not contain PAGE_SIZE video ids then
1913 # we can assume that this page is the last one - there
1914 # are no more ids on further pages - no need to query
1917 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1922 all_ids_count = len(video_ids)
1923 playliststart = self._downloader.params.get('playliststart', 1) - 1
1924 playlistend = self._downloader.params.get('playlistend', -1)
1926 if playlistend == -1:
1927 video_ids = video_ids[playliststart:]
1929 video_ids = video_ids[playliststart:playlistend]
1931 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1932 (username, all_ids_count, len(video_ids)))
1934 for video_id in video_ids:
1935 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1938 class BlipTVUserIE(InfoExtractor):
1939 """Information Extractor for blip.tv users."""
1941 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1943 IE_NAME = u'blip.tv:user'
1945 def __init__(self, downloader=None):
1946 InfoExtractor.__init__(self, downloader)
1948 def report_download_page(self, username, pagenum):
1949 """Report attempt to download user page."""
1950 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1951 (self.IE_NAME, username, pagenum))
1953 def _real_extract(self, url):
1955 mobj = re.match(self._VALID_URL, url)
1957 self._downloader.report_error(u'invalid url: %s' % url)
1960 username = mobj.group(1)
1962 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1964 request = compat_urllib_request.Request(url)
1967 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1968 mobj = re.search(r'data-users-id="([^"]+)"', page)
1969 page_base = page_base % mobj.group(1)
1970 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1971 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1975 # Download video ids using BlipTV Ajax calls. Result size per
1976 # query is limited (currently to 12 videos) so we need to query
1977 # page by page until there are no video ids - it means we got
1984 self.report_download_page(username, pagenum)
1985 url = page_base + "&page=" + str(pagenum)
1986 request = compat_urllib_request.Request( url )
1988 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1989 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1990 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
1993 # Extract video identifiers
1996 for mobj in re.finditer(r'href="/([^"]+)"', page):
1997 if mobj.group(1) not in ids_in_page:
1998 ids_in_page.append(unescapeHTML(mobj.group(1)))
2000 video_ids.extend(ids_in_page)
2002 # A little optimization - if current page is not
2003 # "full", ie. does not contain PAGE_SIZE video ids then
2004 # we can assume that this page is the last one - there
2005 # are no more ids on further pages - no need to query
2008 if len(ids_in_page) < self._PAGE_SIZE:
2013 all_ids_count = len(video_ids)
2014 playliststart = self._downloader.params.get('playliststart', 1) - 1
2015 playlistend = self._downloader.params.get('playlistend', -1)
2017 if playlistend == -1:
2018 video_ids = video_ids[playliststart:]
2020 video_ids = video_ids[playliststart:playlistend]
2022 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
2023 (self.IE_NAME, username, all_ids_count, len(video_ids)))
2025 for video_id in video_ids:
2026 self._downloader.download([u'http://blip.tv/'+video_id])
2029 class DepositFilesIE(InfoExtractor):
2030 """Information extractor for depositfiles.com"""
2032 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2034 def report_download_webpage(self, file_id):
2035 """Report webpage download."""
2036 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2038 def report_extraction(self, file_id):
2039 """Report information extraction."""
2040 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2042 def _real_extract(self, url):
2043 file_id = url.split('/')[-1]
2044 # Rebuild url in english locale
2045 url = 'http://depositfiles.com/en/files/' + file_id
2047 # Retrieve file webpage with 'Free download' button pressed
2048 free_download_indication = { 'gateway_result' : '1' }
2049 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2051 self.report_download_webpage(file_id)
2052 webpage = compat_urllib_request.urlopen(request).read()
2053 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2054 self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
2057 # Search for the real file URL
2058 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2059 if (mobj is None) or (mobj.group(1) is None):
2060 # Try to figure out reason of the error.
2061 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2062 if (mobj is not None) and (mobj.group(1) is not None):
2063 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2064 self._downloader.report_error(u'%s' % restriction_message)
2066 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
2069 file_url = mobj.group(1)
2070 file_extension = os.path.splitext(file_url)[1][1:]
2072 # Search for file title
2073 mobj = re.search(r'<b title="(.*?)">', webpage)
2075 self._downloader.report_error(u'unable to extract title')
2077 file_title = mobj.group(1).decode('utf-8')
2080 'id': file_id.decode('utf-8'),
2081 'url': file_url.decode('utf-8'),
2083 'upload_date': None,
2084 'title': file_title,
2085 'ext': file_extension.decode('utf-8'),
2089 class FacebookIE(InfoExtractor):
2090 """Information Extractor for Facebook"""
2092 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2093 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2094 _NETRC_MACHINE = 'facebook'
2095 IE_NAME = u'facebook'
2097 def report_login(self):
2098 """Report attempt to log in."""
2099 self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2101 def _real_initialize(self):
2102 if self._downloader is None:
2107 downloader_params = self._downloader.params
2109 # Attempt to use provided username and password or .netrc data
2110 if downloader_params.get('username', None) is not None:
2111 useremail = downloader_params['username']
2112 password = downloader_params['password']
2113 elif downloader_params.get('usenetrc', False):
2115 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2116 if info is not None:
2120 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2121 except (IOError, netrc.NetrcParseError) as err:
2122 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2125 if useremail is None:
2134 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2137 login_results = compat_urllib_request.urlopen(request).read()
2138 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2139 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2141 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2142 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2145 def _real_extract(self, url):
2146 mobj = re.match(self._VALID_URL, url)
2148 self._downloader.report_error(u'invalid URL: %s' % url)
2150 video_id = mobj.group('ID')
2152 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2153 webpage = self._download_webpage(url, video_id)
2155 BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2156 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2157 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2159 raise ExtractorError(u'Cannot parse data')
2160 data = dict(json.loads(m.group(1)))
2161 params_raw = compat_urllib_parse.unquote(data['params'])
2162 params = json.loads(params_raw)
2163 video_url = params['hd_src']
2165 video_url = params['sd_src']
2167 raise ExtractorError(u'Cannot find video URL')
2168 video_duration = int(params['video_duration'])
2170 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2172 raise ExtractorError(u'Cannot find title in webpage')
2173 video_title = unescapeHTML(m.group(1))
2177 'title': video_title,
2180 'duration': video_duration,
2181 'thumbnail': params['thumbnail_src'],
2186 class BlipTVIE(InfoExtractor):
2187 """Information extractor for blip.tv"""
2189 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2190 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2191 IE_NAME = u'blip.tv'
2193 def report_extraction(self, file_id):
2194 """Report information extraction."""
2195 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2197 def report_direct_download(self, title):
2198 """Report information extraction."""
2199 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2201 def _real_extract(self, url):
2202 mobj = re.match(self._VALID_URL, url)
2204 self._downloader.report_error(u'invalid URL: %s' % url)
2207 urlp = compat_urllib_parse_urlparse(url)
2208 if urlp.path.startswith('/play/'):
2209 request = compat_urllib_request.Request(url)
2210 response = compat_urllib_request.urlopen(request)
2211 redirecturl = response.geturl()
2212 rurlp = compat_urllib_parse_urlparse(redirecturl)
2213 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2214 url = 'http://blip.tv/a/a-' + file_id
2215 return self._real_extract(url)
2222 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2223 request = compat_urllib_request.Request(json_url)
2224 request.add_header('User-Agent', 'iTunes/10.6.1')
2225 self.report_extraction(mobj.group(1))
2228 urlh = compat_urllib_request.urlopen(request)
2229 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2230 basename = url.split('/')[-1]
2231 title,ext = os.path.splitext(basename)
2232 title = title.decode('UTF-8')
2233 ext = ext.replace('.', '')
2234 self.report_direct_download(title)
2239 'upload_date': None,
2244 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2245 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2246 if info is None: # Regular URL
2248 json_code_bytes = urlh.read()
2249 json_code = json_code_bytes.decode('utf-8')
2250 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2251 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2255 json_data = json.loads(json_code)
2256 if 'Post' in json_data:
2257 data = json_data['Post']
2261 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2262 video_url = data['media']['url']
2263 umobj = re.match(self._URL_EXT, video_url)
2265 raise ValueError('Can not determine filename extension')
2266 ext = umobj.group(1)
2269 'id': data['item_id'],
2271 'uploader': data['display_name'],
2272 'upload_date': upload_date,
2273 'title': data['title'],
2275 'format': data['media']['mimeType'],
2276 'thumbnail': data['thumbnailUrl'],
2277 'description': data['description'],
2278 'player_url': data['embedUrl'],
2279 'user_agent': 'iTunes/10.6.1',
2281 except (ValueError,KeyError) as err:
2282 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2288 class MyVideoIE(InfoExtractor):
2289 """Information Extractor for myvideo.de."""
2291 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2292 IE_NAME = u'myvideo'
2294 def __init__(self, downloader=None):
2295 InfoExtractor.__init__(self, downloader)
2297 def report_extraction(self, video_id):
2298 """Report information extraction."""
2299 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2301 def _real_extract(self,url):
2302 mobj = re.match(self._VALID_URL, url)
2304 self._download.report_error(u'invalid URL: %s' % url)
2307 video_id = mobj.group(1)
2310 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2311 webpage = self._download_webpage(webpage_url, video_id)
2313 self.report_extraction(video_id)
2314 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2317 self._downloader.report_error(u'unable to extract media URL')
2319 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2321 mobj = re.search('<title>([^<]+)</title>', webpage)
2323 self._downloader.report_error(u'unable to extract title')
2326 video_title = mobj.group(1)
2332 'upload_date': None,
2333 'title': video_title,
2337 class ComedyCentralIE(InfoExtractor):
2338 """Information extractor for The Daily Show and Colbert Report """
2340 # urls can be abbreviations like :thedailyshow or :colbert
2341 # urls for episodes like:
2342 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2343 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2344 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2345 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2346 |(https?://)?(www\.)?
2347 (?P<showname>thedailyshow|colbertnation)\.com/
2348 (full-episodes/(?P<episode>.*)|
2350 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2351 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2354 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2356 _video_extensions = {
2364 _video_dimensions = {
2374 def suitable(cls, url):
2375 """Receives a URL and returns True if suitable for this IE."""
2376 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2378 def report_extraction(self, episode_id):
2379 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2381 def report_config_download(self, episode_id, media_id):
2382 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2384 def report_index_download(self, episode_id):
2385 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2387 def _print_formats(self, formats):
2388 print('Available formats:')
2390 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2393 def _real_extract(self, url):
2394 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2396 self._downloader.report_error(u'invalid URL: %s' % url)
2399 if mobj.group('shortname'):
2400 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2401 url = u'http://www.thedailyshow.com/full-episodes/'
2403 url = u'http://www.colbertnation.com/full-episodes/'
2404 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2405 assert mobj is not None
2407 if mobj.group('clip'):
2408 if mobj.group('showname') == 'thedailyshow':
2409 epTitle = mobj.group('tdstitle')
2411 epTitle = mobj.group('cntitle')
2414 dlNewest = not mobj.group('episode')
2416 epTitle = mobj.group('showname')
2418 epTitle = mobj.group('episode')
2420 req = compat_urllib_request.Request(url)
2421 self.report_extraction(epTitle)
2423 htmlHandle = compat_urllib_request.urlopen(req)
2424 html = htmlHandle.read()
2425 webpage = html.decode('utf-8')
2426 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2427 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2430 url = htmlHandle.geturl()
2431 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2433 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2435 if mobj.group('episode') == '':
2436 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2438 epTitle = mobj.group('episode')
2440 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2442 if len(mMovieParams) == 0:
2443 # The Colbert Report embeds the information in a without
2444 # a URL prefix; so extract the alternate reference
2445 # and then add the URL prefix manually.
2447 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2448 if len(altMovieParams) == 0:
2449 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2452 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2454 uri = mMovieParams[0][1]
2455 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2456 self.report_index_download(epTitle)
2458 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2459 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2460 self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2465 idoc = xml.etree.ElementTree.fromstring(indexXml)
2466 itemEls = idoc.findall('.//item')
2467 for partNum,itemEl in enumerate(itemEls):
2468 mediaId = itemEl.findall('./guid')[0].text
2469 shortMediaId = mediaId.split(':')[-1]
2470 showId = mediaId.split(':')[-2].replace('.com', '')
2471 officialTitle = itemEl.findall('./title')[0].text
2472 officialDate = itemEl.findall('./pubDate')[0].text
2474 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2475 compat_urllib_parse.urlencode({'uri': mediaId}))
2476 configReq = compat_urllib_request.Request(configUrl)
2477 self.report_config_download(epTitle, shortMediaId)
2479 configXml = compat_urllib_request.urlopen(configReq).read()
2480 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2481 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2484 cdoc = xml.etree.ElementTree.fromstring(configXml)
2486 for rendition in cdoc.findall('.//rendition'):
2487 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2491 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2494 if self._downloader.params.get('listformats', None):
2495 self._print_formats([i[0] for i in turls])
2498 # For now, just pick the highest bitrate
2499 format,rtmp_video_url = turls[-1]
2501 # Get the format arg from the arg stream
2502 req_format = self._downloader.params.get('format', None)
2504 # Select format if we can find one
2507 format, rtmp_video_url = f, v
2510 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2512 raise ExtractorError(u'Cannot transform RTMP url')
2513 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2514 video_url = base + m.group('finalid')
2516 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2521 'upload_date': officialDate,
2526 'description': officialTitle,
2528 results.append(info)
2533 class EscapistIE(InfoExtractor):
2534 """Information extractor for The Escapist """
2536 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2537 IE_NAME = u'escapist'
2539 def report_extraction(self, showName):
2540 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2542 def report_config_download(self, showName):
2543 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2545 def _real_extract(self, url):
2546 mobj = re.match(self._VALID_URL, url)
2548 self._downloader.report_error(u'invalid URL: %s' % url)
2550 showName = mobj.group('showname')
2551 videoId = mobj.group('episode')
2553 self.report_extraction(showName)
2555 webPage = compat_urllib_request.urlopen(url)
2556 webPageBytes = webPage.read()
2557 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2558 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2559 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2560 self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2563 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2564 description = unescapeHTML(descMatch.group(1))
2565 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2566 imgUrl = unescapeHTML(imgMatch.group(1))
2567 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2568 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2569 configUrlMatch = re.search('config=(.*)$', playerUrl)
2570 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2572 self.report_config_download(showName)
2574 configJSON = compat_urllib_request.urlopen(configUrl)
2575 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2576 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2577 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2578 self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2581 # Technically, it's JavaScript, not JSON
2582 configJSON = configJSON.replace("'", '"')
2585 config = json.loads(configJSON)
2586 except (ValueError,) as err:
2587 self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2590 playlist = config['playlist']
2591 videoUrl = playlist[1]['url']
2596 'uploader': showName,
2597 'upload_date': None,
2600 'thumbnail': imgUrl,
2601 'description': description,
2602 'player_url': playerUrl,
2607 class CollegeHumorIE(InfoExtractor):
2608 """Information extractor for collegehumor.com"""
2611 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2612 IE_NAME = u'collegehumor'
2614 def report_manifest(self, video_id):
2615 """Report information extraction."""
2616 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2618 def report_extraction(self, video_id):
2619 """Report information extraction."""
2620 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2622 def _real_extract(self, url):
2623 mobj = re.match(self._VALID_URL, url)
2625 self._downloader.report_error(u'invalid URL: %s' % url)
2627 video_id = mobj.group('videoid')
2632 'upload_date': None,
2635 self.report_extraction(video_id)
2636 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2638 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2639 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2640 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2643 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2645 videoNode = mdoc.findall('./video')[0]
2646 info['description'] = videoNode.findall('./description')[0].text
2647 info['title'] = videoNode.findall('./caption')[0].text
2648 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2649 manifest_url = videoNode.findall('./file')[0].text
2651 self._downloader.report_error(u'Invalid metadata XML file')
2654 manifest_url += '?hdcore=2.10.3'
2655 self.report_manifest(video_id)
2657 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2658 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2659 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2662 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2664 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2665 node_id = media_node.attrib['url']
2666 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2667 except IndexError as err:
2668 self._downloader.report_error(u'Invalid manifest file')
2671 url_pr = compat_urllib_parse_urlparse(manifest_url)
2672 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2679 class XVideosIE(InfoExtractor):
2680 """Information extractor for xvideos.com"""
2682 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2683 IE_NAME = u'xvideos'
2685 def report_extraction(self, video_id):
2686 """Report information extraction."""
2687 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2689 def _real_extract(self, url):
2690 mobj = re.match(self._VALID_URL, url)
2692 self._downloader.report_error(u'invalid URL: %s' % url)
2694 video_id = mobj.group(1)
2696 webpage = self._download_webpage(url, video_id)
2698 self.report_extraction(video_id)
2702 mobj = re.search(r'flv_url=(.+?)&', webpage)
2704 self._downloader.report_error(u'unable to extract video url')
2706 video_url = compat_urllib_parse.unquote(mobj.group(1))
2710 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2712 self._downloader.report_error(u'unable to extract video title')
2714 video_title = mobj.group(1)
2717 # Extract video thumbnail
2718 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2720 self._downloader.report_error(u'unable to extract video thumbnail')
2722 video_thumbnail = mobj.group(0)
2728 'upload_date': None,
2729 'title': video_title,
2731 'thumbnail': video_thumbnail,
2732 'description': None,
2738 class SoundcloudIE(InfoExtractor):
2739 """Information extractor for soundcloud.com
2740 To access the media, the uid of the song and a stream token
2741 must be extracted from the page source and the script must make
2742 a request to media.soundcloud.com/crossdomain.xml. Then
2743 the media can be grabbed by requesting from an url composed
2744 of the stream token and uid
2747 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2748 IE_NAME = u'soundcloud'
2750 def __init__(self, downloader=None):
2751 InfoExtractor.__init__(self, downloader)
2753 def report_resolve(self, video_id):
2754 """Report information extraction."""
2755 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2757 def report_extraction(self, video_id):
2758 """Report information extraction."""
2759 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2761 def _real_extract(self, url):
2762 mobj = re.match(self._VALID_URL, url)
2764 self._downloader.report_error(u'invalid URL: %s' % url)
2767 # extract uploader (which is in the url)
2768 uploader = mobj.group(1)
2769 # extract simple title (uploader + slug of song title)
2770 slug_title = mobj.group(2)
2771 simple_title = uploader + u'-' + slug_title
2773 self.report_resolve('%s/%s' % (uploader, slug_title))
2775 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2776 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2777 request = compat_urllib_request.Request(resolv_url)
2779 info_json_bytes = compat_urllib_request.urlopen(request).read()
2780 info_json = info_json_bytes.decode('utf-8')
2781 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2782 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2785 info = json.loads(info_json)
2786 video_id = info['id']
2787 self.report_extraction('%s/%s' % (uploader, slug_title))
2789 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2790 request = compat_urllib_request.Request(streams_url)
2792 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2793 stream_json = stream_json_bytes.decode('utf-8')
2794 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2795 self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2798 streams = json.loads(stream_json)
2799 mediaURL = streams['http_mp3_128_url']
2804 'uploader': info['user']['username'],
2805 'upload_date': info['created_at'],
2806 'title': info['title'],
2808 'description': info['description'],
2811 class SoundcloudSetIE(InfoExtractor):
2812 """Information extractor for soundcloud.com sets
2813 To access the media, the uid of the song and a stream token
2814 must be extracted from the page source and the script must make
2815 a request to media.soundcloud.com/crossdomain.xml. Then
2816 the media can be grabbed by requesting from an url composed
2817 of the stream token and uid
2820 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2821 IE_NAME = u'soundcloud'
2823 def __init__(self, downloader=None):
2824 InfoExtractor.__init__(self, downloader)
2826 def report_resolve(self, video_id):
2827 """Report information extraction."""
2828 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2830 def report_extraction(self, video_id):
2831 """Report information extraction."""
2832 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2834 def _real_extract(self, url):
2835 mobj = re.match(self._VALID_URL, url)
2837 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2840 # extract uploader (which is in the url)
2841 uploader = mobj.group(1)
2842 # extract simple title (uploader + slug of song title)
2843 slug_title = mobj.group(2)
2844 simple_title = uploader + u'-' + slug_title
2846 self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2848 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2849 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2850 request = compat_urllib_request.Request(resolv_url)
2852 info_json_bytes = compat_urllib_request.urlopen(request).read()
2853 info_json = info_json_bytes.decode('utf-8')
2854 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2855 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2859 info = json.loads(info_json)
2860 if 'errors' in info:
2861 for err in info['errors']:
2862 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err['error_message']))
2865 for track in info['tracks']:
2866 video_id = track['id']
2867 self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2869 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2870 request = compat_urllib_request.Request(streams_url)
2872 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2873 stream_json = stream_json_bytes.decode('utf-8')
2874 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2875 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2878 streams = json.loads(stream_json)
2879 mediaURL = streams['http_mp3_128_url']
2884 'uploader': track['user']['username'],
2885 'upload_date': track['created_at'],
2886 'title': track['title'],
2888 'description': track['description'],
2893 class InfoQIE(InfoExtractor):
2894 """Information extractor for infoq.com"""
2895 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2897 def report_extraction(self, video_id):
2898 """Report information extraction."""
2899 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2901 def _real_extract(self, url):
2902 mobj = re.match(self._VALID_URL, url)
2904 self._downloader.report_error(u'invalid URL: %s' % url)
2907 webpage = self._download_webpage(url, video_id=url)
2908 self.report_extraction(url)
2911 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2913 self._downloader.report_error(u'unable to extract video url')
2915 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2916 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2919 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2921 self._downloader.report_error(u'unable to extract video title')
2923 video_title = mobj.group(1)
2925 # Extract description
2926 video_description = u'No description available.'
2927 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2928 if mobj is not None:
2929 video_description = mobj.group(1)
2931 video_filename = video_url.split('/')[-1]
2932 video_id, extension = video_filename.split('.')
2938 'upload_date': None,
2939 'title': video_title,
2940 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2942 'description': video_description,
2947 class MixcloudIE(InfoExtractor):
2948 """Information extractor for www.mixcloud.com"""
2950 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2951 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2952 IE_NAME = u'mixcloud'
2954 def __init__(self, downloader=None):
2955 InfoExtractor.__init__(self, downloader)
2957 def report_download_json(self, file_id):
2958 """Report JSON download."""
2959 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2961 def report_extraction(self, file_id):
2962 """Report information extraction."""
2963 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2965 def get_urls(self, jsonData, fmt, bitrate='best'):
2966 """Get urls from 'audio_formats' section in json"""
2969 bitrate_list = jsonData[fmt]
2970 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2971 bitrate = max(bitrate_list) # select highest
2973 url_list = jsonData[fmt][bitrate]
2974 except TypeError: # we have no bitrate info.
2975 url_list = jsonData[fmt]
2978 def check_urls(self, url_list):
2979 """Returns 1st active url from list"""
2980 for url in url_list:
2982 compat_urllib_request.urlopen(url)
2984 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2989 def _print_formats(self, formats):
2990 print('Available formats:')
2991 for fmt in formats.keys():
2992 for b in formats[fmt]:
2994 ext = formats[fmt][b][0]
2995 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2996 except TypeError: # we have no bitrate info
2997 ext = formats[fmt][0]
2998 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
3001 def _real_extract(self, url):
3002 mobj = re.match(self._VALID_URL, url)
3004 self._downloader.report_error(u'invalid URL: %s' % url)
3006 # extract uploader & filename from url
3007 uploader = mobj.group(1).decode('utf-8')
3008 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3010 # construct API request
3011 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3012 # retrieve .json file with links to files
3013 request = compat_urllib_request.Request(file_url)
3015 self.report_download_json(file_url)
3016 jsonData = compat_urllib_request.urlopen(request).read()
3017 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3018 self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
3022 json_data = json.loads(jsonData)
3023 player_url = json_data['player_swf_url']
3024 formats = dict(json_data['audio_formats'])
3026 req_format = self._downloader.params.get('format', None)
3029 if self._downloader.params.get('listformats', None):
3030 self._print_formats(formats)
3033 if req_format is None or req_format == 'best':
3034 for format_param in formats.keys():
3035 url_list = self.get_urls(formats, format_param)
3037 file_url = self.check_urls(url_list)
3038 if file_url is not None:
3041 if req_format not in formats:
3042 self._downloader.report_error(u'format is not available')
3045 url_list = self.get_urls(formats, req_format)
3046 file_url = self.check_urls(url_list)
3047 format_param = req_format
3050 'id': file_id.decode('utf-8'),
3051 'url': file_url.decode('utf-8'),
3052 'uploader': uploader.decode('utf-8'),
3053 'upload_date': None,
3054 'title': json_data['name'],
3055 'ext': file_url.split('.')[-1].decode('utf-8'),
3056 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3057 'thumbnail': json_data['thumbnail_url'],
3058 'description': json_data['description'],
3059 'player_url': player_url.decode('utf-8'),
3062 class StanfordOpenClassroomIE(InfoExtractor):
3063 """Information extractor for Stanford's Open ClassRoom"""
3065 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3066 IE_NAME = u'stanfordoc'
3068 def report_download_webpage(self, objid):
3069 """Report information extraction."""
3070 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3072 def report_extraction(self, video_id):
3073 """Report information extraction."""
3074 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3076 def _real_extract(self, url):
3077 mobj = re.match(self._VALID_URL, url)
3079 raise ExtractorError(u'Invalid URL: %s' % url)
3081 if mobj.group('course') and mobj.group('video'): # A specific video
3082 course = mobj.group('course')
3083 video = mobj.group('video')
3085 'id': course + '_' + video,
3087 'upload_date': None,
3090 self.report_extraction(info['id'])
3091 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3092 xmlUrl = baseUrl + video + '.xml'
3094 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3095 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3096 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
3098 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3100 info['title'] = mdoc.findall('./title')[0].text
3101 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3103 self._downloader.report_error(u'Invalid metadata XML file')
3105 info['ext'] = info['url'].rpartition('.')[2]
3107 elif mobj.group('course'): # A course page
3108 course = mobj.group('course')
3113 'upload_date': None,
3116 coursepage = self._download_webpage(url, info['id'],
3117 note='Downloading course info page',
3118 errnote='Unable to download course info page')
3120 m = re.search('<h1>([^<]+)</h1>', coursepage)
3122 info['title'] = unescapeHTML(m.group(1))
3124 info['title'] = info['id']
3126 m = re.search('<description>([^<]+)</description>', coursepage)
3128 info['description'] = unescapeHTML(m.group(1))
3130 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3133 'type': 'reference',
3134 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3138 for entry in info['list']:
3139 assert entry['type'] == 'reference'
3140 results += self.extract(entry['url'])
3144 'id': 'Stanford OpenClassroom',
3147 'upload_date': None,
3150 self.report_download_webpage(info['id'])
3151 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3153 rootpage = compat_urllib_request.urlopen(rootURL).read()
3154 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3155 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3158 info['title'] = info['id']
3160 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3163 'type': 'reference',
3164 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3169 for entry in info['list']:
3170 assert entry['type'] == 'reference'
3171 results += self.extract(entry['url'])
3174 class MTVIE(InfoExtractor):
3175 """Information extractor for MTV.com"""
3177 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3180 def report_extraction(self, video_id):
3181 """Report information extraction."""
3182 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3184 def _real_extract(self, url):
3185 mobj = re.match(self._VALID_URL, url)
3187 self._downloader.report_error(u'invalid URL: %s' % url)
3189 if not mobj.group('proto'):
3190 url = 'http://' + url
3191 video_id = mobj.group('videoid')
3193 webpage = self._download_webpage(url, video_id)
3195 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3197 self._downloader.report_error(u'unable to extract song name')
3199 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3200 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3202 self._downloader.report_error(u'unable to extract performer')
3204 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3205 video_title = performer + ' - ' + song_name
3207 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3209 self._downloader.report_error(u'unable to mtvn_uri')
3211 mtvn_uri = mobj.group(1)
3213 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3215 self._downloader.report_error(u'unable to extract content id')
3217 content_id = mobj.group(1)
3219 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3220 self.report_extraction(video_id)
3221 request = compat_urllib_request.Request(videogen_url)
3223 metadataXml = compat_urllib_request.urlopen(request).read()
3224 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3225 self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3228 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3229 renditions = mdoc.findall('.//rendition')
3231 # For now, always pick the highest quality.
3232 rendition = renditions[-1]
3235 _,_,ext = rendition.attrib['type'].partition('/')
3236 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3237 video_url = rendition.find('./src').text
3239 self._downloader.trouble('Invalid rendition field.')
3245 'uploader': performer,
3246 'upload_date': None,
3247 'title': video_title,
3255 class YoukuIE(InfoExtractor):
3256 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3258 def report_download_webpage(self, file_id):
3259 """Report webpage download."""
3260 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3262 def report_extraction(self, file_id):
3263 """Report information extraction."""
3264 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3267 nowTime = int(time.time() * 1000)
3268 random1 = random.randint(1000,1998)
3269 random2 = random.randint(1000,9999)
3271 return "%d%d%d" %(nowTime,random1,random2)
3273 def _get_file_ID_mix_string(self, seed):
3275 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3277 for i in range(len(source)):
3278 seed = (seed * 211 + 30031 ) % 65536
3279 index = math.floor(seed / 65536 * len(source) )
3280 mixed.append(source[int(index)])
3281 source.remove(source[int(index)])
3282 #return ''.join(mixed)
3285 def _get_file_id(self, fileId, seed):
3286 mixed = self._get_file_ID_mix_string(seed)
3287 ids = fileId.split('*')
3291 realId.append(mixed[int(ch)])
3292 return ''.join(realId)
3294 def _real_extract(self, url):
3295 mobj = re.match(self._VALID_URL, url)
3297 self._downloader.report_error(u'invalid URL: %s' % url)
3299 video_id = mobj.group('ID')
3301 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3303 request = compat_urllib_request.Request(info_url, None, std_headers)
3305 self.report_download_webpage(video_id)
3306 jsondata = compat_urllib_request.urlopen(request).read()
3307 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3308 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3311 self.report_extraction(video_id)
3313 jsonstr = jsondata.decode('utf-8')
3314 config = json.loads(jsonstr)
3316 video_title = config['data'][0]['title']
3317 seed = config['data'][0]['seed']
3319 format = self._downloader.params.get('format', None)
3320 supported_format = list(config['data'][0]['streamfileids'].keys())
3322 if format is None or format == 'best':
3323 if 'hd2' in supported_format:
3328 elif format == 'worst':
3336 fileid = config['data'][0]['streamfileids'][format]
3337 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3338 except (UnicodeDecodeError, ValueError, KeyError):
3339 self._downloader.report_error(u'unable to extract info section')
3343 sid = self._gen_sid()
3344 fileid = self._get_file_id(fileid, seed)
3346 #column 8,9 of fileid represent the segment number
3347 #fileid[7:9] should be changed
3348 for index, key in enumerate(keys):
3350 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3351 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3354 'id': '%s_part%02d' % (video_id, index),
3355 'url': download_url,
3357 'upload_date': None,
3358 'title': video_title,
3361 files_info.append(info)
3366 class XNXXIE(InfoExtractor):
3367 """Information extractor for xnxx.com"""
3369 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3371 VIDEO_URL_RE = r'flv_url=(.*?)&'
3372 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3373 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3375 def report_webpage(self, video_id):
3376 """Report information extraction"""
3377 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3379 def report_extraction(self, video_id):
3380 """Report information extraction"""
3381 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3383 def _real_extract(self, url):
3384 mobj = re.match(self._VALID_URL, url)
3386 self._downloader.report_error(u'invalid URL: %s' % url)
3388 video_id = mobj.group(1)
3390 self.report_webpage(video_id)
3392 # Get webpage content
3394 webpage_bytes = compat_urllib_request.urlopen(url).read()
3395 webpage = webpage_bytes.decode('utf-8')
3396 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3397 self._downloader.report_error(u'unable to download video webpage: %s' % err)
3400 result = re.search(self.VIDEO_URL_RE, webpage)
3402 self._downloader.report_error(u'unable to extract video url')
3404 video_url = compat_urllib_parse.unquote(result.group(1))
3406 result = re.search(self.VIDEO_TITLE_RE, webpage)
3408 self._downloader.report_error(u'unable to extract video title')
3410 video_title = result.group(1)
3412 result = re.search(self.VIDEO_THUMB_RE, webpage)
3414 self._downloader.report_error(u'unable to extract video thumbnail')
3416 video_thumbnail = result.group(1)
3422 'upload_date': None,
3423 'title': video_title,
3425 'thumbnail': video_thumbnail,
3426 'description': None,
3430 class GooglePlusIE(InfoExtractor):
3431 """Information extractor for plus.google.com."""
3433 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3434 IE_NAME = u'plus.google'
3436 def __init__(self, downloader=None):
3437 InfoExtractor.__init__(self, downloader)
3439 def report_extract_entry(self, url):
3440 """Report downloading extry"""
3441 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3443 def report_date(self, upload_date):
3444 """Report downloading extry"""
3445 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3447 def report_uploader(self, uploader):
3448 """Report downloading extry"""
3449 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3451 def report_title(self, video_title):
3452 """Report downloading extry"""
3453 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3455 def report_extract_vid_page(self, video_page):
3456 """Report information extraction."""
3457 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3459 def _real_extract(self, url):
3460 # Extract id from URL
3461 mobj = re.match(self._VALID_URL, url)
3463 self._downloader.report_error(u'Invalid URL: %s' % url)
3466 post_url = mobj.group(0)
3467 video_id = mobj.group(1)
3469 video_extension = 'flv'
3471 # Step 1, Retrieve post webpage to extract further information
3472 self.report_extract_entry(post_url)
3473 request = compat_urllib_request.Request(post_url)
3475 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3476 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3477 self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3480 # Extract update date
3482 pattern = 'title="Timestamp">(.*?)</a>'
3483 mobj = re.search(pattern, webpage)
3485 upload_date = mobj.group(1)
3486 # Convert timestring to a format suitable for filename
3487 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3488 upload_date = upload_date.strftime('%Y%m%d')
3489 self.report_date(upload_date)
3493 pattern = r'rel\="author".*?>(.*?)</a>'
3494 mobj = re.search(pattern, webpage)
3496 uploader = mobj.group(1)
3497 self.report_uploader(uploader)
3500 # Get the first line for title
3502 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3503 mobj = re.search(pattern, webpage)
3505 video_title = mobj.group(1)
3506 self.report_title(video_title)
3508 # Step 2, Stimulate clicking the image box to launch video
3509 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3510 mobj = re.search(pattern, webpage)
3512 self._downloader.report_error(u'unable to extract video page URL')
3514 video_page = mobj.group(1)
3515 request = compat_urllib_request.Request(video_page)
3517 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3518 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3519 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3521 self.report_extract_vid_page(video_page)
3524 # Extract video links on video page
3525 """Extract video links of all sizes"""
3526 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3527 mobj = re.findall(pattern, webpage)
3529 self._downloader.report_error(u'unable to extract video links')
3531 # Sort in resolution
3532 links = sorted(mobj)
3534 # Choose the lowest of the sort, i.e. highest resolution
3535 video_url = links[-1]
3536 # Only get the url. The resolution part in the tuple has no use anymore
3537 video_url = video_url[-1]
3538 # Treat escaped \u0026 style hex
3540 video_url = video_url.decode("unicode_escape")
3541 except AttributeError: # Python 3
3542 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3548 'uploader': uploader,
3549 'upload_date': upload_date,
3550 'title': video_title,
3551 'ext': video_extension,
3554 class NBAIE(InfoExtractor):
3555 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3558 def _real_extract(self, url):
3559 mobj = re.match(self._VALID_URL, url)
3561 self._downloader.report_error(u'invalid URL: %s' % url)
3564 video_id = mobj.group(1)
3565 if video_id.endswith('/index.html'):
3566 video_id = video_id[:-len('/index.html')]
3568 webpage = self._download_webpage(url, video_id)
3570 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3571 def _findProp(rexp, default=None):
3572 m = re.search(rexp, webpage)
3574 return unescapeHTML(m.group(1))
3578 shortened_video_id = video_id.rpartition('/')[2]
3579 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3581 'id': shortened_video_id,
3585 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3586 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3590 class JustinTVIE(InfoExtractor):
3591 """Information extractor for justin.tv and twitch.tv"""
3592 # TODO: One broadcast may be split into multiple videos. The key
3593 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3594 # starts at 1 and increases. Can we treat all parts as one video?
3596 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3597 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3598 _JUSTIN_PAGE_LIMIT = 100
3599 IE_NAME = u'justin.tv'
3601 def report_extraction(self, file_id):
3602 """Report information extraction."""
3603 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3605 def report_download_page(self, channel, offset):
3606 """Report attempt to download a single page of videos."""
3607 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3608 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3610 # Return count of items, list of *valid* items
3611 def _parse_page(self, url):
3613 urlh = compat_urllib_request.urlopen(url)
3614 webpage_bytes = urlh.read()
3615 webpage = webpage_bytes.decode('utf-8', 'ignore')
3616 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3617 self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3620 response = json.loads(webpage)
3621 if type(response) != list:
3622 error_text = response.get('error', 'unknown error')
3623 self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3626 for clip in response:
3627 video_url = clip['video_file_url']
3629 video_extension = os.path.splitext(video_url)[1][1:]
3630 video_date = re.sub('-', '', clip['start_time'][:10])
3631 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3632 video_id = clip['id']
3633 video_title = clip.get('title', video_id)
3637 'title': video_title,
3638 'uploader': clip.get('channel_name', video_uploader_id),
3639 'uploader_id': video_uploader_id,
3640 'upload_date': video_date,
3641 'ext': video_extension,
3643 return (len(response), info)
3645 def _real_extract(self, url):
3646 mobj = re.match(self._VALID_URL, url)
3648 self._downloader.report_error(u'invalid URL: %s' % url)
3651 api = 'http://api.justin.tv'
3652 video_id = mobj.group(mobj.lastindex)
3654 if mobj.lastindex == 1:
3656 api += '/channel/archives/%s.json'
3658 api += '/broadcast/by_archive/%s.json'
3659 api = api % (video_id,)
3661 self.report_extraction(video_id)
3665 limit = self._JUSTIN_PAGE_LIMIT
3668 self.report_download_page(video_id, offset)
3669 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3670 page_count, page_info = self._parse_page(page_url)
3671 info.extend(page_info)
3672 if not paged or page_count != limit:
3677 class FunnyOrDieIE(InfoExtractor):
3678 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3680 def _real_extract(self, url):
3681 mobj = re.match(self._VALID_URL, url)
3683 self._downloader.report_error(u'invalid URL: %s' % url)
3686 video_id = mobj.group('id')
3687 webpage = self._download_webpage(url, video_id)
3689 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3691 self._downloader.report_error(u'unable to find video information')
3692 video_url = unescapeHTML(m.group('url'))
3694 m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3696 self._downloader.trouble(u'Cannot find video title')
3697 title = clean_html(m.group('title'))
3699 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3701 desc = unescapeHTML(m.group('desc'))
3710 'description': desc,
3714 class SteamIE(InfoExtractor):
3715 _VALID_URL = r"""http://store.steampowered.com/
3716 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3718 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3722 def suitable(cls, url):
3723 """Receives a URL and returns True if suitable for this IE."""
3724 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3726 def _real_extract(self, url):
3727 m = re.match(self._VALID_URL, url, re.VERBOSE)
3728 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3729 gameID = m.group('gameID')
3730 videourl = 'http://store.steampowered.com/video/%s/' % gameID
3731 webpage = self._download_webpage(videourl, gameID)
3732 mweb = re.finditer(urlRE, webpage)
3733 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3734 titles = re.finditer(namesRE, webpage)
3735 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3736 thumbs = re.finditer(thumbsRE, webpage)
3738 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3739 video_id = vid.group('videoID')
3740 title = vtitle.group('videoName')
3741 video_url = vid.group('videoURL')
3742 video_thumb = thumb.group('thumbnail')
3744 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3749 'title': unescapeHTML(title),
3750 'thumbnail': video_thumb
3755 class UstreamIE(InfoExtractor):
3756 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3757 IE_NAME = u'ustream'
3759 def _real_extract(self, url):
3760 m = re.match(self._VALID_URL, url)
3761 video_id = m.group('videoID')
3762 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3763 webpage = self._download_webpage(url, video_id)
3764 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3765 title = m.group('title')
3766 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3767 uploader = m.group('uploader')
3773 'uploader': uploader
3777 class WorldStarHipHopIE(InfoExtractor):
3778 _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3779 IE_NAME = u'WorldStarHipHop'
3781 def _real_extract(self, url):
3782 _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3784 webpage_src = compat_urllib_request.urlopen(url).read()
3785 webpage_src = webpage_src.decode('utf-8')
3787 mobj = re.search(_src_url, webpage_src)
3789 m = re.match(self._VALID_URL, url)
3790 video_id = m.group('id')
3792 if mobj is not None:
3793 video_url = mobj.group()
3794 if 'mp4' in video_url:
3799 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3802 _title = r"""<title>(.*)</title>"""
3804 mobj = re.search(_title, webpage_src)
3806 if mobj is not None:
3807 title = mobj.group(1)
3809 title = 'World Start Hip Hop - %s' % time.ctime()
3811 _thumbnail = r"""rel="image_src" href="(.*)" />"""
3812 mobj = re.search(_thumbnail, webpage_src)
3814 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3815 if mobj is not None:
3816 thumbnail = mobj.group(1)
3818 _title = r"""candytitles.*>(.*)</span>"""
3819 mobj = re.search(_title, webpage_src)
3820 if mobj is not None:
3821 title = mobj.group(1)
3828 'thumbnail' : thumbnail,
3833 class RBMARadioIE(InfoExtractor):
3834 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3836 def _real_extract(self, url):
3837 m = re.match(self._VALID_URL, url)
3838 video_id = m.group('videoID')
3840 webpage = self._download_webpage(url, video_id)
3841 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3843 raise ExtractorError(u'Cannot find metadata')
3844 json_data = m.group(1)
3847 data = json.loads(json_data)
3848 except ValueError as e:
3849 raise ExtractorError(u'Invalid JSON: ' + str(e))
3851 video_url = data['akamai_url'] + '&cbr=256'
3852 url_parts = compat_urllib_parse_urlparse(video_url)
3853 video_ext = url_parts.path.rpartition('.')[2]
3858 'title': data['title'],
3859 'description': data.get('teaser_text'),
3860 'location': data.get('country_of_origin'),
3861 'uploader': data.get('host', {}).get('name'),
3862 'uploader_id': data.get('host', {}).get('slug'),
3863 'thumbnail': data.get('image', {}).get('large_url_2x'),
3864 'duration': data.get('duration'),
3869 class YouPornIE(InfoExtractor):
3870 """Information extractor for youporn.com."""
3871 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3873 def _print_formats(self, formats):
3874 """Print all available formats"""
3875 print(u'Available formats:')
3876 print(u'ext\t\tformat')
3877 print(u'---------------------------------')
3878 for format in formats:
3879 print(u'%s\t\t%s' % (format['ext'], format['format']))
3881 def _specific(self, req_format, formats):
3883 if(x["format"]==req_format):
3887 def _real_extract(self, url):
3888 mobj = re.match(self._VALID_URL, url)
3890 self._downloader.report_error(u'invalid URL: %s' % url)
3893 video_id = mobj.group('videoid')
3895 req = compat_urllib_request.Request(url)
3896 req.add_header('Cookie', 'age_verified=1')
3897 webpage = self._download_webpage(req, video_id)
3899 # Get the video title
3900 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3902 raise ExtractorError(u'Unable to extract video title')
3903 video_title = result.group('title').strip()
3905 # Get the video date
3906 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3908 self._downloader.report_warning(u'unable to extract video date')
3911 upload_date = result.group('date').strip()
3913 # Get the video uploader
3914 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3916 self._downloader.report_warning(u'unable to extract uploader')
3917 video_uploader = None
3919 video_uploader = result.group('uploader').strip()
3920 video_uploader = clean_html( video_uploader )
3922 # Get all of the formats available
3923 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3924 result = re.search(DOWNLOAD_LIST_RE, webpage)
3926 raise ExtractorError(u'Unable to extract download list')
3927 download_list_html = result.group('download_list').strip()
3929 # Get all of the links from the page
3930 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3931 links = re.findall(LINK_RE, download_list_html)
3932 if(len(links) == 0):
3933 raise ExtractorError(u'ERROR: no known formats available for video')
3935 self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3940 # A link looks like this:
3941 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3942 # A path looks like this:
3943 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3944 video_url = unescapeHTML( link )
3945 path = compat_urllib_parse_urlparse( video_url ).path
3946 extension = os.path.splitext( path )[1][1:]
3947 format = path.split('/')[4].split('_')[:2]
3950 format = "-".join( format )
3951 title = u'%s-%s-%s' % (video_title, size, bitrate)
3956 'uploader': video_uploader,
3957 'upload_date': upload_date,
3962 'description': None,
3966 if self._downloader.params.get('listformats', None):
3967 self._print_formats(formats)
3970 req_format = self._downloader.params.get('format', None)
3971 self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3973 if req_format is None or req_format == 'best':
3975 elif req_format == 'worst':
3976 return [formats[-1]]
3977 elif req_format in ('-1', 'all'):
3980 format = self._specific( req_format, formats )
3982 self._downloader.report_error(u'requested format not available')
3988 class PornotubeIE(InfoExtractor):
3989 """Information extractor for pornotube.com."""
3990 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3992 def _real_extract(self, url):
3993 mobj = re.match(self._VALID_URL, url)
3995 self._downloader.report_error(u'invalid URL: %s' % url)
3998 video_id = mobj.group('videoid')
3999 video_title = mobj.group('title')
4001 # Get webpage content
4002 webpage = self._download_webpage(url, video_id)
4005 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
4006 result = re.search(VIDEO_URL_RE, webpage)
4008 self._downloader.report_error(u'unable to extract video url')
4010 video_url = compat_urllib_parse.unquote(result.group('url'))
4012 #Get the uploaded date
4013 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
4014 result = re.search(VIDEO_UPLOADED_RE, webpage)
4016 self._downloader.report_error(u'unable to extract video title')
4018 upload_date = result.group('date')
4020 info = {'id': video_id,
4023 'upload_date': upload_date,
4024 'title': video_title,
4030 class YouJizzIE(InfoExtractor):
4031 """Information extractor for youjizz.com."""
4032 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
4034 def _real_extract(self, url):
4035 mobj = re.match(self._VALID_URL, url)
4037 self._downloader.report_error(u'invalid URL: %s' % url)
4040 video_id = mobj.group('videoid')
4042 # Get webpage content
4043 webpage = self._download_webpage(url, video_id)
4045 # Get the video title
4046 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
4048 raise ExtractorError(u'ERROR: unable to extract video title')
4049 video_title = result.group('title').strip()
4051 # Get the embed page
4052 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
4054 raise ExtractorError(u'ERROR: unable to extract embed page')
4056 embed_page_url = result.group(0).strip()
4057 video_id = result.group('videoid')
4059 webpage = self._download_webpage(embed_page_url, video_id)
4062 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
4064 raise ExtractorError(u'ERROR: unable to extract video url')
4065 video_url = result.group('source')
4067 info = {'id': video_id,
4069 'title': video_title,
4072 'player_url': embed_page_url}
4076 class EightTracksIE(InfoExtractor):
4078 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
4080 def _real_extract(self, url):
4081 mobj = re.match(self._VALID_URL, url)
4083 raise ExtractorError(u'Invalid URL: %s' % url)
4084 playlist_id = mobj.group('id')
4086 webpage = self._download_webpage(url, playlist_id)
4088 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
4090 raise ExtractorError(u'Cannot find trax information')
4091 json_like = m.group(1)
4092 data = json.loads(json_like)
4094 session = str(random.randint(0, 1000000000))
4096 track_count = data['tracks_count']
4097 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
4098 next_url = first_url
4100 for i in itertools.count():
4101 api_json = self._download_webpage(next_url, playlist_id,
4102 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
4103 errnote=u'Failed to download song information')
4104 api_data = json.loads(api_json)
4105 track_data = api_data[u'set']['track']
4107 'id': track_data['id'],
4108 'url': track_data['track_file_stream_url'],
4109 'title': track_data['performer'] + u' - ' + track_data['name'],
4110 'raw_title': track_data['name'],
4111 'uploader_id': data['user']['login'],
4115 if api_data['set']['at_last_track']:
4117 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
4120 class KeekIE(InfoExtractor):
4121 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
4124 def _real_extract(self, url):
4125 m = re.match(self._VALID_URL, url)
4126 video_id = m.group('videoID')
4127 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
4128 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
4129 webpage = self._download_webpage(url, video_id)
4130 m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
4131 title = unescapeHTML(m.group('title'))
4132 m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
4133 uploader = clean_html(m.group('uploader'))
4139 'thumbnail': thumbnail,
4140 'uploader': uploader
4144 class TEDIE(InfoExtractor):
4145 _VALID_URL=r'''http://www.ted.com/
4147 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4149 ((?P<type_talk>talks)) # We have a simple talk
4151 /(?P<name>\w+) # Here goes the name and then ".html"
4155 def suitable(cls, url):
4156 """Receives a URL and returns True if suitable for this IE."""
4157 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4159 def _real_extract(self, url):
4160 m=re.match(self._VALID_URL, url, re.VERBOSE)
4161 if m.group('type_talk'):
4162 return [self._talk_info(url)]
4164 playlist_id=m.group('playlist_id')
4165 name=m.group('name')
4166 self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4167 return self._playlist_videos_info(url,name,playlist_id)
4169 def _talk_video_link(self,mediaSlug):
4170 '''Returns the video link for that mediaSlug'''
4171 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4173 def _playlist_videos_info(self,url,name,playlist_id=0):
4174 '''Returns the videos of the playlist'''
4176 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4177 ([.\s]*?)data-playlist_item_id="(\d+)"
4178 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4180 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4181 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4182 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4183 m_names=re.finditer(video_name_RE,webpage)
4185 for m_video, m_name in zip(m_videos,m_names):
4186 video_id=m_video.group('video_id')
4187 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4188 info.append(self._talk_info(talk_url,video_id))
4191 def _talk_info(self, url, video_id=0):
4192 """Return the video for the talk in the url"""
4193 m=re.match(self._VALID_URL, url,re.VERBOSE)
4194 videoName=m.group('name')
4195 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4196 # If the url includes the language we get the title translated
4197 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4198 title=re.search(title_RE, webpage).group('title')
4199 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4200 "id":(?P<videoID>[\d]+).*?
4201 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4202 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4203 thumb_match=re.search(thumb_RE,webpage)
4204 info_match=re.search(info_RE,webpage,re.VERBOSE)
4205 video_id=info_match.group('videoID')
4206 mediaSlug=info_match.group('mediaSlug')
4207 video_url=self._talk_video_link(mediaSlug)
4213 'thumbnail': thumb_match.group('thumbnail')
4217 class MySpassIE(InfoExtractor):
4218 _VALID_URL = r'http://www.myspass.de/.*'
4220 def _real_extract(self, url):
4221 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4223 # video id is the last path element of the URL
4224 # usually there is a trailing slash, so also try the second but last
4225 url_path = compat_urllib_parse_urlparse(url).path
4226 url_parent_path, video_id = os.path.split(url_path)
4228 _, video_id = os.path.split(url_parent_path)
4231 metadata_url = META_DATA_URL_TEMPLATE % video_id
4232 metadata_text = self._download_webpage(metadata_url, video_id)
4233 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4235 # extract values from metadata
4236 url_flv_el = metadata.find('url_flv')
4237 if url_flv_el is None:
4238 self._downloader.report_error(u'unable to extract download url')
4240 video_url = url_flv_el.text
4241 extension = os.path.splitext(video_url)[1][1:]
4242 title_el = metadata.find('title')
4243 if title_el is None:
4244 self._downloader.report_error(u'unable to extract title')
4246 title = title_el.text
4247 format_id_el = metadata.find('format_id')
4248 if format_id_el is None:
4251 format = format_id_el.text
4252 description_el = metadata.find('description')
4253 if description_el is not None:
4254 description = description_el.text
4257 imagePreview_el = metadata.find('imagePreview')
4258 if imagePreview_el is not None:
4259 thumbnail = imagePreview_el.text
4268 'thumbnail': thumbnail,
4269 'description': description
4273 class SpiegelIE(InfoExtractor):
4274 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4276 def _real_extract(self, url):
4277 m = re.match(self._VALID_URL, url)
4278 video_id = m.group('videoID')
4280 webpage = self._download_webpage(url, video_id)
4281 m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4283 raise ExtractorError(u'Cannot find title')
4284 video_title = unescapeHTML(m.group(1))
4286 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4287 xml_code = self._download_webpage(xml_url, video_id,
4288 note=u'Downloading XML', errnote=u'Failed to download XML')
4290 idoc = xml.etree.ElementTree.fromstring(xml_code)
4291 last_type = idoc[-1]
4292 filename = last_type.findall('./filename')[0].text
4293 duration = float(last_type.findall('./duration')[0].text)
4295 video_url = 'http://video2.spiegel.de/flash/' + filename
4296 video_ext = filename.rpartition('.')[2]
4301 'title': video_title,
4302 'duration': duration,
4306 class LiveLeakIE(InfoExtractor):
4308 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4309 IE_NAME = u'liveleak'
4311 def _real_extract(self, url):
4312 mobj = re.match(self._VALID_URL, url)
4314 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4317 video_id = mobj.group('video_id')
4319 webpage = self._download_webpage(url, video_id)
4321 m = re.search(r'file: "(.*?)",', webpage)
4323 self._downloader.report_error(u'unable to find video url')
4325 video_url = m.group(1)
4327 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4329 self._downloader.trouble(u'Cannot find video title')
4330 title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4332 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4334 desc = unescapeHTML(m.group('desc'))
4338 m = re.search(r'By:.*?(\w+)</a>', webpage)
4340 uploader = clean_html(m.group(1))
4349 'description': desc,
4350 'uploader': uploader
4356 def gen_extractors():
4357 """ Return a list of an instance of every supported extractor.
4358 The order does matter; the first extractor matched is the one handling the URL.
4361 YoutubePlaylistIE(),
4386 StanfordOpenClassroomIE(),
4396 WorldStarHipHopIE(),