2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
23 class InfoExtractor(object):
24 """Information Extractor class.
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
29 others. The information is stored in a dictionary which is then
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
34 The dictionaries must include the following fields:
38 title: Video title, unescaped.
39 ext: Video filename extension.
41 The following fields are optional:
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
46 uploader: Full name of the video uploader.
47 upload_date: Video upload date (YYYYMMDD).
48 uploader_id: Nickname or id of the video uploader.
49 location: Physical location of the video.
50 player_url: SWF Player URL (used for rtmpdump).
51 subtitles: The subtitle file contents.
52 urlhandle: [internal] The urlHandle to be used to download the file,
53 like returned by urllib.request.urlopen
55 The fields should all be Unicode strings.
57 Subclasses of this one should re-define the _real_initialize() and
58 _real_extract() methods and define a _VALID_URL regexp.
59 Probably, they should also be added to the list of extractors.
61 _real_extract() must return a *list* of information dictionaries as
64 Finally, the _WORKING attribute should be set to False for broken IEs
65 in order to warn the users and skip the tests.
72 def __init__(self, downloader=None):
73 """Constructor. Receives an optional downloader."""
75 self.set_downloader(downloader)
78 def suitable(cls, url):
79 """Receives a URL and returns True if suitable for this IE."""
80 return re.match(cls._VALID_URL, url) is not None
84 """Getter method for _WORKING."""
88 """Initializes an instance (authentication, etc)."""
90 self._real_initialize()
93 def extract(self, url):
94 """Extracts URL information and returns it in list of dicts."""
96 return self._real_extract(url)
98 def set_downloader(self, downloader):
99 """Sets the downloader for this IE."""
100 self._downloader = downloader
102 def _real_initialize(self):
103 """Real initialization process. Redefine in subclasses."""
106 def _real_extract(self, url):
107 """Real extraction process. Redefine in subclasses."""
112 return type(self).__name__[:-2]
114 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115 """ Returns the response handle """
117 note = u'Downloading video webpage'
118 self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
120 return compat_urllib_request.urlopen(url_or_request)
121 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
123 errnote = u'Unable to download webpage'
124 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
126 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
127 """ Returns the data of the page as a string """
128 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
129 content_type = urlh.headers.get('Content-Type', '')
130 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
132 encoding = m.group(1)
135 webpage_bytes = urlh.read()
136 return webpage_bytes.decode(encoding, 'replace')
138 #Methods for following #608
139 #They set the correct value of the '_type' key
140 def video_result(self, video_info):
141 """Returns a video"""
142 video_info['_type'] = 'video'
144 def url_result(self, url, ie=None):
145 """Returns a url that points to a page that should be processed"""
146 #TODO: ie should be the class used for getting the info
147 video_info = {'_type': 'url',
150 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
151 """Returns a playlist"""
152 video_info = {'_type': 'playlist',
155 video_info['id'] = playlist_id
157 video_info['title'] = playlist_title
161 class YoutubeIE(InfoExtractor):
162 """Information extractor for youtube.com."""
166 (?:https?://)? # http(s):// (optional)
167 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
168 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
169 (?:.*?\#/)? # handle anchor (#/) redirect urls
170 (?: # the various things that can precede the ID:
171 (?:(?:v|embed|e)/) # v/ or embed/ or e/
172 |(?: # or the v= param in all its forms
173 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
174 (?:\?|\#!?) # the params delimiter ? or # or #!
175 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
178 )? # optional -> youtube.com/xxxx is OK
179 )? # all until now is optional -> you can pass the naked ID
180 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
181 (?(1).+)? # if we found the ID, everything can follow
183 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
184 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
185 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
186 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
187 _NETRC_MACHINE = 'youtube'
188 # Listed in order of quality
189 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
190 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
191 _video_extensions = {
197 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
203 _video_dimensions = {
222 def suitable(cls, url):
223 """Receives a URL and returns True if suitable for this IE."""
224 if YoutubePlaylistIE.suitable(url): return False
225 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
227 def report_lang(self):
228 """Report attempt to set language."""
229 self._downloader.to_screen(u'[youtube] Setting language')
231 def report_login(self):
232 """Report attempt to log in."""
233 self._downloader.to_screen(u'[youtube] Logging in')
235 def report_age_confirmation(self):
236 """Report attempt to confirm age."""
237 self._downloader.to_screen(u'[youtube] Confirming age')
239 def report_video_webpage_download(self, video_id):
240 """Report attempt to download video webpage."""
241 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
243 def report_video_info_webpage_download(self, video_id):
244 """Report attempt to download video info webpage."""
245 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
247 def report_video_subtitles_download(self, video_id):
248 """Report attempt to download video info webpage."""
249 self._downloader.to_screen(u'[youtube] %s: Checking available subtitles' % video_id)
251 def report_video_subtitles_request(self, video_id, sub_lang, format):
252 """Report attempt to download video info webpage."""
253 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
255 def report_video_subtitles_available(self, video_id, sub_lang_list):
256 """Report available subtitles."""
257 sub_lang = ",".join(list(sub_lang_list.keys()))
258 self._downloader.to_screen(u'[youtube] %s: Available subtitles for video: %s' % (video_id, sub_lang))
260 def report_information_extraction(self, video_id):
261 """Report attempt to extract video information."""
262 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
264 def report_unavailable_format(self, video_id, format):
265 """Report extracted video URL."""
266 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
268 def report_rtmp_download(self):
269 """Indicate the download will use the RTMP protocol."""
270 self._downloader.to_screen(u'[youtube] RTMP download detected')
272 def _get_available_subtitles(self, video_id):
273 self.report_video_subtitles_download(video_id)
274 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
276 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
277 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
278 return (u'unable to download video subtitles: %s' % compat_str(err), None)
279 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
280 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
281 if not sub_lang_list:
282 return (u'video doesn\'t have subtitles', None)
285 def _list_available_subtitles(self, video_id):
286 sub_lang_list = self._get_available_subtitles(video_id)
287 self.report_video_subtitles_available(video_id, sub_lang_list)
289 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
292 (error_message, sub_lang, sub)
294 self.report_video_subtitles_request(video_id, sub_lang, format)
295 params = compat_urllib_parse.urlencode({
301 url = 'http://www.youtube.com/api/timedtext?' + params
303 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
304 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
305 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
307 return (u'Did not fetch video subtitles', None, None)
308 return (None, sub_lang, sub)
310 def _extract_subtitle(self, video_id):
312 Return a list with a tuple:
313 [(error_message, sub_lang, sub)]
315 sub_lang_list = self._get_available_subtitles(video_id)
316 sub_format = self._downloader.params.get('subtitlesformat')
317 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
318 return [(sub_lang_list[0], None, None)]
319 if self._downloader.params.get('subtitleslang', False):
320 sub_lang = self._downloader.params.get('subtitleslang')
321 elif 'en' in sub_lang_list:
324 sub_lang = list(sub_lang_list.keys())[0]
325 if not sub_lang in sub_lang_list:
326 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
328 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
331 def _extract_all_subtitles(self, video_id):
332 sub_lang_list = self._get_available_subtitles(video_id)
333 sub_format = self._downloader.params.get('subtitlesformat')
334 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
335 return [(sub_lang_list[0], None, None)]
337 for sub_lang in sub_lang_list:
338 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
339 subtitles.append(subtitle)
342 def _print_formats(self, formats):
343 print('Available formats:')
345 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
347 def _real_initialize(self):
348 if self._downloader is None:
353 downloader_params = self._downloader.params
355 # Attempt to use provided username and password or .netrc data
356 if downloader_params.get('username', None) is not None:
357 username = downloader_params['username']
358 password = downloader_params['password']
359 elif downloader_params.get('usenetrc', False):
361 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
366 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
367 except (IOError, netrc.NetrcParseError) as err:
368 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
372 request = compat_urllib_request.Request(self._LANG_URL)
375 compat_urllib_request.urlopen(request).read()
376 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
377 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
380 # No authentication to be performed
384 request = compat_urllib_request.Request(self._LOGIN_URL)
386 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
387 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
388 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
393 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
395 galx = match.group(1)
397 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
403 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
407 u'PersistentCookie': u'yes',
409 u'bgresponse': u'js_disabled',
410 u'checkConnection': u'',
411 u'checkedDomains': u'youtube',
417 u'signIn': u'Sign in',
419 u'service': u'youtube',
423 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
425 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
426 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
427 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
430 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
431 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
432 self._downloader.report_warning(u'unable to log in: bad username or password')
434 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
435 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
441 'action_confirm': 'Confirm',
443 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
445 self.report_age_confirmation()
446 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
447 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
448 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
451 def _extract_id(self, url):
452 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
454 self._downloader.report_error(u'invalid URL: %s' % url)
456 video_id = mobj.group(2)
459 def _real_extract(self, url):
460 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
461 mobj = re.search(self._NEXT_URL_RE, url)
463 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
464 video_id = self._extract_id(url)
467 self.report_video_webpage_download(video_id)
468 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
469 request = compat_urllib_request.Request(url)
471 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
472 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
473 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
476 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
478 # Attempt to extract SWF player URL
479 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
481 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
486 self.report_video_info_webpage_download(video_id)
487 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
488 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
489 % (video_id, el_type))
490 request = compat_urllib_request.Request(video_info_url)
492 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
493 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
494 video_info = compat_parse_qs(video_info_webpage)
495 if 'token' in video_info:
497 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
498 self._downloader.report_error(u'unable to download video info webpage: %s' % compat_str(err))
500 if 'token' not in video_info:
501 if 'reason' in video_info:
502 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
504 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
507 # Check for "rental" videos
508 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
509 self._downloader.report_error(u'"rental" videos not supported')
512 # Start extracting information
513 self.report_information_extraction(video_id)
516 if 'author' not in video_info:
517 self._downloader.report_error(u'unable to extract uploader name')
519 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
522 video_uploader_id = None
523 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
525 video_uploader_id = mobj.group(1)
527 self._downloader.report_warning(u'unable to extract uploader nickname')
530 if 'title' not in video_info:
531 self._downloader.report_error(u'unable to extract video title')
533 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
536 if 'thumbnail_url' not in video_info:
537 self._downloader.report_warning(u'unable to extract video thumbnail')
539 else: # don't panic if we can't find it
540 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
544 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
546 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
547 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
548 for expression in format_expressions:
550 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
555 video_description = get_element_by_id("eow-description", video_webpage)
556 if video_description:
557 video_description = clean_html(video_description)
559 video_description = ''
562 video_subtitles = None
564 if self._downloader.params.get('writesubtitles', False):
565 video_subtitles = self._extract_subtitle(video_id)
567 (sub_error, sub_lang, sub) = video_subtitles[0]
569 self._downloader.report_error(sub_error)
571 if self._downloader.params.get('allsubtitles', False):
572 video_subtitles = self._extract_all_subtitles(video_id)
573 for video_subtitle in video_subtitles:
574 (sub_error, sub_lang, sub) = video_subtitle
576 self._downloader.report_error(sub_error)
578 if self._downloader.params.get('listsubtitles', False):
579 sub_lang_list = self._list_available_subtitles(video_id)
582 if 'length_seconds' not in video_info:
583 self._downloader.report_warning(u'unable to extract video duration')
586 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
589 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
591 # Decide which formats to download
592 req_format = self._downloader.params.get('format', None)
594 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
595 self.report_rtmp_download()
596 video_url_list = [(None, video_info['conn'][0])]
597 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
598 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
599 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
600 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
601 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
603 format_limit = self._downloader.params.get('format_limit', None)
604 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
605 if format_limit is not None and format_limit in available_formats:
606 format_list = available_formats[available_formats.index(format_limit):]
608 format_list = available_formats
609 existing_formats = [x for x in format_list if x in url_map]
610 if len(existing_formats) == 0:
611 self._downloader.report_error(u'no known formats available for video')
613 if self._downloader.params.get('listformats', None):
614 self._print_formats(existing_formats)
616 if req_format is None or req_format == 'best':
617 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
618 elif req_format == 'worst':
619 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
620 elif req_format in ('-1', 'all'):
621 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
623 # Specific formats. We pick the first in a slash-delimeted sequence.
624 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
625 req_formats = req_format.split('/')
626 video_url_list = None
627 for rf in req_formats:
629 video_url_list = [(rf, url_map[rf])]
631 if video_url_list is None:
632 self._downloader.report_error(u'requested format not available')
635 self._downloader.report_error(u'no conn or url_encoded_fmt_stream_map information found in video info')
639 for format_param, video_real_url in video_url_list:
641 video_extension = self._video_extensions.get(format_param, 'flv')
643 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
644 self._video_dimensions.get(format_param, '???'))
648 'url': video_real_url,
649 'uploader': video_uploader,
650 'uploader_id': video_uploader_id,
651 'upload_date': upload_date,
652 'title': video_title,
653 'ext': video_extension,
654 'format': video_format,
655 'thumbnail': video_thumbnail,
656 'description': video_description,
657 'player_url': player_url,
658 'subtitles': video_subtitles,
659 'duration': video_duration
664 class MetacafeIE(InfoExtractor):
665 """Information Extractor for metacafe.com."""
667 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
668 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
669 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
670 IE_NAME = u'metacafe'
672 def __init__(self, downloader=None):
673 InfoExtractor.__init__(self, downloader)
675 def report_disclaimer(self):
676 """Report disclaimer retrieval."""
677 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
679 def report_age_confirmation(self):
680 """Report attempt to confirm age."""
681 self._downloader.to_screen(u'[metacafe] Confirming age')
683 def report_download_webpage(self, video_id):
684 """Report webpage download."""
685 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
687 def report_extraction(self, video_id):
688 """Report information extraction."""
689 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
691 def _real_initialize(self):
692 # Retrieve disclaimer
693 request = compat_urllib_request.Request(self._DISCLAIMER)
695 self.report_disclaimer()
696 disclaimer = compat_urllib_request.urlopen(request).read()
697 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
698 self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
704 'submit': "Continue - I'm over 18",
706 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
708 self.report_age_confirmation()
709 disclaimer = compat_urllib_request.urlopen(request).read()
710 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
711 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
714 def _real_extract(self, url):
715 # Extract id and simplified title from URL
716 mobj = re.match(self._VALID_URL, url)
718 self._downloader.report_error(u'invalid URL: %s' % url)
721 video_id = mobj.group(1)
723 # Check if video comes from YouTube
724 mobj2 = re.match(r'^yt-(.*)$', video_id)
725 if mobj2 is not None:
726 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1))]
728 # Retrieve video webpage to extract further information
729 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
731 self.report_download_webpage(video_id)
732 webpage = compat_urllib_request.urlopen(request).read()
733 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
734 self._downloader.report_error(u'unable retrieve video webpage: %s' % compat_str(err))
737 # Extract URL, uploader and title from webpage
738 self.report_extraction(video_id)
739 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
741 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
742 video_extension = mediaURL[-3:]
744 # Extract gdaKey if available
745 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
749 gdaKey = mobj.group(1)
750 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
752 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
754 self._downloader.report_error(u'unable to extract media URL')
756 vardict = compat_parse_qs(mobj.group(1))
757 if 'mediaData' not in vardict:
758 self._downloader.report_error(u'unable to extract media URL')
760 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
762 self._downloader.report_error(u'unable to extract media URL')
764 mediaURL = mobj.group(1).replace('\\/', '/')
765 video_extension = mediaURL[-3:]
766 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
768 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
770 self._downloader.report_error(u'unable to extract title')
772 video_title = mobj.group(1).decode('utf-8')
774 mobj = re.search(r'submitter=(.*?);', webpage)
776 self._downloader.report_error(u'unable to extract uploader nickname')
778 video_uploader = mobj.group(1)
781 'id': video_id.decode('utf-8'),
782 'url': video_url.decode('utf-8'),
783 'uploader': video_uploader.decode('utf-8'),
785 'title': video_title,
786 'ext': video_extension.decode('utf-8'),
790 class DailymotionIE(InfoExtractor):
791 """Information Extractor for Dailymotion"""
793 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
794 IE_NAME = u'dailymotion'
797 def __init__(self, downloader=None):
798 InfoExtractor.__init__(self, downloader)
800 def report_extraction(self, video_id):
801 """Report information extraction."""
802 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
804 def _real_extract(self, url):
805 # Extract id and simplified title from URL
806 mobj = re.match(self._VALID_URL, url)
808 self._downloader.report_error(u'invalid URL: %s' % url)
811 video_id = mobj.group(1).split('_')[0].split('?')[0]
813 video_extension = 'mp4'
815 # Retrieve video webpage to extract further information
816 request = compat_urllib_request.Request(url)
817 request.add_header('Cookie', 'family_filter=off')
818 webpage = self._download_webpage(request, video_id)
820 # Extract URL, uploader and title from webpage
821 self.report_extraction(video_id)
822 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
824 self._downloader.report_error(u'unable to extract media URL')
826 flashvars = compat_urllib_parse.unquote(mobj.group(1))
828 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
831 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
834 self._downloader.report_error(u'unable to extract video URL')
837 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
839 self._downloader.report_error(u'unable to extract video URL')
842 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
844 # TODO: support choosing qualities
846 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
848 self._downloader.report_error(u'unable to extract title')
850 video_title = unescapeHTML(mobj.group('title'))
852 video_uploader = None
853 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
855 # lookin for official user
856 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
857 if mobj_official is None:
858 self._downloader.report_warning(u'unable to extract uploader nickname')
860 video_uploader = mobj_official.group(1)
862 video_uploader = mobj.group(1)
864 video_upload_date = None
865 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
867 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
872 'uploader': video_uploader,
873 'upload_date': video_upload_date,
874 'title': video_title,
875 'ext': video_extension,
879 class PhotobucketIE(InfoExtractor):
880 """Information extractor for photobucket.com."""
882 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
883 IE_NAME = u'photobucket'
885 def __init__(self, downloader=None):
886 InfoExtractor.__init__(self, downloader)
888 def report_download_webpage(self, video_id):
889 """Report webpage download."""
890 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
892 def report_extraction(self, video_id):
893 """Report information extraction."""
894 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
896 def _real_extract(self, url):
897 # Extract id from URL
898 mobj = re.match(self._VALID_URL, url)
900 self._downloader.report_error(u'Invalid URL: %s' % url)
903 video_id = mobj.group(1)
905 video_extension = 'flv'
907 # Retrieve video webpage to extract further information
908 request = compat_urllib_request.Request(url)
910 self.report_download_webpage(video_id)
911 webpage = compat_urllib_request.urlopen(request).read()
912 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
913 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
916 # Extract URL, uploader, and title from webpage
917 self.report_extraction(video_id)
918 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
920 self._downloader.report_error(u'unable to extract media URL')
922 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
926 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
928 self._downloader.report_error(u'unable to extract title')
930 video_title = mobj.group(1).decode('utf-8')
932 video_uploader = mobj.group(2).decode('utf-8')
935 'id': video_id.decode('utf-8'),
936 'url': video_url.decode('utf-8'),
937 'uploader': video_uploader,
939 'title': video_title,
940 'ext': video_extension.decode('utf-8'),
944 class YahooIE(InfoExtractor):
945 """Information extractor for video.yahoo.com."""
948 # _VALID_URL matches all Yahoo! Video URLs
949 # _VPAGE_URL matches only the extractable '/watch/' URLs
950 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
951 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
952 IE_NAME = u'video.yahoo'
954 def __init__(self, downloader=None):
955 InfoExtractor.__init__(self, downloader)
957 def report_download_webpage(self, video_id):
958 """Report webpage download."""
959 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
961 def report_extraction(self, video_id):
962 """Report information extraction."""
963 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
965 def _real_extract(self, url, new_video=True):
966 # Extract ID from URL
967 mobj = re.match(self._VALID_URL, url)
969 self._downloader.report_error(u'Invalid URL: %s' % url)
972 video_id = mobj.group(2)
973 video_extension = 'flv'
975 # Rewrite valid but non-extractable URLs as
976 # extractable English language /watch/ URLs
977 if re.match(self._VPAGE_URL, url) is None:
978 request = compat_urllib_request.Request(url)
980 webpage = compat_urllib_request.urlopen(request).read()
981 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
982 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
985 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
987 self._downloader.report_error(u'Unable to extract id field')
989 yahoo_id = mobj.group(1)
991 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
993 self._downloader.report_error(u'Unable to extract vid field')
995 yahoo_vid = mobj.group(1)
997 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
998 return self._real_extract(url, new_video=False)
1000 # Retrieve video webpage to extract further information
1001 request = compat_urllib_request.Request(url)
1003 self.report_download_webpage(video_id)
1004 webpage = compat_urllib_request.urlopen(request).read()
1005 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1006 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1009 # Extract uploader and title from webpage
1010 self.report_extraction(video_id)
1011 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1013 self._downloader.report_error(u'unable to extract video title')
1015 video_title = mobj.group(1).decode('utf-8')
1017 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1019 self._downloader.report_error(u'unable to extract video uploader')
1021 video_uploader = mobj.group(1).decode('utf-8')
1023 # Extract video thumbnail
1024 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1026 self._downloader.report_error(u'unable to extract video thumbnail')
1028 video_thumbnail = mobj.group(1).decode('utf-8')
1030 # Extract video description
1031 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1033 self._downloader.report_error(u'unable to extract video description')
1035 video_description = mobj.group(1).decode('utf-8')
1036 if not video_description:
1037 video_description = 'No description available.'
1039 # Extract video height and width
1040 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1042 self._downloader.report_error(u'unable to extract video height')
1044 yv_video_height = mobj.group(1)
1046 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1048 self._downloader.report_error(u'unable to extract video width')
1050 yv_video_width = mobj.group(1)
1052 # Retrieve video playlist to extract media URL
1053 # I'm not completely sure what all these options are, but we
1054 # seem to need most of them, otherwise the server sends a 401.
1055 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1056 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1057 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1058 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1059 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1061 self.report_download_webpage(video_id)
1062 webpage = compat_urllib_request.urlopen(request).read()
1063 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1064 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1067 # Extract media URL from playlist XML
1068 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1070 self._downloader.report_error(u'Unable to extract media URL')
1072 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1073 video_url = unescapeHTML(video_url)
1076 'id': video_id.decode('utf-8'),
1078 'uploader': video_uploader,
1079 'upload_date': None,
1080 'title': video_title,
1081 'ext': video_extension.decode('utf-8'),
1082 'thumbnail': video_thumbnail.decode('utf-8'),
1083 'description': video_description,
1087 class VimeoIE(InfoExtractor):
1088 """Information extractor for vimeo.com."""
1090 # _VALID_URL matches Vimeo URLs
1091 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1094 def __init__(self, downloader=None):
1095 InfoExtractor.__init__(self, downloader)
1097 def report_download_webpage(self, video_id):
1098 """Report webpage download."""
1099 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1101 def report_extraction(self, video_id):
1102 """Report information extraction."""
1103 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1105 def _real_extract(self, url, new_video=True):
1106 # Extract ID from URL
1107 mobj = re.match(self._VALID_URL, url)
1109 self._downloader.report_error(u'Invalid URL: %s' % url)
1112 video_id = mobj.group('id')
1113 if not mobj.group('proto'):
1114 url = 'https://' + url
1115 if mobj.group('direct_link'):
1116 url = 'https://vimeo.com/' + video_id
1118 # Retrieve video webpage to extract further information
1119 request = compat_urllib_request.Request(url, None, std_headers)
1121 self.report_download_webpage(video_id)
1122 webpage_bytes = compat_urllib_request.urlopen(request).read()
1123 webpage = webpage_bytes.decode('utf-8')
1124 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1125 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1128 # Now we begin extracting as much information as we can from what we
1129 # retrieved. First we extract the information common to all extractors,
1130 # and latter we extract those that are Vimeo specific.
1131 self.report_extraction(video_id)
1133 # Extract the config JSON
1135 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1136 config = json.loads(config)
1138 self._downloader.report_error(u'unable to extract info section')
1142 video_title = config["video"]["title"]
1144 # Extract uploader and uploader_id
1145 video_uploader = config["video"]["owner"]["name"]
1146 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1148 # Extract video thumbnail
1149 video_thumbnail = config["video"]["thumbnail"]
1151 # Extract video description
1152 video_description = get_element_by_attribute("itemprop", "description", webpage)
1153 if video_description: video_description = clean_html(video_description)
1154 else: video_description = ''
1156 # Extract upload date
1157 video_upload_date = None
1158 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1159 if mobj is not None:
1160 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1162 # Vimeo specific: extract request signature and timestamp
1163 sig = config['request']['signature']
1164 timestamp = config['request']['timestamp']
1166 # Vimeo specific: extract video codec and quality information
1167 # First consider quality, then codecs, then take everything
1168 # TODO bind to format param
1169 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1170 files = { 'hd': [], 'sd': [], 'other': []}
1171 for codec_name, codec_extension in codecs:
1172 if codec_name in config["video"]["files"]:
1173 if 'hd' in config["video"]["files"][codec_name]:
1174 files['hd'].append((codec_name, codec_extension, 'hd'))
1175 elif 'sd' in config["video"]["files"][codec_name]:
1176 files['sd'].append((codec_name, codec_extension, 'sd'))
1178 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1180 for quality in ('hd', 'sd', 'other'):
1181 if len(files[quality]) > 0:
1182 video_quality = files[quality][0][2]
1183 video_codec = files[quality][0][0]
1184 video_extension = files[quality][0][1]
1185 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1188 self._downloader.report_error(u'no known codec found')
1191 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1192 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1197 'uploader': video_uploader,
1198 'uploader_id': video_uploader_id,
1199 'upload_date': video_upload_date,
1200 'title': video_title,
1201 'ext': video_extension,
1202 'thumbnail': video_thumbnail,
1203 'description': video_description,
1207 class ArteTvIE(InfoExtractor):
1208 """arte.tv information extractor."""
1210 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1211 _LIVE_URL = r'index-[0-9]+\.html$'
1213 IE_NAME = u'arte.tv'
1215 def __init__(self, downloader=None):
1216 InfoExtractor.__init__(self, downloader)
1218 def report_download_webpage(self, video_id):
1219 """Report webpage download."""
1220 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1222 def report_extraction(self, video_id):
1223 """Report information extraction."""
1224 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1226 def fetch_webpage(self, url):
1227 request = compat_urllib_request.Request(url)
1229 self.report_download_webpage(url)
1230 webpage = compat_urllib_request.urlopen(request).read()
1231 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1232 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1234 except ValueError as err:
1235 self._downloader.report_error(u'Invalid URL: %s' % url)
1239 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1240 page = self.fetch_webpage(url)
1241 mobj = re.search(regex, page, regexFlags)
1245 self._downloader.report_error(u'Invalid URL: %s' % url)
1248 for (i, key, err) in matchTuples:
1249 if mobj.group(i) is None:
1250 self._downloader.trouble(err)
1253 info[key] = mobj.group(i)
1257 def extractLiveStream(self, url):
1258 video_lang = url.split('/')[-4]
1259 info = self.grep_webpage(
1261 r'src="(.*?/videothek_js.*?\.js)',
1264 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1267 http_host = url.split('/')[2]
1268 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1269 info = self.grep_webpage(
1271 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1272 '(http://.*?\.swf).*?' +
1276 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1277 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1278 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1281 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1283 def extractPlus7Stream(self, url):
1284 video_lang = url.split('/')[-3]
1285 info = self.grep_webpage(
1287 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1290 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1293 next_url = compat_urllib_parse.unquote(info.get('url'))
1294 info = self.grep_webpage(
1296 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1299 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1302 next_url = compat_urllib_parse.unquote(info.get('url'))
1304 info = self.grep_webpage(
1306 r'<video id="(.*?)".*?>.*?' +
1307 '<name>(.*?)</name>.*?' +
1308 '<dateVideo>(.*?)</dateVideo>.*?' +
1309 '<url quality="hd">(.*?)</url>',
1312 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1313 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1314 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1315 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1320 'id': info.get('id'),
1321 'url': compat_urllib_parse.unquote(info.get('url')),
1322 'uploader': u'arte.tv',
1323 'upload_date': info.get('date'),
1324 'title': info.get('title').decode('utf-8'),
1330 def _real_extract(self, url):
1331 video_id = url.split('/')[-1]
1332 self.report_extraction(video_id)
1334 if re.search(self._LIVE_URL, video_id) is not None:
1335 self.extractLiveStream(url)
1338 info = self.extractPlus7Stream(url)
1343 class GenericIE(InfoExtractor):
1344 """Generic last-resort information extractor."""
1347 IE_NAME = u'generic'
1349 def __init__(self, downloader=None):
1350 InfoExtractor.__init__(self, downloader)
1352 def report_download_webpage(self, video_id):
1353 """Report webpage download."""
1354 if not self._downloader.params.get('test', False):
1355 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1356 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1358 def report_extraction(self, video_id):
1359 """Report information extraction."""
1360 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1362 def report_following_redirect(self, new_url):
1363 """Report information extraction."""
1364 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1366 def _test_redirect(self, url):
1367 """Check if it is a redirect, like url shorteners, in case return the new url."""
1368 class HeadRequest(compat_urllib_request.Request):
1369 def get_method(self):
1372 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1374 Subclass the HTTPRedirectHandler to make it use our
1375 HeadRequest also on the redirected URL
1377 def redirect_request(self, req, fp, code, msg, headers, newurl):
1378 if code in (301, 302, 303, 307):
1379 newurl = newurl.replace(' ', '%20')
1380 newheaders = dict((k,v) for k,v in req.headers.items()
1381 if k.lower() not in ("content-length", "content-type"))
1382 return HeadRequest(newurl,
1384 origin_req_host=req.get_origin_req_host(),
1387 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1389 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1391 Fallback to GET if HEAD is not allowed (405 HTTP error)
1393 def http_error_405(self, req, fp, code, msg, headers):
1397 newheaders = dict((k,v) for k,v in req.headers.items()
1398 if k.lower() not in ("content-length", "content-type"))
1399 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1401 origin_req_host=req.get_origin_req_host(),
1405 opener = compat_urllib_request.OpenerDirector()
1406 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1407 HTTPMethodFallback, HEADRedirectHandler,
1408 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1409 opener.add_handler(handler())
1411 response = opener.open(HeadRequest(url))
1412 new_url = response.geturl()
1417 self.report_following_redirect(new_url)
1420 def _real_extract(self, url):
1421 new_url = self._test_redirect(url)
1422 if new_url: return [self.url_result(new_url)]
1424 video_id = url.split('/')[-1]
1426 webpage = self._download_webpage(url, video_id)
1427 except ValueError as err:
1428 # since this is the last-resort InfoExtractor, if
1429 # this error is thrown, it'll be thrown here
1430 self._downloader.report_error(u'Invalid URL: %s' % url)
1433 self.report_extraction(video_id)
1434 # Start with something easy: JW Player in SWFObject
1435 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1437 # Broaden the search a little bit
1438 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1440 # Broaden the search a little bit: JWPlayer JS loader
1441 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1443 self._downloader.report_error(u'Invalid URL: %s' % url)
1446 # It's possible that one of the regexes
1447 # matched, but returned an empty group:
1448 if mobj.group(1) is None:
1449 self._downloader.report_error(u'Invalid URL: %s' % url)
1452 video_url = compat_urllib_parse.unquote(mobj.group(1))
1453 video_id = os.path.basename(video_url)
1455 # here's a fun little line of code for you:
1456 video_extension = os.path.splitext(video_id)[1][1:]
1457 video_id = os.path.splitext(video_id)[0]
1459 # it's tempting to parse this further, but you would
1460 # have to take into account all the variations like
1461 # Video Title - Site Name
1462 # Site Name | Video Title
1463 # Video Title - Tagline | Site Name
1464 # and so on and so forth; it's just not practical
1465 mobj = re.search(r'<title>(.*)</title>', webpage)
1467 self._downloader.report_error(u'unable to extract title')
1469 video_title = mobj.group(1)
1471 # video uploader is domain name
1472 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1474 self._downloader.report_error(u'unable to extract title')
1476 video_uploader = mobj.group(1)
1481 'uploader': video_uploader,
1482 'upload_date': None,
1483 'title': video_title,
1484 'ext': video_extension,
1488 class YoutubeSearchIE(InfoExtractor):
1489 """Information Extractor for YouTube search queries."""
1490 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1491 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1492 _max_youtube_results = 1000
1493 IE_NAME = u'youtube:search'
1495 def __init__(self, downloader=None):
1496 InfoExtractor.__init__(self, downloader)
1498 def report_download_page(self, query, pagenum):
1499 """Report attempt to download search page with given number."""
1500 query = query.decode(preferredencoding())
1501 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1503 def _real_extract(self, query):
1504 mobj = re.match(self._VALID_URL, query)
1506 self._downloader.report_error(u'invalid search query "%s"' % query)
1509 prefix, query = query.split(':')
1511 query = query.encode('utf-8')
1513 self._download_n_results(query, 1)
1515 elif prefix == 'all':
1516 self._download_n_results(query, self._max_youtube_results)
1522 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1524 elif n > self._max_youtube_results:
1525 self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1526 n = self._max_youtube_results
1527 self._download_n_results(query, n)
1529 except ValueError: # parsing prefix as integer fails
1530 self._download_n_results(query, 1)
1533 def _download_n_results(self, query, n):
1534 """Downloads a specified number of results for a query"""
1540 while (50 * pagenum) < limit:
1541 self.report_download_page(query, pagenum+1)
1542 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1543 request = compat_urllib_request.Request(result_url)
1545 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1546 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1547 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1549 api_response = json.loads(data)['data']
1551 if not 'items' in api_response:
1552 self._downloader.trouble(u'[youtube] No video results')
1555 new_ids = list(video['id'] for video in api_response['items'])
1556 video_ids += new_ids
1558 limit = min(n, api_response['totalItems'])
1561 if len(video_ids) > n:
1562 video_ids = video_ids[:n]
1563 for id in video_ids:
1564 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1568 class GoogleSearchIE(InfoExtractor):
1569 """Information Extractor for Google Video search queries."""
1570 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1571 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1572 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1573 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1574 _max_google_results = 1000
1575 IE_NAME = u'video.google:search'
1577 def __init__(self, downloader=None):
1578 InfoExtractor.__init__(self, downloader)
1580 def report_download_page(self, query, pagenum):
1581 """Report attempt to download playlist page with given number."""
1582 query = query.decode(preferredencoding())
1583 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1585 def _real_extract(self, query):
1586 mobj = re.match(self._VALID_URL, query)
1588 self._downloader.report_error(u'invalid search query "%s"' % query)
1591 prefix, query = query.split(':')
1593 query = query.encode('utf-8')
1595 self._download_n_results(query, 1)
1597 elif prefix == 'all':
1598 self._download_n_results(query, self._max_google_results)
1604 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1606 elif n > self._max_google_results:
1607 self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1608 n = self._max_google_results
1609 self._download_n_results(query, n)
1611 except ValueError: # parsing prefix as integer fails
1612 self._download_n_results(query, 1)
1615 def _download_n_results(self, query, n):
1616 """Downloads a specified number of results for a query"""
1622 self.report_download_page(query, pagenum)
1623 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1624 request = compat_urllib_request.Request(result_url)
1626 page = compat_urllib_request.urlopen(request).read()
1627 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1628 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1631 # Extract video identifiers
1632 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1633 video_id = mobj.group(1)
1634 if video_id not in video_ids:
1635 video_ids.append(video_id)
1636 if len(video_ids) == n:
1637 # Specified n videos reached
1638 for id in video_ids:
1639 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1642 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1643 for id in video_ids:
1644 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1647 pagenum = pagenum + 1
1650 class YahooSearchIE(InfoExtractor):
1651 """Information Extractor for Yahoo! Video search queries."""
1654 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1655 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1656 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1657 _MORE_PAGES_INDICATOR = r'\s*Next'
1658 _max_yahoo_results = 1000
1659 IE_NAME = u'video.yahoo:search'
1661 def __init__(self, downloader=None):
1662 InfoExtractor.__init__(self, downloader)
1664 def report_download_page(self, query, pagenum):
1665 """Report attempt to download playlist page with given number."""
1666 query = query.decode(preferredencoding())
1667 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1669 def _real_extract(self, query):
1670 mobj = re.match(self._VALID_URL, query)
1672 self._downloader.report_error(u'invalid search query "%s"' % query)
1675 prefix, query = query.split(':')
1677 query = query.encode('utf-8')
1679 self._download_n_results(query, 1)
1681 elif prefix == 'all':
1682 self._download_n_results(query, self._max_yahoo_results)
1688 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1690 elif n > self._max_yahoo_results:
1691 self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1692 n = self._max_yahoo_results
1693 self._download_n_results(query, n)
1695 except ValueError: # parsing prefix as integer fails
1696 self._download_n_results(query, 1)
1699 def _download_n_results(self, query, n):
1700 """Downloads a specified number of results for a query"""
1703 already_seen = set()
1707 self.report_download_page(query, pagenum)
1708 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1709 request = compat_urllib_request.Request(result_url)
1711 page = compat_urllib_request.urlopen(request).read()
1712 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1713 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1716 # Extract video identifiers
1717 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1718 video_id = mobj.group(1)
1719 if video_id not in already_seen:
1720 video_ids.append(video_id)
1721 already_seen.add(video_id)
1722 if len(video_ids) == n:
1723 # Specified n videos reached
1724 for id in video_ids:
1725 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1728 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1729 for id in video_ids:
1730 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1733 pagenum = pagenum + 1
1736 class YoutubePlaylistIE(InfoExtractor):
1737 """Information Extractor for YouTube playlists."""
1739 _VALID_URL = r"""(?:
1744 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1745 \? (?:.*?&)*? (?:p|a|list)=
1748 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1751 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1753 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1755 IE_NAME = u'youtube:playlist'
1757 def __init__(self, downloader=None):
1758 InfoExtractor.__init__(self, downloader)
1761 def suitable(cls, url):
1762 """Receives a URL and returns True if suitable for this IE."""
1763 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1765 def report_download_page(self, playlist_id, pagenum):
1766 """Report attempt to download playlist page with given number."""
1767 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1769 def _real_extract(self, url):
1770 # Extract playlist id
1771 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1773 self._downloader.report_error(u'invalid url: %s' % url)
1776 # Download playlist videos from API
1777 playlist_id = mobj.group(1) or mobj.group(2)
1782 self.report_download_page(playlist_id, page_num)
1784 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1786 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1787 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1788 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1792 response = json.loads(page)
1793 except ValueError as err:
1794 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1797 if not 'feed' in response or not 'entry' in response['feed']:
1798 self._downloader.report_error(u'Got a malformed response from YouTube API')
1800 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1801 for entry in response['feed']['entry']
1802 if 'content' in entry ]
1804 if len(response['feed']['entry']) < self._MAX_RESULTS:
1808 videos = [v[1] for v in sorted(videos)]
1810 url_results = [self.url_result(url) for url in videos]
1811 return [self.playlist_result(url_results, playlist_id)]
1814 class YoutubeChannelIE(InfoExtractor):
1815 """Information Extractor for YouTube channels."""
1817 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1818 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1819 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1820 IE_NAME = u'youtube:channel'
1822 def report_download_page(self, channel_id, pagenum):
1823 """Report attempt to download channel page with given number."""
1824 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1826 def _real_extract(self, url):
1827 # Extract channel id
1828 mobj = re.match(self._VALID_URL, url)
1830 self._downloader.report_error(u'invalid url: %s' % url)
1833 # Download channel pages
1834 channel_id = mobj.group(1)
1839 self.report_download_page(channel_id, pagenum)
1840 url = self._TEMPLATE_URL % (channel_id, pagenum)
1841 request = compat_urllib_request.Request(url)
1843 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1844 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1845 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1848 # Extract video identifiers
1850 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1851 if mobj.group(1) not in ids_in_page:
1852 ids_in_page.append(mobj.group(1))
1853 video_ids.extend(ids_in_page)
1855 if self._MORE_PAGES_INDICATOR not in page:
1857 pagenum = pagenum + 1
1859 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1861 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1862 url_entries = [self.url_result(url) for url in urls]
1863 return [self.playlist_result(url_entries, channel_id)]
1866 class YoutubeUserIE(InfoExtractor):
1867 """Information Extractor for YouTube users."""
1869 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1870 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1871 _GDATA_PAGE_SIZE = 50
1872 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1873 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1874 IE_NAME = u'youtube:user'
1876 def __init__(self, downloader=None):
1877 InfoExtractor.__init__(self, downloader)
1879 def report_download_page(self, username, start_index):
1880 """Report attempt to download user page."""
1881 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1882 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1884 def _real_extract(self, url):
1886 mobj = re.match(self._VALID_URL, url)
1888 self._downloader.report_error(u'invalid url: %s' % url)
1891 username = mobj.group(1)
1893 # Download video ids using YouTube Data API. Result size per
1894 # query is limited (currently to 50 videos) so we need to query
1895 # page by page until there are no video ids - it means we got
1902 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1903 self.report_download_page(username, start_index)
1905 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1908 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1909 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1910 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1913 # Extract video identifiers
1916 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1917 if mobj.group(1) not in ids_in_page:
1918 ids_in_page.append(mobj.group(1))
1920 video_ids.extend(ids_in_page)
1922 # A little optimization - if current page is not
1923 # "full", ie. does not contain PAGE_SIZE video ids then
1924 # we can assume that this page is the last one - there
1925 # are no more ids on further pages - no need to query
1928 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1933 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1934 url_results = [self.url_result(url) for url in urls]
1935 return [self.playlist_result(url_results, playlist_title = username)]
1938 class BlipTVUserIE(InfoExtractor):
1939 """Information Extractor for blip.tv users."""
1941 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1943 IE_NAME = u'blip.tv:user'
1945 def __init__(self, downloader=None):
1946 InfoExtractor.__init__(self, downloader)
1948 def report_download_page(self, username, pagenum):
1949 """Report attempt to download user page."""
1950 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1951 (self.IE_NAME, username, pagenum))
1953 def _real_extract(self, url):
1955 mobj = re.match(self._VALID_URL, url)
1957 self._downloader.report_error(u'invalid url: %s' % url)
1960 username = mobj.group(1)
1962 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1964 request = compat_urllib_request.Request(url)
1967 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1968 mobj = re.search(r'data-users-id="([^"]+)"', page)
1969 page_base = page_base % mobj.group(1)
1970 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1971 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1975 # Download video ids using BlipTV Ajax calls. Result size per
1976 # query is limited (currently to 12 videos) so we need to query
1977 # page by page until there are no video ids - it means we got
1984 self.report_download_page(username, pagenum)
1985 url = page_base + "&page=" + str(pagenum)
1986 request = compat_urllib_request.Request( url )
1988 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1989 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1990 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
1993 # Extract video identifiers
1996 for mobj in re.finditer(r'href="/([^"]+)"', page):
1997 if mobj.group(1) not in ids_in_page:
1998 ids_in_page.append(unescapeHTML(mobj.group(1)))
2000 video_ids.extend(ids_in_page)
2002 # A little optimization - if current page is not
2003 # "full", ie. does not contain PAGE_SIZE video ids then
2004 # we can assume that this page is the last one - there
2005 # are no more ids on further pages - no need to query
2008 if len(ids_in_page) < self._PAGE_SIZE:
2013 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
2014 (self.IE_NAME, username, all_ids_count, len(video_ids)))
2016 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
2017 url_entries = [self.url_result(url) for url in urls]
2018 return [self.playlist_result(url_entries, playlist_title = username)]
2021 class DepositFilesIE(InfoExtractor):
2022 """Information extractor for depositfiles.com"""
2024 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2026 def report_download_webpage(self, file_id):
2027 """Report webpage download."""
2028 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2030 def report_extraction(self, file_id):
2031 """Report information extraction."""
2032 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2034 def _real_extract(self, url):
2035 file_id = url.split('/')[-1]
2036 # Rebuild url in english locale
2037 url = 'http://depositfiles.com/en/files/' + file_id
2039 # Retrieve file webpage with 'Free download' button pressed
2040 free_download_indication = { 'gateway_result' : '1' }
2041 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2043 self.report_download_webpage(file_id)
2044 webpage = compat_urllib_request.urlopen(request).read()
2045 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2046 self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
2049 # Search for the real file URL
2050 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2051 if (mobj is None) or (mobj.group(1) is None):
2052 # Try to figure out reason of the error.
2053 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2054 if (mobj is not None) and (mobj.group(1) is not None):
2055 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2056 self._downloader.report_error(u'%s' % restriction_message)
2058 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
2061 file_url = mobj.group(1)
2062 file_extension = os.path.splitext(file_url)[1][1:]
2064 # Search for file title
2065 mobj = re.search(r'<b title="(.*?)">', webpage)
2067 self._downloader.report_error(u'unable to extract title')
2069 file_title = mobj.group(1).decode('utf-8')
2072 'id': file_id.decode('utf-8'),
2073 'url': file_url.decode('utf-8'),
2075 'upload_date': None,
2076 'title': file_title,
2077 'ext': file_extension.decode('utf-8'),
2081 class FacebookIE(InfoExtractor):
2082 """Information Extractor for Facebook"""
2084 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2085 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2086 _NETRC_MACHINE = 'facebook'
2087 IE_NAME = u'facebook'
2089 def report_login(self):
2090 """Report attempt to log in."""
2091 self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2093 def _real_initialize(self):
2094 if self._downloader is None:
2099 downloader_params = self._downloader.params
2101 # Attempt to use provided username and password or .netrc data
2102 if downloader_params.get('username', None) is not None:
2103 useremail = downloader_params['username']
2104 password = downloader_params['password']
2105 elif downloader_params.get('usenetrc', False):
2107 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2108 if info is not None:
2112 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2113 except (IOError, netrc.NetrcParseError) as err:
2114 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2117 if useremail is None:
2126 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2129 login_results = compat_urllib_request.urlopen(request).read()
2130 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2131 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2133 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2134 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2137 def _real_extract(self, url):
2138 mobj = re.match(self._VALID_URL, url)
2140 self._downloader.report_error(u'invalid URL: %s' % url)
2142 video_id = mobj.group('ID')
2144 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2145 webpage = self._download_webpage(url, video_id)
2147 BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2148 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2149 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2151 raise ExtractorError(u'Cannot parse data')
2152 data = dict(json.loads(m.group(1)))
2153 params_raw = compat_urllib_parse.unquote(data['params'])
2154 params = json.loads(params_raw)
2155 video_url = params['hd_src']
2157 video_url = params['sd_src']
2159 raise ExtractorError(u'Cannot find video URL')
2160 video_duration = int(params['video_duration'])
2162 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2164 raise ExtractorError(u'Cannot find title in webpage')
2165 video_title = unescapeHTML(m.group(1))
2169 'title': video_title,
2172 'duration': video_duration,
2173 'thumbnail': params['thumbnail_src'],
2178 class BlipTVIE(InfoExtractor):
2179 """Information extractor for blip.tv"""
2181 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2182 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2183 IE_NAME = u'blip.tv'
2185 def report_extraction(self, file_id):
2186 """Report information extraction."""
2187 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2189 def report_direct_download(self, title):
2190 """Report information extraction."""
2191 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2193 def _real_extract(self, url):
2194 mobj = re.match(self._VALID_URL, url)
2196 self._downloader.report_error(u'invalid URL: %s' % url)
2199 urlp = compat_urllib_parse_urlparse(url)
2200 if urlp.path.startswith('/play/'):
2201 request = compat_urllib_request.Request(url)
2202 response = compat_urllib_request.urlopen(request)
2203 redirecturl = response.geturl()
2204 rurlp = compat_urllib_parse_urlparse(redirecturl)
2205 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2206 url = 'http://blip.tv/a/a-' + file_id
2207 return self._real_extract(url)
2214 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2215 request = compat_urllib_request.Request(json_url)
2216 request.add_header('User-Agent', 'iTunes/10.6.1')
2217 self.report_extraction(mobj.group(1))
2220 urlh = compat_urllib_request.urlopen(request)
2221 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2222 basename = url.split('/')[-1]
2223 title,ext = os.path.splitext(basename)
2224 title = title.decode('UTF-8')
2225 ext = ext.replace('.', '')
2226 self.report_direct_download(title)
2231 'upload_date': None,
2236 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2237 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2238 if info is None: # Regular URL
2240 json_code_bytes = urlh.read()
2241 json_code = json_code_bytes.decode('utf-8')
2242 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2243 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2247 json_data = json.loads(json_code)
2248 if 'Post' in json_data:
2249 data = json_data['Post']
2253 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2254 video_url = data['media']['url']
2255 umobj = re.match(self._URL_EXT, video_url)
2257 raise ValueError('Can not determine filename extension')
2258 ext = umobj.group(1)
2261 'id': data['item_id'],
2263 'uploader': data['display_name'],
2264 'upload_date': upload_date,
2265 'title': data['title'],
2267 'format': data['media']['mimeType'],
2268 'thumbnail': data['thumbnailUrl'],
2269 'description': data['description'],
2270 'player_url': data['embedUrl'],
2271 'user_agent': 'iTunes/10.6.1',
2273 except (ValueError,KeyError) as err:
2274 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2280 class MyVideoIE(InfoExtractor):
2281 """Information Extractor for myvideo.de."""
2283 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2284 IE_NAME = u'myvideo'
2286 def __init__(self, downloader=None):
2287 InfoExtractor.__init__(self, downloader)
2289 def report_extraction(self, video_id):
2290 """Report information extraction."""
2291 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2293 def _real_extract(self,url):
2294 mobj = re.match(self._VALID_URL, url)
2296 self._download.report_error(u'invalid URL: %s' % url)
2299 video_id = mobj.group(1)
2302 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2303 webpage = self._download_webpage(webpage_url, video_id)
2305 self.report_extraction(video_id)
2306 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2309 self._downloader.report_error(u'unable to extract media URL')
2311 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2313 mobj = re.search('<title>([^<]+)</title>', webpage)
2315 self._downloader.report_error(u'unable to extract title')
2318 video_title = mobj.group(1)
2324 'upload_date': None,
2325 'title': video_title,
2329 class ComedyCentralIE(InfoExtractor):
2330 """Information extractor for The Daily Show and Colbert Report """
2332 # urls can be abbreviations like :thedailyshow or :colbert
2333 # urls for episodes like:
2334 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2335 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2336 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2337 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2338 |(https?://)?(www\.)?
2339 (?P<showname>thedailyshow|colbertnation)\.com/
2340 (full-episodes/(?P<episode>.*)|
2342 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2343 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2346 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2348 _video_extensions = {
2356 _video_dimensions = {
2366 def suitable(cls, url):
2367 """Receives a URL and returns True if suitable for this IE."""
2368 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2370 def report_extraction(self, episode_id):
2371 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2373 def report_config_download(self, episode_id, media_id):
2374 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2376 def report_index_download(self, episode_id):
2377 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2379 def _print_formats(self, formats):
2380 print('Available formats:')
2382 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2385 def _real_extract(self, url):
2386 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2388 self._downloader.report_error(u'invalid URL: %s' % url)
2391 if mobj.group('shortname'):
2392 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2393 url = u'http://www.thedailyshow.com/full-episodes/'
2395 url = u'http://www.colbertnation.com/full-episodes/'
2396 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2397 assert mobj is not None
2399 if mobj.group('clip'):
2400 if mobj.group('showname') == 'thedailyshow':
2401 epTitle = mobj.group('tdstitle')
2403 epTitle = mobj.group('cntitle')
2406 dlNewest = not mobj.group('episode')
2408 epTitle = mobj.group('showname')
2410 epTitle = mobj.group('episode')
2412 req = compat_urllib_request.Request(url)
2413 self.report_extraction(epTitle)
2415 htmlHandle = compat_urllib_request.urlopen(req)
2416 html = htmlHandle.read()
2417 webpage = html.decode('utf-8')
2418 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2419 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2422 url = htmlHandle.geturl()
2423 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2425 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2427 if mobj.group('episode') == '':
2428 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2430 epTitle = mobj.group('episode')
2432 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2434 if len(mMovieParams) == 0:
2435 # The Colbert Report embeds the information in a without
2436 # a URL prefix; so extract the alternate reference
2437 # and then add the URL prefix manually.
2439 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2440 if len(altMovieParams) == 0:
2441 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2444 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2446 uri = mMovieParams[0][1]
2447 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2448 self.report_index_download(epTitle)
2450 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2451 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2452 self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2457 idoc = xml.etree.ElementTree.fromstring(indexXml)
2458 itemEls = idoc.findall('.//item')
2459 for partNum,itemEl in enumerate(itemEls):
2460 mediaId = itemEl.findall('./guid')[0].text
2461 shortMediaId = mediaId.split(':')[-1]
2462 showId = mediaId.split(':')[-2].replace('.com', '')
2463 officialTitle = itemEl.findall('./title')[0].text
2464 officialDate = itemEl.findall('./pubDate')[0].text
2466 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2467 compat_urllib_parse.urlencode({'uri': mediaId}))
2468 configReq = compat_urllib_request.Request(configUrl)
2469 self.report_config_download(epTitle, shortMediaId)
2471 configXml = compat_urllib_request.urlopen(configReq).read()
2472 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2473 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2476 cdoc = xml.etree.ElementTree.fromstring(configXml)
2478 for rendition in cdoc.findall('.//rendition'):
2479 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2483 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2486 if self._downloader.params.get('listformats', None):
2487 self._print_formats([i[0] for i in turls])
2490 # For now, just pick the highest bitrate
2491 format,rtmp_video_url = turls[-1]
2493 # Get the format arg from the arg stream
2494 req_format = self._downloader.params.get('format', None)
2496 # Select format if we can find one
2499 format, rtmp_video_url = f, v
2502 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2504 raise ExtractorError(u'Cannot transform RTMP url')
2505 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2506 video_url = base + m.group('finalid')
2508 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2513 'upload_date': officialDate,
2518 'description': officialTitle,
2520 results.append(info)
2525 class EscapistIE(InfoExtractor):
2526 """Information extractor for The Escapist """
2528 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2529 IE_NAME = u'escapist'
2531 def report_extraction(self, showName):
2532 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2534 def report_config_download(self, showName):
2535 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2537 def _real_extract(self, url):
2538 mobj = re.match(self._VALID_URL, url)
2540 self._downloader.report_error(u'invalid URL: %s' % url)
2542 showName = mobj.group('showname')
2543 videoId = mobj.group('episode')
2545 self.report_extraction(showName)
2547 webPage = compat_urllib_request.urlopen(url)
2548 webPageBytes = webPage.read()
2549 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2550 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2551 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2552 self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2555 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2556 description = unescapeHTML(descMatch.group(1))
2557 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2558 imgUrl = unescapeHTML(imgMatch.group(1))
2559 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2560 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2561 configUrlMatch = re.search('config=(.*)$', playerUrl)
2562 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2564 self.report_config_download(showName)
2566 configJSON = compat_urllib_request.urlopen(configUrl)
2567 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2568 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2569 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2570 self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2573 # Technically, it's JavaScript, not JSON
2574 configJSON = configJSON.replace("'", '"')
2577 config = json.loads(configJSON)
2578 except (ValueError,) as err:
2579 self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2582 playlist = config['playlist']
2583 videoUrl = playlist[1]['url']
2588 'uploader': showName,
2589 'upload_date': None,
2592 'thumbnail': imgUrl,
2593 'description': description,
2594 'player_url': playerUrl,
2599 class CollegeHumorIE(InfoExtractor):
2600 """Information extractor for collegehumor.com"""
2603 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2604 IE_NAME = u'collegehumor'
2606 def report_manifest(self, video_id):
2607 """Report information extraction."""
2608 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2610 def report_extraction(self, video_id):
2611 """Report information extraction."""
2612 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2614 def _real_extract(self, url):
2615 mobj = re.match(self._VALID_URL, url)
2617 self._downloader.report_error(u'invalid URL: %s' % url)
2619 video_id = mobj.group('videoid')
2624 'upload_date': None,
2627 self.report_extraction(video_id)
2628 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2630 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2631 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2632 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2635 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2637 videoNode = mdoc.findall('./video')[0]
2638 info['description'] = videoNode.findall('./description')[0].text
2639 info['title'] = videoNode.findall('./caption')[0].text
2640 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2641 manifest_url = videoNode.findall('./file')[0].text
2643 self._downloader.report_error(u'Invalid metadata XML file')
2646 manifest_url += '?hdcore=2.10.3'
2647 self.report_manifest(video_id)
2649 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2650 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2651 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2654 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2656 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2657 node_id = media_node.attrib['url']
2658 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2659 except IndexError as err:
2660 self._downloader.report_error(u'Invalid manifest file')
2663 url_pr = compat_urllib_parse_urlparse(manifest_url)
2664 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2671 class XVideosIE(InfoExtractor):
2672 """Information extractor for xvideos.com"""
2674 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2675 IE_NAME = u'xvideos'
2677 def report_extraction(self, video_id):
2678 """Report information extraction."""
2679 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2681 def _real_extract(self, url):
2682 mobj = re.match(self._VALID_URL, url)
2684 self._downloader.report_error(u'invalid URL: %s' % url)
2686 video_id = mobj.group(1)
2688 webpage = self._download_webpage(url, video_id)
2690 self.report_extraction(video_id)
2694 mobj = re.search(r'flv_url=(.+?)&', webpage)
2696 self._downloader.report_error(u'unable to extract video url')
2698 video_url = compat_urllib_parse.unquote(mobj.group(1))
2702 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2704 self._downloader.report_error(u'unable to extract video title')
2706 video_title = mobj.group(1)
2709 # Extract video thumbnail
2710 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2712 self._downloader.report_error(u'unable to extract video thumbnail')
2714 video_thumbnail = mobj.group(0)
2720 'upload_date': None,
2721 'title': video_title,
2723 'thumbnail': video_thumbnail,
2724 'description': None,
2730 class SoundcloudIE(InfoExtractor):
2731 """Information extractor for soundcloud.com
2732 To access the media, the uid of the song and a stream token
2733 must be extracted from the page source and the script must make
2734 a request to media.soundcloud.com/crossdomain.xml. Then
2735 the media can be grabbed by requesting from an url composed
2736 of the stream token and uid
2739 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2740 IE_NAME = u'soundcloud'
2742 def __init__(self, downloader=None):
2743 InfoExtractor.__init__(self, downloader)
2745 def report_resolve(self, video_id):
2746 """Report information extraction."""
2747 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2749 def report_extraction(self, video_id):
2750 """Report information extraction."""
2751 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2753 def _real_extract(self, url):
2754 mobj = re.match(self._VALID_URL, url)
2756 self._downloader.report_error(u'invalid URL: %s' % url)
2759 # extract uploader (which is in the url)
2760 uploader = mobj.group(1)
2761 # extract simple title (uploader + slug of song title)
2762 slug_title = mobj.group(2)
2763 simple_title = uploader + u'-' + slug_title
2765 self.report_resolve('%s/%s' % (uploader, slug_title))
2767 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2768 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2769 request = compat_urllib_request.Request(resolv_url)
2771 info_json_bytes = compat_urllib_request.urlopen(request).read()
2772 info_json = info_json_bytes.decode('utf-8')
2773 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2774 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2777 info = json.loads(info_json)
2778 video_id = info['id']
2779 self.report_extraction('%s/%s' % (uploader, slug_title))
2781 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2782 request = compat_urllib_request.Request(streams_url)
2784 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2785 stream_json = stream_json_bytes.decode('utf-8')
2786 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2787 self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2790 streams = json.loads(stream_json)
2791 mediaURL = streams['http_mp3_128_url']
2796 'uploader': info['user']['username'],
2797 'upload_date': info['created_at'],
2798 'title': info['title'],
2800 'description': info['description'],
2803 class SoundcloudSetIE(InfoExtractor):
2804 """Information extractor for soundcloud.com sets
2805 To access the media, the uid of the song and a stream token
2806 must be extracted from the page source and the script must make
2807 a request to media.soundcloud.com/crossdomain.xml. Then
2808 the media can be grabbed by requesting from an url composed
2809 of the stream token and uid
2812 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2813 IE_NAME = u'soundcloud'
2815 def __init__(self, downloader=None):
2816 InfoExtractor.__init__(self, downloader)
2818 def report_resolve(self, video_id):
2819 """Report information extraction."""
2820 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2822 def report_extraction(self, video_id):
2823 """Report information extraction."""
2824 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2826 def _real_extract(self, url):
2827 mobj = re.match(self._VALID_URL, url)
2829 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2832 # extract uploader (which is in the url)
2833 uploader = mobj.group(1)
2834 # extract simple title (uploader + slug of song title)
2835 slug_title = mobj.group(2)
2836 simple_title = uploader + u'-' + slug_title
2838 self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2840 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2841 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2842 request = compat_urllib_request.Request(resolv_url)
2844 info_json_bytes = compat_urllib_request.urlopen(request).read()
2845 info_json = info_json_bytes.decode('utf-8')
2846 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2847 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2851 info = json.loads(info_json)
2852 if 'errors' in info:
2853 for err in info['errors']:
2854 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err['error_message']))
2857 for track in info['tracks']:
2858 video_id = track['id']
2859 self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2861 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2862 request = compat_urllib_request.Request(streams_url)
2864 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2865 stream_json = stream_json_bytes.decode('utf-8')
2866 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2867 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2870 streams = json.loads(stream_json)
2871 mediaURL = streams['http_mp3_128_url']
2876 'uploader': track['user']['username'],
2877 'upload_date': track['created_at'],
2878 'title': track['title'],
2880 'description': track['description'],
2885 class InfoQIE(InfoExtractor):
2886 """Information extractor for infoq.com"""
2887 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2889 def report_extraction(self, video_id):
2890 """Report information extraction."""
2891 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2893 def _real_extract(self, url):
2894 mobj = re.match(self._VALID_URL, url)
2896 self._downloader.report_error(u'invalid URL: %s' % url)
2899 webpage = self._download_webpage(url, video_id=url)
2900 self.report_extraction(url)
2903 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2905 self._downloader.report_error(u'unable to extract video url')
2907 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2908 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2911 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2913 self._downloader.report_error(u'unable to extract video title')
2915 video_title = mobj.group(1)
2917 # Extract description
2918 video_description = u'No description available.'
2919 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2920 if mobj is not None:
2921 video_description = mobj.group(1)
2923 video_filename = video_url.split('/')[-1]
2924 video_id, extension = video_filename.split('.')
2930 'upload_date': None,
2931 'title': video_title,
2932 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2934 'description': video_description,
2939 class MixcloudIE(InfoExtractor):
2940 """Information extractor for www.mixcloud.com"""
2942 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2943 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2944 IE_NAME = u'mixcloud'
2946 def __init__(self, downloader=None):
2947 InfoExtractor.__init__(self, downloader)
2949 def report_download_json(self, file_id):
2950 """Report JSON download."""
2951 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2953 def report_extraction(self, file_id):
2954 """Report information extraction."""
2955 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2957 def get_urls(self, jsonData, fmt, bitrate='best'):
2958 """Get urls from 'audio_formats' section in json"""
2961 bitrate_list = jsonData[fmt]
2962 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2963 bitrate = max(bitrate_list) # select highest
2965 url_list = jsonData[fmt][bitrate]
2966 except TypeError: # we have no bitrate info.
2967 url_list = jsonData[fmt]
2970 def check_urls(self, url_list):
2971 """Returns 1st active url from list"""
2972 for url in url_list:
2974 compat_urllib_request.urlopen(url)
2976 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2981 def _print_formats(self, formats):
2982 print('Available formats:')
2983 for fmt in formats.keys():
2984 for b in formats[fmt]:
2986 ext = formats[fmt][b][0]
2987 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2988 except TypeError: # we have no bitrate info
2989 ext = formats[fmt][0]
2990 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2993 def _real_extract(self, url):
2994 mobj = re.match(self._VALID_URL, url)
2996 self._downloader.report_error(u'invalid URL: %s' % url)
2998 # extract uploader & filename from url
2999 uploader = mobj.group(1).decode('utf-8')
3000 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3002 # construct API request
3003 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3004 # retrieve .json file with links to files
3005 request = compat_urllib_request.Request(file_url)
3007 self.report_download_json(file_url)
3008 jsonData = compat_urllib_request.urlopen(request).read()
3009 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3010 self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
3014 json_data = json.loads(jsonData)
3015 player_url = json_data['player_swf_url']
3016 formats = dict(json_data['audio_formats'])
3018 req_format = self._downloader.params.get('format', None)
3021 if self._downloader.params.get('listformats', None):
3022 self._print_formats(formats)
3025 if req_format is None or req_format == 'best':
3026 for format_param in formats.keys():
3027 url_list = self.get_urls(formats, format_param)
3029 file_url = self.check_urls(url_list)
3030 if file_url is not None:
3033 if req_format not in formats:
3034 self._downloader.report_error(u'format is not available')
3037 url_list = self.get_urls(formats, req_format)
3038 file_url = self.check_urls(url_list)
3039 format_param = req_format
3042 'id': file_id.decode('utf-8'),
3043 'url': file_url.decode('utf-8'),
3044 'uploader': uploader.decode('utf-8'),
3045 'upload_date': None,
3046 'title': json_data['name'],
3047 'ext': file_url.split('.')[-1].decode('utf-8'),
3048 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3049 'thumbnail': json_data['thumbnail_url'],
3050 'description': json_data['description'],
3051 'player_url': player_url.decode('utf-8'),
3054 class StanfordOpenClassroomIE(InfoExtractor):
3055 """Information extractor for Stanford's Open ClassRoom"""
3057 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3058 IE_NAME = u'stanfordoc'
3060 def report_download_webpage(self, objid):
3061 """Report information extraction."""
3062 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3064 def report_extraction(self, video_id):
3065 """Report information extraction."""
3066 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3068 def _real_extract(self, url):
3069 mobj = re.match(self._VALID_URL, url)
3071 raise ExtractorError(u'Invalid URL: %s' % url)
3073 if mobj.group('course') and mobj.group('video'): # A specific video
3074 course = mobj.group('course')
3075 video = mobj.group('video')
3077 'id': course + '_' + video,
3079 'upload_date': None,
3082 self.report_extraction(info['id'])
3083 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3084 xmlUrl = baseUrl + video + '.xml'
3086 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3087 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3088 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
3090 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3092 info['title'] = mdoc.findall('./title')[0].text
3093 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3095 self._downloader.report_error(u'Invalid metadata XML file')
3097 info['ext'] = info['url'].rpartition('.')[2]
3099 elif mobj.group('course'): # A course page
3100 course = mobj.group('course')
3105 'upload_date': None,
3108 coursepage = self._download_webpage(url, info['id'],
3109 note='Downloading course info page',
3110 errnote='Unable to download course info page')
3112 m = re.search('<h1>([^<]+)</h1>', coursepage)
3114 info['title'] = unescapeHTML(m.group(1))
3116 info['title'] = info['id']
3118 m = re.search('<description>([^<]+)</description>', coursepage)
3120 info['description'] = unescapeHTML(m.group(1))
3122 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3125 'type': 'reference',
3126 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3130 for entry in info['list']:
3131 assert entry['type'] == 'reference'
3132 results += self.extract(entry['url'])
3136 'id': 'Stanford OpenClassroom',
3139 'upload_date': None,
3142 self.report_download_webpage(info['id'])
3143 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3145 rootpage = compat_urllib_request.urlopen(rootURL).read()
3146 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3147 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3150 info['title'] = info['id']
3152 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3155 'type': 'reference',
3156 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3161 for entry in info['list']:
3162 assert entry['type'] == 'reference'
3163 results += self.extract(entry['url'])
3166 class MTVIE(InfoExtractor):
3167 """Information extractor for MTV.com"""
3169 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3172 def report_extraction(self, video_id):
3173 """Report information extraction."""
3174 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3176 def _real_extract(self, url):
3177 mobj = re.match(self._VALID_URL, url)
3179 self._downloader.report_error(u'invalid URL: %s' % url)
3181 if not mobj.group('proto'):
3182 url = 'http://' + url
3183 video_id = mobj.group('videoid')
3185 webpage = self._download_webpage(url, video_id)
3187 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3189 self._downloader.report_error(u'unable to extract song name')
3191 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3192 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3194 self._downloader.report_error(u'unable to extract performer')
3196 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3197 video_title = performer + ' - ' + song_name
3199 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3201 self._downloader.report_error(u'unable to mtvn_uri')
3203 mtvn_uri = mobj.group(1)
3205 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3207 self._downloader.report_error(u'unable to extract content id')
3209 content_id = mobj.group(1)
3211 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3212 self.report_extraction(video_id)
3213 request = compat_urllib_request.Request(videogen_url)
3215 metadataXml = compat_urllib_request.urlopen(request).read()
3216 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3217 self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3220 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3221 renditions = mdoc.findall('.//rendition')
3223 # For now, always pick the highest quality.
3224 rendition = renditions[-1]
3227 _,_,ext = rendition.attrib['type'].partition('/')
3228 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3229 video_url = rendition.find('./src').text
3231 self._downloader.trouble('Invalid rendition field.')
3237 'uploader': performer,
3238 'upload_date': None,
3239 'title': video_title,
3247 class YoukuIE(InfoExtractor):
3248 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3250 def report_download_webpage(self, file_id):
3251 """Report webpage download."""
3252 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3254 def report_extraction(self, file_id):
3255 """Report information extraction."""
3256 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3259 nowTime = int(time.time() * 1000)
3260 random1 = random.randint(1000,1998)
3261 random2 = random.randint(1000,9999)
3263 return "%d%d%d" %(nowTime,random1,random2)
3265 def _get_file_ID_mix_string(self, seed):
3267 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3269 for i in range(len(source)):
3270 seed = (seed * 211 + 30031 ) % 65536
3271 index = math.floor(seed / 65536 * len(source) )
3272 mixed.append(source[int(index)])
3273 source.remove(source[int(index)])
3274 #return ''.join(mixed)
3277 def _get_file_id(self, fileId, seed):
3278 mixed = self._get_file_ID_mix_string(seed)
3279 ids = fileId.split('*')
3283 realId.append(mixed[int(ch)])
3284 return ''.join(realId)
3286 def _real_extract(self, url):
3287 mobj = re.match(self._VALID_URL, url)
3289 self._downloader.report_error(u'invalid URL: %s' % url)
3291 video_id = mobj.group('ID')
3293 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3295 request = compat_urllib_request.Request(info_url, None, std_headers)
3297 self.report_download_webpage(video_id)
3298 jsondata = compat_urllib_request.urlopen(request).read()
3299 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3300 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3303 self.report_extraction(video_id)
3305 jsonstr = jsondata.decode('utf-8')
3306 config = json.loads(jsonstr)
3308 video_title = config['data'][0]['title']
3309 seed = config['data'][0]['seed']
3311 format = self._downloader.params.get('format', None)
3312 supported_format = list(config['data'][0]['streamfileids'].keys())
3314 if format is None or format == 'best':
3315 if 'hd2' in supported_format:
3320 elif format == 'worst':
3328 fileid = config['data'][0]['streamfileids'][format]
3329 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3330 except (UnicodeDecodeError, ValueError, KeyError):
3331 self._downloader.report_error(u'unable to extract info section')
3335 sid = self._gen_sid()
3336 fileid = self._get_file_id(fileid, seed)
3338 #column 8,9 of fileid represent the segment number
3339 #fileid[7:9] should be changed
3340 for index, key in enumerate(keys):
3342 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3343 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3346 'id': '%s_part%02d' % (video_id, index),
3347 'url': download_url,
3349 'upload_date': None,
3350 'title': video_title,
3353 files_info.append(info)
3358 class XNXXIE(InfoExtractor):
3359 """Information extractor for xnxx.com"""
3361 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3363 VIDEO_URL_RE = r'flv_url=(.*?)&'
3364 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3365 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3367 def report_webpage(self, video_id):
3368 """Report information extraction"""
3369 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3371 def report_extraction(self, video_id):
3372 """Report information extraction"""
3373 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3375 def _real_extract(self, url):
3376 mobj = re.match(self._VALID_URL, url)
3378 self._downloader.report_error(u'invalid URL: %s' % url)
3380 video_id = mobj.group(1)
3382 self.report_webpage(video_id)
3384 # Get webpage content
3386 webpage_bytes = compat_urllib_request.urlopen(url).read()
3387 webpage = webpage_bytes.decode('utf-8')
3388 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3389 self._downloader.report_error(u'unable to download video webpage: %s' % err)
3392 result = re.search(self.VIDEO_URL_RE, webpage)
3394 self._downloader.report_error(u'unable to extract video url')
3396 video_url = compat_urllib_parse.unquote(result.group(1))
3398 result = re.search(self.VIDEO_TITLE_RE, webpage)
3400 self._downloader.report_error(u'unable to extract video title')
3402 video_title = result.group(1)
3404 result = re.search(self.VIDEO_THUMB_RE, webpage)
3406 self._downloader.report_error(u'unable to extract video thumbnail')
3408 video_thumbnail = result.group(1)
3414 'upload_date': None,
3415 'title': video_title,
3417 'thumbnail': video_thumbnail,
3418 'description': None,
3422 class GooglePlusIE(InfoExtractor):
3423 """Information extractor for plus.google.com."""
3425 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3426 IE_NAME = u'plus.google'
3428 def __init__(self, downloader=None):
3429 InfoExtractor.__init__(self, downloader)
3431 def report_extract_entry(self, url):
3432 """Report downloading extry"""
3433 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3435 def report_date(self, upload_date):
3436 """Report downloading extry"""
3437 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3439 def report_uploader(self, uploader):
3440 """Report downloading extry"""
3441 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3443 def report_title(self, video_title):
3444 """Report downloading extry"""
3445 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3447 def report_extract_vid_page(self, video_page):
3448 """Report information extraction."""
3449 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3451 def _real_extract(self, url):
3452 # Extract id from URL
3453 mobj = re.match(self._VALID_URL, url)
3455 self._downloader.report_error(u'Invalid URL: %s' % url)
3458 post_url = mobj.group(0)
3459 video_id = mobj.group(1)
3461 video_extension = 'flv'
3463 # Step 1, Retrieve post webpage to extract further information
3464 self.report_extract_entry(post_url)
3465 request = compat_urllib_request.Request(post_url)
3467 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3468 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3469 self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3472 # Extract update date
3474 pattern = 'title="Timestamp">(.*?)</a>'
3475 mobj = re.search(pattern, webpage)
3477 upload_date = mobj.group(1)
3478 # Convert timestring to a format suitable for filename
3479 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3480 upload_date = upload_date.strftime('%Y%m%d')
3481 self.report_date(upload_date)
3485 pattern = r'rel\="author".*?>(.*?)</a>'
3486 mobj = re.search(pattern, webpage)
3488 uploader = mobj.group(1)
3489 self.report_uploader(uploader)
3492 # Get the first line for title
3494 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3495 mobj = re.search(pattern, webpage)
3497 video_title = mobj.group(1)
3498 self.report_title(video_title)
3500 # Step 2, Stimulate clicking the image box to launch video
3501 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3502 mobj = re.search(pattern, webpage)
3504 self._downloader.report_error(u'unable to extract video page URL')
3506 video_page = mobj.group(1)
3507 request = compat_urllib_request.Request(video_page)
3509 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3510 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3511 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3513 self.report_extract_vid_page(video_page)
3516 # Extract video links on video page
3517 """Extract video links of all sizes"""
3518 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3519 mobj = re.findall(pattern, webpage)
3521 self._downloader.report_error(u'unable to extract video links')
3523 # Sort in resolution
3524 links = sorted(mobj)
3526 # Choose the lowest of the sort, i.e. highest resolution
3527 video_url = links[-1]
3528 # Only get the url. The resolution part in the tuple has no use anymore
3529 video_url = video_url[-1]
3530 # Treat escaped \u0026 style hex
3532 video_url = video_url.decode("unicode_escape")
3533 except AttributeError: # Python 3
3534 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3540 'uploader': uploader,
3541 'upload_date': upload_date,
3542 'title': video_title,
3543 'ext': video_extension,
3546 class NBAIE(InfoExtractor):
3547 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3550 def _real_extract(self, url):
3551 mobj = re.match(self._VALID_URL, url)
3553 self._downloader.report_error(u'invalid URL: %s' % url)
3556 video_id = mobj.group(1)
3557 if video_id.endswith('/index.html'):
3558 video_id = video_id[:-len('/index.html')]
3560 webpage = self._download_webpage(url, video_id)
3562 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3563 def _findProp(rexp, default=None):
3564 m = re.search(rexp, webpage)
3566 return unescapeHTML(m.group(1))
3570 shortened_video_id = video_id.rpartition('/')[2]
3571 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3573 'id': shortened_video_id,
3577 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3578 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3582 class JustinTVIE(InfoExtractor):
3583 """Information extractor for justin.tv and twitch.tv"""
3584 # TODO: One broadcast may be split into multiple videos. The key
3585 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3586 # starts at 1 and increases. Can we treat all parts as one video?
3588 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3589 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3590 _JUSTIN_PAGE_LIMIT = 100
3591 IE_NAME = u'justin.tv'
3593 def report_extraction(self, file_id):
3594 """Report information extraction."""
3595 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3597 def report_download_page(self, channel, offset):
3598 """Report attempt to download a single page of videos."""
3599 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3600 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3602 # Return count of items, list of *valid* items
3603 def _parse_page(self, url):
3605 urlh = compat_urllib_request.urlopen(url)
3606 webpage_bytes = urlh.read()
3607 webpage = webpage_bytes.decode('utf-8', 'ignore')
3608 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3609 self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3612 response = json.loads(webpage)
3613 if type(response) != list:
3614 error_text = response.get('error', 'unknown error')
3615 self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3618 for clip in response:
3619 video_url = clip['video_file_url']
3621 video_extension = os.path.splitext(video_url)[1][1:]
3622 video_date = re.sub('-', '', clip['start_time'][:10])
3623 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3624 video_id = clip['id']
3625 video_title = clip.get('title', video_id)
3629 'title': video_title,
3630 'uploader': clip.get('channel_name', video_uploader_id),
3631 'uploader_id': video_uploader_id,
3632 'upload_date': video_date,
3633 'ext': video_extension,
3635 return (len(response), info)
3637 def _real_extract(self, url):
3638 mobj = re.match(self._VALID_URL, url)
3640 self._downloader.report_error(u'invalid URL: %s' % url)
3643 api = 'http://api.justin.tv'
3644 video_id = mobj.group(mobj.lastindex)
3646 if mobj.lastindex == 1:
3648 api += '/channel/archives/%s.json'
3650 api += '/broadcast/by_archive/%s.json'
3651 api = api % (video_id,)
3653 self.report_extraction(video_id)
3657 limit = self._JUSTIN_PAGE_LIMIT
3660 self.report_download_page(video_id, offset)
3661 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3662 page_count, page_info = self._parse_page(page_url)
3663 info.extend(page_info)
3664 if not paged or page_count != limit:
3669 class FunnyOrDieIE(InfoExtractor):
3670 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3672 def _real_extract(self, url):
3673 mobj = re.match(self._VALID_URL, url)
3675 self._downloader.report_error(u'invalid URL: %s' % url)
3678 video_id = mobj.group('id')
3679 webpage = self._download_webpage(url, video_id)
3681 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3683 self._downloader.report_error(u'unable to find video information')
3684 video_url = unescapeHTML(m.group('url'))
3686 m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3688 self._downloader.trouble(u'Cannot find video title')
3689 title = clean_html(m.group('title'))
3691 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3693 desc = unescapeHTML(m.group('desc'))
3702 'description': desc,
3706 class SteamIE(InfoExtractor):
3707 _VALID_URL = r"""http://store.steampowered.com/
3708 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3710 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3714 def suitable(cls, url):
3715 """Receives a URL and returns True if suitable for this IE."""
3716 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3718 def _real_extract(self, url):
3719 m = re.match(self._VALID_URL, url, re.VERBOSE)
3720 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3721 gameID = m.group('gameID')
3722 videourl = 'http://store.steampowered.com/video/%s/' % gameID
3723 webpage = self._download_webpage(videourl, gameID)
3724 mweb = re.finditer(urlRE, webpage)
3725 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3726 titles = re.finditer(namesRE, webpage)
3727 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3728 thumbs = re.finditer(thumbsRE, webpage)
3730 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3731 video_id = vid.group('videoID')
3732 title = vtitle.group('videoName')
3733 video_url = vid.group('videoURL')
3734 video_thumb = thumb.group('thumbnail')
3736 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3741 'title': unescapeHTML(title),
3742 'thumbnail': video_thumb
3747 class UstreamIE(InfoExtractor):
3748 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3749 IE_NAME = u'ustream'
3751 def _real_extract(self, url):
3752 m = re.match(self._VALID_URL, url)
3753 video_id = m.group('videoID')
3754 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3755 webpage = self._download_webpage(url, video_id)
3756 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3757 title = m.group('title')
3758 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3759 uploader = m.group('uploader')
3765 'uploader': uploader
3769 class WorldStarHipHopIE(InfoExtractor):
3770 _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3771 IE_NAME = u'WorldStarHipHop'
3773 def _real_extract(self, url):
3774 _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3776 webpage_src = compat_urllib_request.urlopen(url).read()
3777 webpage_src = webpage_src.decode('utf-8')
3779 mobj = re.search(_src_url, webpage_src)
3781 m = re.match(self._VALID_URL, url)
3782 video_id = m.group('id')
3784 if mobj is not None:
3785 video_url = mobj.group()
3786 if 'mp4' in video_url:
3791 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3794 _title = r"""<title>(.*)</title>"""
3796 mobj = re.search(_title, webpage_src)
3798 if mobj is not None:
3799 title = mobj.group(1)
3801 title = 'World Start Hip Hop - %s' % time.ctime()
3803 _thumbnail = r"""rel="image_src" href="(.*)" />"""
3804 mobj = re.search(_thumbnail, webpage_src)
3806 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3807 if mobj is not None:
3808 thumbnail = mobj.group(1)
3810 _title = r"""candytitles.*>(.*)</span>"""
3811 mobj = re.search(_title, webpage_src)
3812 if mobj is not None:
3813 title = mobj.group(1)
3820 'thumbnail' : thumbnail,
3825 class RBMARadioIE(InfoExtractor):
3826 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3828 def _real_extract(self, url):
3829 m = re.match(self._VALID_URL, url)
3830 video_id = m.group('videoID')
3832 webpage = self._download_webpage(url, video_id)
3833 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3835 raise ExtractorError(u'Cannot find metadata')
3836 json_data = m.group(1)
3839 data = json.loads(json_data)
3840 except ValueError as e:
3841 raise ExtractorError(u'Invalid JSON: ' + str(e))
3843 video_url = data['akamai_url'] + '&cbr=256'
3844 url_parts = compat_urllib_parse_urlparse(video_url)
3845 video_ext = url_parts.path.rpartition('.')[2]
3850 'title': data['title'],
3851 'description': data.get('teaser_text'),
3852 'location': data.get('country_of_origin'),
3853 'uploader': data.get('host', {}).get('name'),
3854 'uploader_id': data.get('host', {}).get('slug'),
3855 'thumbnail': data.get('image', {}).get('large_url_2x'),
3856 'duration': data.get('duration'),
3861 class YouPornIE(InfoExtractor):
3862 """Information extractor for youporn.com."""
3863 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3865 def _print_formats(self, formats):
3866 """Print all available formats"""
3867 print(u'Available formats:')
3868 print(u'ext\t\tformat')
3869 print(u'---------------------------------')
3870 for format in formats:
3871 print(u'%s\t\t%s' % (format['ext'], format['format']))
3873 def _specific(self, req_format, formats):
3875 if(x["format"]==req_format):
3879 def _real_extract(self, url):
3880 mobj = re.match(self._VALID_URL, url)
3882 self._downloader.report_error(u'invalid URL: %s' % url)
3885 video_id = mobj.group('videoid')
3887 req = compat_urllib_request.Request(url)
3888 req.add_header('Cookie', 'age_verified=1')
3889 webpage = self._download_webpage(req, video_id)
3891 # Get the video title
3892 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3894 raise ExtractorError(u'Unable to extract video title')
3895 video_title = result.group('title').strip()
3897 # Get the video date
3898 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3900 self._downloader.report_warning(u'unable to extract video date')
3903 upload_date = result.group('date').strip()
3905 # Get the video uploader
3906 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3908 self._downloader.report_warning(u'unable to extract uploader')
3909 video_uploader = None
3911 video_uploader = result.group('uploader').strip()
3912 video_uploader = clean_html( video_uploader )
3914 # Get all of the formats available
3915 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3916 result = re.search(DOWNLOAD_LIST_RE, webpage)
3918 raise ExtractorError(u'Unable to extract download list')
3919 download_list_html = result.group('download_list').strip()
3921 # Get all of the links from the page
3922 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3923 links = re.findall(LINK_RE, download_list_html)
3924 if(len(links) == 0):
3925 raise ExtractorError(u'ERROR: no known formats available for video')
3927 self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3932 # A link looks like this:
3933 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3934 # A path looks like this:
3935 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3936 video_url = unescapeHTML( link )
3937 path = compat_urllib_parse_urlparse( video_url ).path
3938 extension = os.path.splitext( path )[1][1:]
3939 format = path.split('/')[4].split('_')[:2]
3942 format = "-".join( format )
3943 title = u'%s-%s-%s' % (video_title, size, bitrate)
3948 'uploader': video_uploader,
3949 'upload_date': upload_date,
3954 'description': None,
3958 if self._downloader.params.get('listformats', None):
3959 self._print_formats(formats)
3962 req_format = self._downloader.params.get('format', None)
3963 self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3965 if req_format is None or req_format == 'best':
3967 elif req_format == 'worst':
3968 return [formats[-1]]
3969 elif req_format in ('-1', 'all'):
3972 format = self._specific( req_format, formats )
3974 self._downloader.report_error(u'requested format not available')
3980 class PornotubeIE(InfoExtractor):
3981 """Information extractor for pornotube.com."""
3982 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3984 def _real_extract(self, url):
3985 mobj = re.match(self._VALID_URL, url)
3987 self._downloader.report_error(u'invalid URL: %s' % url)
3990 video_id = mobj.group('videoid')
3991 video_title = mobj.group('title')
3993 # Get webpage content
3994 webpage = self._download_webpage(url, video_id)
3997 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3998 result = re.search(VIDEO_URL_RE, webpage)
4000 self._downloader.report_error(u'unable to extract video url')
4002 video_url = compat_urllib_parse.unquote(result.group('url'))
4004 #Get the uploaded date
4005 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
4006 result = re.search(VIDEO_UPLOADED_RE, webpage)
4008 self._downloader.report_error(u'unable to extract video title')
4010 upload_date = result.group('date')
4012 info = {'id': video_id,
4015 'upload_date': upload_date,
4016 'title': video_title,
4022 class YouJizzIE(InfoExtractor):
4023 """Information extractor for youjizz.com."""
4024 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
4026 def _real_extract(self, url):
4027 mobj = re.match(self._VALID_URL, url)
4029 self._downloader.report_error(u'invalid URL: %s' % url)
4032 video_id = mobj.group('videoid')
4034 # Get webpage content
4035 webpage = self._download_webpage(url, video_id)
4037 # Get the video title
4038 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
4040 raise ExtractorError(u'ERROR: unable to extract video title')
4041 video_title = result.group('title').strip()
4043 # Get the embed page
4044 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
4046 raise ExtractorError(u'ERROR: unable to extract embed page')
4048 embed_page_url = result.group(0).strip()
4049 video_id = result.group('videoid')
4051 webpage = self._download_webpage(embed_page_url, video_id)
4054 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
4056 raise ExtractorError(u'ERROR: unable to extract video url')
4057 video_url = result.group('source')
4059 info = {'id': video_id,
4061 'title': video_title,
4064 'player_url': embed_page_url}
4068 class EightTracksIE(InfoExtractor):
4070 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
4072 def _real_extract(self, url):
4073 mobj = re.match(self._VALID_URL, url)
4075 raise ExtractorError(u'Invalid URL: %s' % url)
4076 playlist_id = mobj.group('id')
4078 webpage = self._download_webpage(url, playlist_id)
4080 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
4082 raise ExtractorError(u'Cannot find trax information')
4083 json_like = m.group(1)
4084 data = json.loads(json_like)
4086 session = str(random.randint(0, 1000000000))
4088 track_count = data['tracks_count']
4089 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
4090 next_url = first_url
4092 for i in itertools.count():
4093 api_json = self._download_webpage(next_url, playlist_id,
4094 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
4095 errnote=u'Failed to download song information')
4096 api_data = json.loads(api_json)
4097 track_data = api_data[u'set']['track']
4099 'id': track_data['id'],
4100 'url': track_data['track_file_stream_url'],
4101 'title': track_data['performer'] + u' - ' + track_data['name'],
4102 'raw_title': track_data['name'],
4103 'uploader_id': data['user']['login'],
4107 if api_data['set']['at_last_track']:
4109 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
4112 class KeekIE(InfoExtractor):
4113 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
4116 def _real_extract(self, url):
4117 m = re.match(self._VALID_URL, url)
4118 video_id = m.group('videoID')
4119 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
4120 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
4121 webpage = self._download_webpage(url, video_id)
4122 m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
4123 title = unescapeHTML(m.group('title'))
4124 m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
4125 uploader = clean_html(m.group('uploader'))
4131 'thumbnail': thumbnail,
4132 'uploader': uploader
4136 class TEDIE(InfoExtractor):
4137 _VALID_URL=r'''http://www.ted.com/
4139 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4141 ((?P<type_talk>talks)) # We have a simple talk
4143 /(?P<name>\w+) # Here goes the name and then ".html"
4147 def suitable(cls, url):
4148 """Receives a URL and returns True if suitable for this IE."""
4149 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4151 def _real_extract(self, url):
4152 m=re.match(self._VALID_URL, url, re.VERBOSE)
4153 if m.group('type_talk'):
4154 return [self._talk_info(url)]
4156 playlist_id=m.group('playlist_id')
4157 name=m.group('name')
4158 self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4159 return self._playlist_videos_info(url,name,playlist_id)
4161 def _talk_video_link(self,mediaSlug):
4162 '''Returns the video link for that mediaSlug'''
4163 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4165 def _playlist_videos_info(self,url,name,playlist_id=0):
4166 '''Returns the videos of the playlist'''
4168 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4169 ([.\s]*?)data-playlist_item_id="(\d+)"
4170 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4172 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4173 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4174 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4175 m_names=re.finditer(video_name_RE,webpage)
4177 for m_video, m_name in zip(m_videos,m_names):
4178 video_id=m_video.group('video_id')
4179 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4180 info.append(self._talk_info(talk_url,video_id))
4183 def _talk_info(self, url, video_id=0):
4184 """Return the video for the talk in the url"""
4185 m=re.match(self._VALID_URL, url,re.VERBOSE)
4186 videoName=m.group('name')
4187 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4188 # If the url includes the language we get the title translated
4189 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4190 title=re.search(title_RE, webpage).group('title')
4191 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4192 "id":(?P<videoID>[\d]+).*?
4193 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4194 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4195 thumb_match=re.search(thumb_RE,webpage)
4196 info_match=re.search(info_RE,webpage,re.VERBOSE)
4197 video_id=info_match.group('videoID')
4198 mediaSlug=info_match.group('mediaSlug')
4199 video_url=self._talk_video_link(mediaSlug)
4205 'thumbnail': thumb_match.group('thumbnail')
4209 class MySpassIE(InfoExtractor):
4210 _VALID_URL = r'http://www.myspass.de/.*'
4212 def _real_extract(self, url):
4213 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4215 # video id is the last path element of the URL
4216 # usually there is a trailing slash, so also try the second but last
4217 url_path = compat_urllib_parse_urlparse(url).path
4218 url_parent_path, video_id = os.path.split(url_path)
4220 _, video_id = os.path.split(url_parent_path)
4223 metadata_url = META_DATA_URL_TEMPLATE % video_id
4224 metadata_text = self._download_webpage(metadata_url, video_id)
4225 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4227 # extract values from metadata
4228 url_flv_el = metadata.find('url_flv')
4229 if url_flv_el is None:
4230 self._downloader.report_error(u'unable to extract download url')
4232 video_url = url_flv_el.text
4233 extension = os.path.splitext(video_url)[1][1:]
4234 title_el = metadata.find('title')
4235 if title_el is None:
4236 self._downloader.report_error(u'unable to extract title')
4238 title = title_el.text
4239 format_id_el = metadata.find('format_id')
4240 if format_id_el is None:
4243 format = format_id_el.text
4244 description_el = metadata.find('description')
4245 if description_el is not None:
4246 description = description_el.text
4249 imagePreview_el = metadata.find('imagePreview')
4250 if imagePreview_el is not None:
4251 thumbnail = imagePreview_el.text
4260 'thumbnail': thumbnail,
4261 'description': description
4265 class SpiegelIE(InfoExtractor):
4266 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4268 def _real_extract(self, url):
4269 m = re.match(self._VALID_URL, url)
4270 video_id = m.group('videoID')
4272 webpage = self._download_webpage(url, video_id)
4273 m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4275 raise ExtractorError(u'Cannot find title')
4276 video_title = unescapeHTML(m.group(1))
4278 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4279 xml_code = self._download_webpage(xml_url, video_id,
4280 note=u'Downloading XML', errnote=u'Failed to download XML')
4282 idoc = xml.etree.ElementTree.fromstring(xml_code)
4283 last_type = idoc[-1]
4284 filename = last_type.findall('./filename')[0].text
4285 duration = float(last_type.findall('./duration')[0].text)
4287 video_url = 'http://video2.spiegel.de/flash/' + filename
4288 video_ext = filename.rpartition('.')[2]
4293 'title': video_title,
4294 'duration': duration,
4298 class LiveLeakIE(InfoExtractor):
4300 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4301 IE_NAME = u'liveleak'
4303 def _real_extract(self, url):
4304 mobj = re.match(self._VALID_URL, url)
4306 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4309 video_id = mobj.group('video_id')
4311 webpage = self._download_webpage(url, video_id)
4313 m = re.search(r'file: "(.*?)",', webpage)
4315 self._downloader.report_error(u'unable to find video url')
4317 video_url = m.group(1)
4319 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4321 self._downloader.trouble(u'Cannot find video title')
4322 title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4324 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4326 desc = unescapeHTML(m.group('desc'))
4330 m = re.search(r'By:.*?(\w+)</a>', webpage)
4332 uploader = clean_html(m.group(1))
4341 'description': desc,
4342 'uploader': uploader
4348 def gen_extractors():
4349 """ Return a list of an instance of every supported extractor.
4350 The order does matter; the first extractor matched is the one handling the URL.
4353 YoutubePlaylistIE(),
4378 StanfordOpenClassroomIE(),
4388 WorldStarHipHopIE(),