2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
23 class InfoExtractor(object):
24 """Information Extractor class.
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
29 others. The information is stored in a dictionary which is then
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
34 The dictionaries must include the following fields:
38 title: Video title, unescaped.
39 ext: Video filename extension.
41 The following fields are optional:
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
46 uploader: Full name of the video uploader.
47 upload_date: Video upload date (YYYYMMDD).
48 uploader_id: Nickname or id of the video uploader.
49 location: Physical location of the video.
50 player_url: SWF Player URL (used for rtmpdump).
51 subtitles: The subtitle file contents.
52 urlhandle: [internal] The urlHandle to be used to download the file,
53 like returned by urllib.request.urlopen
55 The fields should all be Unicode strings.
57 Subclasses of this one should re-define the _real_initialize() and
58 _real_extract() methods and define a _VALID_URL regexp.
59 Probably, they should also be added to the list of extractors.
61 _real_extract() must return a *list* of information dictionaries as
64 Finally, the _WORKING attribute should be set to False for broken IEs
65 in order to warn the users and skip the tests.
72 def __init__(self, downloader=None):
73 """Constructor. Receives an optional downloader."""
75 self.set_downloader(downloader)
78 def suitable(cls, url):
79 """Receives a URL and returns True if suitable for this IE."""
80 return re.match(cls._VALID_URL, url) is not None
84 """Getter method for _WORKING."""
88 """Initializes an instance (authentication, etc)."""
90 self._real_initialize()
93 def extract(self, url):
94 """Extracts URL information and returns it in list of dicts."""
96 return self._real_extract(url)
98 def set_downloader(self, downloader):
99 """Sets the downloader for this IE."""
100 self._downloader = downloader
102 def _real_initialize(self):
103 """Real initialization process. Redefine in subclasses."""
106 def _real_extract(self, url):
107 """Real extraction process. Redefine in subclasses."""
112 return type(self).__name__[:-2]
114 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115 """ Returns the response handle """
117 note = u'Downloading video webpage'
118 if note is not False:
119 self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
121 return compat_urllib_request.urlopen(url_or_request)
122 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
124 errnote = u'Unable to download webpage'
125 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
127 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
128 """ Returns the data of the page as a string """
129 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
130 content_type = urlh.headers.get('Content-Type', '')
131 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
133 encoding = m.group(1)
136 webpage_bytes = urlh.read()
137 if self._downloader.params.get('dump_intermediate_pages', False):
139 url = url_or_request.get_full_url()
140 except AttributeError:
142 self._downloader.to_screen(u'Dumping request to ' + url)
143 dump = base64.b64encode(webpage_bytes).decode('ascii')
144 self._downloader.to_screen(dump)
145 return webpage_bytes.decode(encoding, 'replace')
147 #Methods for following #608
148 #They set the correct value of the '_type' key
149 def video_result(self, video_info):
150 """Returns a video"""
151 video_info['_type'] = 'video'
153 def url_result(self, url, ie=None):
154 """Returns a url that points to a page that should be processed"""
155 #TODO: ie should be the class used for getting the info
156 video_info = {'_type': 'url',
159 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
160 """Returns a playlist"""
161 video_info = {'_type': 'playlist',
164 video_info['id'] = playlist_id
166 video_info['title'] = playlist_title
170 class YoutubeIE(InfoExtractor):
171 """Information extractor for youtube.com."""
175 (?:https?://)? # http(s):// (optional)
176 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
177 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
178 (?:.*?\#/)? # handle anchor (#/) redirect urls
179 (?: # the various things that can precede the ID:
180 (?:(?:v|embed|e)/) # v/ or embed/ or e/
181 |(?: # or the v= param in all its forms
182 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
183 (?:\?|\#!?) # the params delimiter ? or # or #!
184 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
187 )? # optional -> youtube.com/xxxx is OK
188 )? # all until now is optional -> you can pass the naked ID
189 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
190 (?(1).+)? # if we found the ID, everything can follow
192 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
193 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
194 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
195 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
196 _NETRC_MACHINE = 'youtube'
197 # Listed in order of quality
198 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
199 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
200 _video_extensions = {
206 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
212 _video_dimensions = {
231 def suitable(cls, url):
232 """Receives a URL and returns True if suitable for this IE."""
233 if YoutubePlaylistIE.suitable(url): return False
234 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
236 def report_lang(self):
237 """Report attempt to set language."""
238 self._downloader.to_screen(u'[youtube] Setting language')
240 def report_login(self):
241 """Report attempt to log in."""
242 self._downloader.to_screen(u'[youtube] Logging in')
244 def report_age_confirmation(self):
245 """Report attempt to confirm age."""
246 self._downloader.to_screen(u'[youtube] Confirming age')
248 def report_video_webpage_download(self, video_id):
249 """Report attempt to download video webpage."""
250 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
252 def report_video_info_webpage_download(self, video_id):
253 """Report attempt to download video info webpage."""
254 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
256 def report_video_subtitles_download(self, video_id):
257 """Report attempt to download video info webpage."""
258 self._downloader.to_screen(u'[youtube] %s: Checking available subtitles' % video_id)
260 def report_video_subtitles_request(self, video_id, sub_lang, format):
261 """Report attempt to download video info webpage."""
262 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
264 def report_video_subtitles_available(self, video_id, sub_lang_list):
265 """Report available subtitles."""
266 sub_lang = ",".join(list(sub_lang_list.keys()))
267 self._downloader.to_screen(u'[youtube] %s: Available subtitles for video: %s' % (video_id, sub_lang))
269 def report_information_extraction(self, video_id):
270 """Report attempt to extract video information."""
271 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
273 def report_unavailable_format(self, video_id, format):
274 """Report extracted video URL."""
275 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
277 def report_rtmp_download(self):
278 """Indicate the download will use the RTMP protocol."""
279 self._downloader.to_screen(u'[youtube] RTMP download detected')
281 def _get_available_subtitles(self, video_id):
282 self.report_video_subtitles_download(video_id)
283 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
285 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
286 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
287 return (u'unable to download video subtitles: %s' % compat_str(err), None)
288 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
289 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
290 if not sub_lang_list:
291 return (u'video doesn\'t have subtitles', None)
294 def _list_available_subtitles(self, video_id):
295 sub_lang_list = self._get_available_subtitles(video_id)
296 self.report_video_subtitles_available(video_id, sub_lang_list)
298 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
301 (error_message, sub_lang, sub)
303 self.report_video_subtitles_request(video_id, sub_lang, format)
304 params = compat_urllib_parse.urlencode({
310 url = 'http://www.youtube.com/api/timedtext?' + params
312 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
313 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
314 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
316 return (u'Did not fetch video subtitles', None, None)
317 return (None, sub_lang, sub)
319 def _extract_subtitle(self, video_id):
321 Return a list with a tuple:
322 [(error_message, sub_lang, sub)]
324 sub_lang_list = self._get_available_subtitles(video_id)
325 sub_format = self._downloader.params.get('subtitlesformat')
326 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
327 return [(sub_lang_list[0], None, None)]
328 if self._downloader.params.get('subtitleslang', False):
329 sub_lang = self._downloader.params.get('subtitleslang')
330 elif 'en' in sub_lang_list:
333 sub_lang = list(sub_lang_list.keys())[0]
334 if not sub_lang in sub_lang_list:
335 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
337 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
340 def _extract_all_subtitles(self, video_id):
341 sub_lang_list = self._get_available_subtitles(video_id)
342 sub_format = self._downloader.params.get('subtitlesformat')
343 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
344 return [(sub_lang_list[0], None, None)]
346 for sub_lang in sub_lang_list:
347 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
348 subtitles.append(subtitle)
351 def _print_formats(self, formats):
352 print('Available formats:')
354 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
356 def _real_initialize(self):
357 if self._downloader is None:
362 downloader_params = self._downloader.params
364 # Attempt to use provided username and password or .netrc data
365 if downloader_params.get('username', None) is not None:
366 username = downloader_params['username']
367 password = downloader_params['password']
368 elif downloader_params.get('usenetrc', False):
370 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
375 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
376 except (IOError, netrc.NetrcParseError) as err:
377 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
381 request = compat_urllib_request.Request(self._LANG_URL)
384 compat_urllib_request.urlopen(request).read()
385 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
386 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
389 # No authentication to be performed
393 request = compat_urllib_request.Request(self._LOGIN_URL)
395 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
396 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
397 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
402 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
404 galx = match.group(1)
406 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
412 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
416 u'PersistentCookie': u'yes',
418 u'bgresponse': u'js_disabled',
419 u'checkConnection': u'',
420 u'checkedDomains': u'youtube',
426 u'signIn': u'Sign in',
428 u'service': u'youtube',
432 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
434 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
435 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
436 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
439 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
440 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
441 self._downloader.report_warning(u'unable to log in: bad username or password')
443 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
444 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
450 'action_confirm': 'Confirm',
452 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
454 self.report_age_confirmation()
455 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
456 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
457 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
460 def _extract_id(self, url):
461 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
463 self._downloader.report_error(u'invalid URL: %s' % url)
465 video_id = mobj.group(2)
468 def _real_extract(self, url):
469 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
470 mobj = re.search(self._NEXT_URL_RE, url)
472 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
473 video_id = self._extract_id(url)
476 self.report_video_webpage_download(video_id)
477 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
478 request = compat_urllib_request.Request(url)
480 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
481 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
482 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
485 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
487 # Attempt to extract SWF player URL
488 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
490 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
495 self.report_video_info_webpage_download(video_id)
496 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
497 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
498 % (video_id, el_type))
499 video_info_webpage = self._download_webpage(video_info_url, video_id,
501 errnote='unable to download video info webpage')
502 video_info = compat_parse_qs(video_info_webpage)
503 if 'token' in video_info:
505 if 'token' not in video_info:
506 if 'reason' in video_info:
507 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
509 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
512 # Check for "rental" videos
513 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
514 self._downloader.report_error(u'"rental" videos not supported')
517 # Start extracting information
518 self.report_information_extraction(video_id)
521 if 'author' not in video_info:
522 self._downloader.report_error(u'unable to extract uploader name')
524 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
527 video_uploader_id = None
528 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
530 video_uploader_id = mobj.group(1)
532 self._downloader.report_warning(u'unable to extract uploader nickname')
535 if 'title' not in video_info:
536 self._downloader.report_error(u'unable to extract video title')
538 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
541 if 'thumbnail_url' not in video_info:
542 self._downloader.report_warning(u'unable to extract video thumbnail')
544 else: # don't panic if we can't find it
545 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
549 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
551 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
552 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
553 for expression in format_expressions:
555 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
560 video_description = get_element_by_id("eow-description", video_webpage)
561 if video_description:
562 video_description = clean_html(video_description)
564 video_description = ''
567 video_subtitles = None
569 if self._downloader.params.get('writesubtitles', False):
570 video_subtitles = self._extract_subtitle(video_id)
572 (sub_error, sub_lang, sub) = video_subtitles[0]
574 self._downloader.report_error(sub_error)
576 if self._downloader.params.get('allsubtitles', False):
577 video_subtitles = self._extract_all_subtitles(video_id)
578 for video_subtitle in video_subtitles:
579 (sub_error, sub_lang, sub) = video_subtitle
581 self._downloader.report_error(sub_error)
583 if self._downloader.params.get('listsubtitles', False):
584 sub_lang_list = self._list_available_subtitles(video_id)
587 if 'length_seconds' not in video_info:
588 self._downloader.report_warning(u'unable to extract video duration')
591 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
594 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
596 # Decide which formats to download
597 req_format = self._downloader.params.get('format', None)
599 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
600 self.report_rtmp_download()
601 video_url_list = [(None, video_info['conn'][0])]
602 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
603 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
604 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
605 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
606 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
608 format_limit = self._downloader.params.get('format_limit', None)
609 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
610 if format_limit is not None and format_limit in available_formats:
611 format_list = available_formats[available_formats.index(format_limit):]
613 format_list = available_formats
614 existing_formats = [x for x in format_list if x in url_map]
615 if len(existing_formats) == 0:
616 self._downloader.report_error(u'no known formats available for video')
618 if self._downloader.params.get('listformats', None):
619 self._print_formats(existing_formats)
621 if req_format is None or req_format == 'best':
622 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
623 elif req_format == 'worst':
624 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
625 elif req_format in ('-1', 'all'):
626 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
628 # Specific formats. We pick the first in a slash-delimeted sequence.
629 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
630 req_formats = req_format.split('/')
631 video_url_list = None
632 for rf in req_formats:
634 video_url_list = [(rf, url_map[rf])]
636 if video_url_list is None:
637 self._downloader.report_error(u'requested format not available')
640 self._downloader.report_error(u'no conn or url_encoded_fmt_stream_map information found in video info')
644 for format_param, video_real_url in video_url_list:
646 video_extension = self._video_extensions.get(format_param, 'flv')
648 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
649 self._video_dimensions.get(format_param, '???'))
653 'url': video_real_url,
654 'uploader': video_uploader,
655 'uploader_id': video_uploader_id,
656 'upload_date': upload_date,
657 'title': video_title,
658 'ext': video_extension,
659 'format': video_format,
660 'thumbnail': video_thumbnail,
661 'description': video_description,
662 'player_url': player_url,
663 'subtitles': video_subtitles,
664 'duration': video_duration
669 class MetacafeIE(InfoExtractor):
670 """Information Extractor for metacafe.com."""
672 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
673 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
674 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
675 IE_NAME = u'metacafe'
677 def __init__(self, downloader=None):
678 InfoExtractor.__init__(self, downloader)
680 def report_disclaimer(self):
681 """Report disclaimer retrieval."""
682 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
684 def report_age_confirmation(self):
685 """Report attempt to confirm age."""
686 self._downloader.to_screen(u'[metacafe] Confirming age')
688 def report_download_webpage(self, video_id):
689 """Report webpage download."""
690 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
692 def report_extraction(self, video_id):
693 """Report information extraction."""
694 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
696 def _real_initialize(self):
697 # Retrieve disclaimer
698 request = compat_urllib_request.Request(self._DISCLAIMER)
700 self.report_disclaimer()
701 disclaimer = compat_urllib_request.urlopen(request).read()
702 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
703 self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
709 'submit': "Continue - I'm over 18",
711 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
713 self.report_age_confirmation()
714 disclaimer = compat_urllib_request.urlopen(request).read()
715 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
716 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
719 def _real_extract(self, url):
720 # Extract id and simplified title from URL
721 mobj = re.match(self._VALID_URL, url)
723 self._downloader.report_error(u'invalid URL: %s' % url)
726 video_id = mobj.group(1)
728 # Check if video comes from YouTube
729 mobj2 = re.match(r'^yt-(.*)$', video_id)
730 if mobj2 is not None:
731 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1))]
733 # Retrieve video webpage to extract further information
734 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
736 self.report_download_webpage(video_id)
737 webpage = compat_urllib_request.urlopen(request).read()
738 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
739 self._downloader.report_error(u'unable retrieve video webpage: %s' % compat_str(err))
742 # Extract URL, uploader and title from webpage
743 self.report_extraction(video_id)
744 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
746 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
747 video_extension = mediaURL[-3:]
749 # Extract gdaKey if available
750 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
754 gdaKey = mobj.group(1)
755 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
757 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
759 self._downloader.report_error(u'unable to extract media URL')
761 vardict = compat_parse_qs(mobj.group(1))
762 if 'mediaData' not in vardict:
763 self._downloader.report_error(u'unable to extract media URL')
765 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
767 self._downloader.report_error(u'unable to extract media URL')
769 mediaURL = mobj.group(1).replace('\\/', '/')
770 video_extension = mediaURL[-3:]
771 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
773 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
775 self._downloader.report_error(u'unable to extract title')
777 video_title = mobj.group(1).decode('utf-8')
779 mobj = re.search(r'submitter=(.*?);', webpage)
781 self._downloader.report_error(u'unable to extract uploader nickname')
783 video_uploader = mobj.group(1)
786 'id': video_id.decode('utf-8'),
787 'url': video_url.decode('utf-8'),
788 'uploader': video_uploader.decode('utf-8'),
790 'title': video_title,
791 'ext': video_extension.decode('utf-8'),
795 class DailymotionIE(InfoExtractor):
796 """Information Extractor for Dailymotion"""
798 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
799 IE_NAME = u'dailymotion'
802 def __init__(self, downloader=None):
803 InfoExtractor.__init__(self, downloader)
805 def report_extraction(self, video_id):
806 """Report information extraction."""
807 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
809 def _real_extract(self, url):
810 # Extract id and simplified title from URL
811 mobj = re.match(self._VALID_URL, url)
813 self._downloader.report_error(u'invalid URL: %s' % url)
816 video_id = mobj.group(1).split('_')[0].split('?')[0]
818 video_extension = 'mp4'
820 # Retrieve video webpage to extract further information
821 request = compat_urllib_request.Request(url)
822 request.add_header('Cookie', 'family_filter=off')
823 webpage = self._download_webpage(request, video_id)
825 # Extract URL, uploader and title from webpage
826 self.report_extraction(video_id)
827 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
829 self._downloader.report_error(u'unable to extract media URL')
831 flashvars = compat_urllib_parse.unquote(mobj.group(1))
833 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
836 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
839 self._downloader.report_error(u'unable to extract video URL')
842 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
844 self._downloader.report_error(u'unable to extract video URL')
847 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
849 # TODO: support choosing qualities
851 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
853 self._downloader.report_error(u'unable to extract title')
855 video_title = unescapeHTML(mobj.group('title'))
857 video_uploader = None
858 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
860 # lookin for official user
861 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
862 if mobj_official is None:
863 self._downloader.report_warning(u'unable to extract uploader nickname')
865 video_uploader = mobj_official.group(1)
867 video_uploader = mobj.group(1)
869 video_upload_date = None
870 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
872 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
877 'uploader': video_uploader,
878 'upload_date': video_upload_date,
879 'title': video_title,
880 'ext': video_extension,
884 class PhotobucketIE(InfoExtractor):
885 """Information extractor for photobucket.com."""
887 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
888 IE_NAME = u'photobucket'
890 def __init__(self, downloader=None):
891 InfoExtractor.__init__(self, downloader)
893 def report_download_webpage(self, video_id):
894 """Report webpage download."""
895 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
897 def report_extraction(self, video_id):
898 """Report information extraction."""
899 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
901 def _real_extract(self, url):
902 # Extract id from URL
903 mobj = re.match(self._VALID_URL, url)
905 self._downloader.report_error(u'Invalid URL: %s' % url)
908 video_id = mobj.group(1)
910 video_extension = 'flv'
912 # Retrieve video webpage to extract further information
913 request = compat_urllib_request.Request(url)
915 self.report_download_webpage(video_id)
916 webpage = compat_urllib_request.urlopen(request).read()
917 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
918 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
921 # Extract URL, uploader, and title from webpage
922 self.report_extraction(video_id)
923 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
925 self._downloader.report_error(u'unable to extract media URL')
927 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
931 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
933 self._downloader.report_error(u'unable to extract title')
935 video_title = mobj.group(1).decode('utf-8')
937 video_uploader = mobj.group(2).decode('utf-8')
940 'id': video_id.decode('utf-8'),
941 'url': video_url.decode('utf-8'),
942 'uploader': video_uploader,
944 'title': video_title,
945 'ext': video_extension.decode('utf-8'),
949 class YahooIE(InfoExtractor):
950 """Information extractor for video.yahoo.com."""
953 # _VALID_URL matches all Yahoo! Video URLs
954 # _VPAGE_URL matches only the extractable '/watch/' URLs
955 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
956 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
957 IE_NAME = u'video.yahoo'
959 def __init__(self, downloader=None):
960 InfoExtractor.__init__(self, downloader)
962 def report_download_webpage(self, video_id):
963 """Report webpage download."""
964 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
966 def report_extraction(self, video_id):
967 """Report information extraction."""
968 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
970 def _real_extract(self, url, new_video=True):
971 # Extract ID from URL
972 mobj = re.match(self._VALID_URL, url)
974 self._downloader.report_error(u'Invalid URL: %s' % url)
977 video_id = mobj.group(2)
978 video_extension = 'flv'
980 # Rewrite valid but non-extractable URLs as
981 # extractable English language /watch/ URLs
982 if re.match(self._VPAGE_URL, url) is None:
983 request = compat_urllib_request.Request(url)
985 webpage = compat_urllib_request.urlopen(request).read()
986 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
987 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
990 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
992 self._downloader.report_error(u'Unable to extract id field')
994 yahoo_id = mobj.group(1)
996 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
998 self._downloader.report_error(u'Unable to extract vid field')
1000 yahoo_vid = mobj.group(1)
1002 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1003 return self._real_extract(url, new_video=False)
1005 # Retrieve video webpage to extract further information
1006 request = compat_urllib_request.Request(url)
1008 self.report_download_webpage(video_id)
1009 webpage = compat_urllib_request.urlopen(request).read()
1010 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1011 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1014 # Extract uploader and title from webpage
1015 self.report_extraction(video_id)
1016 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1018 self._downloader.report_error(u'unable to extract video title')
1020 video_title = mobj.group(1).decode('utf-8')
1022 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1024 self._downloader.report_error(u'unable to extract video uploader')
1026 video_uploader = mobj.group(1).decode('utf-8')
1028 # Extract video thumbnail
1029 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1031 self._downloader.report_error(u'unable to extract video thumbnail')
1033 video_thumbnail = mobj.group(1).decode('utf-8')
1035 # Extract video description
1036 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1038 self._downloader.report_error(u'unable to extract video description')
1040 video_description = mobj.group(1).decode('utf-8')
1041 if not video_description:
1042 video_description = 'No description available.'
1044 # Extract video height and width
1045 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1047 self._downloader.report_error(u'unable to extract video height')
1049 yv_video_height = mobj.group(1)
1051 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1053 self._downloader.report_error(u'unable to extract video width')
1055 yv_video_width = mobj.group(1)
1057 # Retrieve video playlist to extract media URL
1058 # I'm not completely sure what all these options are, but we
1059 # seem to need most of them, otherwise the server sends a 401.
1060 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1061 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1062 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1063 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1064 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1066 self.report_download_webpage(video_id)
1067 webpage = compat_urllib_request.urlopen(request).read()
1068 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1069 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1072 # Extract media URL from playlist XML
1073 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1075 self._downloader.report_error(u'Unable to extract media URL')
1077 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1078 video_url = unescapeHTML(video_url)
1081 'id': video_id.decode('utf-8'),
1083 'uploader': video_uploader,
1084 'upload_date': None,
1085 'title': video_title,
1086 'ext': video_extension.decode('utf-8'),
1087 'thumbnail': video_thumbnail.decode('utf-8'),
1088 'description': video_description,
1092 class VimeoIE(InfoExtractor):
1093 """Information extractor for vimeo.com."""
1095 # _VALID_URL matches Vimeo URLs
1096 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1099 def __init__(self, downloader=None):
1100 InfoExtractor.__init__(self, downloader)
1102 def report_download_webpage(self, video_id):
1103 """Report webpage download."""
1104 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1106 def report_extraction(self, video_id):
1107 """Report information extraction."""
1108 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1110 def _real_extract(self, url, new_video=True):
1111 # Extract ID from URL
1112 mobj = re.match(self._VALID_URL, url)
1114 self._downloader.report_error(u'Invalid URL: %s' % url)
1117 video_id = mobj.group('id')
1118 if not mobj.group('proto'):
1119 url = 'https://' + url
1120 if mobj.group('direct_link'):
1121 url = 'https://vimeo.com/' + video_id
1123 # Retrieve video webpage to extract further information
1124 request = compat_urllib_request.Request(url, None, std_headers)
1126 self.report_download_webpage(video_id)
1127 webpage_bytes = compat_urllib_request.urlopen(request).read()
1128 webpage = webpage_bytes.decode('utf-8')
1129 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1130 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1133 # Now we begin extracting as much information as we can from what we
1134 # retrieved. First we extract the information common to all extractors,
1135 # and latter we extract those that are Vimeo specific.
1136 self.report_extraction(video_id)
1138 # Extract the config JSON
1140 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1141 config = json.loads(config)
1143 self._downloader.report_error(u'unable to extract info section')
1147 video_title = config["video"]["title"]
1149 # Extract uploader and uploader_id
1150 video_uploader = config["video"]["owner"]["name"]
1151 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1153 # Extract video thumbnail
1154 video_thumbnail = config["video"]["thumbnail"]
1156 # Extract video description
1157 video_description = get_element_by_attribute("itemprop", "description", webpage)
1158 if video_description: video_description = clean_html(video_description)
1159 else: video_description = u''
1161 # Extract upload date
1162 video_upload_date = None
1163 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1164 if mobj is not None:
1165 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1167 # Vimeo specific: extract request signature and timestamp
1168 sig = config['request']['signature']
1169 timestamp = config['request']['timestamp']
1171 # Vimeo specific: extract video codec and quality information
1172 # First consider quality, then codecs, then take everything
1173 # TODO bind to format param
1174 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1175 files = { 'hd': [], 'sd': [], 'other': []}
1176 for codec_name, codec_extension in codecs:
1177 if codec_name in config["video"]["files"]:
1178 if 'hd' in config["video"]["files"][codec_name]:
1179 files['hd'].append((codec_name, codec_extension, 'hd'))
1180 elif 'sd' in config["video"]["files"][codec_name]:
1181 files['sd'].append((codec_name, codec_extension, 'sd'))
1183 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1185 for quality in ('hd', 'sd', 'other'):
1186 if len(files[quality]) > 0:
1187 video_quality = files[quality][0][2]
1188 video_codec = files[quality][0][0]
1189 video_extension = files[quality][0][1]
1190 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1193 self._downloader.report_error(u'no known codec found')
1196 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1197 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1202 'uploader': video_uploader,
1203 'uploader_id': video_uploader_id,
1204 'upload_date': video_upload_date,
1205 'title': video_title,
1206 'ext': video_extension,
1207 'thumbnail': video_thumbnail,
1208 'description': video_description,
1212 class ArteTvIE(InfoExtractor):
1213 """arte.tv information extractor."""
1215 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1216 _LIVE_URL = r'index-[0-9]+\.html$'
1218 IE_NAME = u'arte.tv'
1220 def __init__(self, downloader=None):
1221 InfoExtractor.__init__(self, downloader)
1223 def report_download_webpage(self, video_id):
1224 """Report webpage download."""
1225 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1227 def report_extraction(self, video_id):
1228 """Report information extraction."""
1229 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1231 def fetch_webpage(self, url):
1232 request = compat_urllib_request.Request(url)
1234 self.report_download_webpage(url)
1235 webpage = compat_urllib_request.urlopen(request).read()
1236 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1237 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1239 except ValueError as err:
1240 self._downloader.report_error(u'Invalid URL: %s' % url)
1244 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1245 page = self.fetch_webpage(url)
1246 mobj = re.search(regex, page, regexFlags)
1250 self._downloader.report_error(u'Invalid URL: %s' % url)
1253 for (i, key, err) in matchTuples:
1254 if mobj.group(i) is None:
1255 self._downloader.trouble(err)
1258 info[key] = mobj.group(i)
1262 def extractLiveStream(self, url):
1263 video_lang = url.split('/')[-4]
1264 info = self.grep_webpage(
1266 r'src="(.*?/videothek_js.*?\.js)',
1269 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1272 http_host = url.split('/')[2]
1273 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1274 info = self.grep_webpage(
1276 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1277 '(http://.*?\.swf).*?' +
1281 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1282 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1283 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1286 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1288 def extractPlus7Stream(self, url):
1289 video_lang = url.split('/')[-3]
1290 info = self.grep_webpage(
1292 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1295 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1298 next_url = compat_urllib_parse.unquote(info.get('url'))
1299 info = self.grep_webpage(
1301 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1304 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1307 next_url = compat_urllib_parse.unquote(info.get('url'))
1309 info = self.grep_webpage(
1311 r'<video id="(.*?)".*?>.*?' +
1312 '<name>(.*?)</name>.*?' +
1313 '<dateVideo>(.*?)</dateVideo>.*?' +
1314 '<url quality="hd">(.*?)</url>',
1317 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1318 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1319 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1320 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1325 'id': info.get('id'),
1326 'url': compat_urllib_parse.unquote(info.get('url')),
1327 'uploader': u'arte.tv',
1328 'upload_date': info.get('date'),
1329 'title': info.get('title').decode('utf-8'),
1335 def _real_extract(self, url):
1336 video_id = url.split('/')[-1]
1337 self.report_extraction(video_id)
1339 if re.search(self._LIVE_URL, video_id) is not None:
1340 self.extractLiveStream(url)
1343 info = self.extractPlus7Stream(url)
1348 class GenericIE(InfoExtractor):
1349 """Generic last-resort information extractor."""
1352 IE_NAME = u'generic'
1354 def __init__(self, downloader=None):
1355 InfoExtractor.__init__(self, downloader)
1357 def report_download_webpage(self, video_id):
1358 """Report webpage download."""
1359 if not self._downloader.params.get('test', False):
1360 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1361 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1363 def report_extraction(self, video_id):
1364 """Report information extraction."""
1365 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1367 def report_following_redirect(self, new_url):
1368 """Report information extraction."""
1369 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1371 def _test_redirect(self, url):
1372 """Check if it is a redirect, like url shorteners, in case return the new url."""
1373 class HeadRequest(compat_urllib_request.Request):
1374 def get_method(self):
1377 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1379 Subclass the HTTPRedirectHandler to make it use our
1380 HeadRequest also on the redirected URL
1382 def redirect_request(self, req, fp, code, msg, headers, newurl):
1383 if code in (301, 302, 303, 307):
1384 newurl = newurl.replace(' ', '%20')
1385 newheaders = dict((k,v) for k,v in req.headers.items()
1386 if k.lower() not in ("content-length", "content-type"))
1387 return HeadRequest(newurl,
1389 origin_req_host=req.get_origin_req_host(),
1392 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1394 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1396 Fallback to GET if HEAD is not allowed (405 HTTP error)
1398 def http_error_405(self, req, fp, code, msg, headers):
1402 newheaders = dict((k,v) for k,v in req.headers.items()
1403 if k.lower() not in ("content-length", "content-type"))
1404 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1406 origin_req_host=req.get_origin_req_host(),
1410 opener = compat_urllib_request.OpenerDirector()
1411 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1412 HTTPMethodFallback, HEADRedirectHandler,
1413 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1414 opener.add_handler(handler())
1416 response = opener.open(HeadRequest(url))
1417 new_url = response.geturl()
1422 self.report_following_redirect(new_url)
1425 def _real_extract(self, url):
1426 new_url = self._test_redirect(url)
1427 if new_url: return [self.url_result(new_url)]
1429 video_id = url.split('/')[-1]
1431 webpage = self._download_webpage(url, video_id)
1432 except ValueError as err:
1433 # since this is the last-resort InfoExtractor, if
1434 # this error is thrown, it'll be thrown here
1435 self._downloader.report_error(u'Invalid URL: %s' % url)
1438 self.report_extraction(video_id)
1439 # Start with something easy: JW Player in SWFObject
1440 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1442 # Broaden the search a little bit
1443 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1445 # Broaden the search a little bit: JWPlayer JS loader
1446 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1448 self._downloader.report_error(u'Invalid URL: %s' % url)
1451 # It's possible that one of the regexes
1452 # matched, but returned an empty group:
1453 if mobj.group(1) is None:
1454 self._downloader.report_error(u'Invalid URL: %s' % url)
1457 video_url = compat_urllib_parse.unquote(mobj.group(1))
1458 video_id = os.path.basename(video_url)
1460 # here's a fun little line of code for you:
1461 video_extension = os.path.splitext(video_id)[1][1:]
1462 video_id = os.path.splitext(video_id)[0]
1464 # it's tempting to parse this further, but you would
1465 # have to take into account all the variations like
1466 # Video Title - Site Name
1467 # Site Name | Video Title
1468 # Video Title - Tagline | Site Name
1469 # and so on and so forth; it's just not practical
1470 mobj = re.search(r'<title>(.*)</title>', webpage)
1472 self._downloader.report_error(u'unable to extract title')
1474 video_title = mobj.group(1)
1476 # video uploader is domain name
1477 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1479 self._downloader.report_error(u'unable to extract title')
1481 video_uploader = mobj.group(1)
1486 'uploader': video_uploader,
1487 'upload_date': None,
1488 'title': video_title,
1489 'ext': video_extension,
1493 class YoutubeSearchIE(InfoExtractor):
1494 """Information Extractor for YouTube search queries."""
1495 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1496 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1497 _max_youtube_results = 1000
1498 IE_NAME = u'youtube:search'
1500 def __init__(self, downloader=None):
1501 InfoExtractor.__init__(self, downloader)
1503 def report_download_page(self, query, pagenum):
1504 """Report attempt to download search page with given number."""
1505 query = query.decode(preferredencoding())
1506 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1508 def _real_extract(self, query):
1509 mobj = re.match(self._VALID_URL, query)
1511 self._downloader.report_error(u'invalid search query "%s"' % query)
1514 prefix, query = query.split(':')
1516 query = query.encode('utf-8')
1518 self._download_n_results(query, 1)
1520 elif prefix == 'all':
1521 self._download_n_results(query, self._max_youtube_results)
1527 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1529 elif n > self._max_youtube_results:
1530 self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1531 n = self._max_youtube_results
1532 self._download_n_results(query, n)
1534 except ValueError: # parsing prefix as integer fails
1535 self._download_n_results(query, 1)
1538 def _download_n_results(self, query, n):
1539 """Downloads a specified number of results for a query"""
1545 while (50 * pagenum) < limit:
1546 self.report_download_page(query, pagenum+1)
1547 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1548 request = compat_urllib_request.Request(result_url)
1550 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1551 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1552 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1554 api_response = json.loads(data)['data']
1556 if not 'items' in api_response:
1557 self._downloader.trouble(u'[youtube] No video results')
1560 new_ids = list(video['id'] for video in api_response['items'])
1561 video_ids += new_ids
1563 limit = min(n, api_response['totalItems'])
1566 if len(video_ids) > n:
1567 video_ids = video_ids[:n]
1568 for id in video_ids:
1569 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1573 class GoogleSearchIE(InfoExtractor):
1574 """Information Extractor for Google Video search queries."""
1575 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1576 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1577 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1578 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1579 _max_google_results = 1000
1580 IE_NAME = u'video.google:search'
1582 def __init__(self, downloader=None):
1583 InfoExtractor.__init__(self, downloader)
1585 def report_download_page(self, query, pagenum):
1586 """Report attempt to download playlist page with given number."""
1587 query = query.decode(preferredencoding())
1588 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1590 def _real_extract(self, query):
1591 mobj = re.match(self._VALID_URL, query)
1593 self._downloader.report_error(u'invalid search query "%s"' % query)
1596 prefix, query = query.split(':')
1598 query = query.encode('utf-8')
1600 self._download_n_results(query, 1)
1602 elif prefix == 'all':
1603 self._download_n_results(query, self._max_google_results)
1609 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1611 elif n > self._max_google_results:
1612 self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1613 n = self._max_google_results
1614 self._download_n_results(query, n)
1616 except ValueError: # parsing prefix as integer fails
1617 self._download_n_results(query, 1)
1620 def _download_n_results(self, query, n):
1621 """Downloads a specified number of results for a query"""
1627 self.report_download_page(query, pagenum)
1628 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1629 request = compat_urllib_request.Request(result_url)
1631 page = compat_urllib_request.urlopen(request).read()
1632 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1633 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1636 # Extract video identifiers
1637 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1638 video_id = mobj.group(1)
1639 if video_id not in video_ids:
1640 video_ids.append(video_id)
1641 if len(video_ids) == n:
1642 # Specified n videos reached
1643 for id in video_ids:
1644 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1647 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1648 for id in video_ids:
1649 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1652 pagenum = pagenum + 1
1655 class YahooSearchIE(InfoExtractor):
1656 """Information Extractor for Yahoo! Video search queries."""
1659 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1660 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1661 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1662 _MORE_PAGES_INDICATOR = r'\s*Next'
1663 _max_yahoo_results = 1000
1664 IE_NAME = u'video.yahoo:search'
1666 def __init__(self, downloader=None):
1667 InfoExtractor.__init__(self, downloader)
1669 def report_download_page(self, query, pagenum):
1670 """Report attempt to download playlist page with given number."""
1671 query = query.decode(preferredencoding())
1672 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1674 def _real_extract(self, query):
1675 mobj = re.match(self._VALID_URL, query)
1677 self._downloader.report_error(u'invalid search query "%s"' % query)
1680 prefix, query = query.split(':')
1682 query = query.encode('utf-8')
1684 self._download_n_results(query, 1)
1686 elif prefix == 'all':
1687 self._download_n_results(query, self._max_yahoo_results)
1693 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1695 elif n > self._max_yahoo_results:
1696 self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1697 n = self._max_yahoo_results
1698 self._download_n_results(query, n)
1700 except ValueError: # parsing prefix as integer fails
1701 self._download_n_results(query, 1)
1704 def _download_n_results(self, query, n):
1705 """Downloads a specified number of results for a query"""
1708 already_seen = set()
1712 self.report_download_page(query, pagenum)
1713 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1714 request = compat_urllib_request.Request(result_url)
1716 page = compat_urllib_request.urlopen(request).read()
1717 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1718 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1721 # Extract video identifiers
1722 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1723 video_id = mobj.group(1)
1724 if video_id not in already_seen:
1725 video_ids.append(video_id)
1726 already_seen.add(video_id)
1727 if len(video_ids) == n:
1728 # Specified n videos reached
1729 for id in video_ids:
1730 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1733 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1734 for id in video_ids:
1735 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1738 pagenum = pagenum + 1
1741 class YoutubePlaylistIE(InfoExtractor):
1742 """Information Extractor for YouTube playlists."""
1744 _VALID_URL = r"""(?:
1749 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1750 \? (?:.*?&)*? (?:p|a|list)=
1753 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1756 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1758 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1760 IE_NAME = u'youtube:playlist'
1762 def __init__(self, downloader=None):
1763 InfoExtractor.__init__(self, downloader)
1766 def suitable(cls, url):
1767 """Receives a URL and returns True if suitable for this IE."""
1768 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1770 def report_download_page(self, playlist_id, pagenum):
1771 """Report attempt to download playlist page with given number."""
1772 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1774 def _real_extract(self, url):
1775 # Extract playlist id
1776 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1778 self._downloader.report_error(u'invalid url: %s' % url)
1781 # Download playlist videos from API
1782 playlist_id = mobj.group(1) or mobj.group(2)
1787 self.report_download_page(playlist_id, page_num)
1789 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1791 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1792 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1793 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1797 response = json.loads(page)
1798 except ValueError as err:
1799 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1802 if 'feed' not in response:
1803 self._downloader.report_error(u'Got a malformed response from YouTube API')
1805 if 'entry' not in response['feed']:
1806 # Number of videos is a multiple of self._MAX_RESULTS
1809 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1810 for entry in response['feed']['entry']
1811 if 'content' in entry ]
1813 if len(response['feed']['entry']) < self._MAX_RESULTS:
1817 videos = [v[1] for v in sorted(videos)]
1819 url_results = [self.url_result(url) for url in videos]
1820 return [self.playlist_result(url_results, playlist_id)]
1823 class YoutubeChannelIE(InfoExtractor):
1824 """Information Extractor for YouTube channels."""
1826 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1827 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1828 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1829 _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1830 IE_NAME = u'youtube:channel'
1832 def report_download_page(self, channel_id, pagenum):
1833 """Report attempt to download channel page with given number."""
1834 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1836 def extract_videos_from_page(self, page):
1838 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1839 if mobj.group(1) not in ids_in_page:
1840 ids_in_page.append(mobj.group(1))
1843 def _real_extract(self, url):
1844 # Extract channel id
1845 mobj = re.match(self._VALID_URL, url)
1847 self._downloader.report_error(u'invalid url: %s' % url)
1850 # Download channel page
1851 channel_id = mobj.group(1)
1855 self.report_download_page(channel_id, pagenum)
1856 url = self._TEMPLATE_URL % (channel_id, pagenum)
1857 request = compat_urllib_request.Request(url)
1859 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1860 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1861 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1864 # Extract video identifiers
1865 ids_in_page = self.extract_videos_from_page(page)
1866 video_ids.extend(ids_in_page)
1868 # Download any subsequent channel pages using the json-based channel_ajax query
1869 if self._MORE_PAGES_INDICATOR in page:
1871 pagenum = pagenum + 1
1873 self.report_download_page(channel_id, pagenum)
1874 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1875 request = compat_urllib_request.Request(url)
1877 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1878 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1879 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1882 page = json.loads(page)
1884 ids_in_page = self.extract_videos_from_page(page['content_html'])
1885 video_ids.extend(ids_in_page)
1887 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1890 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1892 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1893 url_entries = [self.url_result(url) for url in urls]
1894 return [self.playlist_result(url_entries, channel_id)]
1897 class YoutubeUserIE(InfoExtractor):
1898 """Information Extractor for YouTube users."""
1900 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1901 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1902 _GDATA_PAGE_SIZE = 50
1903 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1904 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1905 IE_NAME = u'youtube:user'
1907 def __init__(self, downloader=None):
1908 InfoExtractor.__init__(self, downloader)
1910 def report_download_page(self, username, start_index):
1911 """Report attempt to download user page."""
1912 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1913 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1915 def _real_extract(self, url):
1917 mobj = re.match(self._VALID_URL, url)
1919 self._downloader.report_error(u'invalid url: %s' % url)
1922 username = mobj.group(1)
1924 # Download video ids using YouTube Data API. Result size per
1925 # query is limited (currently to 50 videos) so we need to query
1926 # page by page until there are no video ids - it means we got
1933 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1934 self.report_download_page(username, start_index)
1936 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1939 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1940 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1941 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1944 # Extract video identifiers
1947 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1948 if mobj.group(1) not in ids_in_page:
1949 ids_in_page.append(mobj.group(1))
1951 video_ids.extend(ids_in_page)
1953 # A little optimization - if current page is not
1954 # "full", ie. does not contain PAGE_SIZE video ids then
1955 # we can assume that this page is the last one - there
1956 # are no more ids on further pages - no need to query
1959 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1964 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1965 url_results = [self.url_result(url) for url in urls]
1966 return [self.playlist_result(url_results, playlist_title = username)]
1969 class BlipTVUserIE(InfoExtractor):
1970 """Information Extractor for blip.tv users."""
1972 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1974 IE_NAME = u'blip.tv:user'
1976 def __init__(self, downloader=None):
1977 InfoExtractor.__init__(self, downloader)
1979 def report_download_page(self, username, pagenum):
1980 """Report attempt to download user page."""
1981 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1982 (self.IE_NAME, username, pagenum))
1984 def _real_extract(self, url):
1986 mobj = re.match(self._VALID_URL, url)
1988 self._downloader.report_error(u'invalid url: %s' % url)
1991 username = mobj.group(1)
1993 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1995 request = compat_urllib_request.Request(url)
1998 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1999 mobj = re.search(r'data-users-id="([^"]+)"', page)
2000 page_base = page_base % mobj.group(1)
2001 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2002 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2006 # Download video ids using BlipTV Ajax calls. Result size per
2007 # query is limited (currently to 12 videos) so we need to query
2008 # page by page until there are no video ids - it means we got
2015 self.report_download_page(username, pagenum)
2016 url = page_base + "&page=" + str(pagenum)
2017 request = compat_urllib_request.Request( url )
2019 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
2020 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2021 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
2024 # Extract video identifiers
2027 for mobj in re.finditer(r'href="/([^"]+)"', page):
2028 if mobj.group(1) not in ids_in_page:
2029 ids_in_page.append(unescapeHTML(mobj.group(1)))
2031 video_ids.extend(ids_in_page)
2033 # A little optimization - if current page is not
2034 # "full", ie. does not contain PAGE_SIZE video ids then
2035 # we can assume that this page is the last one - there
2036 # are no more ids on further pages - no need to query
2039 if len(ids_in_page) < self._PAGE_SIZE:
2044 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
2045 (self.IE_NAME, username, all_ids_count, len(video_ids)))
2047 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
2048 url_entries = [self.url_result(url) for url in urls]
2049 return [self.playlist_result(url_entries, playlist_title = username)]
2052 class DepositFilesIE(InfoExtractor):
2053 """Information extractor for depositfiles.com"""
2055 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2057 def report_download_webpage(self, file_id):
2058 """Report webpage download."""
2059 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2061 def report_extraction(self, file_id):
2062 """Report information extraction."""
2063 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2065 def _real_extract(self, url):
2066 file_id = url.split('/')[-1]
2067 # Rebuild url in english locale
2068 url = 'http://depositfiles.com/en/files/' + file_id
2070 # Retrieve file webpage with 'Free download' button pressed
2071 free_download_indication = { 'gateway_result' : '1' }
2072 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2074 self.report_download_webpage(file_id)
2075 webpage = compat_urllib_request.urlopen(request).read()
2076 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2077 self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
2080 # Search for the real file URL
2081 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2082 if (mobj is None) or (mobj.group(1) is None):
2083 # Try to figure out reason of the error.
2084 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2085 if (mobj is not None) and (mobj.group(1) is not None):
2086 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2087 self._downloader.report_error(u'%s' % restriction_message)
2089 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
2092 file_url = mobj.group(1)
2093 file_extension = os.path.splitext(file_url)[1][1:]
2095 # Search for file title
2096 mobj = re.search(r'<b title="(.*?)">', webpage)
2098 self._downloader.report_error(u'unable to extract title')
2100 file_title = mobj.group(1).decode('utf-8')
2103 'id': file_id.decode('utf-8'),
2104 'url': file_url.decode('utf-8'),
2106 'upload_date': None,
2107 'title': file_title,
2108 'ext': file_extension.decode('utf-8'),
2112 class FacebookIE(InfoExtractor):
2113 """Information Extractor for Facebook"""
2115 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2116 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2117 _NETRC_MACHINE = 'facebook'
2118 IE_NAME = u'facebook'
2120 def report_login(self):
2121 """Report attempt to log in."""
2122 self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2124 def _real_initialize(self):
2125 if self._downloader is None:
2130 downloader_params = self._downloader.params
2132 # Attempt to use provided username and password or .netrc data
2133 if downloader_params.get('username', None) is not None:
2134 useremail = downloader_params['username']
2135 password = downloader_params['password']
2136 elif downloader_params.get('usenetrc', False):
2138 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2139 if info is not None:
2143 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2144 except (IOError, netrc.NetrcParseError) as err:
2145 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2148 if useremail is None:
2157 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2160 login_results = compat_urllib_request.urlopen(request).read()
2161 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2162 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2164 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2165 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2168 def _real_extract(self, url):
2169 mobj = re.match(self._VALID_URL, url)
2171 self._downloader.report_error(u'invalid URL: %s' % url)
2173 video_id = mobj.group('ID')
2175 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2176 webpage = self._download_webpage(url, video_id)
2178 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
2179 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2180 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2182 raise ExtractorError(u'Cannot parse data')
2183 data = dict(json.loads(m.group(1)))
2184 params_raw = compat_urllib_parse.unquote(data['params'])
2185 params = json.loads(params_raw)
2186 video_data = params['video_data'][0]
2187 video_url = video_data.get('hd_src')
2189 video_url = video_data['sd_src']
2191 raise ExtractorError(u'Cannot find video URL')
2192 video_duration = int(video_data['video_duration'])
2193 thumbnail = video_data['thumbnail_src']
2195 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2197 raise ExtractorError(u'Cannot find title in webpage')
2198 video_title = unescapeHTML(m.group(1))
2202 'title': video_title,
2205 'duration': video_duration,
2206 'thumbnail': thumbnail,
2211 class BlipTVIE(InfoExtractor):
2212 """Information extractor for blip.tv"""
2214 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2215 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2216 IE_NAME = u'blip.tv'
2218 def report_extraction(self, file_id):
2219 """Report information extraction."""
2220 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2222 def report_direct_download(self, title):
2223 """Report information extraction."""
2224 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2226 def _real_extract(self, url):
2227 mobj = re.match(self._VALID_URL, url)
2229 self._downloader.report_error(u'invalid URL: %s' % url)
2232 urlp = compat_urllib_parse_urlparse(url)
2233 if urlp.path.startswith('/play/'):
2234 request = compat_urllib_request.Request(url)
2235 response = compat_urllib_request.urlopen(request)
2236 redirecturl = response.geturl()
2237 rurlp = compat_urllib_parse_urlparse(redirecturl)
2238 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2239 url = 'http://blip.tv/a/a-' + file_id
2240 return self._real_extract(url)
2247 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2248 request = compat_urllib_request.Request(json_url)
2249 request.add_header('User-Agent', 'iTunes/10.6.1')
2250 self.report_extraction(mobj.group(1))
2253 urlh = compat_urllib_request.urlopen(request)
2254 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2255 basename = url.split('/')[-1]
2256 title,ext = os.path.splitext(basename)
2257 title = title.decode('UTF-8')
2258 ext = ext.replace('.', '')
2259 self.report_direct_download(title)
2264 'upload_date': None,
2269 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2270 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2271 if info is None: # Regular URL
2273 json_code_bytes = urlh.read()
2274 json_code = json_code_bytes.decode('utf-8')
2275 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2276 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2280 json_data = json.loads(json_code)
2281 if 'Post' in json_data:
2282 data = json_data['Post']
2286 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2287 video_url = data['media']['url']
2288 umobj = re.match(self._URL_EXT, video_url)
2290 raise ValueError('Can not determine filename extension')
2291 ext = umobj.group(1)
2294 'id': data['item_id'],
2296 'uploader': data['display_name'],
2297 'upload_date': upload_date,
2298 'title': data['title'],
2300 'format': data['media']['mimeType'],
2301 'thumbnail': data['thumbnailUrl'],
2302 'description': data['description'],
2303 'player_url': data['embedUrl'],
2304 'user_agent': 'iTunes/10.6.1',
2306 except (ValueError,KeyError) as err:
2307 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2313 class MyVideoIE(InfoExtractor):
2314 """Information Extractor for myvideo.de."""
2316 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2317 IE_NAME = u'myvideo'
2319 def __init__(self, downloader=None):
2320 InfoExtractor.__init__(self, downloader)
2322 def report_extraction(self, video_id):
2323 """Report information extraction."""
2324 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2326 def _real_extract(self,url):
2327 mobj = re.match(self._VALID_URL, url)
2329 self._download.report_error(u'invalid URL: %s' % url)
2332 video_id = mobj.group(1)
2335 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2336 webpage = self._download_webpage(webpage_url, video_id)
2338 self.report_extraction(video_id)
2339 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2342 self._downloader.report_error(u'unable to extract media URL')
2344 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2346 mobj = re.search('<title>([^<]+)</title>', webpage)
2348 self._downloader.report_error(u'unable to extract title')
2351 video_title = mobj.group(1)
2357 'upload_date': None,
2358 'title': video_title,
2362 class ComedyCentralIE(InfoExtractor):
2363 """Information extractor for The Daily Show and Colbert Report """
2365 # urls can be abbreviations like :thedailyshow or :colbert
2366 # urls for episodes like:
2367 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2368 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2369 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2370 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2371 |(https?://)?(www\.)?
2372 (?P<showname>thedailyshow|colbertnation)\.com/
2373 (full-episodes/(?P<episode>.*)|
2375 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2376 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2379 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2381 _video_extensions = {
2389 _video_dimensions = {
2399 def suitable(cls, url):
2400 """Receives a URL and returns True if suitable for this IE."""
2401 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2403 def report_extraction(self, episode_id):
2404 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2406 def report_config_download(self, episode_id, media_id):
2407 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2409 def report_index_download(self, episode_id):
2410 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2412 def _print_formats(self, formats):
2413 print('Available formats:')
2415 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2418 def _real_extract(self, url):
2419 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2421 self._downloader.report_error(u'invalid URL: %s' % url)
2424 if mobj.group('shortname'):
2425 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2426 url = u'http://www.thedailyshow.com/full-episodes/'
2428 url = u'http://www.colbertnation.com/full-episodes/'
2429 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2430 assert mobj is not None
2432 if mobj.group('clip'):
2433 if mobj.group('showname') == 'thedailyshow':
2434 epTitle = mobj.group('tdstitle')
2436 epTitle = mobj.group('cntitle')
2439 dlNewest = not mobj.group('episode')
2441 epTitle = mobj.group('showname')
2443 epTitle = mobj.group('episode')
2445 req = compat_urllib_request.Request(url)
2446 self.report_extraction(epTitle)
2448 htmlHandle = compat_urllib_request.urlopen(req)
2449 html = htmlHandle.read()
2450 webpage = html.decode('utf-8')
2451 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2452 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2455 url = htmlHandle.geturl()
2456 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2458 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2460 if mobj.group('episode') == '':
2461 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2463 epTitle = mobj.group('episode')
2465 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2467 if len(mMovieParams) == 0:
2468 # The Colbert Report embeds the information in a without
2469 # a URL prefix; so extract the alternate reference
2470 # and then add the URL prefix manually.
2472 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2473 if len(altMovieParams) == 0:
2474 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2477 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2479 uri = mMovieParams[0][1]
2480 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2481 self.report_index_download(epTitle)
2483 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2484 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2485 self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2490 idoc = xml.etree.ElementTree.fromstring(indexXml)
2491 itemEls = idoc.findall('.//item')
2492 for partNum,itemEl in enumerate(itemEls):
2493 mediaId = itemEl.findall('./guid')[0].text
2494 shortMediaId = mediaId.split(':')[-1]
2495 showId = mediaId.split(':')[-2].replace('.com', '')
2496 officialTitle = itemEl.findall('./title')[0].text
2497 officialDate = itemEl.findall('./pubDate')[0].text
2499 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2500 compat_urllib_parse.urlencode({'uri': mediaId}))
2501 configReq = compat_urllib_request.Request(configUrl)
2502 self.report_config_download(epTitle, shortMediaId)
2504 configXml = compat_urllib_request.urlopen(configReq).read()
2505 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2506 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2509 cdoc = xml.etree.ElementTree.fromstring(configXml)
2511 for rendition in cdoc.findall('.//rendition'):
2512 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2516 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2519 if self._downloader.params.get('listformats', None):
2520 self._print_formats([i[0] for i in turls])
2523 # For now, just pick the highest bitrate
2524 format,rtmp_video_url = turls[-1]
2526 # Get the format arg from the arg stream
2527 req_format = self._downloader.params.get('format', None)
2529 # Select format if we can find one
2532 format, rtmp_video_url = f, v
2535 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2537 raise ExtractorError(u'Cannot transform RTMP url')
2538 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2539 video_url = base + m.group('finalid')
2541 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2546 'upload_date': officialDate,
2551 'description': officialTitle,
2553 results.append(info)
2558 class EscapistIE(InfoExtractor):
2559 """Information extractor for The Escapist """
2561 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2562 IE_NAME = u'escapist'
2564 def report_extraction(self, showName):
2565 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2567 def report_config_download(self, showName):
2568 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2570 def _real_extract(self, url):
2571 mobj = re.match(self._VALID_URL, url)
2573 self._downloader.report_error(u'invalid URL: %s' % url)
2575 showName = mobj.group('showname')
2576 videoId = mobj.group('episode')
2578 self.report_extraction(showName)
2580 webPage = compat_urllib_request.urlopen(url)
2581 webPageBytes = webPage.read()
2582 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2583 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2584 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2585 self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2588 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2589 description = unescapeHTML(descMatch.group(1))
2590 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2591 imgUrl = unescapeHTML(imgMatch.group(1))
2592 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2593 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2594 configUrlMatch = re.search('config=(.*)$', playerUrl)
2595 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2597 self.report_config_download(showName)
2599 configJSON = compat_urllib_request.urlopen(configUrl)
2600 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2601 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2602 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2603 self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2606 # Technically, it's JavaScript, not JSON
2607 configJSON = configJSON.replace("'", '"')
2610 config = json.loads(configJSON)
2611 except (ValueError,) as err:
2612 self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2615 playlist = config['playlist']
2616 videoUrl = playlist[1]['url']
2621 'uploader': showName,
2622 'upload_date': None,
2625 'thumbnail': imgUrl,
2626 'description': description,
2627 'player_url': playerUrl,
2632 class CollegeHumorIE(InfoExtractor):
2633 """Information extractor for collegehumor.com"""
2636 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2637 IE_NAME = u'collegehumor'
2639 def report_manifest(self, video_id):
2640 """Report information extraction."""
2641 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2643 def report_extraction(self, video_id):
2644 """Report information extraction."""
2645 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2647 def _real_extract(self, url):
2648 mobj = re.match(self._VALID_URL, url)
2650 self._downloader.report_error(u'invalid URL: %s' % url)
2652 video_id = mobj.group('videoid')
2657 'upload_date': None,
2660 self.report_extraction(video_id)
2661 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2663 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2664 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2665 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2668 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2670 videoNode = mdoc.findall('./video')[0]
2671 info['description'] = videoNode.findall('./description')[0].text
2672 info['title'] = videoNode.findall('./caption')[0].text
2673 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2674 manifest_url = videoNode.findall('./file')[0].text
2676 self._downloader.report_error(u'Invalid metadata XML file')
2679 manifest_url += '?hdcore=2.10.3'
2680 self.report_manifest(video_id)
2682 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2683 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2684 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2687 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2689 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2690 node_id = media_node.attrib['url']
2691 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2692 except IndexError as err:
2693 self._downloader.report_error(u'Invalid manifest file')
2696 url_pr = compat_urllib_parse_urlparse(manifest_url)
2697 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2704 class XVideosIE(InfoExtractor):
2705 """Information extractor for xvideos.com"""
2707 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2708 IE_NAME = u'xvideos'
2710 def report_extraction(self, video_id):
2711 """Report information extraction."""
2712 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2714 def _real_extract(self, url):
2715 mobj = re.match(self._VALID_URL, url)
2717 self._downloader.report_error(u'invalid URL: %s' % url)
2719 video_id = mobj.group(1)
2721 webpage = self._download_webpage(url, video_id)
2723 self.report_extraction(video_id)
2727 mobj = re.search(r'flv_url=(.+?)&', webpage)
2729 self._downloader.report_error(u'unable to extract video url')
2731 video_url = compat_urllib_parse.unquote(mobj.group(1))
2735 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2737 self._downloader.report_error(u'unable to extract video title')
2739 video_title = mobj.group(1)
2742 # Extract video thumbnail
2743 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2745 self._downloader.report_error(u'unable to extract video thumbnail')
2747 video_thumbnail = mobj.group(0)
2753 'upload_date': None,
2754 'title': video_title,
2756 'thumbnail': video_thumbnail,
2757 'description': None,
2763 class SoundcloudIE(InfoExtractor):
2764 """Information extractor for soundcloud.com
2765 To access the media, the uid of the song and a stream token
2766 must be extracted from the page source and the script must make
2767 a request to media.soundcloud.com/crossdomain.xml. Then
2768 the media can be grabbed by requesting from an url composed
2769 of the stream token and uid
2772 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2773 IE_NAME = u'soundcloud'
2775 def __init__(self, downloader=None):
2776 InfoExtractor.__init__(self, downloader)
2778 def report_resolve(self, video_id):
2779 """Report information extraction."""
2780 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2782 def report_extraction(self, video_id):
2783 """Report information extraction."""
2784 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2786 def _real_extract(self, url):
2787 mobj = re.match(self._VALID_URL, url)
2789 self._downloader.report_error(u'invalid URL: %s' % url)
2792 # extract uploader (which is in the url)
2793 uploader = mobj.group(1)
2794 # extract simple title (uploader + slug of song title)
2795 slug_title = mobj.group(2)
2796 simple_title = uploader + u'-' + slug_title
2798 self.report_resolve('%s/%s' % (uploader, slug_title))
2800 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2801 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2802 request = compat_urllib_request.Request(resolv_url)
2804 info_json_bytes = compat_urllib_request.urlopen(request).read()
2805 info_json = info_json_bytes.decode('utf-8')
2806 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2807 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2810 info = json.loads(info_json)
2811 video_id = info['id']
2812 self.report_extraction('%s/%s' % (uploader, slug_title))
2814 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2815 request = compat_urllib_request.Request(streams_url)
2817 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2818 stream_json = stream_json_bytes.decode('utf-8')
2819 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2820 self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2823 streams = json.loads(stream_json)
2824 mediaURL = streams['http_mp3_128_url']
2829 'uploader': info['user']['username'],
2830 'upload_date': info['created_at'],
2831 'title': info['title'],
2833 'description': info['description'],
2836 class SoundcloudSetIE(InfoExtractor):
2837 """Information extractor for soundcloud.com sets
2838 To access the media, the uid of the song and a stream token
2839 must be extracted from the page source and the script must make
2840 a request to media.soundcloud.com/crossdomain.xml. Then
2841 the media can be grabbed by requesting from an url composed
2842 of the stream token and uid
2845 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2846 IE_NAME = u'soundcloud'
2848 def __init__(self, downloader=None):
2849 InfoExtractor.__init__(self, downloader)
2851 def report_resolve(self, video_id):
2852 """Report information extraction."""
2853 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2855 def report_extraction(self, video_id):
2856 """Report information extraction."""
2857 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2859 def _real_extract(self, url):
2860 mobj = re.match(self._VALID_URL, url)
2862 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2865 # extract uploader (which is in the url)
2866 uploader = mobj.group(1)
2867 # extract simple title (uploader + slug of song title)
2868 slug_title = mobj.group(2)
2869 simple_title = uploader + u'-' + slug_title
2871 self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2873 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2874 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2875 request = compat_urllib_request.Request(resolv_url)
2877 info_json_bytes = compat_urllib_request.urlopen(request).read()
2878 info_json = info_json_bytes.decode('utf-8')
2879 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2880 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2884 info = json.loads(info_json)
2885 if 'errors' in info:
2886 for err in info['errors']:
2887 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err['error_message']))
2890 for track in info['tracks']:
2891 video_id = track['id']
2892 self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2894 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2895 request = compat_urllib_request.Request(streams_url)
2897 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2898 stream_json = stream_json_bytes.decode('utf-8')
2899 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2900 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2903 streams = json.loads(stream_json)
2904 mediaURL = streams['http_mp3_128_url']
2909 'uploader': track['user']['username'],
2910 'upload_date': track['created_at'],
2911 'title': track['title'],
2913 'description': track['description'],
2918 class InfoQIE(InfoExtractor):
2919 """Information extractor for infoq.com"""
2920 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2922 def report_extraction(self, video_id):
2923 """Report information extraction."""
2924 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2926 def _real_extract(self, url):
2927 mobj = re.match(self._VALID_URL, url)
2929 self._downloader.report_error(u'invalid URL: %s' % url)
2932 webpage = self._download_webpage(url, video_id=url)
2933 self.report_extraction(url)
2936 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2938 self._downloader.report_error(u'unable to extract video url')
2940 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2941 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2944 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2946 self._downloader.report_error(u'unable to extract video title')
2948 video_title = mobj.group(1)
2950 # Extract description
2951 video_description = u'No description available.'
2952 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2953 if mobj is not None:
2954 video_description = mobj.group(1)
2956 video_filename = video_url.split('/')[-1]
2957 video_id, extension = video_filename.split('.')
2963 'upload_date': None,
2964 'title': video_title,
2965 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2967 'description': video_description,
2972 class MixcloudIE(InfoExtractor):
2973 """Information extractor for www.mixcloud.com"""
2975 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2976 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2977 IE_NAME = u'mixcloud'
2979 def __init__(self, downloader=None):
2980 InfoExtractor.__init__(self, downloader)
2982 def report_download_json(self, file_id):
2983 """Report JSON download."""
2984 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2986 def report_extraction(self, file_id):
2987 """Report information extraction."""
2988 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2990 def get_urls(self, jsonData, fmt, bitrate='best'):
2991 """Get urls from 'audio_formats' section in json"""
2994 bitrate_list = jsonData[fmt]
2995 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2996 bitrate = max(bitrate_list) # select highest
2998 url_list = jsonData[fmt][bitrate]
2999 except TypeError: # we have no bitrate info.
3000 url_list = jsonData[fmt]
3003 def check_urls(self, url_list):
3004 """Returns 1st active url from list"""
3005 for url in url_list:
3007 compat_urllib_request.urlopen(url)
3009 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3014 def _print_formats(self, formats):
3015 print('Available formats:')
3016 for fmt in formats.keys():
3017 for b in formats[fmt]:
3019 ext = formats[fmt][b][0]
3020 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
3021 except TypeError: # we have no bitrate info
3022 ext = formats[fmt][0]
3023 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
3026 def _real_extract(self, url):
3027 mobj = re.match(self._VALID_URL, url)
3029 self._downloader.report_error(u'invalid URL: %s' % url)
3031 # extract uploader & filename from url
3032 uploader = mobj.group(1).decode('utf-8')
3033 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3035 # construct API request
3036 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3037 # retrieve .json file with links to files
3038 request = compat_urllib_request.Request(file_url)
3040 self.report_download_json(file_url)
3041 jsonData = compat_urllib_request.urlopen(request).read()
3042 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3043 self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
3047 json_data = json.loads(jsonData)
3048 player_url = json_data['player_swf_url']
3049 formats = dict(json_data['audio_formats'])
3051 req_format = self._downloader.params.get('format', None)
3054 if self._downloader.params.get('listformats', None):
3055 self._print_formats(formats)
3058 if req_format is None or req_format == 'best':
3059 for format_param in formats.keys():
3060 url_list = self.get_urls(formats, format_param)
3062 file_url = self.check_urls(url_list)
3063 if file_url is not None:
3066 if req_format not in formats:
3067 self._downloader.report_error(u'format is not available')
3070 url_list = self.get_urls(formats, req_format)
3071 file_url = self.check_urls(url_list)
3072 format_param = req_format
3075 'id': file_id.decode('utf-8'),
3076 'url': file_url.decode('utf-8'),
3077 'uploader': uploader.decode('utf-8'),
3078 'upload_date': None,
3079 'title': json_data['name'],
3080 'ext': file_url.split('.')[-1].decode('utf-8'),
3081 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3082 'thumbnail': json_data['thumbnail_url'],
3083 'description': json_data['description'],
3084 'player_url': player_url.decode('utf-8'),
3087 class StanfordOpenClassroomIE(InfoExtractor):
3088 """Information extractor for Stanford's Open ClassRoom"""
3090 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3091 IE_NAME = u'stanfordoc'
3093 def report_download_webpage(self, objid):
3094 """Report information extraction."""
3095 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3097 def report_extraction(self, video_id):
3098 """Report information extraction."""
3099 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3101 def _real_extract(self, url):
3102 mobj = re.match(self._VALID_URL, url)
3104 raise ExtractorError(u'Invalid URL: %s' % url)
3106 if mobj.group('course') and mobj.group('video'): # A specific video
3107 course = mobj.group('course')
3108 video = mobj.group('video')
3110 'id': course + '_' + video,
3112 'upload_date': None,
3115 self.report_extraction(info['id'])
3116 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3117 xmlUrl = baseUrl + video + '.xml'
3119 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3120 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3121 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
3123 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3125 info['title'] = mdoc.findall('./title')[0].text
3126 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3128 self._downloader.report_error(u'Invalid metadata XML file')
3130 info['ext'] = info['url'].rpartition('.')[2]
3132 elif mobj.group('course'): # A course page
3133 course = mobj.group('course')
3138 'upload_date': None,
3141 coursepage = self._download_webpage(url, info['id'],
3142 note='Downloading course info page',
3143 errnote='Unable to download course info page')
3145 m = re.search('<h1>([^<]+)</h1>', coursepage)
3147 info['title'] = unescapeHTML(m.group(1))
3149 info['title'] = info['id']
3151 m = re.search('<description>([^<]+)</description>', coursepage)
3153 info['description'] = unescapeHTML(m.group(1))
3155 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3158 'type': 'reference',
3159 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3163 for entry in info['list']:
3164 assert entry['type'] == 'reference'
3165 results += self.extract(entry['url'])
3169 'id': 'Stanford OpenClassroom',
3172 'upload_date': None,
3175 self.report_download_webpage(info['id'])
3176 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3178 rootpage = compat_urllib_request.urlopen(rootURL).read()
3179 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3180 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3183 info['title'] = info['id']
3185 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3188 'type': 'reference',
3189 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3194 for entry in info['list']:
3195 assert entry['type'] == 'reference'
3196 results += self.extract(entry['url'])
3199 class MTVIE(InfoExtractor):
3200 """Information extractor for MTV.com"""
3202 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3205 def report_extraction(self, video_id):
3206 """Report information extraction."""
3207 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3209 def _real_extract(self, url):
3210 mobj = re.match(self._VALID_URL, url)
3212 self._downloader.report_error(u'invalid URL: %s' % url)
3214 if not mobj.group('proto'):
3215 url = 'http://' + url
3216 video_id = mobj.group('videoid')
3218 webpage = self._download_webpage(url, video_id)
3220 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3222 self._downloader.report_error(u'unable to extract song name')
3224 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3225 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3227 self._downloader.report_error(u'unable to extract performer')
3229 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3230 video_title = performer + ' - ' + song_name
3232 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3234 self._downloader.report_error(u'unable to mtvn_uri')
3236 mtvn_uri = mobj.group(1)
3238 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3240 self._downloader.report_error(u'unable to extract content id')
3242 content_id = mobj.group(1)
3244 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3245 self.report_extraction(video_id)
3246 request = compat_urllib_request.Request(videogen_url)
3248 metadataXml = compat_urllib_request.urlopen(request).read()
3249 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3250 self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3253 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3254 renditions = mdoc.findall('.//rendition')
3256 # For now, always pick the highest quality.
3257 rendition = renditions[-1]
3260 _,_,ext = rendition.attrib['type'].partition('/')
3261 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3262 video_url = rendition.find('./src').text
3264 self._downloader.trouble('Invalid rendition field.')
3270 'uploader': performer,
3271 'upload_date': None,
3272 'title': video_title,
3280 class YoukuIE(InfoExtractor):
3281 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3283 def report_download_webpage(self, file_id):
3284 """Report webpage download."""
3285 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3287 def report_extraction(self, file_id):
3288 """Report information extraction."""
3289 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3292 nowTime = int(time.time() * 1000)
3293 random1 = random.randint(1000,1998)
3294 random2 = random.randint(1000,9999)
3296 return "%d%d%d" %(nowTime,random1,random2)
3298 def _get_file_ID_mix_string(self, seed):
3300 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3302 for i in range(len(source)):
3303 seed = (seed * 211 + 30031 ) % 65536
3304 index = math.floor(seed / 65536 * len(source) )
3305 mixed.append(source[int(index)])
3306 source.remove(source[int(index)])
3307 #return ''.join(mixed)
3310 def _get_file_id(self, fileId, seed):
3311 mixed = self._get_file_ID_mix_string(seed)
3312 ids = fileId.split('*')
3316 realId.append(mixed[int(ch)])
3317 return ''.join(realId)
3319 def _real_extract(self, url):
3320 mobj = re.match(self._VALID_URL, url)
3322 self._downloader.report_error(u'invalid URL: %s' % url)
3324 video_id = mobj.group('ID')
3326 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3328 request = compat_urllib_request.Request(info_url, None, std_headers)
3330 self.report_download_webpage(video_id)
3331 jsondata = compat_urllib_request.urlopen(request).read()
3332 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3333 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3336 self.report_extraction(video_id)
3338 jsonstr = jsondata.decode('utf-8')
3339 config = json.loads(jsonstr)
3341 video_title = config['data'][0]['title']
3342 seed = config['data'][0]['seed']
3344 format = self._downloader.params.get('format', None)
3345 supported_format = list(config['data'][0]['streamfileids'].keys())
3347 if format is None or format == 'best':
3348 if 'hd2' in supported_format:
3353 elif format == 'worst':
3361 fileid = config['data'][0]['streamfileids'][format]
3362 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3363 except (UnicodeDecodeError, ValueError, KeyError):
3364 self._downloader.report_error(u'unable to extract info section')
3368 sid = self._gen_sid()
3369 fileid = self._get_file_id(fileid, seed)
3371 #column 8,9 of fileid represent the segment number
3372 #fileid[7:9] should be changed
3373 for index, key in enumerate(keys):
3375 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3376 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3379 'id': '%s_part%02d' % (video_id, index),
3380 'url': download_url,
3382 'upload_date': None,
3383 'title': video_title,
3386 files_info.append(info)
3391 class XNXXIE(InfoExtractor):
3392 """Information extractor for xnxx.com"""
3394 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3396 VIDEO_URL_RE = r'flv_url=(.*?)&'
3397 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3398 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3400 def report_webpage(self, video_id):
3401 """Report information extraction"""
3402 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3404 def report_extraction(self, video_id):
3405 """Report information extraction"""
3406 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3408 def _real_extract(self, url):
3409 mobj = re.match(self._VALID_URL, url)
3411 self._downloader.report_error(u'invalid URL: %s' % url)
3413 video_id = mobj.group(1)
3415 self.report_webpage(video_id)
3417 # Get webpage content
3419 webpage_bytes = compat_urllib_request.urlopen(url).read()
3420 webpage = webpage_bytes.decode('utf-8')
3421 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3422 self._downloader.report_error(u'unable to download video webpage: %s' % err)
3425 result = re.search(self.VIDEO_URL_RE, webpage)
3427 self._downloader.report_error(u'unable to extract video url')
3429 video_url = compat_urllib_parse.unquote(result.group(1))
3431 result = re.search(self.VIDEO_TITLE_RE, webpage)
3433 self._downloader.report_error(u'unable to extract video title')
3435 video_title = result.group(1)
3437 result = re.search(self.VIDEO_THUMB_RE, webpage)
3439 self._downloader.report_error(u'unable to extract video thumbnail')
3441 video_thumbnail = result.group(1)
3447 'upload_date': None,
3448 'title': video_title,
3450 'thumbnail': video_thumbnail,
3451 'description': None,
3455 class GooglePlusIE(InfoExtractor):
3456 """Information extractor for plus.google.com."""
3458 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3459 IE_NAME = u'plus.google'
3461 def __init__(self, downloader=None):
3462 InfoExtractor.__init__(self, downloader)
3464 def report_extract_entry(self, url):
3465 """Report downloading extry"""
3466 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3468 def report_date(self, upload_date):
3469 """Report downloading extry"""
3470 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3472 def report_uploader(self, uploader):
3473 """Report downloading extry"""
3474 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3476 def report_title(self, video_title):
3477 """Report downloading extry"""
3478 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3480 def report_extract_vid_page(self, video_page):
3481 """Report information extraction."""
3482 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3484 def _real_extract(self, url):
3485 # Extract id from URL
3486 mobj = re.match(self._VALID_URL, url)
3488 self._downloader.report_error(u'Invalid URL: %s' % url)
3491 post_url = mobj.group(0)
3492 video_id = mobj.group(1)
3494 video_extension = 'flv'
3496 # Step 1, Retrieve post webpage to extract further information
3497 self.report_extract_entry(post_url)
3498 request = compat_urllib_request.Request(post_url)
3500 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3501 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3502 self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3505 # Extract update date
3507 pattern = 'title="Timestamp">(.*?)</a>'
3508 mobj = re.search(pattern, webpage)
3510 upload_date = mobj.group(1)
3511 # Convert timestring to a format suitable for filename
3512 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3513 upload_date = upload_date.strftime('%Y%m%d')
3514 self.report_date(upload_date)
3518 pattern = r'rel\="author".*?>(.*?)</a>'
3519 mobj = re.search(pattern, webpage)
3521 uploader = mobj.group(1)
3522 self.report_uploader(uploader)
3525 # Get the first line for title
3527 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3528 mobj = re.search(pattern, webpage)
3530 video_title = mobj.group(1)
3531 self.report_title(video_title)
3533 # Step 2, Stimulate clicking the image box to launch video
3534 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3535 mobj = re.search(pattern, webpage)
3537 self._downloader.report_error(u'unable to extract video page URL')
3539 video_page = mobj.group(1)
3540 request = compat_urllib_request.Request(video_page)
3542 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3543 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3544 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3546 self.report_extract_vid_page(video_page)
3549 # Extract video links on video page
3550 """Extract video links of all sizes"""
3551 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3552 mobj = re.findall(pattern, webpage)
3554 self._downloader.report_error(u'unable to extract video links')
3556 # Sort in resolution
3557 links = sorted(mobj)
3559 # Choose the lowest of the sort, i.e. highest resolution
3560 video_url = links[-1]
3561 # Only get the url. The resolution part in the tuple has no use anymore
3562 video_url = video_url[-1]
3563 # Treat escaped \u0026 style hex
3565 video_url = video_url.decode("unicode_escape")
3566 except AttributeError: # Python 3
3567 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3573 'uploader': uploader,
3574 'upload_date': upload_date,
3575 'title': video_title,
3576 'ext': video_extension,
3579 class NBAIE(InfoExtractor):
3580 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3583 def _real_extract(self, url):
3584 mobj = re.match(self._VALID_URL, url)
3586 self._downloader.report_error(u'invalid URL: %s' % url)
3589 video_id = mobj.group(1)
3590 if video_id.endswith('/index.html'):
3591 video_id = video_id[:-len('/index.html')]
3593 webpage = self._download_webpage(url, video_id)
3595 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3596 def _findProp(rexp, default=None):
3597 m = re.search(rexp, webpage)
3599 return unescapeHTML(m.group(1))
3603 shortened_video_id = video_id.rpartition('/')[2]
3604 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3606 'id': shortened_video_id,
3610 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3611 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3615 class JustinTVIE(InfoExtractor):
3616 """Information extractor for justin.tv and twitch.tv"""
3617 # TODO: One broadcast may be split into multiple videos. The key
3618 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3619 # starts at 1 and increases. Can we treat all parts as one video?
3621 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3622 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3623 _JUSTIN_PAGE_LIMIT = 100
3624 IE_NAME = u'justin.tv'
3626 def report_extraction(self, file_id):
3627 """Report information extraction."""
3628 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3630 def report_download_page(self, channel, offset):
3631 """Report attempt to download a single page of videos."""
3632 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3633 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3635 # Return count of items, list of *valid* items
3636 def _parse_page(self, url):
3638 urlh = compat_urllib_request.urlopen(url)
3639 webpage_bytes = urlh.read()
3640 webpage = webpage_bytes.decode('utf-8', 'ignore')
3641 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3642 self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3645 response = json.loads(webpage)
3646 if type(response) != list:
3647 error_text = response.get('error', 'unknown error')
3648 self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3651 for clip in response:
3652 video_url = clip['video_file_url']
3654 video_extension = os.path.splitext(video_url)[1][1:]
3655 video_date = re.sub('-', '', clip['start_time'][:10])
3656 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3657 video_id = clip['id']
3658 video_title = clip.get('title', video_id)
3662 'title': video_title,
3663 'uploader': clip.get('channel_name', video_uploader_id),
3664 'uploader_id': video_uploader_id,
3665 'upload_date': video_date,
3666 'ext': video_extension,
3668 return (len(response), info)
3670 def _real_extract(self, url):
3671 mobj = re.match(self._VALID_URL, url)
3673 self._downloader.report_error(u'invalid URL: %s' % url)
3676 api = 'http://api.justin.tv'
3677 video_id = mobj.group(mobj.lastindex)
3679 if mobj.lastindex == 1:
3681 api += '/channel/archives/%s.json'
3683 api += '/broadcast/by_archive/%s.json'
3684 api = api % (video_id,)
3686 self.report_extraction(video_id)
3690 limit = self._JUSTIN_PAGE_LIMIT
3693 self.report_download_page(video_id, offset)
3694 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3695 page_count, page_info = self._parse_page(page_url)
3696 info.extend(page_info)
3697 if not paged or page_count != limit:
3702 class FunnyOrDieIE(InfoExtractor):
3703 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3705 def _real_extract(self, url):
3706 mobj = re.match(self._VALID_URL, url)
3708 self._downloader.report_error(u'invalid URL: %s' % url)
3711 video_id = mobj.group('id')
3712 webpage = self._download_webpage(url, video_id)
3714 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3716 self._downloader.report_error(u'unable to find video information')
3717 video_url = unescapeHTML(m.group('url'))
3719 m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3721 m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3723 self._downloader.trouble(u'Cannot find video title')
3724 title = clean_html(m.group('title'))
3726 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3728 desc = unescapeHTML(m.group('desc'))
3737 'description': desc,
3741 class SteamIE(InfoExtractor):
3742 _VALID_URL = r"""http://store.steampowered.com/
3743 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3745 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3749 def suitable(cls, url):
3750 """Receives a URL and returns True if suitable for this IE."""
3751 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3753 def _real_extract(self, url):
3754 m = re.match(self._VALID_URL, url, re.VERBOSE)
3755 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3756 gameID = m.group('gameID')
3757 videourl = 'http://store.steampowered.com/video/%s/' % gameID
3758 webpage = self._download_webpage(videourl, gameID)
3759 mweb = re.finditer(urlRE, webpage)
3760 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3761 titles = re.finditer(namesRE, webpage)
3762 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3763 thumbs = re.finditer(thumbsRE, webpage)
3765 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3766 video_id = vid.group('videoID')
3767 title = vtitle.group('videoName')
3768 video_url = vid.group('videoURL')
3769 video_thumb = thumb.group('thumbnail')
3771 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3776 'title': unescapeHTML(title),
3777 'thumbnail': video_thumb
3782 class UstreamIE(InfoExtractor):
3783 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3784 IE_NAME = u'ustream'
3786 def _real_extract(self, url):
3787 m = re.match(self._VALID_URL, url)
3788 video_id = m.group('videoID')
3789 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3790 webpage = self._download_webpage(url, video_id)
3791 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3792 title = m.group('title')
3793 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3794 uploader = m.group('uploader')
3800 'uploader': uploader
3804 class WorldStarHipHopIE(InfoExtractor):
3805 _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3806 IE_NAME = u'WorldStarHipHop'
3808 def _real_extract(self, url):
3809 _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3811 webpage_src = compat_urllib_request.urlopen(url).read()
3812 webpage_src = webpage_src.decode('utf-8')
3814 mobj = re.search(_src_url, webpage_src)
3816 m = re.match(self._VALID_URL, url)
3817 video_id = m.group('id')
3819 if mobj is not None:
3820 video_url = mobj.group()
3821 if 'mp4' in video_url:
3826 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3829 _title = r"""<title>(.*)</title>"""
3831 mobj = re.search(_title, webpage_src)
3833 if mobj is not None:
3834 title = mobj.group(1)
3836 title = 'World Start Hip Hop - %s' % time.ctime()
3838 _thumbnail = r"""rel="image_src" href="(.*)" />"""
3839 mobj = re.search(_thumbnail, webpage_src)
3841 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3842 if mobj is not None:
3843 thumbnail = mobj.group(1)
3845 _title = r"""candytitles.*>(.*)</span>"""
3846 mobj = re.search(_title, webpage_src)
3847 if mobj is not None:
3848 title = mobj.group(1)
3855 'thumbnail' : thumbnail,
3860 class RBMARadioIE(InfoExtractor):
3861 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3863 def _real_extract(self, url):
3864 m = re.match(self._VALID_URL, url)
3865 video_id = m.group('videoID')
3867 webpage = self._download_webpage(url, video_id)
3868 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3870 raise ExtractorError(u'Cannot find metadata')
3871 json_data = m.group(1)
3874 data = json.loads(json_data)
3875 except ValueError as e:
3876 raise ExtractorError(u'Invalid JSON: ' + str(e))
3878 video_url = data['akamai_url'] + '&cbr=256'
3879 url_parts = compat_urllib_parse_urlparse(video_url)
3880 video_ext = url_parts.path.rpartition('.')[2]
3885 'title': data['title'],
3886 'description': data.get('teaser_text'),
3887 'location': data.get('country_of_origin'),
3888 'uploader': data.get('host', {}).get('name'),
3889 'uploader_id': data.get('host', {}).get('slug'),
3890 'thumbnail': data.get('image', {}).get('large_url_2x'),
3891 'duration': data.get('duration'),
3896 class YouPornIE(InfoExtractor):
3897 """Information extractor for youporn.com."""
3898 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3900 def _print_formats(self, formats):
3901 """Print all available formats"""
3902 print(u'Available formats:')
3903 print(u'ext\t\tformat')
3904 print(u'---------------------------------')
3905 for format in formats:
3906 print(u'%s\t\t%s' % (format['ext'], format['format']))
3908 def _specific(self, req_format, formats):
3910 if(x["format"]==req_format):
3914 def _real_extract(self, url):
3915 mobj = re.match(self._VALID_URL, url)
3917 self._downloader.report_error(u'invalid URL: %s' % url)
3920 video_id = mobj.group('videoid')
3922 req = compat_urllib_request.Request(url)
3923 req.add_header('Cookie', 'age_verified=1')
3924 webpage = self._download_webpage(req, video_id)
3926 # Get the video title
3927 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3929 raise ExtractorError(u'Unable to extract video title')
3930 video_title = result.group('title').strip()
3932 # Get the video date
3933 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3935 self._downloader.report_warning(u'unable to extract video date')
3938 upload_date = result.group('date').strip()
3940 # Get the video uploader
3941 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3943 self._downloader.report_warning(u'unable to extract uploader')
3944 video_uploader = None
3946 video_uploader = result.group('uploader').strip()
3947 video_uploader = clean_html( video_uploader )
3949 # Get all of the formats available
3950 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3951 result = re.search(DOWNLOAD_LIST_RE, webpage)
3953 raise ExtractorError(u'Unable to extract download list')
3954 download_list_html = result.group('download_list').strip()
3956 # Get all of the links from the page
3957 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3958 links = re.findall(LINK_RE, download_list_html)
3959 if(len(links) == 0):
3960 raise ExtractorError(u'ERROR: no known formats available for video')
3962 self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3967 # A link looks like this:
3968 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3969 # A path looks like this:
3970 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3971 video_url = unescapeHTML( link )
3972 path = compat_urllib_parse_urlparse( video_url ).path
3973 extension = os.path.splitext( path )[1][1:]
3974 format = path.split('/')[4].split('_')[:2]
3977 format = "-".join( format )
3978 title = u'%s-%s-%s' % (video_title, size, bitrate)
3983 'uploader': video_uploader,
3984 'upload_date': upload_date,
3989 'description': None,
3993 if self._downloader.params.get('listformats', None):
3994 self._print_formats(formats)
3997 req_format = self._downloader.params.get('format', None)
3998 self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
4000 if req_format is None or req_format == 'best':
4002 elif req_format == 'worst':
4003 return [formats[-1]]
4004 elif req_format in ('-1', 'all'):
4007 format = self._specific( req_format, formats )
4009 self._downloader.report_error(u'requested format not available')
4015 class PornotubeIE(InfoExtractor):
4016 """Information extractor for pornotube.com."""
4017 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
4019 def _real_extract(self, url):
4020 mobj = re.match(self._VALID_URL, url)
4022 self._downloader.report_error(u'invalid URL: %s' % url)
4025 video_id = mobj.group('videoid')
4026 video_title = mobj.group('title')
4028 # Get webpage content
4029 webpage = self._download_webpage(url, video_id)
4032 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
4033 result = re.search(VIDEO_URL_RE, webpage)
4035 self._downloader.report_error(u'unable to extract video url')
4037 video_url = compat_urllib_parse.unquote(result.group('url'))
4039 #Get the uploaded date
4040 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
4041 result = re.search(VIDEO_UPLOADED_RE, webpage)
4043 self._downloader.report_error(u'unable to extract video title')
4045 upload_date = result.group('date')
4047 info = {'id': video_id,
4050 'upload_date': upload_date,
4051 'title': video_title,
4057 class YouJizzIE(InfoExtractor):
4058 """Information extractor for youjizz.com."""
4059 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
4061 def _real_extract(self, url):
4062 mobj = re.match(self._VALID_URL, url)
4064 self._downloader.report_error(u'invalid URL: %s' % url)
4067 video_id = mobj.group('videoid')
4069 # Get webpage content
4070 webpage = self._download_webpage(url, video_id)
4072 # Get the video title
4073 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
4075 raise ExtractorError(u'ERROR: unable to extract video title')
4076 video_title = result.group('title').strip()
4078 # Get the embed page
4079 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
4081 raise ExtractorError(u'ERROR: unable to extract embed page')
4083 embed_page_url = result.group(0).strip()
4084 video_id = result.group('videoid')
4086 webpage = self._download_webpage(embed_page_url, video_id)
4089 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
4091 raise ExtractorError(u'ERROR: unable to extract video url')
4092 video_url = result.group('source')
4094 info = {'id': video_id,
4096 'title': video_title,
4099 'player_url': embed_page_url}
4103 class EightTracksIE(InfoExtractor):
4105 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
4107 def _real_extract(self, url):
4108 mobj = re.match(self._VALID_URL, url)
4110 raise ExtractorError(u'Invalid URL: %s' % url)
4111 playlist_id = mobj.group('id')
4113 webpage = self._download_webpage(url, playlist_id)
4115 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
4117 raise ExtractorError(u'Cannot find trax information')
4118 json_like = m.group(1)
4119 data = json.loads(json_like)
4121 session = str(random.randint(0, 1000000000))
4123 track_count = data['tracks_count']
4124 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
4125 next_url = first_url
4127 for i in itertools.count():
4128 api_json = self._download_webpage(next_url, playlist_id,
4129 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
4130 errnote=u'Failed to download song information')
4131 api_data = json.loads(api_json)
4132 track_data = api_data[u'set']['track']
4134 'id': track_data['id'],
4135 'url': track_data['track_file_stream_url'],
4136 'title': track_data['performer'] + u' - ' + track_data['name'],
4137 'raw_title': track_data['name'],
4138 'uploader_id': data['user']['login'],
4142 if api_data['set']['at_last_track']:
4144 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
4147 class KeekIE(InfoExtractor):
4148 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
4151 def _real_extract(self, url):
4152 m = re.match(self._VALID_URL, url)
4153 video_id = m.group('videoID')
4154 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
4155 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
4156 webpage = self._download_webpage(url, video_id)
4157 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4158 title = unescapeHTML(m.group('title'))
4159 m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
4160 uploader = clean_html(m.group('uploader'))
4166 'thumbnail': thumbnail,
4167 'uploader': uploader
4171 class TEDIE(InfoExtractor):
4172 _VALID_URL=r'''http://www.ted.com/
4174 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4176 ((?P<type_talk>talks)) # We have a simple talk
4178 /(?P<name>\w+) # Here goes the name and then ".html"
4182 def suitable(cls, url):
4183 """Receives a URL and returns True if suitable for this IE."""
4184 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4186 def _real_extract(self, url):
4187 m=re.match(self._VALID_URL, url, re.VERBOSE)
4188 if m.group('type_talk'):
4189 return [self._talk_info(url)]
4191 playlist_id=m.group('playlist_id')
4192 name=m.group('name')
4193 self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4194 return self._playlist_videos_info(url,name,playlist_id)
4196 def _talk_video_link(self,mediaSlug):
4197 '''Returns the video link for that mediaSlug'''
4198 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4200 def _playlist_videos_info(self,url,name,playlist_id=0):
4201 '''Returns the videos of the playlist'''
4203 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4204 ([.\s]*?)data-playlist_item_id="(\d+)"
4205 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4207 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4208 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4209 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4210 m_names=re.finditer(video_name_RE,webpage)
4212 for m_video, m_name in zip(m_videos,m_names):
4213 video_id=m_video.group('video_id')
4214 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4215 info.append(self._talk_info(talk_url,video_id))
4218 def _talk_info(self, url, video_id=0):
4219 """Return the video for the talk in the url"""
4220 m=re.match(self._VALID_URL, url,re.VERBOSE)
4221 videoName=m.group('name')
4222 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4223 # If the url includes the language we get the title translated
4224 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4225 title=re.search(title_RE, webpage).group('title')
4226 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4227 "id":(?P<videoID>[\d]+).*?
4228 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4229 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4230 thumb_match=re.search(thumb_RE,webpage)
4231 info_match=re.search(info_RE,webpage,re.VERBOSE)
4232 video_id=info_match.group('videoID')
4233 mediaSlug=info_match.group('mediaSlug')
4234 video_url=self._talk_video_link(mediaSlug)
4240 'thumbnail': thumb_match.group('thumbnail')
4244 class MySpassIE(InfoExtractor):
4245 _VALID_URL = r'http://www.myspass.de/.*'
4247 def _real_extract(self, url):
4248 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4250 # video id is the last path element of the URL
4251 # usually there is a trailing slash, so also try the second but last
4252 url_path = compat_urllib_parse_urlparse(url).path
4253 url_parent_path, video_id = os.path.split(url_path)
4255 _, video_id = os.path.split(url_parent_path)
4258 metadata_url = META_DATA_URL_TEMPLATE % video_id
4259 metadata_text = self._download_webpage(metadata_url, video_id)
4260 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4262 # extract values from metadata
4263 url_flv_el = metadata.find('url_flv')
4264 if url_flv_el is None:
4265 self._downloader.report_error(u'unable to extract download url')
4267 video_url = url_flv_el.text
4268 extension = os.path.splitext(video_url)[1][1:]
4269 title_el = metadata.find('title')
4270 if title_el is None:
4271 self._downloader.report_error(u'unable to extract title')
4273 title = title_el.text
4274 format_id_el = metadata.find('format_id')
4275 if format_id_el is None:
4278 format = format_id_el.text
4279 description_el = metadata.find('description')
4280 if description_el is not None:
4281 description = description_el.text
4284 imagePreview_el = metadata.find('imagePreview')
4285 if imagePreview_el is not None:
4286 thumbnail = imagePreview_el.text
4295 'thumbnail': thumbnail,
4296 'description': description
4300 class SpiegelIE(InfoExtractor):
4301 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4303 def _real_extract(self, url):
4304 m = re.match(self._VALID_URL, url)
4305 video_id = m.group('videoID')
4307 webpage = self._download_webpage(url, video_id)
4308 m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4310 raise ExtractorError(u'Cannot find title')
4311 video_title = unescapeHTML(m.group(1))
4313 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4314 xml_code = self._download_webpage(xml_url, video_id,
4315 note=u'Downloading XML', errnote=u'Failed to download XML')
4317 idoc = xml.etree.ElementTree.fromstring(xml_code)
4318 last_type = idoc[-1]
4319 filename = last_type.findall('./filename')[0].text
4320 duration = float(last_type.findall('./duration')[0].text)
4322 video_url = 'http://video2.spiegel.de/flash/' + filename
4323 video_ext = filename.rpartition('.')[2]
4328 'title': video_title,
4329 'duration': duration,
4333 class LiveLeakIE(InfoExtractor):
4335 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4336 IE_NAME = u'liveleak'
4338 def _real_extract(self, url):
4339 mobj = re.match(self._VALID_URL, url)
4341 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4344 video_id = mobj.group('video_id')
4346 webpage = self._download_webpage(url, video_id)
4348 m = re.search(r'file: "(.*?)",', webpage)
4350 self._downloader.report_error(u'unable to find video url')
4352 video_url = m.group(1)
4354 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4356 self._downloader.trouble(u'Cannot find video title')
4357 title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4359 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4361 desc = unescapeHTML(m.group('desc'))
4365 m = re.search(r'By:.*?(\w+)</a>', webpage)
4367 uploader = clean_html(m.group(1))
4376 'description': desc,
4377 'uploader': uploader
4382 class ARDIE(InfoExtractor):
4383 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4384 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4385 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4387 def _real_extract(self, url):
4388 # determine video id from url
4389 m = re.match(self._VALID_URL, url)
4391 numid = re.search(r'documentId=([0-9]+)', url)
4393 video_id = numid.group(1)
4395 video_id = m.group('video_id')
4397 # determine title and media streams from webpage
4398 html = self._download_webpage(url, video_id)
4399 title = re.search(self._TITLE, html).group('title')
4400 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4402 assert '"fsk"' in html
4403 self._downloader.report_error(u'this video is only available after 8:00 pm')
4406 # choose default media type and highest quality for now
4407 stream = max([s for s in streams if int(s["media_type"]) == 0],
4408 key=lambda s: int(s["quality"]))
4410 # there's two possibilities: RTMP stream or HTTP download
4411 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4412 if stream['rtmp_url']:
4413 self._downloader.to_screen(u'[%s] RTMP download detected' % self.IE_NAME)
4414 assert stream['video_url'].startswith('mp4:')
4415 info["url"] = stream["rtmp_url"]
4416 info["play_path"] = stream['video_url']
4418 assert stream["video_url"].endswith('.mp4')
4419 info["url"] = stream["video_url"]
4423 def gen_extractors():
4424 """ Return a list of an instance of every supported extractor.
4425 The order does matter; the first extractor matched is the one handling the URL.
4428 YoutubePlaylistIE(),
4453 StanfordOpenClassroomIE(),
4463 WorldStarHipHopIE(),