2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
23 class InfoExtractor(object):
24 """Information Extractor class.
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
29 others. The information is stored in a dictionary which is then
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
34 The dictionaries must include the following fields:
38 title: Video title, unescaped.
39 ext: Video filename extension.
41 The following fields are optional:
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
46 uploader: Full name of the video uploader.
47 upload_date: Video upload date (YYYYMMDD).
48 uploader_id: Nickname or id of the video uploader.
49 location: Physical location of the video.
50 player_url: SWF Player URL (used for rtmpdump).
51 subtitles: The subtitle file contents.
52 urlhandle: [internal] The urlHandle to be used to download the file,
53 like returned by urllib.request.urlopen
55 The fields should all be Unicode strings.
57 Subclasses of this one should re-define the _real_initialize() and
58 _real_extract() methods and define a _VALID_URL regexp.
59 Probably, they should also be added to the list of extractors.
61 _real_extract() must return a *list* of information dictionaries as
64 Finally, the _WORKING attribute should be set to False for broken IEs
65 in order to warn the users and skip the tests.
72 def __init__(self, downloader=None):
73 """Constructor. Receives an optional downloader."""
75 self.set_downloader(downloader)
78 def suitable(cls, url):
79 """Receives a URL and returns True if suitable for this IE."""
80 return re.match(cls._VALID_URL, url) is not None
84 """Getter method for _WORKING."""
88 """Initializes an instance (authentication, etc)."""
90 self._real_initialize()
93 def extract(self, url):
94 """Extracts URL information and returns it in list of dicts."""
96 return self._real_extract(url)
98 def set_downloader(self, downloader):
99 """Sets the downloader for this IE."""
100 self._downloader = downloader
102 def _real_initialize(self):
103 """Real initialization process. Redefine in subclasses."""
106 def _real_extract(self, url):
107 """Real extraction process. Redefine in subclasses."""
112 return type(self).__name__[:-2]
114 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115 """ Returns the response handle """
117 note = u'Downloading video webpage'
118 self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
120 return compat_urllib_request.urlopen(url_or_request)
121 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
123 errnote = u'Unable to download webpage'
124 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
126 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
127 """ Returns the data of the page as a string """
128 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
129 content_type = urlh.headers.get('Content-Type', '')
130 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
132 encoding = m.group(1)
135 webpage_bytes = urlh.read()
136 return webpage_bytes.decode(encoding, 'replace')
138 #Methods for following #608
139 #They set the correct value of the '_type' key
140 def video_result(self, video_info):
141 """Returns a video"""
142 video_info['_type'] = 'video'
144 def url_result(self, url, ie=None):
145 """Returns a url that points to a page that should be processed"""
146 #TODO: ie should be the class used for getting the info
147 video_info = {'_type': 'url',
150 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
151 """Returns a playlist"""
152 video_info = {'_type': 'playlist',
155 video_info['id'] = playlist_id
157 video_info['title'] = playlist_title
161 class YoutubeIE(InfoExtractor):
162 """Information extractor for youtube.com."""
166 (?:https?://)? # http(s):// (optional)
167 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
168 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
169 (?:.*?\#/)? # handle anchor (#/) redirect urls
170 (?: # the various things that can precede the ID:
171 (?:(?:v|embed|e)/) # v/ or embed/ or e/
172 |(?: # or the v= param in all its forms
173 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
174 (?:\?|\#!?) # the params delimiter ? or # or #!
175 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
178 )? # optional -> youtube.com/xxxx is OK
179 )? # all until now is optional -> you can pass the naked ID
180 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
181 (?(1).+)? # if we found the ID, everything can follow
183 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
184 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
185 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
186 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
187 _NETRC_MACHINE = 'youtube'
188 # Listed in order of quality
189 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
190 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
191 _video_extensions = {
197 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
203 _video_dimensions = {
222 def suitable(cls, url):
223 """Receives a URL and returns True if suitable for this IE."""
224 if YoutubePlaylistIE.suitable(url): return False
225 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
227 def report_lang(self):
228 """Report attempt to set language."""
229 self._downloader.to_screen(u'[youtube] Setting language')
231 def report_login(self):
232 """Report attempt to log in."""
233 self._downloader.to_screen(u'[youtube] Logging in')
235 def report_age_confirmation(self):
236 """Report attempt to confirm age."""
237 self._downloader.to_screen(u'[youtube] Confirming age')
239 def report_video_webpage_download(self, video_id):
240 """Report attempt to download video webpage."""
241 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
243 def report_video_info_webpage_download(self, video_id):
244 """Report attempt to download video info webpage."""
245 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
247 def report_video_subtitles_download(self, video_id):
248 """Report attempt to download video info webpage."""
249 self._downloader.to_screen(u'[youtube] %s: Checking available subtitles' % video_id)
251 def report_video_subtitles_request(self, video_id, sub_lang, format):
252 """Report attempt to download video info webpage."""
253 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
255 def report_video_subtitles_available(self, video_id, sub_lang_list):
256 """Report available subtitles."""
257 sub_lang = ",".join(list(sub_lang_list.keys()))
258 self._downloader.to_screen(u'[youtube] %s: Available subtitles for video: %s' % (video_id, sub_lang))
260 def report_information_extraction(self, video_id):
261 """Report attempt to extract video information."""
262 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
264 def report_unavailable_format(self, video_id, format):
265 """Report extracted video URL."""
266 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
268 def report_rtmp_download(self):
269 """Indicate the download will use the RTMP protocol."""
270 self._downloader.to_screen(u'[youtube] RTMP download detected')
272 def _get_available_subtitles(self, video_id):
273 self.report_video_subtitles_download(video_id)
274 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
276 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
277 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
278 return (u'unable to download video subtitles: %s' % compat_str(err), None)
279 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
280 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
281 if not sub_lang_list:
282 return (u'video doesn\'t have subtitles', None)
285 def _list_available_subtitles(self, video_id):
286 sub_lang_list = self._get_available_subtitles(video_id)
287 self.report_video_subtitles_available(video_id, sub_lang_list)
289 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
292 (error_message, sub_lang, sub)
294 self.report_video_subtitles_request(video_id, sub_lang, format)
295 params = compat_urllib_parse.urlencode({
301 url = 'http://www.youtube.com/api/timedtext?' + params
303 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
304 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
305 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
307 return (u'Did not fetch video subtitles', None, None)
308 return (None, sub_lang, sub)
310 def _extract_subtitle(self, video_id):
312 Return a list with a tuple:
313 [(error_message, sub_lang, sub)]
315 sub_lang_list = self._get_available_subtitles(video_id)
316 sub_format = self._downloader.params.get('subtitlesformat')
317 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
318 return [(sub_lang_list[0], None, None)]
319 if self._downloader.params.get('subtitleslang', False):
320 sub_lang = self._downloader.params.get('subtitleslang')
321 elif 'en' in sub_lang_list:
324 sub_lang = list(sub_lang_list.keys())[0]
325 if not sub_lang in sub_lang_list:
326 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
328 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
331 def _extract_all_subtitles(self, video_id):
332 sub_lang_list = self._get_available_subtitles(video_id)
333 sub_format = self._downloader.params.get('subtitlesformat')
334 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
335 return [(sub_lang_list[0], None, None)]
337 for sub_lang in sub_lang_list:
338 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
339 subtitles.append(subtitle)
342 def _print_formats(self, formats):
343 print('Available formats:')
345 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
347 def _real_initialize(self):
348 if self._downloader is None:
353 downloader_params = self._downloader.params
355 # Attempt to use provided username and password or .netrc data
356 if downloader_params.get('username', None) is not None:
357 username = downloader_params['username']
358 password = downloader_params['password']
359 elif downloader_params.get('usenetrc', False):
361 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
366 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
367 except (IOError, netrc.NetrcParseError) as err:
368 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
372 request = compat_urllib_request.Request(self._LANG_URL)
375 compat_urllib_request.urlopen(request).read()
376 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
377 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
380 # No authentication to be performed
384 request = compat_urllib_request.Request(self._LOGIN_URL)
386 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
387 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
388 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
393 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
395 galx = match.group(1)
397 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
403 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
407 u'PersistentCookie': u'yes',
409 u'bgresponse': u'js_disabled',
410 u'checkConnection': u'',
411 u'checkedDomains': u'youtube',
417 u'signIn': u'Sign in',
419 u'service': u'youtube',
423 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
425 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
426 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
427 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
430 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
431 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
432 self._downloader.report_warning(u'unable to log in: bad username or password')
434 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
435 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
441 'action_confirm': 'Confirm',
443 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
445 self.report_age_confirmation()
446 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
447 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
448 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
451 def _extract_id(self, url):
452 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
454 self._downloader.report_error(u'invalid URL: %s' % url)
456 video_id = mobj.group(2)
459 def _real_extract(self, url):
460 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
461 mobj = re.search(self._NEXT_URL_RE, url)
463 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
464 video_id = self._extract_id(url)
467 self.report_video_webpage_download(video_id)
468 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
469 request = compat_urllib_request.Request(url)
471 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
472 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
473 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
476 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
478 # Attempt to extract SWF player URL
479 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
481 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
486 self.report_video_info_webpage_download(video_id)
487 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
488 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
489 % (video_id, el_type))
490 request = compat_urllib_request.Request(video_info_url)
492 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
493 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
494 video_info = compat_parse_qs(video_info_webpage)
495 if 'token' in video_info:
497 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
498 self._downloader.report_error(u'unable to download video info webpage: %s' % compat_str(err))
500 if 'token' not in video_info:
501 if 'reason' in video_info:
502 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
504 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
507 # Check for "rental" videos
508 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
509 self._downloader.report_error(u'"rental" videos not supported')
512 # Start extracting information
513 self.report_information_extraction(video_id)
516 if 'author' not in video_info:
517 self._downloader.report_error(u'unable to extract uploader name')
519 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
522 video_uploader_id = None
523 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
525 video_uploader_id = mobj.group(1)
527 self._downloader.report_warning(u'unable to extract uploader nickname')
530 if 'title' not in video_info:
531 self._downloader.report_error(u'unable to extract video title')
533 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
536 if 'thumbnail_url' not in video_info:
537 self._downloader.report_warning(u'unable to extract video thumbnail')
539 else: # don't panic if we can't find it
540 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
544 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
546 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
547 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
548 for expression in format_expressions:
550 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
555 video_description = get_element_by_id("eow-description", video_webpage)
556 if video_description:
557 video_description = clean_html(video_description)
559 video_description = ''
562 video_subtitles = None
564 if self._downloader.params.get('writesubtitles', False):
565 video_subtitles = self._extract_subtitle(video_id)
567 (sub_error, sub_lang, sub) = video_subtitles[0]
569 self._downloader.report_error(sub_error)
571 if self._downloader.params.get('allsubtitles', False):
572 video_subtitles = self._extract_all_subtitles(video_id)
573 for video_subtitle in video_subtitles:
574 (sub_error, sub_lang, sub) = video_subtitle
576 self._downloader.report_error(sub_error)
578 if self._downloader.params.get('listsubtitles', False):
579 sub_lang_list = self._list_available_subtitles(video_id)
582 if 'length_seconds' not in video_info:
583 self._downloader.report_warning(u'unable to extract video duration')
586 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
589 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
591 # Decide which formats to download
592 req_format = self._downloader.params.get('format', None)
594 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
595 self.report_rtmp_download()
596 video_url_list = [(None, video_info['conn'][0])]
597 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
598 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
599 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
600 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
601 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
603 format_limit = self._downloader.params.get('format_limit', None)
604 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
605 if format_limit is not None and format_limit in available_formats:
606 format_list = available_formats[available_formats.index(format_limit):]
608 format_list = available_formats
609 existing_formats = [x for x in format_list if x in url_map]
610 if len(existing_formats) == 0:
611 self._downloader.report_error(u'no known formats available for video')
613 if self._downloader.params.get('listformats', None):
614 self._print_formats(existing_formats)
616 if req_format is None or req_format == 'best':
617 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
618 elif req_format == 'worst':
619 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
620 elif req_format in ('-1', 'all'):
621 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
623 # Specific formats. We pick the first in a slash-delimeted sequence.
624 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
625 req_formats = req_format.split('/')
626 video_url_list = None
627 for rf in req_formats:
629 video_url_list = [(rf, url_map[rf])]
631 if video_url_list is None:
632 self._downloader.report_error(u'requested format not available')
635 self._downloader.report_error(u'no conn or url_encoded_fmt_stream_map information found in video info')
639 for format_param, video_real_url in video_url_list:
641 video_extension = self._video_extensions.get(format_param, 'flv')
643 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
644 self._video_dimensions.get(format_param, '???'))
648 'url': video_real_url,
649 'uploader': video_uploader,
650 'uploader_id': video_uploader_id,
651 'upload_date': upload_date,
652 'title': video_title,
653 'ext': video_extension,
654 'format': video_format,
655 'thumbnail': video_thumbnail,
656 'description': video_description,
657 'player_url': player_url,
658 'subtitles': video_subtitles,
659 'duration': video_duration
664 class MetacafeIE(InfoExtractor):
665 """Information Extractor for metacafe.com."""
667 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
668 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
669 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
670 IE_NAME = u'metacafe'
672 def __init__(self, downloader=None):
673 InfoExtractor.__init__(self, downloader)
675 def report_disclaimer(self):
676 """Report disclaimer retrieval."""
677 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
679 def report_age_confirmation(self):
680 """Report attempt to confirm age."""
681 self._downloader.to_screen(u'[metacafe] Confirming age')
683 def report_download_webpage(self, video_id):
684 """Report webpage download."""
685 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
687 def report_extraction(self, video_id):
688 """Report information extraction."""
689 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
691 def _real_initialize(self):
692 # Retrieve disclaimer
693 request = compat_urllib_request.Request(self._DISCLAIMER)
695 self.report_disclaimer()
696 disclaimer = compat_urllib_request.urlopen(request).read()
697 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
698 self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
704 'submit': "Continue - I'm over 18",
706 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
708 self.report_age_confirmation()
709 disclaimer = compat_urllib_request.urlopen(request).read()
710 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
711 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
714 def _real_extract(self, url):
715 # Extract id and simplified title from URL
716 mobj = re.match(self._VALID_URL, url)
718 self._downloader.report_error(u'invalid URL: %s' % url)
721 video_id = mobj.group(1)
723 # Check if video comes from YouTube
724 mobj2 = re.match(r'^yt-(.*)$', video_id)
725 if mobj2 is not None:
726 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
729 # Retrieve video webpage to extract further information
730 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
732 self.report_download_webpage(video_id)
733 webpage = compat_urllib_request.urlopen(request).read()
734 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
735 self._downloader.report_error(u'unable retrieve video webpage: %s' % compat_str(err))
738 # Extract URL, uploader and title from webpage
739 self.report_extraction(video_id)
740 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
742 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
743 video_extension = mediaURL[-3:]
745 # Extract gdaKey if available
746 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
750 gdaKey = mobj.group(1)
751 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
753 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
755 self._downloader.report_error(u'unable to extract media URL')
757 vardict = compat_parse_qs(mobj.group(1))
758 if 'mediaData' not in vardict:
759 self._downloader.report_error(u'unable to extract media URL')
761 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
763 self._downloader.report_error(u'unable to extract media URL')
765 mediaURL = mobj.group(1).replace('\\/', '/')
766 video_extension = mediaURL[-3:]
767 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
769 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
771 self._downloader.report_error(u'unable to extract title')
773 video_title = mobj.group(1).decode('utf-8')
775 mobj = re.search(r'submitter=(.*?);', webpage)
777 self._downloader.report_error(u'unable to extract uploader nickname')
779 video_uploader = mobj.group(1)
782 'id': video_id.decode('utf-8'),
783 'url': video_url.decode('utf-8'),
784 'uploader': video_uploader.decode('utf-8'),
786 'title': video_title,
787 'ext': video_extension.decode('utf-8'),
791 class DailymotionIE(InfoExtractor):
792 """Information Extractor for Dailymotion"""
794 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
795 IE_NAME = u'dailymotion'
798 def __init__(self, downloader=None):
799 InfoExtractor.__init__(self, downloader)
801 def report_extraction(self, video_id):
802 """Report information extraction."""
803 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
805 def _real_extract(self, url):
806 # Extract id and simplified title from URL
807 mobj = re.match(self._VALID_URL, url)
809 self._downloader.report_error(u'invalid URL: %s' % url)
812 video_id = mobj.group(1).split('_')[0].split('?')[0]
814 video_extension = 'mp4'
816 # Retrieve video webpage to extract further information
817 request = compat_urllib_request.Request(url)
818 request.add_header('Cookie', 'family_filter=off')
819 webpage = self._download_webpage(request, video_id)
821 # Extract URL, uploader and title from webpage
822 self.report_extraction(video_id)
823 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
825 self._downloader.report_error(u'unable to extract media URL')
827 flashvars = compat_urllib_parse.unquote(mobj.group(1))
829 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
832 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
835 self._downloader.report_error(u'unable to extract video URL')
838 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
840 self._downloader.report_error(u'unable to extract video URL')
843 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
845 # TODO: support choosing qualities
847 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
849 self._downloader.report_error(u'unable to extract title')
851 video_title = unescapeHTML(mobj.group('title'))
853 video_uploader = None
854 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
856 # lookin for official user
857 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
858 if mobj_official is None:
859 self._downloader.report_warning(u'unable to extract uploader nickname')
861 video_uploader = mobj_official.group(1)
863 video_uploader = mobj.group(1)
865 video_upload_date = None
866 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
868 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
873 'uploader': video_uploader,
874 'upload_date': video_upload_date,
875 'title': video_title,
876 'ext': video_extension,
880 class PhotobucketIE(InfoExtractor):
881 """Information extractor for photobucket.com."""
883 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
884 IE_NAME = u'photobucket'
886 def __init__(self, downloader=None):
887 InfoExtractor.__init__(self, downloader)
889 def report_download_webpage(self, video_id):
890 """Report webpage download."""
891 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
893 def report_extraction(self, video_id):
894 """Report information extraction."""
895 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
897 def _real_extract(self, url):
898 # Extract id from URL
899 mobj = re.match(self._VALID_URL, url)
901 self._downloader.report_error(u'Invalid URL: %s' % url)
904 video_id = mobj.group(1)
906 video_extension = 'flv'
908 # Retrieve video webpage to extract further information
909 request = compat_urllib_request.Request(url)
911 self.report_download_webpage(video_id)
912 webpage = compat_urllib_request.urlopen(request).read()
913 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
914 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
917 # Extract URL, uploader, and title from webpage
918 self.report_extraction(video_id)
919 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
921 self._downloader.report_error(u'unable to extract media URL')
923 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
927 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
929 self._downloader.report_error(u'unable to extract title')
931 video_title = mobj.group(1).decode('utf-8')
933 video_uploader = mobj.group(2).decode('utf-8')
936 'id': video_id.decode('utf-8'),
937 'url': video_url.decode('utf-8'),
938 'uploader': video_uploader,
940 'title': video_title,
941 'ext': video_extension.decode('utf-8'),
945 class YahooIE(InfoExtractor):
946 """Information extractor for video.yahoo.com."""
949 # _VALID_URL matches all Yahoo! Video URLs
950 # _VPAGE_URL matches only the extractable '/watch/' URLs
951 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
952 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
953 IE_NAME = u'video.yahoo'
955 def __init__(self, downloader=None):
956 InfoExtractor.__init__(self, downloader)
958 def report_download_webpage(self, video_id):
959 """Report webpage download."""
960 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
962 def report_extraction(self, video_id):
963 """Report information extraction."""
964 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
966 def _real_extract(self, url, new_video=True):
967 # Extract ID from URL
968 mobj = re.match(self._VALID_URL, url)
970 self._downloader.report_error(u'Invalid URL: %s' % url)
973 video_id = mobj.group(2)
974 video_extension = 'flv'
976 # Rewrite valid but non-extractable URLs as
977 # extractable English language /watch/ URLs
978 if re.match(self._VPAGE_URL, url) is None:
979 request = compat_urllib_request.Request(url)
981 webpage = compat_urllib_request.urlopen(request).read()
982 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
983 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
986 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
988 self._downloader.report_error(u'Unable to extract id field')
990 yahoo_id = mobj.group(1)
992 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
994 self._downloader.report_error(u'Unable to extract vid field')
996 yahoo_vid = mobj.group(1)
998 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
999 return self._real_extract(url, new_video=False)
1001 # Retrieve video webpage to extract further information
1002 request = compat_urllib_request.Request(url)
1004 self.report_download_webpage(video_id)
1005 webpage = compat_urllib_request.urlopen(request).read()
1006 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1007 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1010 # Extract uploader and title from webpage
1011 self.report_extraction(video_id)
1012 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1014 self._downloader.report_error(u'unable to extract video title')
1016 video_title = mobj.group(1).decode('utf-8')
1018 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1020 self._downloader.report_error(u'unable to extract video uploader')
1022 video_uploader = mobj.group(1).decode('utf-8')
1024 # Extract video thumbnail
1025 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1027 self._downloader.report_error(u'unable to extract video thumbnail')
1029 video_thumbnail = mobj.group(1).decode('utf-8')
1031 # Extract video description
1032 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1034 self._downloader.report_error(u'unable to extract video description')
1036 video_description = mobj.group(1).decode('utf-8')
1037 if not video_description:
1038 video_description = 'No description available.'
1040 # Extract video height and width
1041 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1043 self._downloader.report_error(u'unable to extract video height')
1045 yv_video_height = mobj.group(1)
1047 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1049 self._downloader.report_error(u'unable to extract video width')
1051 yv_video_width = mobj.group(1)
1053 # Retrieve video playlist to extract media URL
1054 # I'm not completely sure what all these options are, but we
1055 # seem to need most of them, otherwise the server sends a 401.
1056 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1057 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1058 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1059 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1060 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1062 self.report_download_webpage(video_id)
1063 webpage = compat_urllib_request.urlopen(request).read()
1064 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1065 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1068 # Extract media URL from playlist XML
1069 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1071 self._downloader.report_error(u'Unable to extract media URL')
1073 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1074 video_url = unescapeHTML(video_url)
1077 'id': video_id.decode('utf-8'),
1079 'uploader': video_uploader,
1080 'upload_date': None,
1081 'title': video_title,
1082 'ext': video_extension.decode('utf-8'),
1083 'thumbnail': video_thumbnail.decode('utf-8'),
1084 'description': video_description,
1088 class VimeoIE(InfoExtractor):
1089 """Information extractor for vimeo.com."""
1091 # _VALID_URL matches Vimeo URLs
1092 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1095 def __init__(self, downloader=None):
1096 InfoExtractor.__init__(self, downloader)
1098 def report_download_webpage(self, video_id):
1099 """Report webpage download."""
1100 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1102 def report_extraction(self, video_id):
1103 """Report information extraction."""
1104 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1106 def _real_extract(self, url, new_video=True):
1107 # Extract ID from URL
1108 mobj = re.match(self._VALID_URL, url)
1110 self._downloader.report_error(u'Invalid URL: %s' % url)
1113 video_id = mobj.group('id')
1114 if not mobj.group('proto'):
1115 url = 'https://' + url
1116 if mobj.group('direct_link'):
1117 url = 'https://vimeo.com/' + video_id
1119 # Retrieve video webpage to extract further information
1120 request = compat_urllib_request.Request(url, None, std_headers)
1122 self.report_download_webpage(video_id)
1123 webpage_bytes = compat_urllib_request.urlopen(request).read()
1124 webpage = webpage_bytes.decode('utf-8')
1125 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1126 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1129 # Now we begin extracting as much information as we can from what we
1130 # retrieved. First we extract the information common to all extractors,
1131 # and latter we extract those that are Vimeo specific.
1132 self.report_extraction(video_id)
1134 # Extract the config JSON
1136 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1137 config = json.loads(config)
1139 self._downloader.report_error(u'unable to extract info section')
1143 video_title = config["video"]["title"]
1145 # Extract uploader and uploader_id
1146 video_uploader = config["video"]["owner"]["name"]
1147 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1149 # Extract video thumbnail
1150 video_thumbnail = config["video"]["thumbnail"]
1152 # Extract video description
1153 video_description = get_element_by_attribute("itemprop", "description", webpage)
1154 if video_description: video_description = clean_html(video_description)
1155 else: video_description = ''
1157 # Extract upload date
1158 video_upload_date = None
1159 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1160 if mobj is not None:
1161 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1163 # Vimeo specific: extract request signature and timestamp
1164 sig = config['request']['signature']
1165 timestamp = config['request']['timestamp']
1167 # Vimeo specific: extract video codec and quality information
1168 # First consider quality, then codecs, then take everything
1169 # TODO bind to format param
1170 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1171 files = { 'hd': [], 'sd': [], 'other': []}
1172 for codec_name, codec_extension in codecs:
1173 if codec_name in config["video"]["files"]:
1174 if 'hd' in config["video"]["files"][codec_name]:
1175 files['hd'].append((codec_name, codec_extension, 'hd'))
1176 elif 'sd' in config["video"]["files"][codec_name]:
1177 files['sd'].append((codec_name, codec_extension, 'sd'))
1179 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1181 for quality in ('hd', 'sd', 'other'):
1182 if len(files[quality]) > 0:
1183 video_quality = files[quality][0][2]
1184 video_codec = files[quality][0][0]
1185 video_extension = files[quality][0][1]
1186 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1189 self._downloader.report_error(u'no known codec found')
1192 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1193 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1198 'uploader': video_uploader,
1199 'uploader_id': video_uploader_id,
1200 'upload_date': video_upload_date,
1201 'title': video_title,
1202 'ext': video_extension,
1203 'thumbnail': video_thumbnail,
1204 'description': video_description,
1208 class ArteTvIE(InfoExtractor):
1209 """arte.tv information extractor."""
1211 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1212 _LIVE_URL = r'index-[0-9]+\.html$'
1214 IE_NAME = u'arte.tv'
1216 def __init__(self, downloader=None):
1217 InfoExtractor.__init__(self, downloader)
1219 def report_download_webpage(self, video_id):
1220 """Report webpage download."""
1221 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1223 def report_extraction(self, video_id):
1224 """Report information extraction."""
1225 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1227 def fetch_webpage(self, url):
1228 request = compat_urllib_request.Request(url)
1230 self.report_download_webpage(url)
1231 webpage = compat_urllib_request.urlopen(request).read()
1232 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1233 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1235 except ValueError as err:
1236 self._downloader.report_error(u'Invalid URL: %s' % url)
1240 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1241 page = self.fetch_webpage(url)
1242 mobj = re.search(regex, page, regexFlags)
1246 self._downloader.report_error(u'Invalid URL: %s' % url)
1249 for (i, key, err) in matchTuples:
1250 if mobj.group(i) is None:
1251 self._downloader.trouble(err)
1254 info[key] = mobj.group(i)
1258 def extractLiveStream(self, url):
1259 video_lang = url.split('/')[-4]
1260 info = self.grep_webpage(
1262 r'src="(.*?/videothek_js.*?\.js)',
1265 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1268 http_host = url.split('/')[2]
1269 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1270 info = self.grep_webpage(
1272 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1273 '(http://.*?\.swf).*?' +
1277 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1278 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1279 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1282 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1284 def extractPlus7Stream(self, url):
1285 video_lang = url.split('/')[-3]
1286 info = self.grep_webpage(
1288 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1291 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1294 next_url = compat_urllib_parse.unquote(info.get('url'))
1295 info = self.grep_webpage(
1297 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1300 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1303 next_url = compat_urllib_parse.unquote(info.get('url'))
1305 info = self.grep_webpage(
1307 r'<video id="(.*?)".*?>.*?' +
1308 '<name>(.*?)</name>.*?' +
1309 '<dateVideo>(.*?)</dateVideo>.*?' +
1310 '<url quality="hd">(.*?)</url>',
1313 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1314 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1315 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1316 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1321 'id': info.get('id'),
1322 'url': compat_urllib_parse.unquote(info.get('url')),
1323 'uploader': u'arte.tv',
1324 'upload_date': info.get('date'),
1325 'title': info.get('title').decode('utf-8'),
1331 def _real_extract(self, url):
1332 video_id = url.split('/')[-1]
1333 self.report_extraction(video_id)
1335 if re.search(self._LIVE_URL, video_id) is not None:
1336 self.extractLiveStream(url)
1339 info = self.extractPlus7Stream(url)
1344 class GenericIE(InfoExtractor):
1345 """Generic last-resort information extractor."""
1348 IE_NAME = u'generic'
1350 def __init__(self, downloader=None):
1351 InfoExtractor.__init__(self, downloader)
1353 def report_download_webpage(self, video_id):
1354 """Report webpage download."""
1355 if not self._downloader.params.get('test', False):
1356 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1357 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1359 def report_extraction(self, video_id):
1360 """Report information extraction."""
1361 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1363 def report_following_redirect(self, new_url):
1364 """Report information extraction."""
1365 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1367 def _test_redirect(self, url):
1368 """Check if it is a redirect, like url shorteners, in case return the new url."""
1369 class HeadRequest(compat_urllib_request.Request):
1370 def get_method(self):
1373 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1375 Subclass the HTTPRedirectHandler to make it use our
1376 HeadRequest also on the redirected URL
1378 def redirect_request(self, req, fp, code, msg, headers, newurl):
1379 if code in (301, 302, 303, 307):
1380 newurl = newurl.replace(' ', '%20')
1381 newheaders = dict((k,v) for k,v in req.headers.items()
1382 if k.lower() not in ("content-length", "content-type"))
1383 return HeadRequest(newurl,
1385 origin_req_host=req.get_origin_req_host(),
1388 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1390 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1392 Fallback to GET if HEAD is not allowed (405 HTTP error)
1394 def http_error_405(self, req, fp, code, msg, headers):
1398 newheaders = dict((k,v) for k,v in req.headers.items()
1399 if k.lower() not in ("content-length", "content-type"))
1400 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1402 origin_req_host=req.get_origin_req_host(),
1406 opener = compat_urllib_request.OpenerDirector()
1407 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1408 HTTPMethodFallback, HEADRedirectHandler,
1409 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1410 opener.add_handler(handler())
1412 response = opener.open(HeadRequest(url))
1413 new_url = response.geturl()
1418 self.report_following_redirect(new_url)
1421 def _real_extract(self, url):
1422 new_url = self._test_redirect(url)
1423 if new_url: return [self.url_result(new_url)]
1425 video_id = url.split('/')[-1]
1427 webpage = self._download_webpage(url, video_id)
1428 except ValueError as err:
1429 # since this is the last-resort InfoExtractor, if
1430 # this error is thrown, it'll be thrown here
1431 self._downloader.report_error(u'Invalid URL: %s' % url)
1434 self.report_extraction(video_id)
1435 # Start with something easy: JW Player in SWFObject
1436 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1438 # Broaden the search a little bit
1439 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1441 # Broaden the search a little bit: JWPlayer JS loader
1442 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1444 self._downloader.report_error(u'Invalid URL: %s' % url)
1447 # It's possible that one of the regexes
1448 # matched, but returned an empty group:
1449 if mobj.group(1) is None:
1450 self._downloader.report_error(u'Invalid URL: %s' % url)
1453 video_url = compat_urllib_parse.unquote(mobj.group(1))
1454 video_id = os.path.basename(video_url)
1456 # here's a fun little line of code for you:
1457 video_extension = os.path.splitext(video_id)[1][1:]
1458 video_id = os.path.splitext(video_id)[0]
1460 # it's tempting to parse this further, but you would
1461 # have to take into account all the variations like
1462 # Video Title - Site Name
1463 # Site Name | Video Title
1464 # Video Title - Tagline | Site Name
1465 # and so on and so forth; it's just not practical
1466 mobj = re.search(r'<title>(.*)</title>', webpage)
1468 self._downloader.report_error(u'unable to extract title')
1470 video_title = mobj.group(1)
1472 # video uploader is domain name
1473 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1475 self._downloader.report_error(u'unable to extract title')
1477 video_uploader = mobj.group(1)
1482 'uploader': video_uploader,
1483 'upload_date': None,
1484 'title': video_title,
1485 'ext': video_extension,
1489 class YoutubeSearchIE(InfoExtractor):
1490 """Information Extractor for YouTube search queries."""
1491 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1492 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1493 _max_youtube_results = 1000
1494 IE_NAME = u'youtube:search'
1496 def __init__(self, downloader=None):
1497 InfoExtractor.__init__(self, downloader)
1499 def report_download_page(self, query, pagenum):
1500 """Report attempt to download search page with given number."""
1501 query = query.decode(preferredencoding())
1502 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1504 def _real_extract(self, query):
1505 mobj = re.match(self._VALID_URL, query)
1507 self._downloader.report_error(u'invalid search query "%s"' % query)
1510 prefix, query = query.split(':')
1512 query = query.encode('utf-8')
1514 self._download_n_results(query, 1)
1516 elif prefix == 'all':
1517 self._download_n_results(query, self._max_youtube_results)
1523 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1525 elif n > self._max_youtube_results:
1526 self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1527 n = self._max_youtube_results
1528 self._download_n_results(query, n)
1530 except ValueError: # parsing prefix as integer fails
1531 self._download_n_results(query, 1)
1534 def _download_n_results(self, query, n):
1535 """Downloads a specified number of results for a query"""
1541 while (50 * pagenum) < limit:
1542 self.report_download_page(query, pagenum+1)
1543 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1544 request = compat_urllib_request.Request(result_url)
1546 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1547 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1548 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1550 api_response = json.loads(data)['data']
1552 if not 'items' in api_response:
1553 self._downloader.trouble(u'[youtube] No video results')
1556 new_ids = list(video['id'] for video in api_response['items'])
1557 video_ids += new_ids
1559 limit = min(n, api_response['totalItems'])
1562 if len(video_ids) > n:
1563 video_ids = video_ids[:n]
1564 for id in video_ids:
1565 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1569 class GoogleSearchIE(InfoExtractor):
1570 """Information Extractor for Google Video search queries."""
1571 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1572 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1573 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1574 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1575 _max_google_results = 1000
1576 IE_NAME = u'video.google:search'
1578 def __init__(self, downloader=None):
1579 InfoExtractor.__init__(self, downloader)
1581 def report_download_page(self, query, pagenum):
1582 """Report attempt to download playlist page with given number."""
1583 query = query.decode(preferredencoding())
1584 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1586 def _real_extract(self, query):
1587 mobj = re.match(self._VALID_URL, query)
1589 self._downloader.report_error(u'invalid search query "%s"' % query)
1592 prefix, query = query.split(':')
1594 query = query.encode('utf-8')
1596 self._download_n_results(query, 1)
1598 elif prefix == 'all':
1599 self._download_n_results(query, self._max_google_results)
1605 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1607 elif n > self._max_google_results:
1608 self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1609 n = self._max_google_results
1610 self._download_n_results(query, n)
1612 except ValueError: # parsing prefix as integer fails
1613 self._download_n_results(query, 1)
1616 def _download_n_results(self, query, n):
1617 """Downloads a specified number of results for a query"""
1623 self.report_download_page(query, pagenum)
1624 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1625 request = compat_urllib_request.Request(result_url)
1627 page = compat_urllib_request.urlopen(request).read()
1628 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1629 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1632 # Extract video identifiers
1633 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1634 video_id = mobj.group(1)
1635 if video_id not in video_ids:
1636 video_ids.append(video_id)
1637 if len(video_ids) == n:
1638 # Specified n videos reached
1639 for id in video_ids:
1640 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1643 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1644 for id in video_ids:
1645 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1648 pagenum = pagenum + 1
1651 class YahooSearchIE(InfoExtractor):
1652 """Information Extractor for Yahoo! Video search queries."""
1655 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1656 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1657 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1658 _MORE_PAGES_INDICATOR = r'\s*Next'
1659 _max_yahoo_results = 1000
1660 IE_NAME = u'video.yahoo:search'
1662 def __init__(self, downloader=None):
1663 InfoExtractor.__init__(self, downloader)
1665 def report_download_page(self, query, pagenum):
1666 """Report attempt to download playlist page with given number."""
1667 query = query.decode(preferredencoding())
1668 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1670 def _real_extract(self, query):
1671 mobj = re.match(self._VALID_URL, query)
1673 self._downloader.report_error(u'invalid search query "%s"' % query)
1676 prefix, query = query.split(':')
1678 query = query.encode('utf-8')
1680 self._download_n_results(query, 1)
1682 elif prefix == 'all':
1683 self._download_n_results(query, self._max_yahoo_results)
1689 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1691 elif n > self._max_yahoo_results:
1692 self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1693 n = self._max_yahoo_results
1694 self._download_n_results(query, n)
1696 except ValueError: # parsing prefix as integer fails
1697 self._download_n_results(query, 1)
1700 def _download_n_results(self, query, n):
1701 """Downloads a specified number of results for a query"""
1704 already_seen = set()
1708 self.report_download_page(query, pagenum)
1709 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1710 request = compat_urllib_request.Request(result_url)
1712 page = compat_urllib_request.urlopen(request).read()
1713 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1714 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1717 # Extract video identifiers
1718 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1719 video_id = mobj.group(1)
1720 if video_id not in already_seen:
1721 video_ids.append(video_id)
1722 already_seen.add(video_id)
1723 if len(video_ids) == n:
1724 # Specified n videos reached
1725 for id in video_ids:
1726 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1729 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1730 for id in video_ids:
1731 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1734 pagenum = pagenum + 1
1737 class YoutubePlaylistIE(InfoExtractor):
1738 """Information Extractor for YouTube playlists."""
1740 _VALID_URL = r"""(?:
1745 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1746 \? (?:.*?&)*? (?:p|a|list)=
1749 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1752 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1754 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1756 IE_NAME = u'youtube:playlist'
1758 def __init__(self, downloader=None):
1759 InfoExtractor.__init__(self, downloader)
1762 def suitable(cls, url):
1763 """Receives a URL and returns True if suitable for this IE."""
1764 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1766 def report_download_page(self, playlist_id, pagenum):
1767 """Report attempt to download playlist page with given number."""
1768 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1770 def _real_extract(self, url):
1771 # Extract playlist id
1772 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1774 self._downloader.report_error(u'invalid url: %s' % url)
1777 # Download playlist videos from API
1778 playlist_id = mobj.group(1) or mobj.group(2)
1783 self.report_download_page(playlist_id, page_num)
1785 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1787 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1788 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1789 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1793 response = json.loads(page)
1794 except ValueError as err:
1795 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1798 if not 'feed' in response or not 'entry' in response['feed']:
1799 self._downloader.report_error(u'Got a malformed response from YouTube API')
1801 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1802 for entry in response['feed']['entry']
1803 if 'content' in entry ]
1805 if len(response['feed']['entry']) < self._MAX_RESULTS:
1809 videos = [v[1] for v in sorted(videos)]
1812 playliststart = self._downloader.params.get('playliststart', 1) - 1
1813 playlistend = self._downloader.params.get('playlistend', -1)
1814 if playlistend == -1:
1815 videos = videos[playliststart:]
1817 videos = videos[playliststart:playlistend]
1819 if len(videos) == total:
1820 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1822 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(videos)))
1824 url_results = [self.url_result(url) for url in videos]
1825 return [self.playlist_result(url_results, playlist_id)]
1828 class YoutubeChannelIE(InfoExtractor):
1829 """Information Extractor for YouTube channels."""
1831 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1832 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1833 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1834 IE_NAME = u'youtube:channel'
1836 def report_download_page(self, channel_id, pagenum):
1837 """Report attempt to download channel page with given number."""
1838 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1840 def _real_extract(self, url):
1841 # Extract channel id
1842 mobj = re.match(self._VALID_URL, url)
1844 self._downloader.report_error(u'invalid url: %s' % url)
1847 # Download channel pages
1848 channel_id = mobj.group(1)
1853 self.report_download_page(channel_id, pagenum)
1854 url = self._TEMPLATE_URL % (channel_id, pagenum)
1855 request = compat_urllib_request.Request(url)
1857 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1858 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1859 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1862 # Extract video identifiers
1864 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1865 if mobj.group(1) not in ids_in_page:
1866 ids_in_page.append(mobj.group(1))
1867 video_ids.extend(ids_in_page)
1869 if self._MORE_PAGES_INDICATOR not in page:
1871 pagenum = pagenum + 1
1873 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1875 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1876 url_entries = [self.url_result(url) for url in urls]
1877 return [self.playlist_result(url_entries, channel_id)]
1880 class YoutubeUserIE(InfoExtractor):
1881 """Information Extractor for YouTube users."""
1883 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1884 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1885 _GDATA_PAGE_SIZE = 50
1886 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1887 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1888 IE_NAME = u'youtube:user'
1890 def __init__(self, downloader=None):
1891 InfoExtractor.__init__(self, downloader)
1893 def report_download_page(self, username, start_index):
1894 """Report attempt to download user page."""
1895 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1896 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1898 def _real_extract(self, url):
1900 mobj = re.match(self._VALID_URL, url)
1902 self._downloader.report_error(u'invalid url: %s' % url)
1905 username = mobj.group(1)
1907 # Download video ids using YouTube Data API. Result size per
1908 # query is limited (currently to 50 videos) so we need to query
1909 # page by page until there are no video ids - it means we got
1916 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1917 self.report_download_page(username, start_index)
1919 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1922 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1923 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1924 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1927 # Extract video identifiers
1930 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1931 if mobj.group(1) not in ids_in_page:
1932 ids_in_page.append(mobj.group(1))
1934 video_ids.extend(ids_in_page)
1936 # A little optimization - if current page is not
1937 # "full", ie. does not contain PAGE_SIZE video ids then
1938 # we can assume that this page is the last one - there
1939 # are no more ids on further pages - no need to query
1942 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1947 all_ids_count = len(video_ids)
1948 playliststart = self._downloader.params.get('playliststart', 1) - 1
1949 playlistend = self._downloader.params.get('playlistend', -1)
1951 if playlistend == -1:
1952 video_ids = video_ids[playliststart:]
1954 video_ids = video_ids[playliststart:playlistend]
1956 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1957 (username, all_ids_count, len(video_ids)))
1959 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1960 url_results = [self.url_result(url) for url in urls]
1961 return [self.playlist_result(url_results, playlist_title = username)]
1964 class BlipTVUserIE(InfoExtractor):
1965 """Information Extractor for blip.tv users."""
1967 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1969 IE_NAME = u'blip.tv:user'
1971 def __init__(self, downloader=None):
1972 InfoExtractor.__init__(self, downloader)
1974 def report_download_page(self, username, pagenum):
1975 """Report attempt to download user page."""
1976 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1977 (self.IE_NAME, username, pagenum))
1979 def _real_extract(self, url):
1981 mobj = re.match(self._VALID_URL, url)
1983 self._downloader.report_error(u'invalid url: %s' % url)
1986 username = mobj.group(1)
1988 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1990 request = compat_urllib_request.Request(url)
1993 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1994 mobj = re.search(r'data-users-id="([^"]+)"', page)
1995 page_base = page_base % mobj.group(1)
1996 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1997 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2001 # Download video ids using BlipTV Ajax calls. Result size per
2002 # query is limited (currently to 12 videos) so we need to query
2003 # page by page until there are no video ids - it means we got
2010 self.report_download_page(username, pagenum)
2011 url = page_base + "&page=" + str(pagenum)
2012 request = compat_urllib_request.Request( url )
2014 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
2015 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2016 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
2019 # Extract video identifiers
2022 for mobj in re.finditer(r'href="/([^"]+)"', page):
2023 if mobj.group(1) not in ids_in_page:
2024 ids_in_page.append(unescapeHTML(mobj.group(1)))
2026 video_ids.extend(ids_in_page)
2028 # A little optimization - if current page is not
2029 # "full", ie. does not contain PAGE_SIZE video ids then
2030 # we can assume that this page is the last one - there
2031 # are no more ids on further pages - no need to query
2034 if len(ids_in_page) < self._PAGE_SIZE:
2039 all_ids_count = len(video_ids)
2040 playliststart = self._downloader.params.get('playliststart', 1) - 1
2041 playlistend = self._downloader.params.get('playlistend', -1)
2043 if playlistend == -1:
2044 video_ids = video_ids[playliststart:]
2046 video_ids = video_ids[playliststart:playlistend]
2048 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
2049 (self.IE_NAME, username, all_ids_count, len(video_ids)))
2051 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
2052 url_entries = [self.url_result(url) for url in urls]
2053 return [self.playlist_result(url_entries, playlist_title = username)]
2056 class DepositFilesIE(InfoExtractor):
2057 """Information extractor for depositfiles.com"""
2059 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2061 def report_download_webpage(self, file_id):
2062 """Report webpage download."""
2063 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2065 def report_extraction(self, file_id):
2066 """Report information extraction."""
2067 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2069 def _real_extract(self, url):
2070 file_id = url.split('/')[-1]
2071 # Rebuild url in english locale
2072 url = 'http://depositfiles.com/en/files/' + file_id
2074 # Retrieve file webpage with 'Free download' button pressed
2075 free_download_indication = { 'gateway_result' : '1' }
2076 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2078 self.report_download_webpage(file_id)
2079 webpage = compat_urllib_request.urlopen(request).read()
2080 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2081 self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
2084 # Search for the real file URL
2085 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2086 if (mobj is None) or (mobj.group(1) is None):
2087 # Try to figure out reason of the error.
2088 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2089 if (mobj is not None) and (mobj.group(1) is not None):
2090 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2091 self._downloader.report_error(u'%s' % restriction_message)
2093 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
2096 file_url = mobj.group(1)
2097 file_extension = os.path.splitext(file_url)[1][1:]
2099 # Search for file title
2100 mobj = re.search(r'<b title="(.*?)">', webpage)
2102 self._downloader.report_error(u'unable to extract title')
2104 file_title = mobj.group(1).decode('utf-8')
2107 'id': file_id.decode('utf-8'),
2108 'url': file_url.decode('utf-8'),
2110 'upload_date': None,
2111 'title': file_title,
2112 'ext': file_extension.decode('utf-8'),
2116 class FacebookIE(InfoExtractor):
2117 """Information Extractor for Facebook"""
2119 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2120 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2121 _NETRC_MACHINE = 'facebook'
2122 IE_NAME = u'facebook'
2124 def report_login(self):
2125 """Report attempt to log in."""
2126 self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2128 def _real_initialize(self):
2129 if self._downloader is None:
2134 downloader_params = self._downloader.params
2136 # Attempt to use provided username and password or .netrc data
2137 if downloader_params.get('username', None) is not None:
2138 useremail = downloader_params['username']
2139 password = downloader_params['password']
2140 elif downloader_params.get('usenetrc', False):
2142 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2143 if info is not None:
2147 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2148 except (IOError, netrc.NetrcParseError) as err:
2149 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2152 if useremail is None:
2161 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2164 login_results = compat_urllib_request.urlopen(request).read()
2165 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2166 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2168 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2169 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2172 def _real_extract(self, url):
2173 mobj = re.match(self._VALID_URL, url)
2175 self._downloader.report_error(u'invalid URL: %s' % url)
2177 video_id = mobj.group('ID')
2179 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2180 webpage = self._download_webpage(url, video_id)
2182 BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2183 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2184 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2186 raise ExtractorError(u'Cannot parse data')
2187 data = dict(json.loads(m.group(1)))
2188 params_raw = compat_urllib_parse.unquote(data['params'])
2189 params = json.loads(params_raw)
2190 video_url = params['hd_src']
2192 video_url = params['sd_src']
2194 raise ExtractorError(u'Cannot find video URL')
2195 video_duration = int(params['video_duration'])
2197 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2199 raise ExtractorError(u'Cannot find title in webpage')
2200 video_title = unescapeHTML(m.group(1))
2204 'title': video_title,
2207 'duration': video_duration,
2208 'thumbnail': params['thumbnail_src'],
2213 class BlipTVIE(InfoExtractor):
2214 """Information extractor for blip.tv"""
2216 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2217 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2218 IE_NAME = u'blip.tv'
2220 def report_extraction(self, file_id):
2221 """Report information extraction."""
2222 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2224 def report_direct_download(self, title):
2225 """Report information extraction."""
2226 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2228 def _real_extract(self, url):
2229 mobj = re.match(self._VALID_URL, url)
2231 self._downloader.report_error(u'invalid URL: %s' % url)
2234 urlp = compat_urllib_parse_urlparse(url)
2235 if urlp.path.startswith('/play/'):
2236 request = compat_urllib_request.Request(url)
2237 response = compat_urllib_request.urlopen(request)
2238 redirecturl = response.geturl()
2239 rurlp = compat_urllib_parse_urlparse(redirecturl)
2240 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2241 url = 'http://blip.tv/a/a-' + file_id
2242 return self._real_extract(url)
2249 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2250 request = compat_urllib_request.Request(json_url)
2251 request.add_header('User-Agent', 'iTunes/10.6.1')
2252 self.report_extraction(mobj.group(1))
2255 urlh = compat_urllib_request.urlopen(request)
2256 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2257 basename = url.split('/')[-1]
2258 title,ext = os.path.splitext(basename)
2259 title = title.decode('UTF-8')
2260 ext = ext.replace('.', '')
2261 self.report_direct_download(title)
2266 'upload_date': None,
2271 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2272 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2273 if info is None: # Regular URL
2275 json_code_bytes = urlh.read()
2276 json_code = json_code_bytes.decode('utf-8')
2277 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2278 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2282 json_data = json.loads(json_code)
2283 if 'Post' in json_data:
2284 data = json_data['Post']
2288 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2289 video_url = data['media']['url']
2290 umobj = re.match(self._URL_EXT, video_url)
2292 raise ValueError('Can not determine filename extension')
2293 ext = umobj.group(1)
2296 'id': data['item_id'],
2298 'uploader': data['display_name'],
2299 'upload_date': upload_date,
2300 'title': data['title'],
2302 'format': data['media']['mimeType'],
2303 'thumbnail': data['thumbnailUrl'],
2304 'description': data['description'],
2305 'player_url': data['embedUrl'],
2306 'user_agent': 'iTunes/10.6.1',
2308 except (ValueError,KeyError) as err:
2309 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2315 class MyVideoIE(InfoExtractor):
2316 """Information Extractor for myvideo.de."""
2318 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2319 IE_NAME = u'myvideo'
2321 def __init__(self, downloader=None):
2322 InfoExtractor.__init__(self, downloader)
2324 def report_extraction(self, video_id):
2325 """Report information extraction."""
2326 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2328 def _real_extract(self,url):
2329 mobj = re.match(self._VALID_URL, url)
2331 self._download.report_error(u'invalid URL: %s' % url)
2334 video_id = mobj.group(1)
2337 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2338 webpage = self._download_webpage(webpage_url, video_id)
2340 self.report_extraction(video_id)
2341 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2344 self._downloader.report_error(u'unable to extract media URL')
2346 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2348 mobj = re.search('<title>([^<]+)</title>', webpage)
2350 self._downloader.report_error(u'unable to extract title')
2353 video_title = mobj.group(1)
2359 'upload_date': None,
2360 'title': video_title,
2364 class ComedyCentralIE(InfoExtractor):
2365 """Information extractor for The Daily Show and Colbert Report """
2367 # urls can be abbreviations like :thedailyshow or :colbert
2368 # urls for episodes like:
2369 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2370 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2371 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2372 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2373 |(https?://)?(www\.)?
2374 (?P<showname>thedailyshow|colbertnation)\.com/
2375 (full-episodes/(?P<episode>.*)|
2377 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2378 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2381 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2383 _video_extensions = {
2391 _video_dimensions = {
2401 def suitable(cls, url):
2402 """Receives a URL and returns True if suitable for this IE."""
2403 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2405 def report_extraction(self, episode_id):
2406 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2408 def report_config_download(self, episode_id, media_id):
2409 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2411 def report_index_download(self, episode_id):
2412 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2414 def _print_formats(self, formats):
2415 print('Available formats:')
2417 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2420 def _real_extract(self, url):
2421 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2423 self._downloader.report_error(u'invalid URL: %s' % url)
2426 if mobj.group('shortname'):
2427 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2428 url = u'http://www.thedailyshow.com/full-episodes/'
2430 url = u'http://www.colbertnation.com/full-episodes/'
2431 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2432 assert mobj is not None
2434 if mobj.group('clip'):
2435 if mobj.group('showname') == 'thedailyshow':
2436 epTitle = mobj.group('tdstitle')
2438 epTitle = mobj.group('cntitle')
2441 dlNewest = not mobj.group('episode')
2443 epTitle = mobj.group('showname')
2445 epTitle = mobj.group('episode')
2447 req = compat_urllib_request.Request(url)
2448 self.report_extraction(epTitle)
2450 htmlHandle = compat_urllib_request.urlopen(req)
2451 html = htmlHandle.read()
2452 webpage = html.decode('utf-8')
2453 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2454 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2457 url = htmlHandle.geturl()
2458 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2460 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2462 if mobj.group('episode') == '':
2463 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2465 epTitle = mobj.group('episode')
2467 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2469 if len(mMovieParams) == 0:
2470 # The Colbert Report embeds the information in a without
2471 # a URL prefix; so extract the alternate reference
2472 # and then add the URL prefix manually.
2474 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2475 if len(altMovieParams) == 0:
2476 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2479 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2481 uri = mMovieParams[0][1]
2482 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2483 self.report_index_download(epTitle)
2485 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2486 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2487 self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2492 idoc = xml.etree.ElementTree.fromstring(indexXml)
2493 itemEls = idoc.findall('.//item')
2494 for partNum,itemEl in enumerate(itemEls):
2495 mediaId = itemEl.findall('./guid')[0].text
2496 shortMediaId = mediaId.split(':')[-1]
2497 showId = mediaId.split(':')[-2].replace('.com', '')
2498 officialTitle = itemEl.findall('./title')[0].text
2499 officialDate = itemEl.findall('./pubDate')[0].text
2501 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2502 compat_urllib_parse.urlencode({'uri': mediaId}))
2503 configReq = compat_urllib_request.Request(configUrl)
2504 self.report_config_download(epTitle, shortMediaId)
2506 configXml = compat_urllib_request.urlopen(configReq).read()
2507 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2508 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2511 cdoc = xml.etree.ElementTree.fromstring(configXml)
2513 for rendition in cdoc.findall('.//rendition'):
2514 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2518 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2521 if self._downloader.params.get('listformats', None):
2522 self._print_formats([i[0] for i in turls])
2525 # For now, just pick the highest bitrate
2526 format,rtmp_video_url = turls[-1]
2528 # Get the format arg from the arg stream
2529 req_format = self._downloader.params.get('format', None)
2531 # Select format if we can find one
2534 format, rtmp_video_url = f, v
2537 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2539 raise ExtractorError(u'Cannot transform RTMP url')
2540 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2541 video_url = base + m.group('finalid')
2543 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2548 'upload_date': officialDate,
2553 'description': officialTitle,
2555 results.append(info)
2560 class EscapistIE(InfoExtractor):
2561 """Information extractor for The Escapist """
2563 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2564 IE_NAME = u'escapist'
2566 def report_extraction(self, showName):
2567 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2569 def report_config_download(self, showName):
2570 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2572 def _real_extract(self, url):
2573 mobj = re.match(self._VALID_URL, url)
2575 self._downloader.report_error(u'invalid URL: %s' % url)
2577 showName = mobj.group('showname')
2578 videoId = mobj.group('episode')
2580 self.report_extraction(showName)
2582 webPage = compat_urllib_request.urlopen(url)
2583 webPageBytes = webPage.read()
2584 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2585 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2586 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2587 self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2590 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2591 description = unescapeHTML(descMatch.group(1))
2592 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2593 imgUrl = unescapeHTML(imgMatch.group(1))
2594 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2595 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2596 configUrlMatch = re.search('config=(.*)$', playerUrl)
2597 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2599 self.report_config_download(showName)
2601 configJSON = compat_urllib_request.urlopen(configUrl)
2602 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2603 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2604 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2605 self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2608 # Technically, it's JavaScript, not JSON
2609 configJSON = configJSON.replace("'", '"')
2612 config = json.loads(configJSON)
2613 except (ValueError,) as err:
2614 self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2617 playlist = config['playlist']
2618 videoUrl = playlist[1]['url']
2623 'uploader': showName,
2624 'upload_date': None,
2627 'thumbnail': imgUrl,
2628 'description': description,
2629 'player_url': playerUrl,
2634 class CollegeHumorIE(InfoExtractor):
2635 """Information extractor for collegehumor.com"""
2638 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2639 IE_NAME = u'collegehumor'
2641 def report_manifest(self, video_id):
2642 """Report information extraction."""
2643 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2645 def report_extraction(self, video_id):
2646 """Report information extraction."""
2647 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2649 def _real_extract(self, url):
2650 mobj = re.match(self._VALID_URL, url)
2652 self._downloader.report_error(u'invalid URL: %s' % url)
2654 video_id = mobj.group('videoid')
2659 'upload_date': None,
2662 self.report_extraction(video_id)
2663 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2665 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2666 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2667 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2670 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2672 videoNode = mdoc.findall('./video')[0]
2673 info['description'] = videoNode.findall('./description')[0].text
2674 info['title'] = videoNode.findall('./caption')[0].text
2675 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2676 manifest_url = videoNode.findall('./file')[0].text
2678 self._downloader.report_error(u'Invalid metadata XML file')
2681 manifest_url += '?hdcore=2.10.3'
2682 self.report_manifest(video_id)
2684 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2685 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2686 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2689 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2691 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2692 node_id = media_node.attrib['url']
2693 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2694 except IndexError as err:
2695 self._downloader.report_error(u'Invalid manifest file')
2698 url_pr = compat_urllib_parse_urlparse(manifest_url)
2699 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2706 class XVideosIE(InfoExtractor):
2707 """Information extractor for xvideos.com"""
2709 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2710 IE_NAME = u'xvideos'
2712 def report_extraction(self, video_id):
2713 """Report information extraction."""
2714 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2716 def _real_extract(self, url):
2717 mobj = re.match(self._VALID_URL, url)
2719 self._downloader.report_error(u'invalid URL: %s' % url)
2721 video_id = mobj.group(1)
2723 webpage = self._download_webpage(url, video_id)
2725 self.report_extraction(video_id)
2729 mobj = re.search(r'flv_url=(.+?)&', webpage)
2731 self._downloader.report_error(u'unable to extract video url')
2733 video_url = compat_urllib_parse.unquote(mobj.group(1))
2737 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2739 self._downloader.report_error(u'unable to extract video title')
2741 video_title = mobj.group(1)
2744 # Extract video thumbnail
2745 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2747 self._downloader.report_error(u'unable to extract video thumbnail')
2749 video_thumbnail = mobj.group(0)
2755 'upload_date': None,
2756 'title': video_title,
2758 'thumbnail': video_thumbnail,
2759 'description': None,
2765 class SoundcloudIE(InfoExtractor):
2766 """Information extractor for soundcloud.com
2767 To access the media, the uid of the song and a stream token
2768 must be extracted from the page source and the script must make
2769 a request to media.soundcloud.com/crossdomain.xml. Then
2770 the media can be grabbed by requesting from an url composed
2771 of the stream token and uid
2774 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2775 IE_NAME = u'soundcloud'
2777 def __init__(self, downloader=None):
2778 InfoExtractor.__init__(self, downloader)
2780 def report_resolve(self, video_id):
2781 """Report information extraction."""
2782 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2784 def report_extraction(self, video_id):
2785 """Report information extraction."""
2786 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2788 def _real_extract(self, url):
2789 mobj = re.match(self._VALID_URL, url)
2791 self._downloader.report_error(u'invalid URL: %s' % url)
2794 # extract uploader (which is in the url)
2795 uploader = mobj.group(1)
2796 # extract simple title (uploader + slug of song title)
2797 slug_title = mobj.group(2)
2798 simple_title = uploader + u'-' + slug_title
2800 self.report_resolve('%s/%s' % (uploader, slug_title))
2802 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2803 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2804 request = compat_urllib_request.Request(resolv_url)
2806 info_json_bytes = compat_urllib_request.urlopen(request).read()
2807 info_json = info_json_bytes.decode('utf-8')
2808 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2809 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2812 info = json.loads(info_json)
2813 video_id = info['id']
2814 self.report_extraction('%s/%s' % (uploader, slug_title))
2816 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2817 request = compat_urllib_request.Request(streams_url)
2819 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2820 stream_json = stream_json_bytes.decode('utf-8')
2821 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2822 self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2825 streams = json.loads(stream_json)
2826 mediaURL = streams['http_mp3_128_url']
2831 'uploader': info['user']['username'],
2832 'upload_date': info['created_at'],
2833 'title': info['title'],
2835 'description': info['description'],
2838 class SoundcloudSetIE(InfoExtractor):
2839 """Information extractor for soundcloud.com sets
2840 To access the media, the uid of the song and a stream token
2841 must be extracted from the page source and the script must make
2842 a request to media.soundcloud.com/crossdomain.xml. Then
2843 the media can be grabbed by requesting from an url composed
2844 of the stream token and uid
2847 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2848 IE_NAME = u'soundcloud'
2850 def __init__(self, downloader=None):
2851 InfoExtractor.__init__(self, downloader)
2853 def report_resolve(self, video_id):
2854 """Report information extraction."""
2855 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2857 def report_extraction(self, video_id):
2858 """Report information extraction."""
2859 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2861 def _real_extract(self, url):
2862 mobj = re.match(self._VALID_URL, url)
2864 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2867 # extract uploader (which is in the url)
2868 uploader = mobj.group(1)
2869 # extract simple title (uploader + slug of song title)
2870 slug_title = mobj.group(2)
2871 simple_title = uploader + u'-' + slug_title
2873 self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2875 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2876 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2877 request = compat_urllib_request.Request(resolv_url)
2879 info_json_bytes = compat_urllib_request.urlopen(request).read()
2880 info_json = info_json_bytes.decode('utf-8')
2881 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2882 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2886 info = json.loads(info_json)
2887 if 'errors' in info:
2888 for err in info['errors']:
2889 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err['error_message']))
2892 for track in info['tracks']:
2893 video_id = track['id']
2894 self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2896 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2897 request = compat_urllib_request.Request(streams_url)
2899 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2900 stream_json = stream_json_bytes.decode('utf-8')
2901 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2902 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2905 streams = json.loads(stream_json)
2906 mediaURL = streams['http_mp3_128_url']
2911 'uploader': track['user']['username'],
2912 'upload_date': track['created_at'],
2913 'title': track['title'],
2915 'description': track['description'],
2920 class InfoQIE(InfoExtractor):
2921 """Information extractor for infoq.com"""
2922 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2924 def report_extraction(self, video_id):
2925 """Report information extraction."""
2926 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2928 def _real_extract(self, url):
2929 mobj = re.match(self._VALID_URL, url)
2931 self._downloader.report_error(u'invalid URL: %s' % url)
2934 webpage = self._download_webpage(url, video_id=url)
2935 self.report_extraction(url)
2938 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2940 self._downloader.report_error(u'unable to extract video url')
2942 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2943 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2946 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2948 self._downloader.report_error(u'unable to extract video title')
2950 video_title = mobj.group(1)
2952 # Extract description
2953 video_description = u'No description available.'
2954 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2955 if mobj is not None:
2956 video_description = mobj.group(1)
2958 video_filename = video_url.split('/')[-1]
2959 video_id, extension = video_filename.split('.')
2965 'upload_date': None,
2966 'title': video_title,
2967 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2969 'description': video_description,
2974 class MixcloudIE(InfoExtractor):
2975 """Information extractor for www.mixcloud.com"""
2977 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2978 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2979 IE_NAME = u'mixcloud'
2981 def __init__(self, downloader=None):
2982 InfoExtractor.__init__(self, downloader)
2984 def report_download_json(self, file_id):
2985 """Report JSON download."""
2986 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2988 def report_extraction(self, file_id):
2989 """Report information extraction."""
2990 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2992 def get_urls(self, jsonData, fmt, bitrate='best'):
2993 """Get urls from 'audio_formats' section in json"""
2996 bitrate_list = jsonData[fmt]
2997 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2998 bitrate = max(bitrate_list) # select highest
3000 url_list = jsonData[fmt][bitrate]
3001 except TypeError: # we have no bitrate info.
3002 url_list = jsonData[fmt]
3005 def check_urls(self, url_list):
3006 """Returns 1st active url from list"""
3007 for url in url_list:
3009 compat_urllib_request.urlopen(url)
3011 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3016 def _print_formats(self, formats):
3017 print('Available formats:')
3018 for fmt in formats.keys():
3019 for b in formats[fmt]:
3021 ext = formats[fmt][b][0]
3022 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
3023 except TypeError: # we have no bitrate info
3024 ext = formats[fmt][0]
3025 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
3028 def _real_extract(self, url):
3029 mobj = re.match(self._VALID_URL, url)
3031 self._downloader.report_error(u'invalid URL: %s' % url)
3033 # extract uploader & filename from url
3034 uploader = mobj.group(1).decode('utf-8')
3035 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3037 # construct API request
3038 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3039 # retrieve .json file with links to files
3040 request = compat_urllib_request.Request(file_url)
3042 self.report_download_json(file_url)
3043 jsonData = compat_urllib_request.urlopen(request).read()
3044 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3045 self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
3049 json_data = json.loads(jsonData)
3050 player_url = json_data['player_swf_url']
3051 formats = dict(json_data['audio_formats'])
3053 req_format = self._downloader.params.get('format', None)
3056 if self._downloader.params.get('listformats', None):
3057 self._print_formats(formats)
3060 if req_format is None or req_format == 'best':
3061 for format_param in formats.keys():
3062 url_list = self.get_urls(formats, format_param)
3064 file_url = self.check_urls(url_list)
3065 if file_url is not None:
3068 if req_format not in formats:
3069 self._downloader.report_error(u'format is not available')
3072 url_list = self.get_urls(formats, req_format)
3073 file_url = self.check_urls(url_list)
3074 format_param = req_format
3077 'id': file_id.decode('utf-8'),
3078 'url': file_url.decode('utf-8'),
3079 'uploader': uploader.decode('utf-8'),
3080 'upload_date': None,
3081 'title': json_data['name'],
3082 'ext': file_url.split('.')[-1].decode('utf-8'),
3083 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3084 'thumbnail': json_data['thumbnail_url'],
3085 'description': json_data['description'],
3086 'player_url': player_url.decode('utf-8'),
3089 class StanfordOpenClassroomIE(InfoExtractor):
3090 """Information extractor for Stanford's Open ClassRoom"""
3092 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3093 IE_NAME = u'stanfordoc'
3095 def report_download_webpage(self, objid):
3096 """Report information extraction."""
3097 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3099 def report_extraction(self, video_id):
3100 """Report information extraction."""
3101 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3103 def _real_extract(self, url):
3104 mobj = re.match(self._VALID_URL, url)
3106 raise ExtractorError(u'Invalid URL: %s' % url)
3108 if mobj.group('course') and mobj.group('video'): # A specific video
3109 course = mobj.group('course')
3110 video = mobj.group('video')
3112 'id': course + '_' + video,
3114 'upload_date': None,
3117 self.report_extraction(info['id'])
3118 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3119 xmlUrl = baseUrl + video + '.xml'
3121 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3122 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3123 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
3125 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3127 info['title'] = mdoc.findall('./title')[0].text
3128 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3130 self._downloader.report_error(u'Invalid metadata XML file')
3132 info['ext'] = info['url'].rpartition('.')[2]
3134 elif mobj.group('course'): # A course page
3135 course = mobj.group('course')
3140 'upload_date': None,
3143 coursepage = self._download_webpage(url, info['id'],
3144 note='Downloading course info page',
3145 errnote='Unable to download course info page')
3147 m = re.search('<h1>([^<]+)</h1>', coursepage)
3149 info['title'] = unescapeHTML(m.group(1))
3151 info['title'] = info['id']
3153 m = re.search('<description>([^<]+)</description>', coursepage)
3155 info['description'] = unescapeHTML(m.group(1))
3157 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3160 'type': 'reference',
3161 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3165 for entry in info['list']:
3166 assert entry['type'] == 'reference'
3167 results += self.extract(entry['url'])
3171 'id': 'Stanford OpenClassroom',
3174 'upload_date': None,
3177 self.report_download_webpage(info['id'])
3178 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3180 rootpage = compat_urllib_request.urlopen(rootURL).read()
3181 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3182 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3185 info['title'] = info['id']
3187 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3190 'type': 'reference',
3191 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3196 for entry in info['list']:
3197 assert entry['type'] == 'reference'
3198 results += self.extract(entry['url'])
3201 class MTVIE(InfoExtractor):
3202 """Information extractor for MTV.com"""
3204 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3207 def report_extraction(self, video_id):
3208 """Report information extraction."""
3209 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3211 def _real_extract(self, url):
3212 mobj = re.match(self._VALID_URL, url)
3214 self._downloader.report_error(u'invalid URL: %s' % url)
3216 if not mobj.group('proto'):
3217 url = 'http://' + url
3218 video_id = mobj.group('videoid')
3220 webpage = self._download_webpage(url, video_id)
3222 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3224 self._downloader.report_error(u'unable to extract song name')
3226 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3227 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3229 self._downloader.report_error(u'unable to extract performer')
3231 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3232 video_title = performer + ' - ' + song_name
3234 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3236 self._downloader.report_error(u'unable to mtvn_uri')
3238 mtvn_uri = mobj.group(1)
3240 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3242 self._downloader.report_error(u'unable to extract content id')
3244 content_id = mobj.group(1)
3246 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3247 self.report_extraction(video_id)
3248 request = compat_urllib_request.Request(videogen_url)
3250 metadataXml = compat_urllib_request.urlopen(request).read()
3251 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3252 self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3255 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3256 renditions = mdoc.findall('.//rendition')
3258 # For now, always pick the highest quality.
3259 rendition = renditions[-1]
3262 _,_,ext = rendition.attrib['type'].partition('/')
3263 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3264 video_url = rendition.find('./src').text
3266 self._downloader.trouble('Invalid rendition field.')
3272 'uploader': performer,
3273 'upload_date': None,
3274 'title': video_title,
3282 class YoukuIE(InfoExtractor):
3283 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3285 def report_download_webpage(self, file_id):
3286 """Report webpage download."""
3287 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3289 def report_extraction(self, file_id):
3290 """Report information extraction."""
3291 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3294 nowTime = int(time.time() * 1000)
3295 random1 = random.randint(1000,1998)
3296 random2 = random.randint(1000,9999)
3298 return "%d%d%d" %(nowTime,random1,random2)
3300 def _get_file_ID_mix_string(self, seed):
3302 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3304 for i in range(len(source)):
3305 seed = (seed * 211 + 30031 ) % 65536
3306 index = math.floor(seed / 65536 * len(source) )
3307 mixed.append(source[int(index)])
3308 source.remove(source[int(index)])
3309 #return ''.join(mixed)
3312 def _get_file_id(self, fileId, seed):
3313 mixed = self._get_file_ID_mix_string(seed)
3314 ids = fileId.split('*')
3318 realId.append(mixed[int(ch)])
3319 return ''.join(realId)
3321 def _real_extract(self, url):
3322 mobj = re.match(self._VALID_URL, url)
3324 self._downloader.report_error(u'invalid URL: %s' % url)
3326 video_id = mobj.group('ID')
3328 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3330 request = compat_urllib_request.Request(info_url, None, std_headers)
3332 self.report_download_webpage(video_id)
3333 jsondata = compat_urllib_request.urlopen(request).read()
3334 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3335 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3338 self.report_extraction(video_id)
3340 jsonstr = jsondata.decode('utf-8')
3341 config = json.loads(jsonstr)
3343 video_title = config['data'][0]['title']
3344 seed = config['data'][0]['seed']
3346 format = self._downloader.params.get('format', None)
3347 supported_format = list(config['data'][0]['streamfileids'].keys())
3349 if format is None or format == 'best':
3350 if 'hd2' in supported_format:
3355 elif format == 'worst':
3363 fileid = config['data'][0]['streamfileids'][format]
3364 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3365 except (UnicodeDecodeError, ValueError, KeyError):
3366 self._downloader.report_error(u'unable to extract info section')
3370 sid = self._gen_sid()
3371 fileid = self._get_file_id(fileid, seed)
3373 #column 8,9 of fileid represent the segment number
3374 #fileid[7:9] should be changed
3375 for index, key in enumerate(keys):
3377 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3378 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3381 'id': '%s_part%02d' % (video_id, index),
3382 'url': download_url,
3384 'upload_date': None,
3385 'title': video_title,
3388 files_info.append(info)
3393 class XNXXIE(InfoExtractor):
3394 """Information extractor for xnxx.com"""
3396 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3398 VIDEO_URL_RE = r'flv_url=(.*?)&'
3399 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3400 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3402 def report_webpage(self, video_id):
3403 """Report information extraction"""
3404 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3406 def report_extraction(self, video_id):
3407 """Report information extraction"""
3408 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3410 def _real_extract(self, url):
3411 mobj = re.match(self._VALID_URL, url)
3413 self._downloader.report_error(u'invalid URL: %s' % url)
3415 video_id = mobj.group(1)
3417 self.report_webpage(video_id)
3419 # Get webpage content
3421 webpage_bytes = compat_urllib_request.urlopen(url).read()
3422 webpage = webpage_bytes.decode('utf-8')
3423 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3424 self._downloader.report_error(u'unable to download video webpage: %s' % err)
3427 result = re.search(self.VIDEO_URL_RE, webpage)
3429 self._downloader.report_error(u'unable to extract video url')
3431 video_url = compat_urllib_parse.unquote(result.group(1))
3433 result = re.search(self.VIDEO_TITLE_RE, webpage)
3435 self._downloader.report_error(u'unable to extract video title')
3437 video_title = result.group(1)
3439 result = re.search(self.VIDEO_THUMB_RE, webpage)
3441 self._downloader.report_error(u'unable to extract video thumbnail')
3443 video_thumbnail = result.group(1)
3449 'upload_date': None,
3450 'title': video_title,
3452 'thumbnail': video_thumbnail,
3453 'description': None,
3457 class GooglePlusIE(InfoExtractor):
3458 """Information extractor for plus.google.com."""
3460 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3461 IE_NAME = u'plus.google'
3463 def __init__(self, downloader=None):
3464 InfoExtractor.__init__(self, downloader)
3466 def report_extract_entry(self, url):
3467 """Report downloading extry"""
3468 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3470 def report_date(self, upload_date):
3471 """Report downloading extry"""
3472 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3474 def report_uploader(self, uploader):
3475 """Report downloading extry"""
3476 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3478 def report_title(self, video_title):
3479 """Report downloading extry"""
3480 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3482 def report_extract_vid_page(self, video_page):
3483 """Report information extraction."""
3484 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3486 def _real_extract(self, url):
3487 # Extract id from URL
3488 mobj = re.match(self._VALID_URL, url)
3490 self._downloader.report_error(u'Invalid URL: %s' % url)
3493 post_url = mobj.group(0)
3494 video_id = mobj.group(1)
3496 video_extension = 'flv'
3498 # Step 1, Retrieve post webpage to extract further information
3499 self.report_extract_entry(post_url)
3500 request = compat_urllib_request.Request(post_url)
3502 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3503 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3504 self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3507 # Extract update date
3509 pattern = 'title="Timestamp">(.*?)</a>'
3510 mobj = re.search(pattern, webpage)
3512 upload_date = mobj.group(1)
3513 # Convert timestring to a format suitable for filename
3514 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3515 upload_date = upload_date.strftime('%Y%m%d')
3516 self.report_date(upload_date)
3520 pattern = r'rel\="author".*?>(.*?)</a>'
3521 mobj = re.search(pattern, webpage)
3523 uploader = mobj.group(1)
3524 self.report_uploader(uploader)
3527 # Get the first line for title
3529 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3530 mobj = re.search(pattern, webpage)
3532 video_title = mobj.group(1)
3533 self.report_title(video_title)
3535 # Step 2, Stimulate clicking the image box to launch video
3536 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3537 mobj = re.search(pattern, webpage)
3539 self._downloader.report_error(u'unable to extract video page URL')
3541 video_page = mobj.group(1)
3542 request = compat_urllib_request.Request(video_page)
3544 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3545 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3546 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3548 self.report_extract_vid_page(video_page)
3551 # Extract video links on video page
3552 """Extract video links of all sizes"""
3553 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3554 mobj = re.findall(pattern, webpage)
3556 self._downloader.report_error(u'unable to extract video links')
3558 # Sort in resolution
3559 links = sorted(mobj)
3561 # Choose the lowest of the sort, i.e. highest resolution
3562 video_url = links[-1]
3563 # Only get the url. The resolution part in the tuple has no use anymore
3564 video_url = video_url[-1]
3565 # Treat escaped \u0026 style hex
3567 video_url = video_url.decode("unicode_escape")
3568 except AttributeError: # Python 3
3569 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3575 'uploader': uploader,
3576 'upload_date': upload_date,
3577 'title': video_title,
3578 'ext': video_extension,
3581 class NBAIE(InfoExtractor):
3582 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3585 def _real_extract(self, url):
3586 mobj = re.match(self._VALID_URL, url)
3588 self._downloader.report_error(u'invalid URL: %s' % url)
3591 video_id = mobj.group(1)
3592 if video_id.endswith('/index.html'):
3593 video_id = video_id[:-len('/index.html')]
3595 webpage = self._download_webpage(url, video_id)
3597 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3598 def _findProp(rexp, default=None):
3599 m = re.search(rexp, webpage)
3601 return unescapeHTML(m.group(1))
3605 shortened_video_id = video_id.rpartition('/')[2]
3606 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3608 'id': shortened_video_id,
3612 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3613 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3617 class JustinTVIE(InfoExtractor):
3618 """Information extractor for justin.tv and twitch.tv"""
3619 # TODO: One broadcast may be split into multiple videos. The key
3620 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3621 # starts at 1 and increases. Can we treat all parts as one video?
3623 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3624 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3625 _JUSTIN_PAGE_LIMIT = 100
3626 IE_NAME = u'justin.tv'
3628 def report_extraction(self, file_id):
3629 """Report information extraction."""
3630 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3632 def report_download_page(self, channel, offset):
3633 """Report attempt to download a single page of videos."""
3634 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3635 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3637 # Return count of items, list of *valid* items
3638 def _parse_page(self, url):
3640 urlh = compat_urllib_request.urlopen(url)
3641 webpage_bytes = urlh.read()
3642 webpage = webpage_bytes.decode('utf-8', 'ignore')
3643 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3644 self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3647 response = json.loads(webpage)
3648 if type(response) != list:
3649 error_text = response.get('error', 'unknown error')
3650 self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3653 for clip in response:
3654 video_url = clip['video_file_url']
3656 video_extension = os.path.splitext(video_url)[1][1:]
3657 video_date = re.sub('-', '', clip['start_time'][:10])
3658 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3659 video_id = clip['id']
3660 video_title = clip.get('title', video_id)
3664 'title': video_title,
3665 'uploader': clip.get('channel_name', video_uploader_id),
3666 'uploader_id': video_uploader_id,
3667 'upload_date': video_date,
3668 'ext': video_extension,
3670 return (len(response), info)
3672 def _real_extract(self, url):
3673 mobj = re.match(self._VALID_URL, url)
3675 self._downloader.report_error(u'invalid URL: %s' % url)
3678 api = 'http://api.justin.tv'
3679 video_id = mobj.group(mobj.lastindex)
3681 if mobj.lastindex == 1:
3683 api += '/channel/archives/%s.json'
3685 api += '/broadcast/by_archive/%s.json'
3686 api = api % (video_id,)
3688 self.report_extraction(video_id)
3692 limit = self._JUSTIN_PAGE_LIMIT
3695 self.report_download_page(video_id, offset)
3696 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3697 page_count, page_info = self._parse_page(page_url)
3698 info.extend(page_info)
3699 if not paged or page_count != limit:
3704 class FunnyOrDieIE(InfoExtractor):
3705 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3707 def _real_extract(self, url):
3708 mobj = re.match(self._VALID_URL, url)
3710 self._downloader.report_error(u'invalid URL: %s' % url)
3713 video_id = mobj.group('id')
3714 webpage = self._download_webpage(url, video_id)
3716 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3718 self._downloader.report_error(u'unable to find video information')
3719 video_url = unescapeHTML(m.group('url'))
3721 m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3723 self._downloader.trouble(u'Cannot find video title')
3724 title = clean_html(m.group('title'))
3726 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3728 desc = unescapeHTML(m.group('desc'))
3737 'description': desc,
3741 class SteamIE(InfoExtractor):
3742 _VALID_URL = r"""http://store.steampowered.com/
3743 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3745 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3749 def suitable(cls, url):
3750 """Receives a URL and returns True if suitable for this IE."""
3751 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3753 def _real_extract(self, url):
3754 m = re.match(self._VALID_URL, url, re.VERBOSE)
3755 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3756 gameID = m.group('gameID')
3757 videourl = 'http://store.steampowered.com/video/%s/' % gameID
3758 webpage = self._download_webpage(videourl, gameID)
3759 mweb = re.finditer(urlRE, webpage)
3760 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3761 titles = re.finditer(namesRE, webpage)
3762 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3763 thumbs = re.finditer(thumbsRE, webpage)
3765 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3766 video_id = vid.group('videoID')
3767 title = vtitle.group('videoName')
3768 video_url = vid.group('videoURL')
3769 video_thumb = thumb.group('thumbnail')
3771 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3776 'title': unescapeHTML(title),
3777 'thumbnail': video_thumb
3782 class UstreamIE(InfoExtractor):
3783 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3784 IE_NAME = u'ustream'
3786 def _real_extract(self, url):
3787 m = re.match(self._VALID_URL, url)
3788 video_id = m.group('videoID')
3789 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3790 webpage = self._download_webpage(url, video_id)
3791 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3792 title = m.group('title')
3793 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3794 uploader = m.group('uploader')
3800 'uploader': uploader
3804 class WorldStarHipHopIE(InfoExtractor):
3805 _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3806 IE_NAME = u'WorldStarHipHop'
3808 def _real_extract(self, url):
3809 _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3811 webpage_src = compat_urllib_request.urlopen(url).read()
3812 webpage_src = webpage_src.decode('utf-8')
3814 mobj = re.search(_src_url, webpage_src)
3816 m = re.match(self._VALID_URL, url)
3817 video_id = m.group('id')
3819 if mobj is not None:
3820 video_url = mobj.group()
3821 if 'mp4' in video_url:
3826 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3829 _title = r"""<title>(.*)</title>"""
3831 mobj = re.search(_title, webpage_src)
3833 if mobj is not None:
3834 title = mobj.group(1)
3836 title = 'World Start Hip Hop - %s' % time.ctime()
3838 _thumbnail = r"""rel="image_src" href="(.*)" />"""
3839 mobj = re.search(_thumbnail, webpage_src)
3841 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3842 if mobj is not None:
3843 thumbnail = mobj.group(1)
3845 _title = r"""candytitles.*>(.*)</span>"""
3846 mobj = re.search(_title, webpage_src)
3847 if mobj is not None:
3848 title = mobj.group(1)
3855 'thumbnail' : thumbnail,
3860 class RBMARadioIE(InfoExtractor):
3861 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3863 def _real_extract(self, url):
3864 m = re.match(self._VALID_URL, url)
3865 video_id = m.group('videoID')
3867 webpage = self._download_webpage(url, video_id)
3868 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3870 raise ExtractorError(u'Cannot find metadata')
3871 json_data = m.group(1)
3874 data = json.loads(json_data)
3875 except ValueError as e:
3876 raise ExtractorError(u'Invalid JSON: ' + str(e))
3878 video_url = data['akamai_url'] + '&cbr=256'
3879 url_parts = compat_urllib_parse_urlparse(video_url)
3880 video_ext = url_parts.path.rpartition('.')[2]
3885 'title': data['title'],
3886 'description': data.get('teaser_text'),
3887 'location': data.get('country_of_origin'),
3888 'uploader': data.get('host', {}).get('name'),
3889 'uploader_id': data.get('host', {}).get('slug'),
3890 'thumbnail': data.get('image', {}).get('large_url_2x'),
3891 'duration': data.get('duration'),
3896 class YouPornIE(InfoExtractor):
3897 """Information extractor for youporn.com."""
3898 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3900 def _print_formats(self, formats):
3901 """Print all available formats"""
3902 print(u'Available formats:')
3903 print(u'ext\t\tformat')
3904 print(u'---------------------------------')
3905 for format in formats:
3906 print(u'%s\t\t%s' % (format['ext'], format['format']))
3908 def _specific(self, req_format, formats):
3910 if(x["format"]==req_format):
3914 def _real_extract(self, url):
3915 mobj = re.match(self._VALID_URL, url)
3917 self._downloader.report_error(u'invalid URL: %s' % url)
3920 video_id = mobj.group('videoid')
3922 req = compat_urllib_request.Request(url)
3923 req.add_header('Cookie', 'age_verified=1')
3924 webpage = self._download_webpage(req, video_id)
3926 # Get the video title
3927 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3929 raise ExtractorError(u'Unable to extract video title')
3930 video_title = result.group('title').strip()
3932 # Get the video date
3933 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3935 self._downloader.report_warning(u'unable to extract video date')
3938 upload_date = result.group('date').strip()
3940 # Get the video uploader
3941 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3943 self._downloader.report_warning(u'unable to extract uploader')
3944 video_uploader = None
3946 video_uploader = result.group('uploader').strip()
3947 video_uploader = clean_html( video_uploader )
3949 # Get all of the formats available
3950 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3951 result = re.search(DOWNLOAD_LIST_RE, webpage)
3953 raise ExtractorError(u'Unable to extract download list')
3954 download_list_html = result.group('download_list').strip()
3956 # Get all of the links from the page
3957 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3958 links = re.findall(LINK_RE, download_list_html)
3959 if(len(links) == 0):
3960 raise ExtractorError(u'ERROR: no known formats available for video')
3962 self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3967 # A link looks like this:
3968 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3969 # A path looks like this:
3970 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3971 video_url = unescapeHTML( link )
3972 path = compat_urllib_parse_urlparse( video_url ).path
3973 extension = os.path.splitext( path )[1][1:]
3974 format = path.split('/')[4].split('_')[:2]
3977 format = "-".join( format )
3978 title = u'%s-%s-%s' % (video_title, size, bitrate)
3983 'uploader': video_uploader,
3984 'upload_date': upload_date,
3989 'description': None,
3993 if self._downloader.params.get('listformats', None):
3994 self._print_formats(formats)
3997 req_format = self._downloader.params.get('format', None)
3998 self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
4000 if req_format is None or req_format == 'best':
4002 elif req_format == 'worst':
4003 return [formats[-1]]
4004 elif req_format in ('-1', 'all'):
4007 format = self._specific( req_format, formats )
4009 self._downloader.report_error(u'requested format not available')
4015 class PornotubeIE(InfoExtractor):
4016 """Information extractor for pornotube.com."""
4017 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
4019 def _real_extract(self, url):
4020 mobj = re.match(self._VALID_URL, url)
4022 self._downloader.report_error(u'invalid URL: %s' % url)
4025 video_id = mobj.group('videoid')
4026 video_title = mobj.group('title')
4028 # Get webpage content
4029 webpage = self._download_webpage(url, video_id)
4032 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
4033 result = re.search(VIDEO_URL_RE, webpage)
4035 self._downloader.report_error(u'unable to extract video url')
4037 video_url = compat_urllib_parse.unquote(result.group('url'))
4039 #Get the uploaded date
4040 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
4041 result = re.search(VIDEO_UPLOADED_RE, webpage)
4043 self._downloader.report_error(u'unable to extract video title')
4045 upload_date = result.group('date')
4047 info = {'id': video_id,
4050 'upload_date': upload_date,
4051 'title': video_title,
4057 class YouJizzIE(InfoExtractor):
4058 """Information extractor for youjizz.com."""
4059 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
4061 def _real_extract(self, url):
4062 mobj = re.match(self._VALID_URL, url)
4064 self._downloader.report_error(u'invalid URL: %s' % url)
4067 video_id = mobj.group('videoid')
4069 # Get webpage content
4070 webpage = self._download_webpage(url, video_id)
4072 # Get the video title
4073 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
4075 raise ExtractorError(u'ERROR: unable to extract video title')
4076 video_title = result.group('title').strip()
4078 # Get the embed page
4079 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
4081 raise ExtractorError(u'ERROR: unable to extract embed page')
4083 embed_page_url = result.group(0).strip()
4084 video_id = result.group('videoid')
4086 webpage = self._download_webpage(embed_page_url, video_id)
4089 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
4091 raise ExtractorError(u'ERROR: unable to extract video url')
4092 video_url = result.group('source')
4094 info = {'id': video_id,
4096 'title': video_title,
4099 'player_url': embed_page_url}
4103 class EightTracksIE(InfoExtractor):
4105 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
4107 def _real_extract(self, url):
4108 mobj = re.match(self._VALID_URL, url)
4110 raise ExtractorError(u'Invalid URL: %s' % url)
4111 playlist_id = mobj.group('id')
4113 webpage = self._download_webpage(url, playlist_id)
4115 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
4117 raise ExtractorError(u'Cannot find trax information')
4118 json_like = m.group(1)
4119 data = json.loads(json_like)
4121 session = str(random.randint(0, 1000000000))
4123 track_count = data['tracks_count']
4124 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
4125 next_url = first_url
4127 for i in itertools.count():
4128 api_json = self._download_webpage(next_url, playlist_id,
4129 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
4130 errnote=u'Failed to download song information')
4131 api_data = json.loads(api_json)
4132 track_data = api_data[u'set']['track']
4134 'id': track_data['id'],
4135 'url': track_data['track_file_stream_url'],
4136 'title': track_data['performer'] + u' - ' + track_data['name'],
4137 'raw_title': track_data['name'],
4138 'uploader_id': data['user']['login'],
4142 if api_data['set']['at_last_track']:
4144 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
4147 class KeekIE(InfoExtractor):
4148 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
4151 def _real_extract(self, url):
4152 m = re.match(self._VALID_URL, url)
4153 video_id = m.group('videoID')
4154 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
4155 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
4156 webpage = self._download_webpage(url, video_id)
4157 m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
4158 title = unescapeHTML(m.group('title'))
4159 m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
4160 uploader = clean_html(m.group('uploader'))
4166 'thumbnail': thumbnail,
4167 'uploader': uploader
4171 class TEDIE(InfoExtractor):
4172 _VALID_URL=r'''http://www.ted.com/
4174 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4176 ((?P<type_talk>talks)) # We have a simple talk
4178 /(?P<name>\w+) # Here goes the name and then ".html"
4182 def suitable(cls, url):
4183 """Receives a URL and returns True if suitable for this IE."""
4184 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4186 def _real_extract(self, url):
4187 m=re.match(self._VALID_URL, url, re.VERBOSE)
4188 if m.group('type_talk'):
4189 return [self._talk_info(url)]
4191 playlist_id=m.group('playlist_id')
4192 name=m.group('name')
4193 self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4194 return self._playlist_videos_info(url,name,playlist_id)
4196 def _talk_video_link(self,mediaSlug):
4197 '''Returns the video link for that mediaSlug'''
4198 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4200 def _playlist_videos_info(self,url,name,playlist_id=0):
4201 '''Returns the videos of the playlist'''
4203 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4204 ([.\s]*?)data-playlist_item_id="(\d+)"
4205 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4207 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4208 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4209 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4210 m_names=re.finditer(video_name_RE,webpage)
4212 for m_video, m_name in zip(m_videos,m_names):
4213 video_id=m_video.group('video_id')
4214 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4215 info.append(self._talk_info(talk_url,video_id))
4218 def _talk_info(self, url, video_id=0):
4219 """Return the video for the talk in the url"""
4220 m=re.match(self._VALID_URL, url,re.VERBOSE)
4221 videoName=m.group('name')
4222 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4223 # If the url includes the language we get the title translated
4224 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4225 title=re.search(title_RE, webpage).group('title')
4226 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4227 "id":(?P<videoID>[\d]+).*?
4228 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4229 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4230 thumb_match=re.search(thumb_RE,webpage)
4231 info_match=re.search(info_RE,webpage,re.VERBOSE)
4232 video_id=info_match.group('videoID')
4233 mediaSlug=info_match.group('mediaSlug')
4234 video_url=self._talk_video_link(mediaSlug)
4240 'thumbnail': thumb_match.group('thumbnail')
4244 class MySpassIE(InfoExtractor):
4245 _VALID_URL = r'http://www.myspass.de/.*'
4247 def _real_extract(self, url):
4248 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4250 # video id is the last path element of the URL
4251 # usually there is a trailing slash, so also try the second but last
4252 url_path = compat_urllib_parse_urlparse(url).path
4253 url_parent_path, video_id = os.path.split(url_path)
4255 _, video_id = os.path.split(url_parent_path)
4258 metadata_url = META_DATA_URL_TEMPLATE % video_id
4259 metadata_text = self._download_webpage(metadata_url, video_id)
4260 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4262 # extract values from metadata
4263 url_flv_el = metadata.find('url_flv')
4264 if url_flv_el is None:
4265 self._downloader.report_error(u'unable to extract download url')
4267 video_url = url_flv_el.text
4268 extension = os.path.splitext(video_url)[1][1:]
4269 title_el = metadata.find('title')
4270 if title_el is None:
4271 self._downloader.report_error(u'unable to extract title')
4273 title = title_el.text
4274 format_id_el = metadata.find('format_id')
4275 if format_id_el is None:
4278 format = format_id_el.text
4279 description_el = metadata.find('description')
4280 if description_el is not None:
4281 description = description_el.text
4284 imagePreview_el = metadata.find('imagePreview')
4285 if imagePreview_el is not None:
4286 thumbnail = imagePreview_el.text
4295 'thumbnail': thumbnail,
4296 'description': description
4300 class SpiegelIE(InfoExtractor):
4301 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4303 def _real_extract(self, url):
4304 m = re.match(self._VALID_URL, url)
4305 video_id = m.group('videoID')
4307 webpage = self._download_webpage(url, video_id)
4308 m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4310 raise ExtractorError(u'Cannot find title')
4311 video_title = unescapeHTML(m.group(1))
4313 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4314 xml_code = self._download_webpage(xml_url, video_id,
4315 note=u'Downloading XML', errnote=u'Failed to download XML')
4317 idoc = xml.etree.ElementTree.fromstring(xml_code)
4318 last_type = idoc[-1]
4319 filename = last_type.findall('./filename')[0].text
4320 duration = float(last_type.findall('./duration')[0].text)
4322 video_url = 'http://video2.spiegel.de/flash/' + filename
4323 video_ext = filename.rpartition('.')[2]
4328 'title': video_title,
4329 'duration': duration,
4333 class LiveLeakIE(InfoExtractor):
4335 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4336 IE_NAME = u'liveleak'
4338 def _real_extract(self, url):
4339 mobj = re.match(self._VALID_URL, url)
4341 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4344 video_id = mobj.group('video_id')
4346 webpage = self._download_webpage(url, video_id)
4348 m = re.search(r'file: "(.*?)",', webpage)
4350 self._downloader.report_error(u'unable to find video url')
4352 video_url = m.group(1)
4354 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4356 self._downloader.trouble(u'Cannot find video title')
4357 title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4359 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4361 desc = unescapeHTML(m.group('desc'))
4365 m = re.search(r'By:.*?(\w+)</a>', webpage)
4367 uploader = clean_html(m.group(1))
4376 'description': desc,
4377 'uploader': uploader
4383 def gen_extractors():
4384 """ Return a list of an instance of every supported extractor.
4385 The order does matter; the first extractor matched is the one handling the URL.
4388 YoutubePlaylistIE(),
4413 StanfordOpenClassroomIE(),
4423 WorldStarHipHopIE(),