2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
26 class InfoExtractor(object):
27 """Information Extractor class.
29 Information extractors are the classes that, given a URL, extract
30 information about the video (or videos) the URL refers to. This
31 information includes the real video URL, the video title, author and
32 others. The information is stored in a dictionary which is then
33 passed to the FileDownloader. The FileDownloader processes this
34 information possibly downloading the video to the file system, among
35 other possible outcomes.
37 The dictionaries must include the following fields:
41 title: Video title, unescaped.
42 ext: Video filename extension.
44 The following fields are optional:
46 format: The video format, defaults to ext (used for --get-format)
47 thumbnail: Full URL to a video thumbnail image.
48 description: One-line video description.
49 uploader: Full name of the video uploader.
50 upload_date: Video upload date (YYYYMMDD).
51 uploader_id: Nickname or id of the video uploader.
52 location: Physical location of the video.
53 player_url: SWF Player URL (used for rtmpdump).
54 subtitles: The subtitle file contents.
55 urlhandle: [internal] The urlHandle to be used to download the file,
56 like returned by urllib.request.urlopen
58 The fields should all be Unicode strings.
60 Subclasses of this one should re-define the _real_initialize() and
61 _real_extract() methods and define a _VALID_URL regexp.
62 Probably, they should also be added to the list of extractors.
64 _real_extract() must return a *list* of information dictionaries as
67 Finally, the _WORKING attribute should be set to False for broken IEs
68 in order to warn the users and skip the tests.
75 def __init__(self, downloader=None):
76 """Constructor. Receives an optional downloader."""
78 self.set_downloader(downloader)
81 def suitable(cls, url):
82 """Receives a URL and returns True if suitable for this IE."""
83 return re.match(cls._VALID_URL, url) is not None
87 """Getter method for _WORKING."""
91 """Initializes an instance (authentication, etc)."""
93 self._real_initialize()
96 def extract(self, url):
97 """Extracts URL information and returns it in list of dicts."""
99 return self._real_extract(url)
101 def set_downloader(self, downloader):
102 """Sets the downloader for this IE."""
103 self._downloader = downloader
105 def _real_initialize(self):
106 """Real initialization process. Redefine in subclasses."""
109 def _real_extract(self, url):
110 """Real extraction process. Redefine in subclasses."""
115 return type(self).__name__[:-2]
117 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
118 """ Returns the response handle """
120 self.report_download_webpage(video_id)
121 elif note is not False:
122 self.to_screen(u'%s: %s' % (video_id, note))
124 return compat_urllib_request.urlopen(url_or_request)
125 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
127 errnote = u'Unable to download webpage'
128 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
130 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
131 """ Returns a tuple (page content as string, URL handle) """
132 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
133 content_type = urlh.headers.get('Content-Type', '')
134 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
136 encoding = m.group(1)
139 webpage_bytes = urlh.read()
140 if self._downloader.params.get('dump_intermediate_pages', False):
142 url = url_or_request.get_full_url()
143 except AttributeError:
145 self.to_screen(u'Dumping request to ' + url)
146 dump = base64.b64encode(webpage_bytes).decode('ascii')
147 self._downloader.to_screen(dump)
148 content = webpage_bytes.decode(encoding, 'replace')
149 return (content, urlh)
151 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
152 """ Returns the data of the page as a string """
153 return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
155 def to_screen(self, msg):
156 """Print msg to screen, prefixing it with '[ie_name]'"""
157 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
159 def report_extraction(self, id_or_name):
160 """Report information extraction."""
161 self.to_screen(u'%s: Extracting information' % id_or_name)
163 def report_download_webpage(self, video_id):
164 """Report webpage download."""
165 self.to_screen(u'%s: Downloading webpage' % video_id)
167 def report_age_confirmation(self):
168 """Report attempt to confirm age."""
169 self.to_screen(u'Confirming age')
171 #Methods for following #608
172 #They set the correct value of the '_type' key
173 def video_result(self, video_info):
174 """Returns a video"""
175 video_info['_type'] = 'video'
177 def url_result(self, url, ie=None):
178 """Returns a url that points to a page that should be processed"""
179 #TODO: ie should be the class used for getting the info
180 video_info = {'_type': 'url',
184 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
185 """Returns a playlist"""
186 video_info = {'_type': 'playlist',
189 video_info['id'] = playlist_id
191 video_info['title'] = playlist_title
194 class SearchInfoExtractor(InfoExtractor):
196 Base class for paged search queries extractors.
197 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
198 Instances should define _SEARCH_KEY and _MAX_RESULTS.
202 def _make_valid_url(cls):
203 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
206 def suitable(cls, url):
207 return re.match(cls._make_valid_url(), url) is not None
209 def _real_extract(self, query):
210 mobj = re.match(self._make_valid_url(), query)
212 raise ExtractorError(u'Invalid search query "%s"' % query)
214 prefix = mobj.group('prefix')
215 query = mobj.group('query')
217 return self._get_n_results(query, 1)
218 elif prefix == 'all':
219 return self._get_n_results(query, self._MAX_RESULTS)
223 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
224 elif n > self._MAX_RESULTS:
225 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
226 n = self._MAX_RESULTS
227 return self._get_n_results(query, n)
229 def _get_n_results(self, query, n):
230 """Get a specified number of results for a query"""
231 raise NotImplementedError("This method must be implemented by sublclasses")
234 class YoutubeIE(InfoExtractor):
235 """Information extractor for youtube.com."""
239 (?:https?://)? # http(s):// (optional)
240 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
241 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
242 (?:.*?\#/)? # handle anchor (#/) redirect urls
243 (?: # the various things that can precede the ID:
244 (?:(?:v|embed|e)/) # v/ or embed/ or e/
245 |(?: # or the v= param in all its forms
246 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
247 (?:\?|\#!?) # the params delimiter ? or # or #!
248 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
251 )? # optional -> youtube.com/xxxx is OK
252 )? # all until now is optional -> you can pass the naked ID
253 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
254 (?(1).+)? # if we found the ID, everything can follow
256 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
257 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
258 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
259 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
260 _NETRC_MACHINE = 'youtube'
261 # Listed in order of quality
262 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
263 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
264 _video_extensions = {
270 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
276 _video_dimensions = {
295 def suitable(cls, url):
296 """Receives a URL and returns True if suitable for this IE."""
297 if YoutubePlaylistIE.suitable(url): return False
298 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
300 def report_lang(self):
301 """Report attempt to set language."""
302 self.to_screen(u'Setting language')
304 def report_login(self):
305 """Report attempt to log in."""
306 self.to_screen(u'Logging in')
308 def report_video_webpage_download(self, video_id):
309 """Report attempt to download video webpage."""
310 self.to_screen(u'%s: Downloading video webpage' % video_id)
312 def report_video_info_webpage_download(self, video_id):
313 """Report attempt to download video info webpage."""
314 self.to_screen(u'%s: Downloading video info webpage' % video_id)
316 def report_video_subtitles_download(self, video_id):
317 """Report attempt to download video info webpage."""
318 self.to_screen(u'%s: Checking available subtitles' % video_id)
320 def report_video_subtitles_request(self, video_id, sub_lang, format):
321 """Report attempt to download video info webpage."""
322 self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
324 def report_video_subtitles_available(self, video_id, sub_lang_list):
325 """Report available subtitles."""
326 sub_lang = ",".join(list(sub_lang_list.keys()))
327 self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
329 def report_information_extraction(self, video_id):
330 """Report attempt to extract video information."""
331 self.to_screen(u'%s: Extracting video information' % video_id)
333 def report_unavailable_format(self, video_id, format):
334 """Report extracted video URL."""
335 self.to_screen(u'%s: Format %s not available' % (video_id, format))
337 def report_rtmp_download(self):
338 """Indicate the download will use the RTMP protocol."""
339 self.to_screen(u'RTMP download detected')
341 def _get_available_subtitles(self, video_id):
342 self.report_video_subtitles_download(video_id)
343 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
345 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
346 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
347 return (u'unable to download video subtitles: %s' % compat_str(err), None)
348 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
349 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
350 if not sub_lang_list:
351 return (u'video doesn\'t have subtitles', None)
354 def _list_available_subtitles(self, video_id):
355 sub_lang_list = self._get_available_subtitles(video_id)
356 self.report_video_subtitles_available(video_id, sub_lang_list)
358 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
361 (error_message, sub_lang, sub)
363 self.report_video_subtitles_request(video_id, sub_lang, format)
364 params = compat_urllib_parse.urlencode({
370 url = 'http://www.youtube.com/api/timedtext?' + params
372 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
373 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
374 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
376 return (u'Did not fetch video subtitles', None, None)
377 return (None, sub_lang, sub)
379 def _request_automatic_caption(self, video_id, webpage):
380 """We need the webpage for getting the captions url, pass it as an
381 argument to speed up the process."""
382 sub_lang = self._downloader.params.get('subtitleslang')
383 sub_format = self._downloader.params.get('subtitlesformat')
384 self.to_screen(u'%s: Looking for automatic captions' % video_id)
385 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
386 err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang
388 return [(err_msg, None, None)]
389 player_config = json.loads(mobj.group(1))
391 args = player_config[u'args']
392 caption_url = args[u'ttsurl']
393 timestamp = args[u'timestamp']
394 params = compat_urllib_parse.urlencode({
401 subtitles_url = caption_url + '&' + params
402 sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions')
403 return [(None, sub_lang, sub)]
405 return [(err_msg, None, None)]
407 def _extract_subtitle(self, video_id):
409 Return a list with a tuple:
410 [(error_message, sub_lang, sub)]
412 sub_lang_list = self._get_available_subtitles(video_id)
413 sub_format = self._downloader.params.get('subtitlesformat')
414 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
415 return [(sub_lang_list[0], None, None)]
416 if self._downloader.params.get('subtitleslang', False):
417 sub_lang = self._downloader.params.get('subtitleslang')
418 elif 'en' in sub_lang_list:
421 sub_lang = list(sub_lang_list.keys())[0]
422 if not sub_lang in sub_lang_list:
423 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
425 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
428 def _extract_all_subtitles(self, video_id):
429 sub_lang_list = self._get_available_subtitles(video_id)
430 sub_format = self._downloader.params.get('subtitlesformat')
431 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
432 return [(sub_lang_list[0], None, None)]
434 for sub_lang in sub_lang_list:
435 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
436 subtitles.append(subtitle)
439 def _print_formats(self, formats):
440 print('Available formats:')
442 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
444 def _real_initialize(self):
445 if self._downloader is None:
450 downloader_params = self._downloader.params
452 # Attempt to use provided username and password or .netrc data
453 if downloader_params.get('username', None) is not None:
454 username = downloader_params['username']
455 password = downloader_params['password']
456 elif downloader_params.get('usenetrc', False):
458 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
463 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
464 except (IOError, netrc.NetrcParseError) as err:
465 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
469 request = compat_urllib_request.Request(self._LANG_URL)
472 compat_urllib_request.urlopen(request).read()
473 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
474 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
477 # No authentication to be performed
481 request = compat_urllib_request.Request(self._LOGIN_URL)
483 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
484 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
485 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
490 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
492 galx = match.group(1)
494 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
500 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
504 u'PersistentCookie': u'yes',
506 u'bgresponse': u'js_disabled',
507 u'checkConnection': u'',
508 u'checkedDomains': u'youtube',
514 u'signIn': u'Sign in',
516 u'service': u'youtube',
520 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
522 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
523 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
524 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
527 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
528 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
529 self._downloader.report_warning(u'unable to log in: bad username or password')
531 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
532 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
538 'action_confirm': 'Confirm',
540 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
542 self.report_age_confirmation()
543 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
544 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
545 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
547 def _extract_id(self, url):
548 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
550 raise ExtractorError(u'Invalid URL: %s' % url)
551 video_id = mobj.group(2)
554 def _real_extract(self, url):
555 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
556 mobj = re.search(self._NEXT_URL_RE, url)
558 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
559 video_id = self._extract_id(url)
562 self.report_video_webpage_download(video_id)
563 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
564 request = compat_urllib_request.Request(url)
566 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
567 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
568 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
570 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
572 # Attempt to extract SWF player URL
573 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
575 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
580 self.report_video_info_webpage_download(video_id)
581 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
582 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
583 % (video_id, el_type))
584 video_info_webpage = self._download_webpage(video_info_url, video_id,
586 errnote='unable to download video info webpage')
587 video_info = compat_parse_qs(video_info_webpage)
588 if 'token' in video_info:
590 if 'token' not in video_info:
591 if 'reason' in video_info:
592 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
594 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
596 # Check for "rental" videos
597 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
598 raise ExtractorError(u'"rental" videos not supported')
600 # Start extracting information
601 self.report_information_extraction(video_id)
604 if 'author' not in video_info:
605 raise ExtractorError(u'Unable to extract uploader name')
606 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
609 video_uploader_id = None
610 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
612 video_uploader_id = mobj.group(1)
614 self._downloader.report_warning(u'unable to extract uploader nickname')
617 if 'title' not in video_info:
618 raise ExtractorError(u'Unable to extract video title')
619 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
622 if 'thumbnail_url' not in video_info:
623 self._downloader.report_warning(u'unable to extract video thumbnail')
625 else: # don't panic if we can't find it
626 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
630 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
632 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
633 upload_date = unified_strdate(upload_date)
636 video_description = get_element_by_id("eow-description", video_webpage)
637 if video_description:
638 video_description = clean_html(video_description)
640 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
642 video_description = unescapeHTML(fd_mobj.group(1))
644 video_description = u''
647 video_subtitles = None
649 if self._downloader.params.get('writesubtitles', False):
650 video_subtitles = self._extract_subtitle(video_id)
652 (sub_error, sub_lang, sub) = video_subtitles[0]
654 # We try with the automatic captions
655 video_subtitles = self._request_automatic_caption(video_id, video_webpage)
656 (sub_error_auto, sub_lang, sub) = video_subtitles[0]
660 # We report the original error
661 self._downloader.report_error(sub_error)
663 if self._downloader.params.get('allsubtitles', False):
664 video_subtitles = self._extract_all_subtitles(video_id)
665 for video_subtitle in video_subtitles:
666 (sub_error, sub_lang, sub) = video_subtitle
668 self._downloader.report_error(sub_error)
670 if self._downloader.params.get('listsubtitles', False):
671 sub_lang_list = self._list_available_subtitles(video_id)
674 if 'length_seconds' not in video_info:
675 self._downloader.report_warning(u'unable to extract video duration')
678 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
681 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
683 # Decide which formats to download
684 req_format = self._downloader.params.get('format', None)
686 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
687 self.report_rtmp_download()
688 video_url_list = [(None, video_info['conn'][0])]
689 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
691 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
692 url_data = compat_parse_qs(url_data_str)
693 if 'itag' in url_data and 'url' in url_data:
694 url = url_data['url'][0] + '&signature=' + url_data['sig'][0]
695 if not 'ratebypass' in url: url += '&ratebypass=yes'
696 url_map[url_data['itag'][0]] = url
698 format_limit = self._downloader.params.get('format_limit', None)
699 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
700 if format_limit is not None and format_limit in available_formats:
701 format_list = available_formats[available_formats.index(format_limit):]
703 format_list = available_formats
704 existing_formats = [x for x in format_list if x in url_map]
705 if len(existing_formats) == 0:
706 raise ExtractorError(u'no known formats available for video')
707 if self._downloader.params.get('listformats', None):
708 self._print_formats(existing_formats)
710 if req_format is None or req_format == 'best':
711 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
712 elif req_format == 'worst':
713 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
714 elif req_format in ('-1', 'all'):
715 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
717 # Specific formats. We pick the first in a slash-delimeted sequence.
718 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
719 req_formats = req_format.split('/')
720 video_url_list = None
721 for rf in req_formats:
723 video_url_list = [(rf, url_map[rf])]
725 if video_url_list is None:
726 raise ExtractorError(u'requested format not available')
728 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
731 for format_param, video_real_url in video_url_list:
733 video_extension = self._video_extensions.get(format_param, 'flv')
735 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
736 self._video_dimensions.get(format_param, '???'))
740 'url': video_real_url,
741 'uploader': video_uploader,
742 'uploader_id': video_uploader_id,
743 'upload_date': upload_date,
744 'title': video_title,
745 'ext': video_extension,
746 'format': video_format,
747 'thumbnail': video_thumbnail,
748 'description': video_description,
749 'player_url': player_url,
750 'subtitles': video_subtitles,
751 'duration': video_duration
756 class MetacafeIE(InfoExtractor):
757 """Information Extractor for metacafe.com."""
759 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
760 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
761 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
762 IE_NAME = u'metacafe'
764 def report_disclaimer(self):
765 """Report disclaimer retrieval."""
766 self.to_screen(u'Retrieving disclaimer')
768 def _real_initialize(self):
769 # Retrieve disclaimer
770 request = compat_urllib_request.Request(self._DISCLAIMER)
772 self.report_disclaimer()
773 disclaimer = compat_urllib_request.urlopen(request).read()
774 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
775 raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
780 'submit': "Continue - I'm over 18",
782 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
784 self.report_age_confirmation()
785 disclaimer = compat_urllib_request.urlopen(request).read()
786 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
787 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
789 def _real_extract(self, url):
790 # Extract id and simplified title from URL
791 mobj = re.match(self._VALID_URL, url)
793 raise ExtractorError(u'Invalid URL: %s' % url)
795 video_id = mobj.group(1)
797 # Check if video comes from YouTube
798 mobj2 = re.match(r'^yt-(.*)$', video_id)
799 if mobj2 is not None:
800 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
802 # Retrieve video webpage to extract further information
803 webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
805 # Extract URL, uploader and title from webpage
806 self.report_extraction(video_id)
807 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
809 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
810 video_extension = mediaURL[-3:]
812 # Extract gdaKey if available
813 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
817 gdaKey = mobj.group(1)
818 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
820 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
822 raise ExtractorError(u'Unable to extract media URL')
823 vardict = compat_parse_qs(mobj.group(1))
824 if 'mediaData' not in vardict:
825 raise ExtractorError(u'Unable to extract media URL')
826 mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
828 raise ExtractorError(u'Unable to extract media URL')
829 mediaURL = mobj.group('mediaURL').replace('\\/', '/')
830 video_extension = mediaURL[-3:]
831 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
833 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
835 raise ExtractorError(u'Unable to extract title')
836 video_title = mobj.group(1).decode('utf-8')
838 mobj = re.search(r'submitter=(.*?);', webpage)
840 raise ExtractorError(u'Unable to extract uploader nickname')
841 video_uploader = mobj.group(1)
844 'id': video_id.decode('utf-8'),
845 'url': video_url.decode('utf-8'),
846 'uploader': video_uploader.decode('utf-8'),
848 'title': video_title,
849 'ext': video_extension.decode('utf-8'),
852 class DailymotionIE(InfoExtractor):
853 """Information Extractor for Dailymotion"""
855 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
856 IE_NAME = u'dailymotion'
858 def _real_extract(self, url):
859 # Extract id and simplified title from URL
860 mobj = re.match(self._VALID_URL, url)
862 raise ExtractorError(u'Invalid URL: %s' % url)
864 video_id = mobj.group(1).split('_')[0].split('?')[0]
866 video_extension = 'mp4'
868 # Retrieve video webpage to extract further information
869 request = compat_urllib_request.Request(url)
870 request.add_header('Cookie', 'family_filter=off')
871 webpage = self._download_webpage(request, video_id)
873 # Extract URL, uploader and title from webpage
874 self.report_extraction(video_id)
875 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
877 raise ExtractorError(u'Unable to extract media URL')
878 flashvars = compat_urllib_parse.unquote(mobj.group(1))
880 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
883 self.to_screen(u'Using %s' % key)
886 raise ExtractorError(u'Unable to extract video URL')
888 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
890 raise ExtractorError(u'Unable to extract video URL')
892 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
894 # TODO: support choosing qualities
896 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
898 raise ExtractorError(u'Unable to extract title')
899 video_title = unescapeHTML(mobj.group('title'))
901 video_uploader = None
902 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
904 # lookin for official user
905 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
906 if mobj_official is None:
907 self._downloader.report_warning(u'unable to extract uploader nickname')
909 video_uploader = mobj_official.group(1)
911 video_uploader = mobj.group(1)
913 video_upload_date = None
914 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
916 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
921 'uploader': video_uploader,
922 'upload_date': video_upload_date,
923 'title': video_title,
924 'ext': video_extension,
928 class PhotobucketIE(InfoExtractor):
929 """Information extractor for photobucket.com."""
931 # TODO: the original _VALID_URL was:
932 # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
933 # Check if it's necessary to keep the old extracion process
934 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
935 IE_NAME = u'photobucket'
937 def _real_extract(self, url):
938 # Extract id from URL
939 mobj = re.match(self._VALID_URL, url)
941 raise ExtractorError(u'Invalid URL: %s' % url)
943 video_id = mobj.group('id')
945 video_extension = mobj.group('ext')
947 # Retrieve video webpage to extract further information
948 webpage = self._download_webpage(url, video_id)
950 # Extract URL, uploader, and title from webpage
951 self.report_extraction(video_id)
952 # We try first by looking the javascript code:
953 mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
955 info = json.loads(mobj.group('json'))
958 'url': info[u'downloadUrl'],
959 'uploader': info[u'username'],
960 'upload_date': datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
961 'title': info[u'title'],
962 'ext': video_extension,
963 'thumbnail': info[u'thumbUrl'],
966 # We try looking in other parts of the webpage
967 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
969 raise ExtractorError(u'Unable to extract media URL')
970 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
974 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
976 raise ExtractorError(u'Unable to extract title')
977 video_title = mobj.group(1).decode('utf-8')
979 video_uploader = mobj.group(2).decode('utf-8')
982 'id': video_id.decode('utf-8'),
983 'url': video_url.decode('utf-8'),
984 'uploader': video_uploader,
986 'title': video_title,
987 'ext': video_extension.decode('utf-8'),
991 class YahooIE(InfoExtractor):
992 """Information extractor for screen.yahoo.com."""
993 _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
995 def _real_extract(self, url):
996 mobj = re.match(self._VALID_URL, url)
998 raise ExtractorError(u'Invalid URL: %s' % url)
999 video_id = mobj.group('id')
1000 webpage = self._download_webpage(url, video_id)
1001 m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
1004 # TODO: Check which url parameters are required
1005 info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1006 webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
1007 info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
1008 <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
1009 <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
1010 <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
1012 self.report_extraction(video_id)
1013 m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
1015 raise ExtractorError(u'Unable to extract video info')
1016 video_title = m_info.group('title')
1017 video_description = m_info.group('description')
1018 video_thumb = m_info.group('thumb')
1019 video_date = m_info.group('date')
1020 video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
1022 # TODO: Find a way to get mp4 videos
1023 rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1024 webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
1025 m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
1026 video_url = m_rest.group('url')
1027 video_path = m_rest.group('path')
1029 raise ExtractorError(u'Unable to extract video url')
1031 else: # We have to use a different method if another id is defined
1032 long_id = m_id.group('new_id')
1033 info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
1034 webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
1035 json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
1036 info = json.loads(json_str)
1037 res = info[u'query'][u'results'][u'mediaObj'][0]
1038 stream = res[u'streams'][0]
1039 video_path = stream[u'path']
1040 video_url = stream[u'host']
1042 video_title = meta[u'title']
1043 video_description = meta[u'description']
1044 video_thumb = meta[u'thumbnail']
1045 video_date = None # I can't find it
1050 'play_path': video_path,
1051 'title':video_title,
1052 'description': video_description,
1053 'thumbnail': video_thumb,
1054 'upload_date': video_date,
1059 class VimeoIE(InfoExtractor):
1060 """Information extractor for vimeo.com."""
1062 # _VALID_URL matches Vimeo URLs
1063 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1066 def _real_extract(self, url, new_video=True):
1067 # Extract ID from URL
1068 mobj = re.match(self._VALID_URL, url)
1070 raise ExtractorError(u'Invalid URL: %s' % url)
1072 video_id = mobj.group('id')
1073 if not mobj.group('proto'):
1074 url = 'https://' + url
1075 if mobj.group('direct_link') or mobj.group('pro'):
1076 url = 'https://vimeo.com/' + video_id
1078 # Retrieve video webpage to extract further information
1079 request = compat_urllib_request.Request(url, None, std_headers)
1080 webpage = self._download_webpage(request, video_id)
1082 # Now we begin extracting as much information as we can from what we
1083 # retrieved. First we extract the information common to all extractors,
1084 # and latter we extract those that are Vimeo specific.
1085 self.report_extraction(video_id)
1087 # Extract the config JSON
1089 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1090 config = json.loads(config)
1092 if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1093 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1095 raise ExtractorError(u'Unable to extract info section')
1098 video_title = config["video"]["title"]
1100 # Extract uploader and uploader_id
1101 video_uploader = config["video"]["owner"]["name"]
1102 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None
1104 # Extract video thumbnail
1105 video_thumbnail = config["video"]["thumbnail"]
1107 # Extract video description
1108 video_description = get_element_by_attribute("itemprop", "description", webpage)
1109 if video_description: video_description = clean_html(video_description)
1110 else: video_description = u''
1112 # Extract upload date
1113 video_upload_date = None
1114 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1115 if mobj is not None:
1116 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1118 # Vimeo specific: extract request signature and timestamp
1119 sig = config['request']['signature']
1120 timestamp = config['request']['timestamp']
1122 # Vimeo specific: extract video codec and quality information
1123 # First consider quality, then codecs, then take everything
1124 # TODO bind to format param
1125 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1126 files = { 'hd': [], 'sd': [], 'other': []}
1127 for codec_name, codec_extension in codecs:
1128 if codec_name in config["video"]["files"]:
1129 if 'hd' in config["video"]["files"][codec_name]:
1130 files['hd'].append((codec_name, codec_extension, 'hd'))
1131 elif 'sd' in config["video"]["files"][codec_name]:
1132 files['sd'].append((codec_name, codec_extension, 'sd'))
1134 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1136 for quality in ('hd', 'sd', 'other'):
1137 if len(files[quality]) > 0:
1138 video_quality = files[quality][0][2]
1139 video_codec = files[quality][0][0]
1140 video_extension = files[quality][0][1]
1141 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1144 raise ExtractorError(u'No known codec found')
1146 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1147 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1152 'uploader': video_uploader,
1153 'uploader_id': video_uploader_id,
1154 'upload_date': video_upload_date,
1155 'title': video_title,
1156 'ext': video_extension,
1157 'thumbnail': video_thumbnail,
1158 'description': video_description,
1162 class ArteTvIE(InfoExtractor):
1163 """arte.tv information extractor."""
1165 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1166 _LIVE_URL = r'index-[0-9]+\.html$'
1168 IE_NAME = u'arte.tv'
1170 def fetch_webpage(self, url):
1171 request = compat_urllib_request.Request(url)
1173 self.report_download_webpage(url)
1174 webpage = compat_urllib_request.urlopen(request).read()
1175 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1176 raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1177 except ValueError as err:
1178 raise ExtractorError(u'Invalid URL: %s' % url)
1181 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1182 page = self.fetch_webpage(url)
1183 mobj = re.search(regex, page, regexFlags)
1187 raise ExtractorError(u'Invalid URL: %s' % url)
1189 for (i, key, err) in matchTuples:
1190 if mobj.group(i) is None:
1191 raise ExtractorError(err)
1193 info[key] = mobj.group(i)
1197 def extractLiveStream(self, url):
1198 video_lang = url.split('/')[-4]
1199 info = self.grep_webpage(
1201 r'src="(.*?/videothek_js.*?\.js)',
1204 (1, 'url', u'Invalid URL: %s' % url)
1207 http_host = url.split('/')[2]
1208 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1209 info = self.grep_webpage(
1211 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1212 '(http://.*?\.swf).*?' +
1216 (1, 'path', u'could not extract video path: %s' % url),
1217 (2, 'player', u'could not extract video player: %s' % url),
1218 (3, 'url', u'could not extract video url: %s' % url)
1221 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1223 def extractPlus7Stream(self, url):
1224 video_lang = url.split('/')[-3]
1225 info = self.grep_webpage(
1227 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1230 (1, 'url', u'Invalid URL: %s' % url)
1233 next_url = compat_urllib_parse.unquote(info.get('url'))
1234 info = self.grep_webpage(
1236 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1239 (1, 'url', u'Could not find <video> tag: %s' % url)
1242 next_url = compat_urllib_parse.unquote(info.get('url'))
1244 info = self.grep_webpage(
1246 r'<video id="(.*?)".*?>.*?' +
1247 '<name>(.*?)</name>.*?' +
1248 '<dateVideo>(.*?)</dateVideo>.*?' +
1249 '<url quality="hd">(.*?)</url>',
1252 (1, 'id', u'could not extract video id: %s' % url),
1253 (2, 'title', u'could not extract video title: %s' % url),
1254 (3, 'date', u'could not extract video date: %s' % url),
1255 (4, 'url', u'could not extract video url: %s' % url)
1260 'id': info.get('id'),
1261 'url': compat_urllib_parse.unquote(info.get('url')),
1262 'uploader': u'arte.tv',
1263 'upload_date': unified_strdate(info.get('date')),
1264 'title': info.get('title').decode('utf-8'),
1270 def _real_extract(self, url):
1271 video_id = url.split('/')[-1]
1272 self.report_extraction(video_id)
1274 if re.search(self._LIVE_URL, video_id) is not None:
1275 self.extractLiveStream(url)
1278 info = self.extractPlus7Stream(url)
1283 class GenericIE(InfoExtractor):
1284 """Generic last-resort information extractor."""
1287 IE_NAME = u'generic'
1289 def report_download_webpage(self, video_id):
1290 """Report webpage download."""
1291 if not self._downloader.params.get('test', False):
1292 self._downloader.report_warning(u'Falling back on generic information extractor.')
1293 super(GenericIE, self).report_download_webpage(video_id)
1295 def report_following_redirect(self, new_url):
1296 """Report information extraction."""
1297 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1299 def _test_redirect(self, url):
1300 """Check if it is a redirect, like url shorteners, in case return the new url."""
1301 class HeadRequest(compat_urllib_request.Request):
1302 def get_method(self):
1305 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1307 Subclass the HTTPRedirectHandler to make it use our
1308 HeadRequest also on the redirected URL
1310 def redirect_request(self, req, fp, code, msg, headers, newurl):
1311 if code in (301, 302, 303, 307):
1312 newurl = newurl.replace(' ', '%20')
1313 newheaders = dict((k,v) for k,v in req.headers.items()
1314 if k.lower() not in ("content-length", "content-type"))
1315 return HeadRequest(newurl,
1317 origin_req_host=req.get_origin_req_host(),
1320 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1322 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1324 Fallback to GET if HEAD is not allowed (405 HTTP error)
1326 def http_error_405(self, req, fp, code, msg, headers):
1330 newheaders = dict((k,v) for k,v in req.headers.items()
1331 if k.lower() not in ("content-length", "content-type"))
1332 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1334 origin_req_host=req.get_origin_req_host(),
1338 opener = compat_urllib_request.OpenerDirector()
1339 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1340 HTTPMethodFallback, HEADRedirectHandler,
1341 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1342 opener.add_handler(handler())
1344 response = opener.open(HeadRequest(url))
1345 if response is None:
1346 raise ExtractorError(u'Invalid URL protocol')
1347 new_url = response.geturl()
1352 self.report_following_redirect(new_url)
1355 def _real_extract(self, url):
1356 new_url = self._test_redirect(url)
1357 if new_url: return [self.url_result(new_url)]
1359 video_id = url.split('/')[-1]
1361 webpage = self._download_webpage(url, video_id)
1362 except ValueError as err:
1363 # since this is the last-resort InfoExtractor, if
1364 # this error is thrown, it'll be thrown here
1365 raise ExtractorError(u'Invalid URL: %s' % url)
1367 self.report_extraction(video_id)
1368 # Start with something easy: JW Player in SWFObject
1369 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1371 # Broaden the search a little bit
1372 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1374 # Broaden the search a little bit: JWPlayer JS loader
1375 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1377 raise ExtractorError(u'Invalid URL: %s' % url)
1379 # It's possible that one of the regexes
1380 # matched, but returned an empty group:
1381 if mobj.group(1) is None:
1382 raise ExtractorError(u'Invalid URL: %s' % url)
1384 video_url = compat_urllib_parse.unquote(mobj.group(1))
1385 video_id = os.path.basename(video_url)
1387 # here's a fun little line of code for you:
1388 video_extension = os.path.splitext(video_id)[1][1:]
1389 video_id = os.path.splitext(video_id)[0]
1391 # it's tempting to parse this further, but you would
1392 # have to take into account all the variations like
1393 # Video Title - Site Name
1394 # Site Name | Video Title
1395 # Video Title - Tagline | Site Name
1396 # and so on and so forth; it's just not practical
1397 mobj = re.search(r'<title>(.*)</title>', webpage)
1399 raise ExtractorError(u'Unable to extract title')
1400 video_title = mobj.group(1)
1402 # video uploader is domain name
1403 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1405 raise ExtractorError(u'Unable to extract title')
1406 video_uploader = mobj.group(1)
1411 'uploader': video_uploader,
1412 'upload_date': None,
1413 'title': video_title,
1414 'ext': video_extension,
1418 class YoutubeSearchIE(SearchInfoExtractor):
1419 """Information Extractor for YouTube search queries."""
1420 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1422 IE_NAME = u'youtube:search'
1423 _SEARCH_KEY = 'ytsearch'
1425 def report_download_page(self, query, pagenum):
1426 """Report attempt to download search page with given number."""
1427 query = query.decode(preferredencoding())
1428 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1430 def _get_n_results(self, query, n):
1431 """Get a specified number of results for a query"""
1437 while (50 * pagenum) < limit:
1438 self.report_download_page(query, pagenum+1)
1439 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1440 request = compat_urllib_request.Request(result_url)
1442 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1443 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1444 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1445 api_response = json.loads(data)['data']
1447 if not 'items' in api_response:
1448 raise ExtractorError(u'[youtube] No video results')
1450 new_ids = list(video['id'] for video in api_response['items'])
1451 video_ids += new_ids
1453 limit = min(n, api_response['totalItems'])
1456 if len(video_ids) > n:
1457 video_ids = video_ids[:n]
1458 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1459 return self.playlist_result(videos, query)
1462 class GoogleSearchIE(SearchInfoExtractor):
1463 """Information Extractor for Google Video search queries."""
1464 _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
1466 IE_NAME = u'video.google:search'
1467 _SEARCH_KEY = 'gvsearch'
1469 def _get_n_results(self, query, n):
1470 """Get a specified number of results for a query"""
1473 '_type': 'playlist',
1478 for pagenum in itertools.count(1):
1479 result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1480 webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1481 note='Downloading result page ' + str(pagenum))
1483 for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1486 'url': mobj.group(1)
1488 res['entries'].append(e)
1490 if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1493 class YahooSearchIE(SearchInfoExtractor):
1494 """Information Extractor for Yahoo! Video search queries."""
1497 IE_NAME = u'screen.yahoo:search'
1498 _SEARCH_KEY = 'yvsearch'
1500 def _get_n_results(self, query, n):
1501 """Get a specified number of results for a query"""
1504 '_type': 'playlist',
1508 for pagenum in itertools.count(0):
1509 result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
1510 webpage = self._download_webpage(result_url, query,
1511 note='Downloading results page '+str(pagenum+1))
1512 info = json.loads(webpage)
1514 results = info[u'results']
1516 for (i, r) in enumerate(results):
1517 if (pagenum * 30) +i >= n:
1519 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
1520 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
1521 res['entries'].append(e)
1522 if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
1528 class YoutubePlaylistIE(InfoExtractor):
1529 """Information Extractor for YouTube playlists."""
1531 _VALID_URL = r"""(?:
1536 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1537 \? (?:.*?&)*? (?:p|a|list)=
1540 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1543 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1545 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1547 IE_NAME = u'youtube:playlist'
1550 def suitable(cls, url):
1551 """Receives a URL and returns True if suitable for this IE."""
1552 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1554 def _real_extract(self, url):
1555 # Extract playlist id
1556 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1558 raise ExtractorError(u'Invalid URL: %s' % url)
1560 # Download playlist videos from API
1561 playlist_id = mobj.group(1) or mobj.group(2)
1566 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1567 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1570 response = json.loads(page)
1571 except ValueError as err:
1572 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1574 if 'feed' not in response:
1575 raise ExtractorError(u'Got a malformed response from YouTube API')
1576 playlist_title = response['feed']['title']['$t']
1577 if 'entry' not in response['feed']:
1578 # Number of videos is a multiple of self._MAX_RESULTS
1581 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1582 for entry in response['feed']['entry']
1583 if 'content' in entry ]
1585 if len(response['feed']['entry']) < self._MAX_RESULTS:
1589 videos = [v[1] for v in sorted(videos)]
1591 url_results = [self.url_result(url, 'Youtube') for url in videos]
1592 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1595 class YoutubeChannelIE(InfoExtractor):
1596 """Information Extractor for YouTube channels."""
1598 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1599 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1600 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1601 _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1602 IE_NAME = u'youtube:channel'
1604 def extract_videos_from_page(self, page):
1606 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1607 if mobj.group(1) not in ids_in_page:
1608 ids_in_page.append(mobj.group(1))
1611 def _real_extract(self, url):
1612 # Extract channel id
1613 mobj = re.match(self._VALID_URL, url)
1615 raise ExtractorError(u'Invalid URL: %s' % url)
1617 # Download channel page
1618 channel_id = mobj.group(1)
1622 url = self._TEMPLATE_URL % (channel_id, pagenum)
1623 page = self._download_webpage(url, channel_id,
1624 u'Downloading page #%s' % pagenum)
1626 # Extract video identifiers
1627 ids_in_page = self.extract_videos_from_page(page)
1628 video_ids.extend(ids_in_page)
1630 # Download any subsequent channel pages using the json-based channel_ajax query
1631 if self._MORE_PAGES_INDICATOR in page:
1633 pagenum = pagenum + 1
1635 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1636 page = self._download_webpage(url, channel_id,
1637 u'Downloading page #%s' % pagenum)
1639 page = json.loads(page)
1641 ids_in_page = self.extract_videos_from_page(page['content_html'])
1642 video_ids.extend(ids_in_page)
1644 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1647 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1649 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1650 url_entries = [self.url_result(url, 'Youtube') for url in urls]
1651 return [self.playlist_result(url_entries, channel_id)]
1654 class YoutubeUserIE(InfoExtractor):
1655 """Information Extractor for YouTube users."""
1657 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1658 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1659 _GDATA_PAGE_SIZE = 50
1660 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1661 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1662 IE_NAME = u'youtube:user'
1664 def _real_extract(self, url):
1666 mobj = re.match(self._VALID_URL, url)
1668 raise ExtractorError(u'Invalid URL: %s' % url)
1670 username = mobj.group(1)
1672 # Download video ids using YouTube Data API. Result size per
1673 # query is limited (currently to 50 videos) so we need to query
1674 # page by page until there are no video ids - it means we got
1681 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1683 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1684 page = self._download_webpage(gdata_url, username,
1685 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1687 # Extract video identifiers
1690 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1691 if mobj.group(1) not in ids_in_page:
1692 ids_in_page.append(mobj.group(1))
1694 video_ids.extend(ids_in_page)
1696 # A little optimization - if current page is not
1697 # "full", ie. does not contain PAGE_SIZE video ids then
1698 # we can assume that this page is the last one - there
1699 # are no more ids on further pages - no need to query
1702 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1707 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1708 url_results = [self.url_result(url, 'Youtube') for url in urls]
1709 return [self.playlist_result(url_results, playlist_title = username)]
1712 class BlipTVUserIE(InfoExtractor):
1713 """Information Extractor for blip.tv users."""
1715 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1717 IE_NAME = u'blip.tv:user'
1719 def _real_extract(self, url):
1721 mobj = re.match(self._VALID_URL, url)
1723 raise ExtractorError(u'Invalid URL: %s' % url)
1725 username = mobj.group(1)
1727 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1729 page = self._download_webpage(url, username, u'Downloading user page')
1730 mobj = re.search(r'data-users-id="([^"]+)"', page)
1731 page_base = page_base % mobj.group(1)
1734 # Download video ids using BlipTV Ajax calls. Result size per
1735 # query is limited (currently to 12 videos) so we need to query
1736 # page by page until there are no video ids - it means we got
1743 url = page_base + "&page=" + str(pagenum)
1744 page = self._download_webpage(url, username,
1745 u'Downloading video ids from page %d' % pagenum)
1747 # Extract video identifiers
1750 for mobj in re.finditer(r'href="/([^"]+)"', page):
1751 if mobj.group(1) not in ids_in_page:
1752 ids_in_page.append(unescapeHTML(mobj.group(1)))
1754 video_ids.extend(ids_in_page)
1756 # A little optimization - if current page is not
1757 # "full", ie. does not contain PAGE_SIZE video ids then
1758 # we can assume that this page is the last one - there
1759 # are no more ids on further pages - no need to query
1762 if len(ids_in_page) < self._PAGE_SIZE:
1767 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1768 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1769 return [self.playlist_result(url_entries, playlist_title = username)]
1772 class DepositFilesIE(InfoExtractor):
1773 """Information extractor for depositfiles.com"""
1775 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1777 def _real_extract(self, url):
1778 file_id = url.split('/')[-1]
1779 # Rebuild url in english locale
1780 url = 'http://depositfiles.com/en/files/' + file_id
1782 # Retrieve file webpage with 'Free download' button pressed
1783 free_download_indication = { 'gateway_result' : '1' }
1784 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1786 self.report_download_webpage(file_id)
1787 webpage = compat_urllib_request.urlopen(request).read()
1788 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1789 raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1791 # Search for the real file URL
1792 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1793 if (mobj is None) or (mobj.group(1) is None):
1794 # Try to figure out reason of the error.
1795 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1796 if (mobj is not None) and (mobj.group(1) is not None):
1797 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1798 raise ExtractorError(u'%s' % restriction_message)
1800 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1802 file_url = mobj.group(1)
1803 file_extension = os.path.splitext(file_url)[1][1:]
1805 # Search for file title
1806 mobj = re.search(r'<b title="(.*?)">', webpage)
1808 raise ExtractorError(u'Unable to extract title')
1809 file_title = mobj.group(1).decode('utf-8')
1812 'id': file_id.decode('utf-8'),
1813 'url': file_url.decode('utf-8'),
1815 'upload_date': None,
1816 'title': file_title,
1817 'ext': file_extension.decode('utf-8'),
1821 class FacebookIE(InfoExtractor):
1822 """Information Extractor for Facebook"""
1824 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1825 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1826 _NETRC_MACHINE = 'facebook'
1827 IE_NAME = u'facebook'
1829 def report_login(self):
1830 """Report attempt to log in."""
1831 self.to_screen(u'Logging in')
1833 def _real_initialize(self):
1834 if self._downloader is None:
1839 downloader_params = self._downloader.params
1841 # Attempt to use provided username and password or .netrc data
1842 if downloader_params.get('username', None) is not None:
1843 useremail = downloader_params['username']
1844 password = downloader_params['password']
1845 elif downloader_params.get('usenetrc', False):
1847 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1848 if info is not None:
1852 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1853 except (IOError, netrc.NetrcParseError) as err:
1854 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1857 if useremail is None:
1866 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1869 login_results = compat_urllib_request.urlopen(request).read()
1870 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1871 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1873 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1874 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1877 def _real_extract(self, url):
1878 mobj = re.match(self._VALID_URL, url)
1880 raise ExtractorError(u'Invalid URL: %s' % url)
1881 video_id = mobj.group('ID')
1883 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1884 webpage = self._download_webpage(url, video_id)
1886 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1887 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1888 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1890 raise ExtractorError(u'Cannot parse data')
1891 data = dict(json.loads(m.group(1)))
1892 params_raw = compat_urllib_parse.unquote(data['params'])
1893 params = json.loads(params_raw)
1894 video_data = params['video_data'][0]
1895 video_url = video_data.get('hd_src')
1897 video_url = video_data['sd_src']
1899 raise ExtractorError(u'Cannot find video URL')
1900 video_duration = int(video_data['video_duration'])
1901 thumbnail = video_data['thumbnail_src']
1903 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
1905 raise ExtractorError(u'Cannot find title in webpage')
1906 video_title = unescapeHTML(m.group(1))
1910 'title': video_title,
1913 'duration': video_duration,
1914 'thumbnail': thumbnail,
1919 class BlipTVIE(InfoExtractor):
1920 """Information extractor for blip.tv"""
1922 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
1923 _URL_EXT = r'^.*\.([a-z0-9]+)$'
1924 IE_NAME = u'blip.tv'
1926 def report_direct_download(self, title):
1927 """Report information extraction."""
1928 self.to_screen(u'%s: Direct download detected' % title)
1930 def _real_extract(self, url):
1931 mobj = re.match(self._VALID_URL, url)
1933 raise ExtractorError(u'Invalid URL: %s' % url)
1935 # See https://github.com/rg3/youtube-dl/issues/857
1936 api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
1937 if api_mobj is not None:
1938 url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
1939 urlp = compat_urllib_parse_urlparse(url)
1940 if urlp.path.startswith('/play/'):
1941 request = compat_urllib_request.Request(url)
1942 response = compat_urllib_request.urlopen(request)
1943 redirecturl = response.geturl()
1944 rurlp = compat_urllib_parse_urlparse(redirecturl)
1945 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1946 url = 'http://blip.tv/a/a-' + file_id
1947 return self._real_extract(url)
1954 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1955 request = compat_urllib_request.Request(json_url)
1956 request.add_header('User-Agent', 'iTunes/10.6.1')
1957 self.report_extraction(mobj.group(1))
1960 urlh = compat_urllib_request.urlopen(request)
1961 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1962 basename = url.split('/')[-1]
1963 title,ext = os.path.splitext(basename)
1964 title = title.decode('UTF-8')
1965 ext = ext.replace('.', '')
1966 self.report_direct_download(title)
1971 'upload_date': None,
1976 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1977 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
1978 if info is None: # Regular URL
1980 json_code_bytes = urlh.read()
1981 json_code = json_code_bytes.decode('utf-8')
1982 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1983 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
1986 json_data = json.loads(json_code)
1987 if 'Post' in json_data:
1988 data = json_data['Post']
1992 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
1993 video_url = data['media']['url']
1994 umobj = re.match(self._URL_EXT, video_url)
1996 raise ValueError('Can not determine filename extension')
1997 ext = umobj.group(1)
2000 'id': data['item_id'],
2002 'uploader': data['display_name'],
2003 'upload_date': upload_date,
2004 'title': data['title'],
2006 'format': data['media']['mimeType'],
2007 'thumbnail': data['thumbnailUrl'],
2008 'description': data['description'],
2009 'player_url': data['embedUrl'],
2010 'user_agent': 'iTunes/10.6.1',
2012 except (ValueError,KeyError) as err:
2013 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
2018 class MyVideoIE(InfoExtractor):
2019 """Information Extractor for myvideo.de."""
2021 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2022 IE_NAME = u'myvideo'
2024 # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
2025 # Released into the Public Domain by Tristan Fischer on 2013-05-19
2026 # https://github.com/rg3/youtube-dl/pull/842
2027 def __rc4crypt(self,data, key):
2029 box = list(range(256))
2030 for i in list(range(256)):
2031 x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
2032 box[i], box[x] = box[x], box[i]
2038 y = (y + box[x]) % 256
2039 box[x], box[y] = box[y], box[x]
2040 out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
2044 return hashlib.md5(s).hexdigest().encode()
2046 def _real_extract(self,url):
2047 mobj = re.match(self._VALID_URL, url)
2049 raise ExtractorError(u'invalid URL: %s' % url)
2051 video_id = mobj.group(1)
2054 b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
2055 b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
2056 b'TnpsbA0KTVRkbU1tSTRNdz09'
2060 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2061 webpage = self._download_webpage(webpage_url, video_id)
2063 mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
2064 if mobj is not None:
2065 self.report_extraction(video_id)
2066 video_url = mobj.group(1) + '.flv'
2068 mobj = re.search('<title>([^<]+)</title>', webpage)
2070 raise ExtractorError(u'Unable to extract title')
2071 video_title = mobj.group(1)
2073 mobj = re.search('[.](.+?)$', video_url)
2075 raise ExtractorError(u'Unable to extract extention')
2076 video_ext = mobj.group(1)
2082 'upload_date': None,
2083 'title': video_title,
2088 mobj = re.search('var flashvars={(.+?)}', webpage)
2090 raise ExtractorError(u'Unable to extract video')
2095 for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
2096 if not a == '_encxml':
2099 encxml = compat_urllib_parse.unquote(b)
2100 if not params.get('domain'):
2101 params['domain'] = 'www.myvideo.de'
2102 xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
2103 if 'flash_playertype=MTV' in xmldata_url:
2104 self._downloader.report_warning(u'avoiding MTV player')
2106 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
2107 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
2111 enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
2112 enc_data_b = binascii.unhexlify(enc_data)
2114 base64.b64decode(base64.b64decode(GK)) +
2116 str(video_id).encode('utf-8')
2119 dec_data = self.__rc4crypt(enc_data_b, sk)
2122 self.report_extraction(video_id)
2124 mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
2126 raise ExtractorError(u'unable to extract rtmpurl')
2127 video_rtmpurl = compat_urllib_parse.unquote(mobj.group(1))
2128 if 'myvideo2flash' in video_rtmpurl:
2129 self._downloader.report_warning(u'forcing RTMPT ...')
2130 video_rtmpurl = video_rtmpurl.replace('rtmpe://', 'rtmpt://')
2132 # extract non rtmp videos
2133 if (video_rtmpurl is None) or (video_rtmpurl == ''):
2134 mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
2136 raise ExtractorError(u'unable to extract url')
2137 video_rtmpurl = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
2139 mobj = re.search('source=\'(.*?)\'', dec_data)
2141 raise ExtractorError(u'unable to extract swfobj')
2142 video_file = compat_urllib_parse.unquote(mobj.group(1))
2144 if not video_file.endswith('f4m'):
2145 ppath, prefix = video_file.split('.')
2146 video_playpath = '%s:%s' % (prefix, ppath)
2147 video_hls_playlist = ''
2150 video_hls_playlist = (
2151 video_filepath + video_file
2152 ).replace('.f4m', '.m3u8')
2154 mobj = re.search('swfobject.embedSWF\(\'(.+?)\'', webpage)
2156 raise ExtractorError(u'unable to extract swfobj')
2157 video_swfobj = compat_urllib_parse.unquote(mobj.group(1))
2159 mobj = re.search("<h1(?: class='globalHd')?>(.*?)</h1>", webpage)
2161 raise ExtractorError(u'unable to extract title')
2162 video_title = mobj.group(1)
2166 'url': video_rtmpurl,
2167 'tc_url': video_rtmpurl,
2169 'upload_date': None,
2170 'title': video_title,
2172 'play_path': video_playpath,
2173 'video_file': video_file,
2174 'video_hls_playlist': video_hls_playlist,
2175 'player_url': video_swfobj,
2178 class ComedyCentralIE(InfoExtractor):
2179 """Information extractor for The Daily Show and Colbert Report """
2181 # urls can be abbreviations like :thedailyshow or :colbert
2182 # urls for episodes like:
2183 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2184 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2185 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2186 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2187 |(https?://)?(www\.)?
2188 (?P<showname>thedailyshow|colbertnation)\.com/
2189 (full-episodes/(?P<episode>.*)|
2191 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2192 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2195 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2197 _video_extensions = {
2205 _video_dimensions = {
2215 def suitable(cls, url):
2216 """Receives a URL and returns True if suitable for this IE."""
2217 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2219 def _print_formats(self, formats):
2220 print('Available formats:')
2222 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2225 def _real_extract(self, url):
2226 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2228 raise ExtractorError(u'Invalid URL: %s' % url)
2230 if mobj.group('shortname'):
2231 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2232 url = u'http://www.thedailyshow.com/full-episodes/'
2234 url = u'http://www.colbertnation.com/full-episodes/'
2235 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2236 assert mobj is not None
2238 if mobj.group('clip'):
2239 if mobj.group('showname') == 'thedailyshow':
2240 epTitle = mobj.group('tdstitle')
2242 epTitle = mobj.group('cntitle')
2245 dlNewest = not mobj.group('episode')
2247 epTitle = mobj.group('showname')
2249 epTitle = mobj.group('episode')
2251 self.report_extraction(epTitle)
2252 webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2254 url = htmlHandle.geturl()
2255 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2257 raise ExtractorError(u'Invalid redirected URL: ' + url)
2258 if mobj.group('episode') == '':
2259 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2260 epTitle = mobj.group('episode')
2262 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2264 if len(mMovieParams) == 0:
2265 # The Colbert Report embeds the information in a without
2266 # a URL prefix; so extract the alternate reference
2267 # and then add the URL prefix manually.
2269 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2270 if len(altMovieParams) == 0:
2271 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2273 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2275 uri = mMovieParams[0][1]
2276 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2277 indexXml = self._download_webpage(indexUrl, epTitle,
2278 u'Downloading show index',
2279 u'unable to download episode index')
2283 idoc = xml.etree.ElementTree.fromstring(indexXml)
2284 itemEls = idoc.findall('.//item')
2285 for partNum,itemEl in enumerate(itemEls):
2286 mediaId = itemEl.findall('./guid')[0].text
2287 shortMediaId = mediaId.split(':')[-1]
2288 showId = mediaId.split(':')[-2].replace('.com', '')
2289 officialTitle = itemEl.findall('./title')[0].text
2290 officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2292 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2293 compat_urllib_parse.urlencode({'uri': mediaId}))
2294 configXml = self._download_webpage(configUrl, epTitle,
2295 u'Downloading configuration for %s' % shortMediaId)
2297 cdoc = xml.etree.ElementTree.fromstring(configXml)
2299 for rendition in cdoc.findall('.//rendition'):
2300 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2304 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2307 if self._downloader.params.get('listformats', None):
2308 self._print_formats([i[0] for i in turls])
2311 # For now, just pick the highest bitrate
2312 format,rtmp_video_url = turls[-1]
2314 # Get the format arg from the arg stream
2315 req_format = self._downloader.params.get('format', None)
2317 # Select format if we can find one
2320 format, rtmp_video_url = f, v
2323 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2325 raise ExtractorError(u'Cannot transform RTMP url')
2326 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2327 video_url = base + m.group('finalid')
2329 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2334 'upload_date': officialDate,
2339 'description': officialTitle,
2341 results.append(info)
2346 class EscapistIE(InfoExtractor):
2347 """Information extractor for The Escapist """
2349 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2350 IE_NAME = u'escapist'
2352 def _real_extract(self, url):
2353 mobj = re.match(self._VALID_URL, url)
2355 raise ExtractorError(u'Invalid URL: %s' % url)
2356 showName = mobj.group('showname')
2357 videoId = mobj.group('episode')
2359 self.report_extraction(showName)
2360 webPage = self._download_webpage(url, showName)
2362 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2363 description = unescapeHTML(descMatch.group(1))
2364 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2365 imgUrl = unescapeHTML(imgMatch.group(1))
2366 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2367 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2368 configUrlMatch = re.search('config=(.*)$', playerUrl)
2369 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2371 configJSON = self._download_webpage(configUrl, showName,
2372 u'Downloading configuration',
2373 u'unable to download configuration')
2375 # Technically, it's JavaScript, not JSON
2376 configJSON = configJSON.replace("'", '"')
2379 config = json.loads(configJSON)
2380 except (ValueError,) as err:
2381 raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2383 playlist = config['playlist']
2384 videoUrl = playlist[1]['url']
2389 'uploader': showName,
2390 'upload_date': None,
2393 'thumbnail': imgUrl,
2394 'description': description,
2395 'player_url': playerUrl,
2400 class CollegeHumorIE(InfoExtractor):
2401 """Information extractor for collegehumor.com"""
2404 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2405 IE_NAME = u'collegehumor'
2407 def report_manifest(self, video_id):
2408 """Report information extraction."""
2409 self.to_screen(u'%s: Downloading XML manifest' % video_id)
2411 def _real_extract(self, url):
2412 mobj = re.match(self._VALID_URL, url)
2414 raise ExtractorError(u'Invalid URL: %s' % url)
2415 video_id = mobj.group('videoid')
2420 'upload_date': None,
2423 self.report_extraction(video_id)
2424 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2426 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2427 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2428 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2430 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2432 videoNode = mdoc.findall('./video')[0]
2433 info['description'] = videoNode.findall('./description')[0].text
2434 info['title'] = videoNode.findall('./caption')[0].text
2435 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2436 manifest_url = videoNode.findall('./file')[0].text
2438 raise ExtractorError(u'Invalid metadata XML file')
2440 manifest_url += '?hdcore=2.10.3'
2441 self.report_manifest(video_id)
2443 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2444 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2445 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2447 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2449 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2450 node_id = media_node.attrib['url']
2451 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2452 except IndexError as err:
2453 raise ExtractorError(u'Invalid manifest file')
2455 url_pr = compat_urllib_parse_urlparse(manifest_url)
2456 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2463 class XVideosIE(InfoExtractor):
2464 """Information extractor for xvideos.com"""
2466 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2467 IE_NAME = u'xvideos'
2469 def _real_extract(self, url):
2470 mobj = re.match(self._VALID_URL, url)
2472 raise ExtractorError(u'Invalid URL: %s' % url)
2473 video_id = mobj.group(1)
2475 webpage = self._download_webpage(url, video_id)
2477 self.report_extraction(video_id)
2481 mobj = re.search(r'flv_url=(.+?)&', webpage)
2483 raise ExtractorError(u'Unable to extract video url')
2484 video_url = compat_urllib_parse.unquote(mobj.group(1))
2488 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2490 raise ExtractorError(u'Unable to extract video title')
2491 video_title = mobj.group(1)
2494 # Extract video thumbnail
2495 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2497 raise ExtractorError(u'Unable to extract video thumbnail')
2498 video_thumbnail = mobj.group(0)
2504 'upload_date': None,
2505 'title': video_title,
2507 'thumbnail': video_thumbnail,
2508 'description': None,
2514 class SoundcloudIE(InfoExtractor):
2515 """Information extractor for soundcloud.com
2516 To access the media, the uid of the song and a stream token
2517 must be extracted from the page source and the script must make
2518 a request to media.soundcloud.com/crossdomain.xml. Then
2519 the media can be grabbed by requesting from an url composed
2520 of the stream token and uid
2523 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2524 IE_NAME = u'soundcloud'
2526 def report_resolve(self, video_id):
2527 """Report information extraction."""
2528 self.to_screen(u'%s: Resolving id' % video_id)
2530 def _real_extract(self, url):
2531 mobj = re.match(self._VALID_URL, url)
2533 raise ExtractorError(u'Invalid URL: %s' % url)
2535 # extract uploader (which is in the url)
2536 uploader = mobj.group(1)
2537 # extract simple title (uploader + slug of song title)
2538 slug_title = mobj.group(2)
2539 simple_title = uploader + u'-' + slug_title
2540 full_title = '%s/%s' % (uploader, slug_title)
2542 self.report_resolve(full_title)
2544 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2545 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2546 info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2548 info = json.loads(info_json)
2549 video_id = info['id']
2550 self.report_extraction(full_title)
2552 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2553 stream_json = self._download_webpage(streams_url, full_title,
2554 u'Downloading stream definitions',
2555 u'unable to download stream definitions')
2557 streams = json.loads(stream_json)
2558 mediaURL = streams['http_mp3_128_url']
2559 upload_date = unified_strdate(info['created_at'])
2564 'uploader': info['user']['username'],
2565 'upload_date': upload_date,
2566 'title': info['title'],
2568 'description': info['description'],
2571 class SoundcloudSetIE(InfoExtractor):
2572 """Information extractor for soundcloud.com sets
2573 To access the media, the uid of the song and a stream token
2574 must be extracted from the page source and the script must make
2575 a request to media.soundcloud.com/crossdomain.xml. Then
2576 the media can be grabbed by requesting from an url composed
2577 of the stream token and uid
2580 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2581 IE_NAME = u'soundcloud:set'
2583 def report_resolve(self, video_id):
2584 """Report information extraction."""
2585 self.to_screen(u'%s: Resolving id' % video_id)
2587 def _real_extract(self, url):
2588 mobj = re.match(self._VALID_URL, url)
2590 raise ExtractorError(u'Invalid URL: %s' % url)
2592 # extract uploader (which is in the url)
2593 uploader = mobj.group(1)
2594 # extract simple title (uploader + slug of song title)
2595 slug_title = mobj.group(2)
2596 simple_title = uploader + u'-' + slug_title
2597 full_title = '%s/sets/%s' % (uploader, slug_title)
2599 self.report_resolve(full_title)
2601 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2602 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2603 info_json = self._download_webpage(resolv_url, full_title)
2606 info = json.loads(info_json)
2607 if 'errors' in info:
2608 for err in info['errors']:
2609 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2612 self.report_extraction(full_title)
2613 for track in info['tracks']:
2614 video_id = track['id']
2616 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2617 stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2619 self.report_extraction(video_id)
2620 streams = json.loads(stream_json)
2621 mediaURL = streams['http_mp3_128_url']
2626 'uploader': track['user']['username'],
2627 'upload_date': unified_strdate(track['created_at']),
2628 'title': track['title'],
2630 'description': track['description'],
2635 class InfoQIE(InfoExtractor):
2636 """Information extractor for infoq.com"""
2637 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2639 def _real_extract(self, url):
2640 mobj = re.match(self._VALID_URL, url)
2642 raise ExtractorError(u'Invalid URL: %s' % url)
2644 webpage = self._download_webpage(url, video_id=url)
2645 self.report_extraction(url)
2648 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2650 raise ExtractorError(u'Unable to extract video url')
2651 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2652 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2655 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2657 raise ExtractorError(u'Unable to extract video title')
2658 video_title = mobj.group(1)
2660 # Extract description
2661 video_description = u'No description available.'
2662 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2663 if mobj is not None:
2664 video_description = mobj.group(1)
2666 video_filename = video_url.split('/')[-1]
2667 video_id, extension = video_filename.split('.')
2673 'upload_date': None,
2674 'title': video_title,
2675 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2677 'description': video_description,
2682 class MixcloudIE(InfoExtractor):
2683 """Information extractor for www.mixcloud.com"""
2685 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2686 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2687 IE_NAME = u'mixcloud'
2689 def report_download_json(self, file_id):
2690 """Report JSON download."""
2691 self.to_screen(u'Downloading json')
2693 def get_urls(self, jsonData, fmt, bitrate='best'):
2694 """Get urls from 'audio_formats' section in json"""
2697 bitrate_list = jsonData[fmt]
2698 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2699 bitrate = max(bitrate_list) # select highest
2701 url_list = jsonData[fmt][bitrate]
2702 except TypeError: # we have no bitrate info.
2703 url_list = jsonData[fmt]
2706 def check_urls(self, url_list):
2707 """Returns 1st active url from list"""
2708 for url in url_list:
2710 compat_urllib_request.urlopen(url)
2712 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2717 def _print_formats(self, formats):
2718 print('Available formats:')
2719 for fmt in formats.keys():
2720 for b in formats[fmt]:
2722 ext = formats[fmt][b][0]
2723 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2724 except TypeError: # we have no bitrate info
2725 ext = formats[fmt][0]
2726 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2729 def _real_extract(self, url):
2730 mobj = re.match(self._VALID_URL, url)
2732 raise ExtractorError(u'Invalid URL: %s' % url)
2733 # extract uploader & filename from url
2734 uploader = mobj.group(1).decode('utf-8')
2735 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2737 # construct API request
2738 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2739 # retrieve .json file with links to files
2740 request = compat_urllib_request.Request(file_url)
2742 self.report_download_json(file_url)
2743 jsonData = compat_urllib_request.urlopen(request).read()
2744 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2745 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2748 json_data = json.loads(jsonData)
2749 player_url = json_data['player_swf_url']
2750 formats = dict(json_data['audio_formats'])
2752 req_format = self._downloader.params.get('format', None)
2755 if self._downloader.params.get('listformats', None):
2756 self._print_formats(formats)
2759 if req_format is None or req_format == 'best':
2760 for format_param in formats.keys():
2761 url_list = self.get_urls(formats, format_param)
2763 file_url = self.check_urls(url_list)
2764 if file_url is not None:
2767 if req_format not in formats:
2768 raise ExtractorError(u'Format is not available')
2770 url_list = self.get_urls(formats, req_format)
2771 file_url = self.check_urls(url_list)
2772 format_param = req_format
2775 'id': file_id.decode('utf-8'),
2776 'url': file_url.decode('utf-8'),
2777 'uploader': uploader.decode('utf-8'),
2778 'upload_date': None,
2779 'title': json_data['name'],
2780 'ext': file_url.split('.')[-1].decode('utf-8'),
2781 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2782 'thumbnail': json_data['thumbnail_url'],
2783 'description': json_data['description'],
2784 'player_url': player_url.decode('utf-8'),
2787 class StanfordOpenClassroomIE(InfoExtractor):
2788 """Information extractor for Stanford's Open ClassRoom"""
2790 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2791 IE_NAME = u'stanfordoc'
2793 def _real_extract(self, url):
2794 mobj = re.match(self._VALID_URL, url)
2796 raise ExtractorError(u'Invalid URL: %s' % url)
2798 if mobj.group('course') and mobj.group('video'): # A specific video
2799 course = mobj.group('course')
2800 video = mobj.group('video')
2802 'id': course + '_' + video,
2804 'upload_date': None,
2807 self.report_extraction(info['id'])
2808 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2809 xmlUrl = baseUrl + video + '.xml'
2811 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2812 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2813 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2814 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2816 info['title'] = mdoc.findall('./title')[0].text
2817 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2819 raise ExtractorError(u'Invalid metadata XML file')
2820 info['ext'] = info['url'].rpartition('.')[2]
2822 elif mobj.group('course'): # A course page
2823 course = mobj.group('course')
2828 'upload_date': None,
2831 coursepage = self._download_webpage(url, info['id'],
2832 note='Downloading course info page',
2833 errnote='Unable to download course info page')
2835 m = re.search('<h1>([^<]+)</h1>', coursepage)
2837 info['title'] = unescapeHTML(m.group(1))
2839 info['title'] = info['id']
2841 m = re.search('<description>([^<]+)</description>', coursepage)
2843 info['description'] = unescapeHTML(m.group(1))
2845 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2848 'type': 'reference',
2849 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2853 for entry in info['list']:
2854 assert entry['type'] == 'reference'
2855 results += self.extract(entry['url'])
2859 'id': 'Stanford OpenClassroom',
2862 'upload_date': None,
2865 self.report_download_webpage(info['id'])
2866 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2868 rootpage = compat_urllib_request.urlopen(rootURL).read()
2869 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2870 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2872 info['title'] = info['id']
2874 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2877 'type': 'reference',
2878 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2883 for entry in info['list']:
2884 assert entry['type'] == 'reference'
2885 results += self.extract(entry['url'])
2888 class MTVIE(InfoExtractor):
2889 """Information extractor for MTV.com"""
2891 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2894 def _real_extract(self, url):
2895 mobj = re.match(self._VALID_URL, url)
2897 raise ExtractorError(u'Invalid URL: %s' % url)
2898 if not mobj.group('proto'):
2899 url = 'http://' + url
2900 video_id = mobj.group('videoid')
2902 webpage = self._download_webpage(url, video_id)
2904 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2906 raise ExtractorError(u'Unable to extract song name')
2907 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2908 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2910 raise ExtractorError(u'Unable to extract performer')
2911 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2912 video_title = performer + ' - ' + song_name
2914 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2916 raise ExtractorError(u'Unable to mtvn_uri')
2917 mtvn_uri = mobj.group(1)
2919 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2921 raise ExtractorError(u'Unable to extract content id')
2922 content_id = mobj.group(1)
2924 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2925 self.report_extraction(video_id)
2926 request = compat_urllib_request.Request(videogen_url)
2928 metadataXml = compat_urllib_request.urlopen(request).read()
2929 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2930 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2932 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2933 renditions = mdoc.findall('.//rendition')
2935 # For now, always pick the highest quality.
2936 rendition = renditions[-1]
2939 _,_,ext = rendition.attrib['type'].partition('/')
2940 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2941 video_url = rendition.find('./src').text
2943 raise ExtractorError('Invalid rendition field.')
2948 'uploader': performer,
2949 'upload_date': None,
2950 'title': video_title,
2958 class YoukuIE(InfoExtractor):
2959 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2962 nowTime = int(time.time() * 1000)
2963 random1 = random.randint(1000,1998)
2964 random2 = random.randint(1000,9999)
2966 return "%d%d%d" %(nowTime,random1,random2)
2968 def _get_file_ID_mix_string(self, seed):
2970 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2972 for i in range(len(source)):
2973 seed = (seed * 211 + 30031 ) % 65536
2974 index = math.floor(seed / 65536 * len(source) )
2975 mixed.append(source[int(index)])
2976 source.remove(source[int(index)])
2977 #return ''.join(mixed)
2980 def _get_file_id(self, fileId, seed):
2981 mixed = self._get_file_ID_mix_string(seed)
2982 ids = fileId.split('*')
2986 realId.append(mixed[int(ch)])
2987 return ''.join(realId)
2989 def _real_extract(self, url):
2990 mobj = re.match(self._VALID_URL, url)
2992 raise ExtractorError(u'Invalid URL: %s' % url)
2993 video_id = mobj.group('ID')
2995 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
2997 jsondata = self._download_webpage(info_url, video_id)
2999 self.report_extraction(video_id)
3001 config = json.loads(jsondata)
3003 video_title = config['data'][0]['title']
3004 seed = config['data'][0]['seed']
3006 format = self._downloader.params.get('format', None)
3007 supported_format = list(config['data'][0]['streamfileids'].keys())
3009 if format is None or format == 'best':
3010 if 'hd2' in supported_format:
3015 elif format == 'worst':
3023 fileid = config['data'][0]['streamfileids'][format]
3024 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3025 except (UnicodeDecodeError, ValueError, KeyError):
3026 raise ExtractorError(u'Unable to extract info section')
3029 sid = self._gen_sid()
3030 fileid = self._get_file_id(fileid, seed)
3032 #column 8,9 of fileid represent the segment number
3033 #fileid[7:9] should be changed
3034 for index, key in enumerate(keys):
3036 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3037 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3040 'id': '%s_part%02d' % (video_id, index),
3041 'url': download_url,
3043 'upload_date': None,
3044 'title': video_title,
3047 files_info.append(info)
3052 class XNXXIE(InfoExtractor):
3053 """Information extractor for xnxx.com"""
3055 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3057 VIDEO_URL_RE = r'flv_url=(.*?)&'
3058 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3059 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3061 def _real_extract(self, url):
3062 mobj = re.match(self._VALID_URL, url)
3064 raise ExtractorError(u'Invalid URL: %s' % url)
3065 video_id = mobj.group(1)
3067 # Get webpage content
3068 webpage = self._download_webpage(url, video_id)
3070 result = re.search(self.VIDEO_URL_RE, webpage)
3072 raise ExtractorError(u'Unable to extract video url')
3073 video_url = compat_urllib_parse.unquote(result.group(1))
3075 result = re.search(self.VIDEO_TITLE_RE, webpage)
3077 raise ExtractorError(u'Unable to extract video title')
3078 video_title = result.group(1)
3080 result = re.search(self.VIDEO_THUMB_RE, webpage)
3082 raise ExtractorError(u'Unable to extract video thumbnail')
3083 video_thumbnail = result.group(1)
3089 'upload_date': None,
3090 'title': video_title,
3092 'thumbnail': video_thumbnail,
3093 'description': None,
3097 class GooglePlusIE(InfoExtractor):
3098 """Information extractor for plus.google.com."""
3100 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3101 IE_NAME = u'plus.google'
3103 def report_extract_entry(self, url):
3104 """Report downloading extry"""
3105 self.to_screen(u'Downloading entry: %s' % url)
3107 def report_date(self, upload_date):
3108 """Report downloading extry"""
3109 self.to_screen(u'Entry date: %s' % upload_date)
3111 def report_uploader(self, uploader):
3112 """Report downloading extry"""
3113 self.to_screen(u'Uploader: %s' % uploader)
3115 def report_title(self, video_title):
3116 """Report downloading extry"""
3117 self.to_screen(u'Title: %s' % video_title)
3119 def report_extract_vid_page(self, video_page):
3120 """Report information extraction."""
3121 self.to_screen(u'Extracting video page: %s' % video_page)
3123 def _real_extract(self, url):
3124 # Extract id from URL
3125 mobj = re.match(self._VALID_URL, url)
3127 raise ExtractorError(u'Invalid URL: %s' % url)
3129 post_url = mobj.group(0)
3130 video_id = mobj.group(1)
3132 video_extension = 'flv'
3134 # Step 1, Retrieve post webpage to extract further information
3135 self.report_extract_entry(post_url)
3136 webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3138 # Extract update date
3140 pattern = 'title="Timestamp">(.*?)</a>'
3141 mobj = re.search(pattern, webpage)
3143 upload_date = mobj.group(1)
3144 # Convert timestring to a format suitable for filename
3145 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3146 upload_date = upload_date.strftime('%Y%m%d')
3147 self.report_date(upload_date)
3151 pattern = r'rel\="author".*?>(.*?)</a>'
3152 mobj = re.search(pattern, webpage)
3154 uploader = mobj.group(1)
3155 self.report_uploader(uploader)
3158 # Get the first line for title
3160 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3161 mobj = re.search(pattern, webpage)
3163 video_title = mobj.group(1)
3164 self.report_title(video_title)
3166 # Step 2, Stimulate clicking the image box to launch video
3167 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3168 mobj = re.search(pattern, webpage)
3170 raise ExtractorError(u'Unable to extract video page URL')
3172 video_page = mobj.group(1)
3173 webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3174 self.report_extract_vid_page(video_page)
3177 # Extract video links on video page
3178 """Extract video links of all sizes"""
3179 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3180 mobj = re.findall(pattern, webpage)
3182 raise ExtractorError(u'Unable to extract video links')
3184 # Sort in resolution
3185 links = sorted(mobj)
3187 # Choose the lowest of the sort, i.e. highest resolution
3188 video_url = links[-1]
3189 # Only get the url. The resolution part in the tuple has no use anymore
3190 video_url = video_url[-1]
3191 # Treat escaped \u0026 style hex
3193 video_url = video_url.decode("unicode_escape")
3194 except AttributeError: # Python 3
3195 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3201 'uploader': uploader,
3202 'upload_date': upload_date,
3203 'title': video_title,
3204 'ext': video_extension,
3207 class NBAIE(InfoExtractor):
3208 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3211 def _real_extract(self, url):
3212 mobj = re.match(self._VALID_URL, url)
3214 raise ExtractorError(u'Invalid URL: %s' % url)
3216 video_id = mobj.group(1)
3217 if video_id.endswith('/index.html'):
3218 video_id = video_id[:-len('/index.html')]
3220 webpage = self._download_webpage(url, video_id)
3222 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3223 def _findProp(rexp, default=None):
3224 m = re.search(rexp, webpage)
3226 return unescapeHTML(m.group(1))
3230 shortened_video_id = video_id.rpartition('/')[2]
3231 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3233 'id': shortened_video_id,
3237 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3238 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3242 class JustinTVIE(InfoExtractor):
3243 """Information extractor for justin.tv and twitch.tv"""
3244 # TODO: One broadcast may be split into multiple videos. The key
3245 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3246 # starts at 1 and increases. Can we treat all parts as one video?
3248 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3250 (?P<channelid>[^/]+)|
3251 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3252 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3256 _JUSTIN_PAGE_LIMIT = 100
3257 IE_NAME = u'justin.tv'
3259 def report_download_page(self, channel, offset):
3260 """Report attempt to download a single page of videos."""
3261 self.to_screen(u'%s: Downloading video information from %d to %d' %
3262 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3264 # Return count of items, list of *valid* items
3265 def _parse_page(self, url, video_id):
3266 webpage = self._download_webpage(url, video_id,
3267 u'Downloading video info JSON',
3268 u'unable to download video info JSON')
3270 response = json.loads(webpage)
3271 if type(response) != list:
3272 error_text = response.get('error', 'unknown error')
3273 raise ExtractorError(u'Justin.tv API: %s' % error_text)
3275 for clip in response:
3276 video_url = clip['video_file_url']
3278 video_extension = os.path.splitext(video_url)[1][1:]
3279 video_date = re.sub('-', '', clip['start_time'][:10])
3280 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3281 video_id = clip['id']
3282 video_title = clip.get('title', video_id)
3286 'title': video_title,
3287 'uploader': clip.get('channel_name', video_uploader_id),
3288 'uploader_id': video_uploader_id,
3289 'upload_date': video_date,
3290 'ext': video_extension,
3292 return (len(response), info)
3294 def _real_extract(self, url):
3295 mobj = re.match(self._VALID_URL, url)
3297 raise ExtractorError(u'invalid URL: %s' % url)
3299 api_base = 'http://api.justin.tv'
3301 if mobj.group('channelid'):
3303 video_id = mobj.group('channelid')
3304 api = api_base + '/channel/archives/%s.json' % video_id
3305 elif mobj.group('chapterid'):
3306 chapter_id = mobj.group('chapterid')
3308 webpage = self._download_webpage(url, chapter_id)
3309 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3311 raise ExtractorError(u'Cannot find archive of a chapter')
3312 archive_id = m.group(1)
3314 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3315 chapter_info_xml = self._download_webpage(api, chapter_id,
3316 note=u'Downloading chapter information',
3317 errnote=u'Chapter information download failed')
3318 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3319 for a in doc.findall('.//archive'):
3320 if archive_id == a.find('./id').text:
3323 raise ExtractorError(u'Could not find chapter in chapter information')
3325 video_url = a.find('./video_file_url').text
3326 video_ext = video_url.rpartition('.')[2] or u'flv'
3328 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3329 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3330 note='Downloading chapter metadata',
3331 errnote='Download of chapter metadata failed')
3332 chapter_info = json.loads(chapter_info_json)
3334 bracket_start = int(doc.find('.//bracket_start').text)
3335 bracket_end = int(doc.find('.//bracket_end').text)
3337 # TODO determine start (and probably fix up file)
3338 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3339 #video_url += u'?start=' + TODO:start_timestamp
3340 # bracket_start is 13290, but we want 51670615
3341 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3342 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3345 'id': u'c' + chapter_id,
3348 'title': chapter_info['title'],
3349 'thumbnail': chapter_info['preview'],
3350 'description': chapter_info['description'],
3351 'uploader': chapter_info['channel']['display_name'],
3352 'uploader_id': chapter_info['channel']['name'],
3356 video_id = mobj.group('videoid')
3357 api = api_base + '/broadcast/by_archive/%s.json' % video_id
3359 self.report_extraction(video_id)
3363 limit = self._JUSTIN_PAGE_LIMIT
3366 self.report_download_page(video_id, offset)
3367 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3368 page_count, page_info = self._parse_page(page_url, video_id)
3369 info.extend(page_info)
3370 if not paged or page_count != limit:
3375 class FunnyOrDieIE(InfoExtractor):
3376 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3378 def _real_extract(self, url):
3379 mobj = re.match(self._VALID_URL, url)
3381 raise ExtractorError(u'invalid URL: %s' % url)
3383 video_id = mobj.group('id')
3384 webpage = self._download_webpage(url, video_id)
3386 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3388 raise ExtractorError(u'Unable to find video information')
3389 video_url = unescapeHTML(m.group('url'))
3391 m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3393 m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3395 raise ExtractorError(u'Cannot find video title')
3396 title = clean_html(m.group('title'))
3398 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3400 desc = unescapeHTML(m.group('desc'))
3409 'description': desc,
3413 class SteamIE(InfoExtractor):
3414 _VALID_URL = r"""http://store\.steampowered\.com/
3416 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3418 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3422 def suitable(cls, url):
3423 """Receives a URL and returns True if suitable for this IE."""
3424 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3426 def _real_extract(self, url):
3427 m = re.match(self._VALID_URL, url, re.VERBOSE)
3428 gameID = m.group('gameID')
3429 videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3430 self.report_age_confirmation()
3431 webpage = self._download_webpage(videourl, gameID)
3432 game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3434 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3435 mweb = re.finditer(urlRE, webpage)
3436 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3437 titles = re.finditer(namesRE, webpage)
3438 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3439 thumbs = re.finditer(thumbsRE, webpage)
3441 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3442 video_id = vid.group('videoID')
3443 title = vtitle.group('videoName')
3444 video_url = vid.group('videoURL')
3445 video_thumb = thumb.group('thumbnail')
3447 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3452 'title': unescapeHTML(title),
3453 'thumbnail': video_thumb
3456 return [self.playlist_result(videos, gameID, game_title)]
3458 class UstreamIE(InfoExtractor):
3459 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3460 IE_NAME = u'ustream'
3462 def _real_extract(self, url):
3463 m = re.match(self._VALID_URL, url)
3464 video_id = m.group('videoID')
3465 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3466 webpage = self._download_webpage(url, video_id)
3467 self.report_extraction(video_id)
3469 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3470 title = m.group('title')
3471 m = re.search(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
3473 uploader = unescapeHTML(m.group('uploader').strip())
3474 m = re.search(r'<link rel="image_src" href="(?P<thumb>.*?)"', webpage)
3475 thumb = m.group('thumb')
3476 except AttributeError:
3477 raise ExtractorError(u'Unable to extract info')
3483 'uploader': uploader,
3488 class WorldStarHipHopIE(InfoExtractor):
3489 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3490 IE_NAME = u'WorldStarHipHop'
3492 def _real_extract(self, url):
3493 _src_url = r'so\.addVariable\("file","(.*?)"\)'
3495 m = re.match(self._VALID_URL, url)
3496 video_id = m.group('id')
3498 webpage_src = self._download_webpage(url, video_id)
3500 mobj = re.search(_src_url, webpage_src)
3502 if mobj is not None:
3503 video_url = mobj.group(1)
3504 if 'mp4' in video_url:
3509 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3511 mobj = re.search(r"<title>(.*)</title>", webpage_src)
3514 raise ExtractorError(u'Cannot determine title')
3515 title = mobj.group(1)
3517 mobj = re.search(r'rel="image_src" href="(.*)" />', webpage_src)
3518 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3519 if mobj is not None:
3520 thumbnail = mobj.group(1)
3522 _title = r"""candytitles.*>(.*)</span>"""
3523 mobj = re.search(_title, webpage_src)
3524 if mobj is not None:
3525 title = mobj.group(1)
3532 'thumbnail' : thumbnail,
3537 class RBMARadioIE(InfoExtractor):
3538 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3540 def _real_extract(self, url):
3541 m = re.match(self._VALID_URL, url)
3542 video_id = m.group('videoID')
3544 webpage = self._download_webpage(url, video_id)
3545 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3547 raise ExtractorError(u'Cannot find metadata')
3548 json_data = m.group(1)
3551 data = json.loads(json_data)
3552 except ValueError as e:
3553 raise ExtractorError(u'Invalid JSON: ' + str(e))
3555 video_url = data['akamai_url'] + '&cbr=256'
3556 url_parts = compat_urllib_parse_urlparse(video_url)
3557 video_ext = url_parts.path.rpartition('.')[2]
3562 'title': data['title'],
3563 'description': data.get('teaser_text'),
3564 'location': data.get('country_of_origin'),
3565 'uploader': data.get('host', {}).get('name'),
3566 'uploader_id': data.get('host', {}).get('slug'),
3567 'thumbnail': data.get('image', {}).get('large_url_2x'),
3568 'duration': data.get('duration'),
3573 class YouPornIE(InfoExtractor):
3574 """Information extractor for youporn.com."""
3575 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3577 def _print_formats(self, formats):
3578 """Print all available formats"""
3579 print(u'Available formats:')
3580 print(u'ext\t\tformat')
3581 print(u'---------------------------------')
3582 for format in formats:
3583 print(u'%s\t\t%s' % (format['ext'], format['format']))
3585 def _specific(self, req_format, formats):
3587 if(x["format"]==req_format):
3591 def _real_extract(self, url):
3592 mobj = re.match(self._VALID_URL, url)
3594 raise ExtractorError(u'Invalid URL: %s' % url)
3596 video_id = mobj.group('videoid')
3598 req = compat_urllib_request.Request(url)
3599 req.add_header('Cookie', 'age_verified=1')
3600 webpage = self._download_webpage(req, video_id)
3602 # Get the video title
3603 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3605 raise ExtractorError(u'Unable to extract video title')
3606 video_title = result.group('title').strip()
3608 # Get the video date
3609 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3611 self._downloader.report_warning(u'unable to extract video date')
3614 upload_date = unified_strdate(result.group('date').strip())
3616 # Get the video uploader
3617 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3619 self._downloader.report_warning(u'unable to extract uploader')
3620 video_uploader = None
3622 video_uploader = result.group('uploader').strip()
3623 video_uploader = clean_html( video_uploader )
3625 # Get all of the formats available
3626 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3627 result = re.search(DOWNLOAD_LIST_RE, webpage)
3629 raise ExtractorError(u'Unable to extract download list')
3630 download_list_html = result.group('download_list').strip()
3632 # Get all of the links from the page
3633 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3634 links = re.findall(LINK_RE, download_list_html)
3635 if(len(links) == 0):
3636 raise ExtractorError(u'ERROR: no known formats available for video')
3638 self.to_screen(u'Links found: %d' % len(links))
3643 # A link looks like this:
3644 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3645 # A path looks like this:
3646 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3647 video_url = unescapeHTML( link )
3648 path = compat_urllib_parse_urlparse( video_url ).path
3649 extension = os.path.splitext( path )[1][1:]
3650 format = path.split('/')[4].split('_')[:2]
3653 format = "-".join( format )
3654 title = u'%s-%s-%s' % (video_title, size, bitrate)
3659 'uploader': video_uploader,
3660 'upload_date': upload_date,
3665 'description': None,
3669 if self._downloader.params.get('listformats', None):
3670 self._print_formats(formats)
3673 req_format = self._downloader.params.get('format', None)
3674 self.to_screen(u'Format: %s' % req_format)
3676 if req_format is None or req_format == 'best':
3678 elif req_format == 'worst':
3679 return [formats[-1]]
3680 elif req_format in ('-1', 'all'):
3683 format = self._specific( req_format, formats )
3685 raise ExtractorError(u'Requested format not available')
3690 class PornotubeIE(InfoExtractor):
3691 """Information extractor for pornotube.com."""
3692 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3694 def _real_extract(self, url):
3695 mobj = re.match(self._VALID_URL, url)
3697 raise ExtractorError(u'Invalid URL: %s' % url)
3699 video_id = mobj.group('videoid')
3700 video_title = mobj.group('title')
3702 # Get webpage content
3703 webpage = self._download_webpage(url, video_id)
3706 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3707 result = re.search(VIDEO_URL_RE, webpage)
3709 raise ExtractorError(u'Unable to extract video url')
3710 video_url = compat_urllib_parse.unquote(result.group('url'))
3712 #Get the uploaded date
3713 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3714 result = re.search(VIDEO_UPLOADED_RE, webpage)
3716 raise ExtractorError(u'Unable to extract video title')
3717 upload_date = unified_strdate(result.group('date'))
3719 info = {'id': video_id,
3722 'upload_date': upload_date,
3723 'title': video_title,
3729 class YouJizzIE(InfoExtractor):
3730 """Information extractor for youjizz.com."""
3731 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3733 def _real_extract(self, url):
3734 mobj = re.match(self._VALID_URL, url)
3736 raise ExtractorError(u'Invalid URL: %s' % url)
3738 video_id = mobj.group('videoid')
3740 # Get webpage content
3741 webpage = self._download_webpage(url, video_id)
3743 # Get the video title
3744 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3746 raise ExtractorError(u'ERROR: unable to extract video title')
3747 video_title = result.group('title').strip()
3749 # Get the embed page
3750 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3752 raise ExtractorError(u'ERROR: unable to extract embed page')
3754 embed_page_url = result.group(0).strip()
3755 video_id = result.group('videoid')
3757 webpage = self._download_webpage(embed_page_url, video_id)
3760 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3762 raise ExtractorError(u'ERROR: unable to extract video url')
3763 video_url = result.group('source')
3765 info = {'id': video_id,
3767 'title': video_title,
3770 'player_url': embed_page_url}
3774 class EightTracksIE(InfoExtractor):
3776 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3778 def _real_extract(self, url):
3779 mobj = re.match(self._VALID_URL, url)
3781 raise ExtractorError(u'Invalid URL: %s' % url)
3782 playlist_id = mobj.group('id')
3784 webpage = self._download_webpage(url, playlist_id)
3786 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3788 raise ExtractorError(u'Cannot find trax information')
3789 json_like = m.group(1)
3790 data = json.loads(json_like)
3792 session = str(random.randint(0, 1000000000))
3794 track_count = data['tracks_count']
3795 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3796 next_url = first_url
3798 for i in itertools.count():
3799 api_json = self._download_webpage(next_url, playlist_id,
3800 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3801 errnote=u'Failed to download song information')
3802 api_data = json.loads(api_json)
3803 track_data = api_data[u'set']['track']
3805 'id': track_data['id'],
3806 'url': track_data['track_file_stream_url'],
3807 'title': track_data['performer'] + u' - ' + track_data['name'],
3808 'raw_title': track_data['name'],
3809 'uploader_id': data['user']['login'],
3813 if api_data['set']['at_last_track']:
3815 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3818 class KeekIE(InfoExtractor):
3819 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3822 def _real_extract(self, url):
3823 m = re.match(self._VALID_URL, url)
3824 video_id = m.group('videoID')
3825 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3826 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3827 webpage = self._download_webpage(url, video_id)
3828 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3829 title = unescapeHTML(m.group('title'))
3830 m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
3831 uploader = clean_html(m.group('uploader'))
3837 'thumbnail': thumbnail,
3838 'uploader': uploader
3842 class TEDIE(InfoExtractor):
3843 _VALID_URL=r'''http://www\.ted\.com/
3845 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3847 ((?P<type_talk>talks)) # We have a simple talk
3849 (/lang/(.*?))? # The url may contain the language
3850 /(?P<name>\w+) # Here goes the name and then ".html"
3854 def suitable(cls, url):
3855 """Receives a URL and returns True if suitable for this IE."""
3856 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3858 def _real_extract(self, url):
3859 m=re.match(self._VALID_URL, url, re.VERBOSE)
3860 if m.group('type_talk'):
3861 return [self._talk_info(url)]
3863 playlist_id=m.group('playlist_id')
3864 name=m.group('name')
3865 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3866 return [self._playlist_videos_info(url,name,playlist_id)]
3868 def _talk_video_link(self,mediaSlug):
3869 '''Returns the video link for that mediaSlug'''
3870 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3872 def _playlist_videos_info(self,url,name,playlist_id=0):
3873 '''Returns the videos of the playlist'''
3875 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3876 ([.\s]*?)data-playlist_item_id="(\d+)"
3877 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3879 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3880 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3881 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3882 m_names=re.finditer(video_name_RE,webpage)
3884 playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3885 m_playlist = re.search(playlist_RE, webpage)
3886 playlist_title = m_playlist.group('playlist_title')
3888 playlist_entries = []
3889 for m_video, m_name in zip(m_videos,m_names):
3890 video_id=m_video.group('video_id')
3891 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3892 playlist_entries.append(self.url_result(talk_url, 'TED'))
3893 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3895 def _talk_info(self, url, video_id=0):
3896 """Return the video for the talk in the url"""
3897 m=re.match(self._VALID_URL, url,re.VERBOSE)
3898 videoName=m.group('name')
3899 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3900 # If the url includes the language we get the title translated
3901 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3902 title=re.search(title_RE, webpage).group('title')
3903 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3904 "id":(?P<videoID>[\d]+).*?
3905 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3906 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3907 thumb_match=re.search(thumb_RE,webpage)
3908 info_match=re.search(info_RE,webpage,re.VERBOSE)
3909 video_id=info_match.group('videoID')
3910 mediaSlug=info_match.group('mediaSlug')
3911 video_url=self._talk_video_link(mediaSlug)
3917 'thumbnail': thumb_match.group('thumbnail')
3921 class MySpassIE(InfoExtractor):
3922 _VALID_URL = r'http://www.myspass.de/.*'
3924 def _real_extract(self, url):
3925 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3927 # video id is the last path element of the URL
3928 # usually there is a trailing slash, so also try the second but last
3929 url_path = compat_urllib_parse_urlparse(url).path
3930 url_parent_path, video_id = os.path.split(url_path)
3932 _, video_id = os.path.split(url_parent_path)
3935 metadata_url = META_DATA_URL_TEMPLATE % video_id
3936 metadata_text = self._download_webpage(metadata_url, video_id)
3937 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3939 # extract values from metadata
3940 url_flv_el = metadata.find('url_flv')
3941 if url_flv_el is None:
3942 raise ExtractorError(u'Unable to extract download url')
3943 video_url = url_flv_el.text
3944 extension = os.path.splitext(video_url)[1][1:]
3945 title_el = metadata.find('title')
3946 if title_el is None:
3947 raise ExtractorError(u'Unable to extract title')
3948 title = title_el.text
3949 format_id_el = metadata.find('format_id')
3950 if format_id_el is None:
3953 format = format_id_el.text
3954 description_el = metadata.find('description')
3955 if description_el is not None:
3956 description = description_el.text
3959 imagePreview_el = metadata.find('imagePreview')
3960 if imagePreview_el is not None:
3961 thumbnail = imagePreview_el.text
3970 'thumbnail': thumbnail,
3971 'description': description
3975 class SpiegelIE(InfoExtractor):
3976 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3978 def _real_extract(self, url):
3979 m = re.match(self._VALID_URL, url)
3980 video_id = m.group('videoID')
3982 webpage = self._download_webpage(url, video_id)
3983 m = re.search(r'<div class="module-title">(.*?)</div>', webpage)
3985 raise ExtractorError(u'Cannot find title')
3986 video_title = unescapeHTML(m.group(1))
3988 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3989 xml_code = self._download_webpage(xml_url, video_id,
3990 note=u'Downloading XML', errnote=u'Failed to download XML')
3992 idoc = xml.etree.ElementTree.fromstring(xml_code)
3993 last_type = idoc[-1]
3994 filename = last_type.findall('./filename')[0].text
3995 duration = float(last_type.findall('./duration')[0].text)
3997 video_url = 'http://video2.spiegel.de/flash/' + filename
3998 video_ext = filename.rpartition('.')[2]
4003 'title': video_title,
4004 'duration': duration,
4008 class LiveLeakIE(InfoExtractor):
4010 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4011 IE_NAME = u'liveleak'
4013 def _real_extract(self, url):
4014 mobj = re.match(self._VALID_URL, url)
4016 raise ExtractorError(u'Invalid URL: %s' % url)
4018 video_id = mobj.group('video_id')
4020 webpage = self._download_webpage(url, video_id)
4022 m = re.search(r'file: "(.*?)",', webpage)
4024 raise ExtractorError(u'Unable to find video url')
4025 video_url = m.group(1)
4027 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4029 raise ExtractorError(u'Cannot find video title')
4030 title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4032 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4034 desc = unescapeHTML(m.group('desc'))
4038 m = re.search(r'By:.*?(\w+)</a>', webpage)
4040 uploader = clean_html(m.group(1))
4049 'description': desc,
4050 'uploader': uploader
4055 class ARDIE(InfoExtractor):
4056 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4057 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4058 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4060 def _real_extract(self, url):
4061 # determine video id from url
4062 m = re.match(self._VALID_URL, url)
4064 numid = re.search(r'documentId=([0-9]+)', url)
4066 video_id = numid.group(1)
4068 video_id = m.group('video_id')
4070 # determine title and media streams from webpage
4071 html = self._download_webpage(url, video_id)
4072 title = re.search(self._TITLE, html).group('title')
4073 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4075 assert '"fsk"' in html
4076 raise ExtractorError(u'This video is only available after 8:00 pm')
4078 # choose default media type and highest quality for now
4079 stream = max([s for s in streams if int(s["media_type"]) == 0],
4080 key=lambda s: int(s["quality"]))
4082 # there's two possibilities: RTMP stream or HTTP download
4083 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4084 if stream['rtmp_url']:
4085 self.to_screen(u'RTMP download detected')
4086 assert stream['video_url'].startswith('mp4:')
4087 info["url"] = stream["rtmp_url"]
4088 info["play_path"] = stream['video_url']
4090 assert stream["video_url"].endswith('.mp4')
4091 info["url"] = stream["video_url"]
4094 class ZDFIE(InfoExtractor):
4095 _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4096 _TITLE = r'<h1(?: class="beitragHeadline")?>(?P<title>.*)</h1>'
4097 _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>'
4098 _MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"'
4099 _RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)'
4101 def _real_extract(self, url):
4102 mobj = re.match(self._VALID_URL, url)
4104 raise ExtractorError(u'Invalid URL: %s' % url)
4105 video_id = mobj.group('video_id')
4107 html = self._download_webpage(url, video_id)
4108 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4110 raise ExtractorError(u'No media url found.')
4112 # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url
4113 # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url
4114 # choose first/default media type and highest quality for now
4115 for s in streams: #find 300 - dsl1000mbit
4116 if s['quality'] == '300' and s['media_type'] == 'wstreaming':
4119 for s in streams: #find veryhigh - dsl2000mbit
4120 if s['quality'] == 'veryhigh' and s['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working
4124 raise ExtractorError(u'No stream found.')
4126 media_link = self._download_webpage(stream_['video_url'], video_id,'Get stream URL')
4128 self.report_extraction(video_id)
4129 mobj = re.search(self._TITLE, html)
4131 raise ExtractorError(u'Cannot extract title')
4132 title = unescapeHTML(mobj.group('title'))
4134 mobj = re.search(self._MMS_STREAM, media_link)
4136 mobj = re.search(self._RTSP_STREAM, media_link)
4138 raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL')
4139 mms_url = mobj.group('video_url')
4141 mobj = re.search('(.*)[.](?P<ext>[^.]+)', mms_url)
4143 raise ExtractorError(u'Cannot extract extention')
4144 ext = mobj.group('ext')
4146 return [{'id': video_id,
4152 class TumblrIE(InfoExtractor):
4153 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4155 def _real_extract(self, url):
4156 m_url = re.match(self._VALID_URL, url)
4157 video_id = m_url.group('id')
4158 blog = m_url.group('blog_name')
4160 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4161 webpage = self._download_webpage(url, video_id)
4163 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4164 video = re.search(re_video, webpage)
4166 self.to_screen("No video found")
4168 video_url = video.group('video_url')
4169 ext = video.group('ext')
4171 re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22' # We pick the first poster
4172 thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
4174 # The only place where you can get a title, it's not complete,
4175 # but searching in other places doesn't work for all videos
4176 re_title = r'<title>(?P<title>.*?)</title>'
4177 title = unescapeHTML(re.search(re_title, webpage, re.DOTALL).group('title'))
4179 return [{'id': video_id,
4186 class BandcampIE(InfoExtractor):
4187 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4189 def _real_extract(self, url):
4190 mobj = re.match(self._VALID_URL, url)
4191 title = mobj.group('title')
4192 webpage = self._download_webpage(url, title)
4193 # We get the link to the free download page
4194 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4195 if m_download is None:
4196 raise ExtractorError(u'No free songs founded')
4198 download_link = m_download.group(1)
4199 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4200 webpage, re.MULTILINE|re.DOTALL).group('id')
4202 download_webpage = self._download_webpage(download_link, id,
4203 'Downloading free downloads page')
4204 # We get the dictionary of the track from some javascrip code
4205 info = re.search(r'items: (.*?),$',
4206 download_webpage, re.MULTILINE).group(1)
4207 info = json.loads(info)[0]
4208 # We pick mp3-320 for now, until format selection can be easily implemented.
4209 mp3_info = info[u'downloads'][u'mp3-320']
4210 # If we try to use this url it says the link has expired
4211 initial_url = mp3_info[u'url']
4212 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4213 m_url = re.match(re_url, initial_url)
4214 #We build the url we will use to get the final track url
4215 # This url is build in Bandcamp in the script download_bunde_*.js
4216 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4217 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4218 # If we could correctly generate the .rand field the url would be
4219 #in the "download_url" key
4220 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4222 track_info = {'id':id,
4223 'title' : info[u'title'],
4226 'thumbnail' : info[u'thumb_url'],
4227 'uploader' : info[u'artist']
4232 class RedTubeIE(InfoExtractor):
4233 """Information Extractor for redtube"""
4234 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4236 def _real_extract(self,url):
4237 mobj = re.match(self._VALID_URL, url)
4239 raise ExtractorError(u'Invalid URL: %s' % url)
4241 video_id = mobj.group('id')
4242 video_extension = 'mp4'
4243 webpage = self._download_webpage(url, video_id)
4244 self.report_extraction(video_id)
4245 mobj = re.search(r'<source src="'+'(.+)'+'" type="video/mp4">',webpage)
4248 raise ExtractorError(u'Unable to extract media URL')
4250 video_url = mobj.group(1)
4251 mobj = re.search('<h1 class="videoTitle slidePanelMovable">(.+)</h1>',webpage)
4253 raise ExtractorError(u'Unable to extract title')
4254 video_title = mobj.group(1)
4259 'ext': video_extension,
4260 'title': video_title,
4263 class InaIE(InfoExtractor):
4264 """Information Extractor for Ina.fr"""
4265 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4267 def _real_extract(self,url):
4268 mobj = re.match(self._VALID_URL, url)
4270 video_id = mobj.group('id')
4271 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4272 video_extension = 'mp4'
4273 webpage = self._download_webpage(mrss_url, video_id)
4275 mobj = re.search(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)', webpage)
4277 raise ExtractorError(u'Unable to extract media URL')
4278 video_url = mobj.group(1)
4280 mobj = re.search(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>', webpage)
4282 raise ExtractorError(u'Unable to extract title')
4283 video_title = mobj.group(1)
4288 'ext': video_extension,
4289 'title': video_title,
4292 class HowcastIE(InfoExtractor):
4293 """Information Extractor for Howcast.com"""
4294 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
4296 def _real_extract(self, url):
4297 mobj = re.match(self._VALID_URL, url)
4299 video_id = mobj.group('id')
4300 webpage_url = 'http://www.howcast.com/videos/' + video_id
4301 webpage = self._download_webpage(webpage_url, video_id)
4303 self.report_extraction(video_id)
4305 mobj = re.search(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)"', webpage)
4307 raise ExtractorError(u'Unable to extract video URL')
4308 video_url = mobj.group(1)
4310 mobj = re.search(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'', webpage)
4312 raise ExtractorError(u'Unable to extract title')
4313 video_title = mobj.group(1) or mobj.group(2)
4315 mobj = re.search(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'', webpage)
4317 self._downloader.report_warning(u'unable to extract description')
4318 video_description = None
4320 video_description = mobj.group(1) or mobj.group(2)
4322 mobj = re.search(r'<meta content=\'(.+?)\' property=\'og:image\'', webpage)
4324 raise ExtractorError(u'Unable to extract thumbnail')
4325 thumbnail = mobj.group(1)
4331 'title': video_title,
4332 'description': video_description,
4333 'thumbnail': thumbnail,
4336 class VineIE(InfoExtractor):
4337 """Information Extractor for Vine.co"""
4338 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
4340 def _real_extract(self, url):
4342 mobj = re.match(self._VALID_URL, url)
4344 video_id = mobj.group('id')
4345 webpage_url = 'https://vine.co/v/' + video_id
4346 webpage = self._download_webpage(webpage_url, video_id)
4348 self.report_extraction(video_id)
4350 mobj = re.search(r'<meta property="twitter:player:stream" content="(.+?)"', webpage)
4352 raise ExtractorError(u'Unable to extract video URL')
4353 video_url = mobj.group(1)
4355 mobj = re.search(r'<meta property="og:title" content="(.+?)"', webpage)
4357 raise ExtractorError(u'Unable to extract title')
4358 video_title = mobj.group(1)
4360 mobj = re.search(r'<meta property="og:image" content="(.+?)(\?.*?)?"', webpage)
4362 raise ExtractorError(u'Unable to extract thumbnail')
4363 thumbnail = mobj.group(1)
4365 mobj = re.search(r'<div class="user">.*?<h2>(.+?)</h2>', webpage, re.DOTALL)
4367 raise ExtractorError(u'Unable to extract uploader')
4368 uploader = mobj.group(1)
4374 'title': video_title,
4375 'thumbnail': thumbnail,
4376 'uploader': uploader,
4379 class FlickrIE(InfoExtractor):
4380 """Information Extractor for Flickr videos"""
4381 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
4383 def _real_extract(self, url):
4384 mobj = re.match(self._VALID_URL, url)
4386 video_id = mobj.group('id')
4387 video_uploader_id = mobj.group('uploader_id')
4388 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
4389 webpage = self._download_webpage(webpage_url, video_id)
4391 mobj = re.search(r"photo_secret: '(\w+)'", webpage)
4393 raise ExtractorError(u'Unable to extract video secret')
4394 secret = mobj.group(1)
4396 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
4397 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
4399 mobj = re.search(r'<Item id="id">(\d+-\d+)</Item>', first_xml)
4401 raise ExtractorError(u'Unable to extract node_id')
4402 node_id = mobj.group(1)
4404 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
4405 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
4407 self.report_extraction(video_id)
4409 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
4411 raise ExtractorError(u'Unable to extract video url')
4412 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
4414 mobj = re.search(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')', webpage)
4416 raise ExtractorError(u'Unable to extract title')
4417 video_title = mobj.group(1) or mobj.group(2)
4419 mobj = re.search(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')', webpage)
4421 self._downloader.report_warning(u'unable to extract description')
4422 video_description = None
4424 video_description = mobj.group(1) or mobj.group(2)
4426 mobj = re.search(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')', webpage)
4428 raise ExtractorError(u'Unable to extract thumbnail')
4429 thumbnail = mobj.group(1) or mobj.group(2)
4435 'title': video_title,
4436 'description': video_description,
4437 'thumbnail': thumbnail,
4438 'uploader_id': video_uploader_id,
4441 class TeamcocoIE(InfoExtractor):
4442 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
4444 def _real_extract(self, url):
4445 mobj = re.match(self._VALID_URL, url)
4447 raise ExtractorError(u'Invalid URL: %s' % url)
4448 url_title = mobj.group('url_title')
4449 webpage = self._download_webpage(url, url_title)
4451 mobj = re.search(r'<article class="video" data-id="(\d+?)"', webpage)
4452 video_id = mobj.group(1)
4454 self.report_extraction(video_id)
4456 mobj = re.search(r'<meta property="og:title" content="(.+?)"', webpage)
4458 raise ExtractorError(u'Unable to extract title')
4459 video_title = mobj.group(1)
4461 mobj = re.search(r'<meta property="og:image" content="(.+?)"', webpage)
4463 raise ExtractorError(u'Unable to extract thumbnail')
4464 thumbnail = mobj.group(1)
4466 mobj = re.search(r'<meta property="og:description" content="(.*?)"', webpage)
4468 raise ExtractorError(u'Unable to extract description')
4469 description = mobj.group(1)
4471 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
4472 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
4473 mobj = re.search(r'<file type="high".*?>(.*?)</file>', data)
4475 raise ExtractorError(u'Unable to extract video url')
4476 video_url = mobj.group(1)
4482 'title': video_title,
4483 'thumbnail': thumbnail,
4484 'description': description,
4487 class XHamsterIE(InfoExtractor):
4488 """Information Extractor for xHamster"""
4489 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
4491 def _real_extract(self,url):
4492 mobj = re.match(self._VALID_URL, url)
4494 video_id = mobj.group('id')
4495 mrss_url='http://xhamster.com/movies/%s/.html' % video_id
4496 webpage = self._download_webpage(mrss_url, video_id)
4497 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
4499 raise ExtractorError(u'Unable to extract media URL')
4500 if len(mobj.group('server')) == 0:
4501 video_url = compat_urllib_parse.unquote(mobj.group('file'))
4503 video_url = mobj.group('server')+'/key='+mobj.group('file')
4504 video_extension = video_url.split('.')[-1]
4506 mobj = re.search(r'<title>(?P<title>.+?) - xHamster\.com</title>', webpage)
4508 raise ExtractorError(u'Unable to extract title')
4509 video_title = unescapeHTML(mobj.group('title'))
4511 mobj = re.search(r'<span>Description: </span>(?P<description>[^<]+)', webpage)
4513 video_description = u''
4515 video_description = unescapeHTML(mobj.group('description'))
4517 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
4519 raise ExtractorError(u'Unable to extract upload date')
4520 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
4522 mobj = re.search(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^>]+)', webpage)
4524 video_uploader_id = u'anonymous'
4526 video_uploader_id = mobj.group('uploader_id')
4528 mobj = re.search(r'\'image\':\'(?P<thumbnail>[^\']+)\'', webpage)
4530 raise ExtractorError(u'Unable to extract thumbnail URL')
4531 video_thumbnail = mobj.group('thumbnail')
4536 'ext': video_extension,
4537 'title': video_title,
4538 'description': video_description,
4539 'upload_date': video_upload_date,
4540 'uploader_id': video_uploader_id,
4541 'thumbnail': video_thumbnail
4544 class HypemIE(InfoExtractor):
4545 """Information Extractor for hypem"""
4546 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
4548 def _real_extract(self, url):
4549 mobj = re.match(self._VALID_URL, url)
4551 raise ExtractorError(u'Invalid URL: %s' % url)
4552 track_id = mobj.group(1)
4554 data = { 'ax': 1, 'ts': time.time() }
4555 data_encoded = compat_urllib_parse.urlencode(data)
4556 complete_url = url + "?" + data_encoded
4557 request = compat_urllib_request.Request(complete_url)
4558 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
4559 cookie = urlh.headers.get('Set-Cookie', '')
4561 self.report_extraction(track_id)
4562 mobj = re.search(r'<script type="application/json" id="displayList-data">(.*?)</script>', response, flags=re.MULTILINE|re.DOTALL)
4564 raise ExtractorError(u'Unable to extrack tracks')
4565 html_tracks = mobj.group(1).strip()
4567 track_list = json.loads(html_tracks)
4568 track = track_list[u'tracks'][0]
4570 raise ExtractorError(u'Hypemachine contained invalid JSON.')
4573 track_id = track[u"id"]
4574 artist = track[u"artist"]
4575 title = track[u"song"]
4577 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
4578 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
4579 request.add_header('cookie', cookie)
4580 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
4582 song_data = json.loads(song_data_json)
4584 raise ExtractorError(u'Hypemachine contained invalid JSON.')
4585 final_url = song_data[u"url"]
4595 class Vbox7IE(InfoExtractor):
4596 """Information Extractor for Vbox7"""
4597 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
4599 def _real_extract(self,url):
4600 mobj = re.match(self._VALID_URL, url)
4602 raise ExtractorError(u'Invalid URL: %s' % url)
4603 video_id = mobj.group(1)
4605 redirect_page, urlh = self._download_webpage_handle(url, video_id)
4606 redirect_url = urlh.geturl() + re.search(r'window\.location = \'(.*)\';', redirect_page).group(1)
4607 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
4609 title = re.search(r'<title>(.*)</title>', webpage)
4610 title = (title.group(1)).split('/')[0].strip()
4613 info_url = "http://vbox7.com/play/magare.do"
4614 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
4615 info_request = compat_urllib_request.Request(info_url, data)
4616 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
4617 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
4618 if info_response is None:
4619 raise ExtractorError(u'Unable to extract the media url')
4620 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
4627 'thumbnail': thumbnail_url,
4630 def gen_extractors():
4631 """ Return a list of an instance of every supported extractor.
4632 The order does matter; the first extractor matched is the one handling the URL.
4635 YoutubePlaylistIE(),
4660 StanfordOpenClassroomIE(),
4670 WorldStarHipHopIE(),
4698 def get_info_extractor(ie_name):
4699 """Returns the info extractor class with the given ie_name"""
4700 return globals()[ie_name+'IE']