2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
26 class InfoExtractor(object):
27 """Information Extractor class.
29 Information extractors are the classes that, given a URL, extract
30 information about the video (or videos) the URL refers to. This
31 information includes the real video URL, the video title, author and
32 others. The information is stored in a dictionary which is then
33 passed to the FileDownloader. The FileDownloader processes this
34 information possibly downloading the video to the file system, among
35 other possible outcomes.
37 The dictionaries must include the following fields:
41 title: Video title, unescaped.
42 ext: Video filename extension.
44 The following fields are optional:
46 format: The video format, defaults to ext (used for --get-format)
47 thumbnail: Full URL to a video thumbnail image.
48 description: One-line video description.
49 uploader: Full name of the video uploader.
50 upload_date: Video upload date (YYYYMMDD).
51 uploader_id: Nickname or id of the video uploader.
52 location: Physical location of the video.
53 player_url: SWF Player URL (used for rtmpdump).
54 subtitles: The subtitle file contents.
55 urlhandle: [internal] The urlHandle to be used to download the file,
56 like returned by urllib.request.urlopen
58 The fields should all be Unicode strings.
60 Subclasses of this one should re-define the _real_initialize() and
61 _real_extract() methods and define a _VALID_URL regexp.
62 Probably, they should also be added to the list of extractors.
64 _real_extract() must return a *list* of information dictionaries as
67 Finally, the _WORKING attribute should be set to False for broken IEs
68 in order to warn the users and skip the tests.
75 def __init__(self, downloader=None):
76 """Constructor. Receives an optional downloader."""
78 self.set_downloader(downloader)
81 def suitable(cls, url):
82 """Receives a URL and returns True if suitable for this IE."""
83 return re.match(cls._VALID_URL, url) is not None
87 """Getter method for _WORKING."""
91 """Initializes an instance (authentication, etc)."""
93 self._real_initialize()
96 def extract(self, url):
97 """Extracts URL information and returns it in list of dicts."""
99 return self._real_extract(url)
101 def set_downloader(self, downloader):
102 """Sets the downloader for this IE."""
103 self._downloader = downloader
105 def _real_initialize(self):
106 """Real initialization process. Redefine in subclasses."""
109 def _real_extract(self, url):
110 """Real extraction process. Redefine in subclasses."""
115 return type(self).__name__[:-2]
117 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
118 """ Returns the response handle """
120 self.report_download_webpage(video_id)
121 elif note is not False:
122 self.to_screen(u'%s: %s' % (video_id, note))
124 return compat_urllib_request.urlopen(url_or_request)
125 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
127 errnote = u'Unable to download webpage'
128 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
130 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
131 """ Returns a tuple (page content as string, URL handle) """
132 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
133 content_type = urlh.headers.get('Content-Type', '')
134 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
136 encoding = m.group(1)
139 webpage_bytes = urlh.read()
140 if self._downloader.params.get('dump_intermediate_pages', False):
142 url = url_or_request.get_full_url()
143 except AttributeError:
145 self.to_screen(u'Dumping request to ' + url)
146 dump = base64.b64encode(webpage_bytes).decode('ascii')
147 self._downloader.to_screen(dump)
148 content = webpage_bytes.decode(encoding, 'replace')
149 return (content, urlh)
151 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
152 """ Returns the data of the page as a string """
153 return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
155 def to_screen(self, msg):
156 """Print msg to screen, prefixing it with '[ie_name]'"""
157 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
159 def report_extraction(self, id_or_name):
160 """Report information extraction."""
161 self.to_screen(u'%s: Extracting information' % id_or_name)
163 def report_download_webpage(self, video_id):
164 """Report webpage download."""
165 self.to_screen(u'%s: Downloading webpage' % video_id)
167 def report_age_confirmation(self):
168 """Report attempt to confirm age."""
169 self.to_screen(u'Confirming age')
171 #Methods for following #608
172 #They set the correct value of the '_type' key
173 def video_result(self, video_info):
174 """Returns a video"""
175 video_info['_type'] = 'video'
177 def url_result(self, url, ie=None):
178 """Returns a url that points to a page that should be processed"""
179 #TODO: ie should be the class used for getting the info
180 video_info = {'_type': 'url',
184 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
185 """Returns a playlist"""
186 video_info = {'_type': 'playlist',
189 video_info['id'] = playlist_id
191 video_info['title'] = playlist_title
194 class SearchInfoExtractor(InfoExtractor):
196 Base class for paged search queries extractors.
197 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
198 Instances should define _SEARCH_KEY and _MAX_RESULTS.
202 def _make_valid_url(cls):
203 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
206 def suitable(cls, url):
207 return re.match(cls._make_valid_url(), url) is not None
209 def _real_extract(self, query):
210 mobj = re.match(self._make_valid_url(), query)
212 raise ExtractorError(u'Invalid search query "%s"' % query)
214 prefix = mobj.group('prefix')
215 query = mobj.group('query')
217 return self._get_n_results(query, 1)
218 elif prefix == 'all':
219 return self._get_n_results(query, self._MAX_RESULTS)
223 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
224 elif n > self._MAX_RESULTS:
225 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
226 n = self._MAX_RESULTS
227 return self._get_n_results(query, n)
229 def _get_n_results(self, query, n):
230 """Get a specified number of results for a query"""
231 raise NotImplementedError("This method must be implemented by sublclasses")
234 class YoutubeIE(InfoExtractor):
235 """Information extractor for youtube.com."""
239 (?:https?://)? # http(s):// (optional)
240 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
241 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
242 (?:.*?\#/)? # handle anchor (#/) redirect urls
243 (?: # the various things that can precede the ID:
244 (?:(?:v|embed|e)/) # v/ or embed/ or e/
245 |(?: # or the v= param in all its forms
246 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
247 (?:\?|\#!?) # the params delimiter ? or # or #!
248 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
251 )? # optional -> youtube.com/xxxx is OK
252 )? # all until now is optional -> you can pass the naked ID
253 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
254 (?(1).+)? # if we found the ID, everything can follow
256 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
257 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
258 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
259 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
260 _NETRC_MACHINE = 'youtube'
261 # Listed in order of quality
262 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
263 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
264 _video_extensions = {
270 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
276 _video_dimensions = {
295 def suitable(cls, url):
296 """Receives a URL and returns True if suitable for this IE."""
297 if YoutubePlaylistIE.suitable(url): return False
298 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
300 def report_lang(self):
301 """Report attempt to set language."""
302 self.to_screen(u'Setting language')
304 def report_login(self):
305 """Report attempt to log in."""
306 self.to_screen(u'Logging in')
308 def report_video_webpage_download(self, video_id):
309 """Report attempt to download video webpage."""
310 self.to_screen(u'%s: Downloading video webpage' % video_id)
312 def report_video_info_webpage_download(self, video_id):
313 """Report attempt to download video info webpage."""
314 self.to_screen(u'%s: Downloading video info webpage' % video_id)
316 def report_video_subtitles_download(self, video_id):
317 """Report attempt to download video info webpage."""
318 self.to_screen(u'%s: Checking available subtitles' % video_id)
320 def report_video_subtitles_request(self, video_id, sub_lang, format):
321 """Report attempt to download video info webpage."""
322 self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
324 def report_video_subtitles_available(self, video_id, sub_lang_list):
325 """Report available subtitles."""
326 sub_lang = ",".join(list(sub_lang_list.keys()))
327 self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
329 def report_information_extraction(self, video_id):
330 """Report attempt to extract video information."""
331 self.to_screen(u'%s: Extracting video information' % video_id)
333 def report_unavailable_format(self, video_id, format):
334 """Report extracted video URL."""
335 self.to_screen(u'%s: Format %s not available' % (video_id, format))
337 def report_rtmp_download(self):
338 """Indicate the download will use the RTMP protocol."""
339 self.to_screen(u'RTMP download detected')
341 def _get_available_subtitles(self, video_id):
342 self.report_video_subtitles_download(video_id)
343 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
345 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
346 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
347 return (u'unable to download video subtitles: %s' % compat_str(err), None)
348 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
349 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
350 if not sub_lang_list:
351 return (u'video doesn\'t have subtitles', None)
354 def _list_available_subtitles(self, video_id):
355 sub_lang_list = self._get_available_subtitles(video_id)
356 self.report_video_subtitles_available(video_id, sub_lang_list)
358 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
361 (error_message, sub_lang, sub)
363 self.report_video_subtitles_request(video_id, sub_lang, format)
364 params = compat_urllib_parse.urlencode({
370 url = 'http://www.youtube.com/api/timedtext?' + params
372 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
373 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
374 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
376 return (u'Did not fetch video subtitles', None, None)
377 return (None, sub_lang, sub)
379 def _request_automatic_caption(self, video_id, webpage):
380 """We need the webpage for getting the captions url, pass it as an
381 argument to speed up the process."""
382 sub_lang = self._downloader.params.get('subtitleslang')
383 sub_format = self._downloader.params.get('subtitlesformat')
384 self.to_screen(u'%s: Looking for automatic captions' % video_id)
385 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
386 err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang
388 return [(err_msg, None, None)]
389 player_config = json.loads(mobj.group(1))
391 args = player_config[u'args']
392 caption_url = args[u'ttsurl']
393 timestamp = args[u'timestamp']
394 params = compat_urllib_parse.urlencode({
401 subtitles_url = caption_url + '&' + params
402 sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions')
403 return [(None, sub_lang, sub)]
405 return [(err_msg, None, None)]
407 def _extract_subtitle(self, video_id):
409 Return a list with a tuple:
410 [(error_message, sub_lang, sub)]
412 sub_lang_list = self._get_available_subtitles(video_id)
413 sub_format = self._downloader.params.get('subtitlesformat')
414 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
415 return [(sub_lang_list[0], None, None)]
416 if self._downloader.params.get('subtitleslang', False):
417 sub_lang = self._downloader.params.get('subtitleslang')
418 elif 'en' in sub_lang_list:
421 sub_lang = list(sub_lang_list.keys())[0]
422 if not sub_lang in sub_lang_list:
423 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
425 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
428 def _extract_all_subtitles(self, video_id):
429 sub_lang_list = self._get_available_subtitles(video_id)
430 sub_format = self._downloader.params.get('subtitlesformat')
431 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
432 return [(sub_lang_list[0], None, None)]
434 for sub_lang in sub_lang_list:
435 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
436 subtitles.append(subtitle)
439 def _print_formats(self, formats):
440 print('Available formats:')
442 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
444 def _real_initialize(self):
445 if self._downloader is None:
450 downloader_params = self._downloader.params
452 # Attempt to use provided username and password or .netrc data
453 if downloader_params.get('username', None) is not None:
454 username = downloader_params['username']
455 password = downloader_params['password']
456 elif downloader_params.get('usenetrc', False):
458 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
463 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
464 except (IOError, netrc.NetrcParseError) as err:
465 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
469 request = compat_urllib_request.Request(self._LANG_URL)
472 compat_urllib_request.urlopen(request).read()
473 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
474 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
477 # No authentication to be performed
481 request = compat_urllib_request.Request(self._LOGIN_URL)
483 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
484 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
485 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
490 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
492 galx = match.group(1)
494 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
500 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
504 u'PersistentCookie': u'yes',
506 u'bgresponse': u'js_disabled',
507 u'checkConnection': u'',
508 u'checkedDomains': u'youtube',
514 u'signIn': u'Sign in',
516 u'service': u'youtube',
520 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
522 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
523 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
524 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
527 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
528 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
529 self._downloader.report_warning(u'unable to log in: bad username or password')
531 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
532 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
538 'action_confirm': 'Confirm',
540 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
542 self.report_age_confirmation()
543 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
544 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
545 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
547 def _extract_id(self, url):
548 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
550 raise ExtractorError(u'Invalid URL: %s' % url)
551 video_id = mobj.group(2)
554 def _real_extract(self, url):
555 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
556 mobj = re.search(self._NEXT_URL_RE, url)
558 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
559 video_id = self._extract_id(url)
562 self.report_video_webpage_download(video_id)
563 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
564 request = compat_urllib_request.Request(url)
566 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
567 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
568 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
570 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
572 # Attempt to extract SWF player URL
573 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
575 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
580 self.report_video_info_webpage_download(video_id)
581 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
582 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
583 % (video_id, el_type))
584 video_info_webpage = self._download_webpage(video_info_url, video_id,
586 errnote='unable to download video info webpage')
587 video_info = compat_parse_qs(video_info_webpage)
588 if 'token' in video_info:
590 if 'token' not in video_info:
591 if 'reason' in video_info:
592 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
594 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
596 # Check for "rental" videos
597 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
598 raise ExtractorError(u'"rental" videos not supported')
600 # Start extracting information
601 self.report_information_extraction(video_id)
604 if 'author' not in video_info:
605 raise ExtractorError(u'Unable to extract uploader name')
606 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
609 video_uploader_id = None
610 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
612 video_uploader_id = mobj.group(1)
614 self._downloader.report_warning(u'unable to extract uploader nickname')
617 if 'title' not in video_info:
618 raise ExtractorError(u'Unable to extract video title')
619 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
622 if 'thumbnail_url' not in video_info:
623 self._downloader.report_warning(u'unable to extract video thumbnail')
625 else: # don't panic if we can't find it
626 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
630 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
632 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
633 upload_date = unified_strdate(upload_date)
636 video_description = get_element_by_id("eow-description", video_webpage)
637 if video_description:
638 video_description = clean_html(video_description)
640 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
642 video_description = unescapeHTML(fd_mobj.group(1))
644 video_description = u''
647 video_subtitles = None
649 if self._downloader.params.get('writesubtitles', False):
650 video_subtitles = self._extract_subtitle(video_id)
652 (sub_error, sub_lang, sub) = video_subtitles[0]
654 # We try with the automatic captions
655 video_subtitles = self._request_automatic_caption(video_id, video_webpage)
656 (sub_error_auto, sub_lang, sub) = video_subtitles[0]
660 # We report the original error
661 self._downloader.report_error(sub_error)
663 if self._downloader.params.get('allsubtitles', False):
664 video_subtitles = self._extract_all_subtitles(video_id)
665 for video_subtitle in video_subtitles:
666 (sub_error, sub_lang, sub) = video_subtitle
668 self._downloader.report_error(sub_error)
670 if self._downloader.params.get('listsubtitles', False):
671 sub_lang_list = self._list_available_subtitles(video_id)
674 if 'length_seconds' not in video_info:
675 self._downloader.report_warning(u'unable to extract video duration')
678 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
681 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
683 # Decide which formats to download
684 req_format = self._downloader.params.get('format', None)
686 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
687 self.report_rtmp_download()
688 video_url_list = [(None, video_info['conn'][0])]
689 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
691 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
692 url_data = compat_parse_qs(url_data_str)
693 if 'itag' in url_data and 'url' in url_data:
694 url = url_data['url'][0] + '&signature=' + url_data['sig'][0]
695 if not 'ratebypass' in url: url += '&ratebypass=yes'
696 url_map[url_data['itag'][0]] = url
698 format_limit = self._downloader.params.get('format_limit', None)
699 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
700 if format_limit is not None and format_limit in available_formats:
701 format_list = available_formats[available_formats.index(format_limit):]
703 format_list = available_formats
704 existing_formats = [x for x in format_list if x in url_map]
705 if len(existing_formats) == 0:
706 raise ExtractorError(u'no known formats available for video')
707 if self._downloader.params.get('listformats', None):
708 self._print_formats(existing_formats)
710 if req_format is None or req_format == 'best':
711 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
712 elif req_format == 'worst':
713 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
714 elif req_format in ('-1', 'all'):
715 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
717 # Specific formats. We pick the first in a slash-delimeted sequence.
718 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
719 req_formats = req_format.split('/')
720 video_url_list = None
721 for rf in req_formats:
723 video_url_list = [(rf, url_map[rf])]
725 if video_url_list is None:
726 raise ExtractorError(u'requested format not available')
728 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
731 for format_param, video_real_url in video_url_list:
733 video_extension = self._video_extensions.get(format_param, 'flv')
735 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
736 self._video_dimensions.get(format_param, '???'))
740 'url': video_real_url,
741 'uploader': video_uploader,
742 'uploader_id': video_uploader_id,
743 'upload_date': upload_date,
744 'title': video_title,
745 'ext': video_extension,
746 'format': video_format,
747 'thumbnail': video_thumbnail,
748 'description': video_description,
749 'player_url': player_url,
750 'subtitles': video_subtitles,
751 'duration': video_duration
756 class MetacafeIE(InfoExtractor):
757 """Information Extractor for metacafe.com."""
759 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
760 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
761 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
762 IE_NAME = u'metacafe'
764 def report_disclaimer(self):
765 """Report disclaimer retrieval."""
766 self.to_screen(u'Retrieving disclaimer')
768 def _real_initialize(self):
769 # Retrieve disclaimer
770 request = compat_urllib_request.Request(self._DISCLAIMER)
772 self.report_disclaimer()
773 disclaimer = compat_urllib_request.urlopen(request).read()
774 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
775 raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
780 'submit': "Continue - I'm over 18",
782 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
784 self.report_age_confirmation()
785 disclaimer = compat_urllib_request.urlopen(request).read()
786 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
787 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
789 def _real_extract(self, url):
790 # Extract id and simplified title from URL
791 mobj = re.match(self._VALID_URL, url)
793 raise ExtractorError(u'Invalid URL: %s' % url)
795 video_id = mobj.group(1)
797 # Check if video comes from YouTube
798 mobj2 = re.match(r'^yt-(.*)$', video_id)
799 if mobj2 is not None:
800 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
802 # Retrieve video webpage to extract further information
803 webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
805 # Extract URL, uploader and title from webpage
806 self.report_extraction(video_id)
807 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
809 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
810 video_extension = mediaURL[-3:]
812 # Extract gdaKey if available
813 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
817 gdaKey = mobj.group(1)
818 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
820 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
822 raise ExtractorError(u'Unable to extract media URL')
823 vardict = compat_parse_qs(mobj.group(1))
824 if 'mediaData' not in vardict:
825 raise ExtractorError(u'Unable to extract media URL')
826 mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
828 raise ExtractorError(u'Unable to extract media URL')
829 mediaURL = mobj.group('mediaURL').replace('\\/', '/')
830 video_extension = mediaURL[-3:]
831 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
833 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
835 raise ExtractorError(u'Unable to extract title')
836 video_title = mobj.group(1).decode('utf-8')
838 mobj = re.search(r'submitter=(.*?);', webpage)
840 raise ExtractorError(u'Unable to extract uploader nickname')
841 video_uploader = mobj.group(1)
844 'id': video_id.decode('utf-8'),
845 'url': video_url.decode('utf-8'),
846 'uploader': video_uploader.decode('utf-8'),
848 'title': video_title,
849 'ext': video_extension.decode('utf-8'),
852 class DailymotionIE(InfoExtractor):
853 """Information Extractor for Dailymotion"""
855 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
856 IE_NAME = u'dailymotion'
858 def _real_extract(self, url):
859 # Extract id and simplified title from URL
860 mobj = re.match(self._VALID_URL, url)
862 raise ExtractorError(u'Invalid URL: %s' % url)
864 video_id = mobj.group(1).split('_')[0].split('?')[0]
866 video_extension = 'mp4'
868 # Retrieve video webpage to extract further information
869 request = compat_urllib_request.Request(url)
870 request.add_header('Cookie', 'family_filter=off')
871 webpage = self._download_webpage(request, video_id)
873 # Extract URL, uploader and title from webpage
874 self.report_extraction(video_id)
875 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
877 raise ExtractorError(u'Unable to extract media URL')
878 flashvars = compat_urllib_parse.unquote(mobj.group(1))
880 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
883 self.to_screen(u'Using %s' % key)
886 raise ExtractorError(u'Unable to extract video URL')
888 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
890 raise ExtractorError(u'Unable to extract video URL')
892 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
894 # TODO: support choosing qualities
896 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
898 raise ExtractorError(u'Unable to extract title')
899 video_title = unescapeHTML(mobj.group('title'))
901 video_uploader = None
902 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
904 # lookin for official user
905 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
906 if mobj_official is None:
907 self._downloader.report_warning(u'unable to extract uploader nickname')
909 video_uploader = mobj_official.group(1)
911 video_uploader = mobj.group(1)
913 video_upload_date = None
914 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
916 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
921 'uploader': video_uploader,
922 'upload_date': video_upload_date,
923 'title': video_title,
924 'ext': video_extension,
928 class PhotobucketIE(InfoExtractor):
929 """Information extractor for photobucket.com."""
931 # TODO: the original _VALID_URL was:
932 # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
933 # Check if it's necessary to keep the old extracion process
934 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
935 IE_NAME = u'photobucket'
937 def _real_extract(self, url):
938 # Extract id from URL
939 mobj = re.match(self._VALID_URL, url)
941 raise ExtractorError(u'Invalid URL: %s' % url)
943 video_id = mobj.group('id')
945 video_extension = mobj.group('ext')
947 # Retrieve video webpage to extract further information
948 webpage = self._download_webpage(url, video_id)
950 # Extract URL, uploader, and title from webpage
951 self.report_extraction(video_id)
952 # We try first by looking the javascript code:
953 mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
955 info = json.loads(mobj.group('json'))
958 'url': info[u'downloadUrl'],
959 'uploader': info[u'username'],
960 'upload_date': datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
961 'title': info[u'title'],
962 'ext': video_extension,
963 'thumbnail': info[u'thumbUrl'],
966 # We try looking in other parts of the webpage
967 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
969 raise ExtractorError(u'Unable to extract media URL')
970 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
974 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
976 raise ExtractorError(u'Unable to extract title')
977 video_title = mobj.group(1).decode('utf-8')
979 video_uploader = mobj.group(2).decode('utf-8')
982 'id': video_id.decode('utf-8'),
983 'url': video_url.decode('utf-8'),
984 'uploader': video_uploader,
986 'title': video_title,
987 'ext': video_extension.decode('utf-8'),
991 class YahooIE(InfoExtractor):
992 """Information extractor for screen.yahoo.com."""
993 _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
995 def _real_extract(self, url):
996 mobj = re.match(self._VALID_URL, url)
998 raise ExtractorError(u'Invalid URL: %s' % url)
999 video_id = mobj.group('id')
1000 webpage = self._download_webpage(url, video_id)
1001 m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
1004 # TODO: Check which url parameters are required
1005 info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1006 webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
1007 info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
1008 <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
1009 <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
1010 <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
1012 self.report_extraction(video_id)
1013 m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
1015 raise ExtractorError(u'Unable to extract video info')
1016 video_title = m_info.group('title')
1017 video_description = m_info.group('description')
1018 video_thumb = m_info.group('thumb')
1019 video_date = m_info.group('date')
1020 video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
1022 # TODO: Find a way to get mp4 videos
1023 rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1024 webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
1025 m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
1026 video_url = m_rest.group('url')
1027 video_path = m_rest.group('path')
1029 raise ExtractorError(u'Unable to extract video url')
1031 else: # We have to use a different method if another id is defined
1032 long_id = m_id.group('new_id')
1033 info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
1034 webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
1035 json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
1036 info = json.loads(json_str)
1037 res = info[u'query'][u'results'][u'mediaObj'][0]
1038 stream = res[u'streams'][0]
1039 video_path = stream[u'path']
1040 video_url = stream[u'host']
1042 video_title = meta[u'title']
1043 video_description = meta[u'description']
1044 video_thumb = meta[u'thumbnail']
1045 video_date = None # I can't find it
1050 'play_path': video_path,
1051 'title':video_title,
1052 'description': video_description,
1053 'thumbnail': video_thumb,
1054 'upload_date': video_date,
1059 class VimeoIE(InfoExtractor):
1060 """Information extractor for vimeo.com."""
1062 # _VALID_URL matches Vimeo URLs
1063 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1066 def _real_extract(self, url, new_video=True):
1067 # Extract ID from URL
1068 mobj = re.match(self._VALID_URL, url)
1070 raise ExtractorError(u'Invalid URL: %s' % url)
1072 video_id = mobj.group('id')
1073 if not mobj.group('proto'):
1074 url = 'https://' + url
1075 if mobj.group('direct_link') or mobj.group('pro'):
1076 url = 'https://vimeo.com/' + video_id
1078 # Retrieve video webpage to extract further information
1079 request = compat_urllib_request.Request(url, None, std_headers)
1080 webpage = self._download_webpage(request, video_id)
1082 # Now we begin extracting as much information as we can from what we
1083 # retrieved. First we extract the information common to all extractors,
1084 # and latter we extract those that are Vimeo specific.
1085 self.report_extraction(video_id)
1087 # Extract the config JSON
1089 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1090 config = json.loads(config)
1092 if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1093 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1095 raise ExtractorError(u'Unable to extract info section')
1098 video_title = config["video"]["title"]
1100 # Extract uploader and uploader_id
1101 video_uploader = config["video"]["owner"]["name"]
1102 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None
1104 # Extract video thumbnail
1105 video_thumbnail = config["video"]["thumbnail"]
1107 # Extract video description
1108 video_description = get_element_by_attribute("itemprop", "description", webpage)
1109 if video_description: video_description = clean_html(video_description)
1110 else: video_description = u''
1112 # Extract upload date
1113 video_upload_date = None
1114 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1115 if mobj is not None:
1116 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1118 # Vimeo specific: extract request signature and timestamp
1119 sig = config['request']['signature']
1120 timestamp = config['request']['timestamp']
1122 # Vimeo specific: extract video codec and quality information
1123 # First consider quality, then codecs, then take everything
1124 # TODO bind to format param
1125 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1126 files = { 'hd': [], 'sd': [], 'other': []}
1127 for codec_name, codec_extension in codecs:
1128 if codec_name in config["video"]["files"]:
1129 if 'hd' in config["video"]["files"][codec_name]:
1130 files['hd'].append((codec_name, codec_extension, 'hd'))
1131 elif 'sd' in config["video"]["files"][codec_name]:
1132 files['sd'].append((codec_name, codec_extension, 'sd'))
1134 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1136 for quality in ('hd', 'sd', 'other'):
1137 if len(files[quality]) > 0:
1138 video_quality = files[quality][0][2]
1139 video_codec = files[quality][0][0]
1140 video_extension = files[quality][0][1]
1141 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1144 raise ExtractorError(u'No known codec found')
1146 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1147 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1152 'uploader': video_uploader,
1153 'uploader_id': video_uploader_id,
1154 'upload_date': video_upload_date,
1155 'title': video_title,
1156 'ext': video_extension,
1157 'thumbnail': video_thumbnail,
1158 'description': video_description,
1162 class ArteTvIE(InfoExtractor):
1163 """arte.tv information extractor."""
1165 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1166 _LIVE_URL = r'index-[0-9]+\.html$'
1168 IE_NAME = u'arte.tv'
1170 def fetch_webpage(self, url):
1171 request = compat_urllib_request.Request(url)
1173 self.report_download_webpage(url)
1174 webpage = compat_urllib_request.urlopen(request).read()
1175 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1176 raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1177 except ValueError as err:
1178 raise ExtractorError(u'Invalid URL: %s' % url)
1181 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1182 page = self.fetch_webpage(url)
1183 mobj = re.search(regex, page, regexFlags)
1187 raise ExtractorError(u'Invalid URL: %s' % url)
1189 for (i, key, err) in matchTuples:
1190 if mobj.group(i) is None:
1191 raise ExtractorError(err)
1193 info[key] = mobj.group(i)
1197 def extractLiveStream(self, url):
1198 video_lang = url.split('/')[-4]
1199 info = self.grep_webpage(
1201 r'src="(.*?/videothek_js.*?\.js)',
1204 (1, 'url', u'Invalid URL: %s' % url)
1207 http_host = url.split('/')[2]
1208 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1209 info = self.grep_webpage(
1211 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1212 '(http://.*?\.swf).*?' +
1216 (1, 'path', u'could not extract video path: %s' % url),
1217 (2, 'player', u'could not extract video player: %s' % url),
1218 (3, 'url', u'could not extract video url: %s' % url)
1221 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1223 def extractPlus7Stream(self, url):
1224 video_lang = url.split('/')[-3]
1225 info = self.grep_webpage(
1227 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1230 (1, 'url', u'Invalid URL: %s' % url)
1233 next_url = compat_urllib_parse.unquote(info.get('url'))
1234 info = self.grep_webpage(
1236 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1239 (1, 'url', u'Could not find <video> tag: %s' % url)
1242 next_url = compat_urllib_parse.unquote(info.get('url'))
1244 info = self.grep_webpage(
1246 r'<video id="(.*?)".*?>.*?' +
1247 '<name>(.*?)</name>.*?' +
1248 '<dateVideo>(.*?)</dateVideo>.*?' +
1249 '<url quality="hd">(.*?)</url>',
1252 (1, 'id', u'could not extract video id: %s' % url),
1253 (2, 'title', u'could not extract video title: %s' % url),
1254 (3, 'date', u'could not extract video date: %s' % url),
1255 (4, 'url', u'could not extract video url: %s' % url)
1260 'id': info.get('id'),
1261 'url': compat_urllib_parse.unquote(info.get('url')),
1262 'uploader': u'arte.tv',
1263 'upload_date': unified_strdate(info.get('date')),
1264 'title': info.get('title').decode('utf-8'),
1270 def _real_extract(self, url):
1271 video_id = url.split('/')[-1]
1272 self.report_extraction(video_id)
1274 if re.search(self._LIVE_URL, video_id) is not None:
1275 self.extractLiveStream(url)
1278 info = self.extractPlus7Stream(url)
1283 class GenericIE(InfoExtractor):
1284 """Generic last-resort information extractor."""
1287 IE_NAME = u'generic'
1289 def report_download_webpage(self, video_id):
1290 """Report webpage download."""
1291 if not self._downloader.params.get('test', False):
1292 self._downloader.report_warning(u'Falling back on generic information extractor.')
1293 super(GenericIE, self).report_download_webpage(video_id)
1295 def report_following_redirect(self, new_url):
1296 """Report information extraction."""
1297 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1299 def _test_redirect(self, url):
1300 """Check if it is a redirect, like url shorteners, in case return the new url."""
1301 class HeadRequest(compat_urllib_request.Request):
1302 def get_method(self):
1305 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1307 Subclass the HTTPRedirectHandler to make it use our
1308 HeadRequest also on the redirected URL
1310 def redirect_request(self, req, fp, code, msg, headers, newurl):
1311 if code in (301, 302, 303, 307):
1312 newurl = newurl.replace(' ', '%20')
1313 newheaders = dict((k,v) for k,v in req.headers.items()
1314 if k.lower() not in ("content-length", "content-type"))
1315 return HeadRequest(newurl,
1317 origin_req_host=req.get_origin_req_host(),
1320 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1322 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1324 Fallback to GET if HEAD is not allowed (405 HTTP error)
1326 def http_error_405(self, req, fp, code, msg, headers):
1330 newheaders = dict((k,v) for k,v in req.headers.items()
1331 if k.lower() not in ("content-length", "content-type"))
1332 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1334 origin_req_host=req.get_origin_req_host(),
1338 opener = compat_urllib_request.OpenerDirector()
1339 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1340 HTTPMethodFallback, HEADRedirectHandler,
1341 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1342 opener.add_handler(handler())
1344 response = opener.open(HeadRequest(url))
1345 if response is None:
1346 raise ExtractorError(u'Invalid URL protocol')
1347 new_url = response.geturl()
1352 self.report_following_redirect(new_url)
1355 def _real_extract(self, url):
1356 new_url = self._test_redirect(url)
1357 if new_url: return [self.url_result(new_url)]
1359 video_id = url.split('/')[-1]
1361 webpage = self._download_webpage(url, video_id)
1362 except ValueError as err:
1363 # since this is the last-resort InfoExtractor, if
1364 # this error is thrown, it'll be thrown here
1365 raise ExtractorError(u'Invalid URL: %s' % url)
1367 self.report_extraction(video_id)
1368 # Start with something easy: JW Player in SWFObject
1369 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1371 # Broaden the search a little bit
1372 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1374 # Broaden the search a little bit: JWPlayer JS loader
1375 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1377 # Try to find twitter cards info
1378 mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
1380 raise ExtractorError(u'Invalid URL: %s' % url)
1382 # It's possible that one of the regexes
1383 # matched, but returned an empty group:
1384 if mobj.group(1) is None:
1385 raise ExtractorError(u'Invalid URL: %s' % url)
1387 video_url = compat_urllib_parse.unquote(mobj.group(1))
1388 video_id = os.path.basename(video_url)
1390 # here's a fun little line of code for you:
1391 video_extension = os.path.splitext(video_id)[1][1:]
1392 video_id = os.path.splitext(video_id)[0]
1394 # it's tempting to parse this further, but you would
1395 # have to take into account all the variations like
1396 # Video Title - Site Name
1397 # Site Name | Video Title
1398 # Video Title - Tagline | Site Name
1399 # and so on and so forth; it's just not practical
1400 mobj = re.search(r'<title>(.*)</title>', webpage)
1402 raise ExtractorError(u'Unable to extract title')
1403 video_title = mobj.group(1)
1405 # video uploader is domain name
1406 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1408 raise ExtractorError(u'Unable to extract title')
1409 video_uploader = mobj.group(1)
1414 'uploader': video_uploader,
1415 'upload_date': None,
1416 'title': video_title,
1417 'ext': video_extension,
1421 class YoutubeSearchIE(SearchInfoExtractor):
1422 """Information Extractor for YouTube search queries."""
1423 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1425 IE_NAME = u'youtube:search'
1426 _SEARCH_KEY = 'ytsearch'
1428 def report_download_page(self, query, pagenum):
1429 """Report attempt to download search page with given number."""
1430 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1432 def _get_n_results(self, query, n):
1433 """Get a specified number of results for a query"""
1439 while (50 * pagenum) < limit:
1440 self.report_download_page(query, pagenum+1)
1441 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1442 request = compat_urllib_request.Request(result_url)
1444 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1445 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1446 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1447 api_response = json.loads(data)['data']
1449 if not 'items' in api_response:
1450 raise ExtractorError(u'[youtube] No video results')
1452 new_ids = list(video['id'] for video in api_response['items'])
1453 video_ids += new_ids
1455 limit = min(n, api_response['totalItems'])
1458 if len(video_ids) > n:
1459 video_ids = video_ids[:n]
1460 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1461 return self.playlist_result(videos, query)
1464 class GoogleSearchIE(SearchInfoExtractor):
1465 """Information Extractor for Google Video search queries."""
1466 _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
1468 IE_NAME = u'video.google:search'
1469 _SEARCH_KEY = 'gvsearch'
1471 def _get_n_results(self, query, n):
1472 """Get a specified number of results for a query"""
1475 '_type': 'playlist',
1480 for pagenum in itertools.count(1):
1481 result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1482 webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1483 note='Downloading result page ' + str(pagenum))
1485 for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1488 'url': mobj.group(1)
1490 res['entries'].append(e)
1492 if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1495 class YahooSearchIE(SearchInfoExtractor):
1496 """Information Extractor for Yahoo! Video search queries."""
1499 IE_NAME = u'screen.yahoo:search'
1500 _SEARCH_KEY = 'yvsearch'
1502 def _get_n_results(self, query, n):
1503 """Get a specified number of results for a query"""
1506 '_type': 'playlist',
1510 for pagenum in itertools.count(0):
1511 result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
1512 webpage = self._download_webpage(result_url, query,
1513 note='Downloading results page '+str(pagenum+1))
1514 info = json.loads(webpage)
1516 results = info[u'results']
1518 for (i, r) in enumerate(results):
1519 if (pagenum * 30) +i >= n:
1521 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
1522 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
1523 res['entries'].append(e)
1524 if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
1530 class YoutubePlaylistIE(InfoExtractor):
1531 """Information Extractor for YouTube playlists."""
1533 _VALID_URL = r"""(?:
1538 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1539 \? (?:.*?&)*? (?:p|a|list)=
1542 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1545 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1547 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1549 IE_NAME = u'youtube:playlist'
1552 def suitable(cls, url):
1553 """Receives a URL and returns True if suitable for this IE."""
1554 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1556 def _real_extract(self, url):
1557 # Extract playlist id
1558 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1560 raise ExtractorError(u'Invalid URL: %s' % url)
1562 # Download playlist videos from API
1563 playlist_id = mobj.group(1) or mobj.group(2)
1568 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1569 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1572 response = json.loads(page)
1573 except ValueError as err:
1574 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1576 if 'feed' not in response:
1577 raise ExtractorError(u'Got a malformed response from YouTube API')
1578 playlist_title = response['feed']['title']['$t']
1579 if 'entry' not in response['feed']:
1580 # Number of videos is a multiple of self._MAX_RESULTS
1583 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1584 for entry in response['feed']['entry']
1585 if 'content' in entry ]
1587 if len(response['feed']['entry']) < self._MAX_RESULTS:
1591 videos = [v[1] for v in sorted(videos)]
1593 url_results = [self.url_result(url, 'Youtube') for url in videos]
1594 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1597 class YoutubeChannelIE(InfoExtractor):
1598 """Information Extractor for YouTube channels."""
1600 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1601 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1602 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1603 _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1604 IE_NAME = u'youtube:channel'
1606 def extract_videos_from_page(self, page):
1608 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1609 if mobj.group(1) not in ids_in_page:
1610 ids_in_page.append(mobj.group(1))
1613 def _real_extract(self, url):
1614 # Extract channel id
1615 mobj = re.match(self._VALID_URL, url)
1617 raise ExtractorError(u'Invalid URL: %s' % url)
1619 # Download channel page
1620 channel_id = mobj.group(1)
1624 url = self._TEMPLATE_URL % (channel_id, pagenum)
1625 page = self._download_webpage(url, channel_id,
1626 u'Downloading page #%s' % pagenum)
1628 # Extract video identifiers
1629 ids_in_page = self.extract_videos_from_page(page)
1630 video_ids.extend(ids_in_page)
1632 # Download any subsequent channel pages using the json-based channel_ajax query
1633 if self._MORE_PAGES_INDICATOR in page:
1635 pagenum = pagenum + 1
1637 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1638 page = self._download_webpage(url, channel_id,
1639 u'Downloading page #%s' % pagenum)
1641 page = json.loads(page)
1643 ids_in_page = self.extract_videos_from_page(page['content_html'])
1644 video_ids.extend(ids_in_page)
1646 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1649 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1651 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1652 url_entries = [self.url_result(url, 'Youtube') for url in urls]
1653 return [self.playlist_result(url_entries, channel_id)]
1656 class YoutubeUserIE(InfoExtractor):
1657 """Information Extractor for YouTube users."""
1659 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1660 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1661 _GDATA_PAGE_SIZE = 50
1662 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1663 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1664 IE_NAME = u'youtube:user'
1666 def _real_extract(self, url):
1668 mobj = re.match(self._VALID_URL, url)
1670 raise ExtractorError(u'Invalid URL: %s' % url)
1672 username = mobj.group(1)
1674 # Download video ids using YouTube Data API. Result size per
1675 # query is limited (currently to 50 videos) so we need to query
1676 # page by page until there are no video ids - it means we got
1683 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1685 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1686 page = self._download_webpage(gdata_url, username,
1687 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1689 # Extract video identifiers
1692 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1693 if mobj.group(1) not in ids_in_page:
1694 ids_in_page.append(mobj.group(1))
1696 video_ids.extend(ids_in_page)
1698 # A little optimization - if current page is not
1699 # "full", ie. does not contain PAGE_SIZE video ids then
1700 # we can assume that this page is the last one - there
1701 # are no more ids on further pages - no need to query
1704 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1709 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1710 url_results = [self.url_result(url, 'Youtube') for url in urls]
1711 return [self.playlist_result(url_results, playlist_title = username)]
1714 class BlipTVUserIE(InfoExtractor):
1715 """Information Extractor for blip.tv users."""
1717 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1719 IE_NAME = u'blip.tv:user'
1721 def _real_extract(self, url):
1723 mobj = re.match(self._VALID_URL, url)
1725 raise ExtractorError(u'Invalid URL: %s' % url)
1727 username = mobj.group(1)
1729 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1731 page = self._download_webpage(url, username, u'Downloading user page')
1732 mobj = re.search(r'data-users-id="([^"]+)"', page)
1733 page_base = page_base % mobj.group(1)
1736 # Download video ids using BlipTV Ajax calls. Result size per
1737 # query is limited (currently to 12 videos) so we need to query
1738 # page by page until there are no video ids - it means we got
1745 url = page_base + "&page=" + str(pagenum)
1746 page = self._download_webpage(url, username,
1747 u'Downloading video ids from page %d' % pagenum)
1749 # Extract video identifiers
1752 for mobj in re.finditer(r'href="/([^"]+)"', page):
1753 if mobj.group(1) not in ids_in_page:
1754 ids_in_page.append(unescapeHTML(mobj.group(1)))
1756 video_ids.extend(ids_in_page)
1758 # A little optimization - if current page is not
1759 # "full", ie. does not contain PAGE_SIZE video ids then
1760 # we can assume that this page is the last one - there
1761 # are no more ids on further pages - no need to query
1764 if len(ids_in_page) < self._PAGE_SIZE:
1769 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1770 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1771 return [self.playlist_result(url_entries, playlist_title = username)]
1774 class DepositFilesIE(InfoExtractor):
1775 """Information extractor for depositfiles.com"""
1777 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1779 def _real_extract(self, url):
1780 file_id = url.split('/')[-1]
1781 # Rebuild url in english locale
1782 url = 'http://depositfiles.com/en/files/' + file_id
1784 # Retrieve file webpage with 'Free download' button pressed
1785 free_download_indication = { 'gateway_result' : '1' }
1786 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1788 self.report_download_webpage(file_id)
1789 webpage = compat_urllib_request.urlopen(request).read()
1790 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1791 raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1793 # Search for the real file URL
1794 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1795 if (mobj is None) or (mobj.group(1) is None):
1796 # Try to figure out reason of the error.
1797 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1798 if (mobj is not None) and (mobj.group(1) is not None):
1799 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1800 raise ExtractorError(u'%s' % restriction_message)
1802 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1804 file_url = mobj.group(1)
1805 file_extension = os.path.splitext(file_url)[1][1:]
1807 # Search for file title
1808 mobj = re.search(r'<b title="(.*?)">', webpage)
1810 raise ExtractorError(u'Unable to extract title')
1811 file_title = mobj.group(1).decode('utf-8')
1814 'id': file_id.decode('utf-8'),
1815 'url': file_url.decode('utf-8'),
1817 'upload_date': None,
1818 'title': file_title,
1819 'ext': file_extension.decode('utf-8'),
1823 class FacebookIE(InfoExtractor):
1824 """Information Extractor for Facebook"""
1826 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1827 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1828 _NETRC_MACHINE = 'facebook'
1829 IE_NAME = u'facebook'
1831 def report_login(self):
1832 """Report attempt to log in."""
1833 self.to_screen(u'Logging in')
1835 def _real_initialize(self):
1836 if self._downloader is None:
1841 downloader_params = self._downloader.params
1843 # Attempt to use provided username and password or .netrc data
1844 if downloader_params.get('username', None) is not None:
1845 useremail = downloader_params['username']
1846 password = downloader_params['password']
1847 elif downloader_params.get('usenetrc', False):
1849 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1850 if info is not None:
1854 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1855 except (IOError, netrc.NetrcParseError) as err:
1856 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1859 if useremail is None:
1868 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1871 login_results = compat_urllib_request.urlopen(request).read()
1872 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1873 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1875 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1876 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1879 def _real_extract(self, url):
1880 mobj = re.match(self._VALID_URL, url)
1882 raise ExtractorError(u'Invalid URL: %s' % url)
1883 video_id = mobj.group('ID')
1885 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1886 webpage = self._download_webpage(url, video_id)
1888 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1889 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1890 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1892 raise ExtractorError(u'Cannot parse data')
1893 data = dict(json.loads(m.group(1)))
1894 params_raw = compat_urllib_parse.unquote(data['params'])
1895 params = json.loads(params_raw)
1896 video_data = params['video_data'][0]
1897 video_url = video_data.get('hd_src')
1899 video_url = video_data['sd_src']
1901 raise ExtractorError(u'Cannot find video URL')
1902 video_duration = int(video_data['video_duration'])
1903 thumbnail = video_data['thumbnail_src']
1905 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
1907 raise ExtractorError(u'Cannot find title in webpage')
1908 video_title = unescapeHTML(m.group(1))
1912 'title': video_title,
1915 'duration': video_duration,
1916 'thumbnail': thumbnail,
1921 class BlipTVIE(InfoExtractor):
1922 """Information extractor for blip.tv"""
1924 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
1925 _URL_EXT = r'^.*\.([a-z0-9]+)$'
1926 IE_NAME = u'blip.tv'
1928 def report_direct_download(self, title):
1929 """Report information extraction."""
1930 self.to_screen(u'%s: Direct download detected' % title)
1932 def _real_extract(self, url):
1933 mobj = re.match(self._VALID_URL, url)
1935 raise ExtractorError(u'Invalid URL: %s' % url)
1937 # See https://github.com/rg3/youtube-dl/issues/857
1938 api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
1939 if api_mobj is not None:
1940 url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
1941 urlp = compat_urllib_parse_urlparse(url)
1942 if urlp.path.startswith('/play/'):
1943 request = compat_urllib_request.Request(url)
1944 response = compat_urllib_request.urlopen(request)
1945 redirecturl = response.geturl()
1946 rurlp = compat_urllib_parse_urlparse(redirecturl)
1947 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1948 url = 'http://blip.tv/a/a-' + file_id
1949 return self._real_extract(url)
1956 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1957 request = compat_urllib_request.Request(json_url)
1958 request.add_header('User-Agent', 'iTunes/10.6.1')
1959 self.report_extraction(mobj.group(1))
1962 urlh = compat_urllib_request.urlopen(request)
1963 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1964 basename = url.split('/')[-1]
1965 title,ext = os.path.splitext(basename)
1966 title = title.decode('UTF-8')
1967 ext = ext.replace('.', '')
1968 self.report_direct_download(title)
1973 'upload_date': None,
1978 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1979 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
1980 if info is None: # Regular URL
1982 json_code_bytes = urlh.read()
1983 json_code = json_code_bytes.decode('utf-8')
1984 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1985 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
1988 json_data = json.loads(json_code)
1989 if 'Post' in json_data:
1990 data = json_data['Post']
1994 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
1995 video_url = data['media']['url']
1996 umobj = re.match(self._URL_EXT, video_url)
1998 raise ValueError('Can not determine filename extension')
1999 ext = umobj.group(1)
2002 'id': data['item_id'],
2004 'uploader': data['display_name'],
2005 'upload_date': upload_date,
2006 'title': data['title'],
2008 'format': data['media']['mimeType'],
2009 'thumbnail': data['thumbnailUrl'],
2010 'description': data['description'],
2011 'player_url': data['embedUrl'],
2012 'user_agent': 'iTunes/10.6.1',
2014 except (ValueError,KeyError) as err:
2015 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
2020 class MyVideoIE(InfoExtractor):
2021 """Information Extractor for myvideo.de."""
2023 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2024 IE_NAME = u'myvideo'
2026 # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
2027 # Released into the Public Domain by Tristan Fischer on 2013-05-19
2028 # https://github.com/rg3/youtube-dl/pull/842
2029 def __rc4crypt(self,data, key):
2031 box = list(range(256))
2032 for i in list(range(256)):
2033 x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
2034 box[i], box[x] = box[x], box[i]
2040 y = (y + box[x]) % 256
2041 box[x], box[y] = box[y], box[x]
2042 out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
2046 return hashlib.md5(s).hexdigest().encode()
2048 def _real_extract(self,url):
2049 mobj = re.match(self._VALID_URL, url)
2051 raise ExtractorError(u'invalid URL: %s' % url)
2053 video_id = mobj.group(1)
2056 b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
2057 b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
2058 b'TnpsbA0KTVRkbU1tSTRNdz09'
2062 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2063 webpage = self._download_webpage(webpage_url, video_id)
2065 mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
2066 if mobj is not None:
2067 self.report_extraction(video_id)
2068 video_url = mobj.group(1) + '.flv'
2070 mobj = re.search('<title>([^<]+)</title>', webpage)
2072 raise ExtractorError(u'Unable to extract title')
2073 video_title = mobj.group(1)
2075 mobj = re.search('[.](.+?)$', video_url)
2077 raise ExtractorError(u'Unable to extract extention')
2078 video_ext = mobj.group(1)
2084 'upload_date': None,
2085 'title': video_title,
2090 mobj = re.search('var flashvars={(.+?)}', webpage)
2092 raise ExtractorError(u'Unable to extract video')
2097 for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
2098 if not a == '_encxml':
2101 encxml = compat_urllib_parse.unquote(b)
2102 if not params.get('domain'):
2103 params['domain'] = 'www.myvideo.de'
2104 xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
2105 if 'flash_playertype=MTV' in xmldata_url:
2106 self._downloader.report_warning(u'avoiding MTV player')
2108 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
2109 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
2113 enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
2114 enc_data_b = binascii.unhexlify(enc_data)
2116 base64.b64decode(base64.b64decode(GK)) +
2118 str(video_id).encode('utf-8')
2121 dec_data = self.__rc4crypt(enc_data_b, sk)
2124 self.report_extraction(video_id)
2126 mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
2128 raise ExtractorError(u'unable to extract rtmpurl')
2129 video_rtmpurl = compat_urllib_parse.unquote(mobj.group(1))
2130 if 'myvideo2flash' in video_rtmpurl:
2131 self._downloader.report_warning(u'forcing RTMPT ...')
2132 video_rtmpurl = video_rtmpurl.replace('rtmpe://', 'rtmpt://')
2134 # extract non rtmp videos
2135 if (video_rtmpurl is None) or (video_rtmpurl == ''):
2136 mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
2138 raise ExtractorError(u'unable to extract url')
2139 video_rtmpurl = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
2141 mobj = re.search('source=\'(.*?)\'', dec_data)
2143 raise ExtractorError(u'unable to extract swfobj')
2144 video_file = compat_urllib_parse.unquote(mobj.group(1))
2146 if not video_file.endswith('f4m'):
2147 ppath, prefix = video_file.split('.')
2148 video_playpath = '%s:%s' % (prefix, ppath)
2149 video_hls_playlist = ''
2152 video_hls_playlist = (
2153 video_filepath + video_file
2154 ).replace('.f4m', '.m3u8')
2156 mobj = re.search('swfobject.embedSWF\(\'(.+?)\'', webpage)
2158 raise ExtractorError(u'unable to extract swfobj')
2159 video_swfobj = compat_urllib_parse.unquote(mobj.group(1))
2161 mobj = re.search("<h1(?: class='globalHd')?>(.*?)</h1>", webpage)
2163 raise ExtractorError(u'unable to extract title')
2164 video_title = mobj.group(1)
2168 'url': video_rtmpurl,
2169 'tc_url': video_rtmpurl,
2171 'upload_date': None,
2172 'title': video_title,
2174 'play_path': video_playpath,
2175 'video_file': video_file,
2176 'video_hls_playlist': video_hls_playlist,
2177 'player_url': video_swfobj,
2180 class ComedyCentralIE(InfoExtractor):
2181 """Information extractor for The Daily Show and Colbert Report """
2183 # urls can be abbreviations like :thedailyshow or :colbert
2184 # urls for episodes like:
2185 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2186 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2187 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2188 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2189 |(https?://)?(www\.)?
2190 (?P<showname>thedailyshow|colbertnation)\.com/
2191 (full-episodes/(?P<episode>.*)|
2193 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2194 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2197 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2199 _video_extensions = {
2207 _video_dimensions = {
2217 def suitable(cls, url):
2218 """Receives a URL and returns True if suitable for this IE."""
2219 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2221 def _print_formats(self, formats):
2222 print('Available formats:')
2224 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2227 def _real_extract(self, url):
2228 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2230 raise ExtractorError(u'Invalid URL: %s' % url)
2232 if mobj.group('shortname'):
2233 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2234 url = u'http://www.thedailyshow.com/full-episodes/'
2236 url = u'http://www.colbertnation.com/full-episodes/'
2237 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2238 assert mobj is not None
2240 if mobj.group('clip'):
2241 if mobj.group('showname') == 'thedailyshow':
2242 epTitle = mobj.group('tdstitle')
2244 epTitle = mobj.group('cntitle')
2247 dlNewest = not mobj.group('episode')
2249 epTitle = mobj.group('showname')
2251 epTitle = mobj.group('episode')
2253 self.report_extraction(epTitle)
2254 webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2256 url = htmlHandle.geturl()
2257 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2259 raise ExtractorError(u'Invalid redirected URL: ' + url)
2260 if mobj.group('episode') == '':
2261 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2262 epTitle = mobj.group('episode')
2264 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2266 if len(mMovieParams) == 0:
2267 # The Colbert Report embeds the information in a without
2268 # a URL prefix; so extract the alternate reference
2269 # and then add the URL prefix manually.
2271 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2272 if len(altMovieParams) == 0:
2273 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2275 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2277 uri = mMovieParams[0][1]
2278 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2279 indexXml = self._download_webpage(indexUrl, epTitle,
2280 u'Downloading show index',
2281 u'unable to download episode index')
2285 idoc = xml.etree.ElementTree.fromstring(indexXml)
2286 itemEls = idoc.findall('.//item')
2287 for partNum,itemEl in enumerate(itemEls):
2288 mediaId = itemEl.findall('./guid')[0].text
2289 shortMediaId = mediaId.split(':')[-1]
2290 showId = mediaId.split(':')[-2].replace('.com', '')
2291 officialTitle = itemEl.findall('./title')[0].text
2292 officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2294 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2295 compat_urllib_parse.urlencode({'uri': mediaId}))
2296 configXml = self._download_webpage(configUrl, epTitle,
2297 u'Downloading configuration for %s' % shortMediaId)
2299 cdoc = xml.etree.ElementTree.fromstring(configXml)
2301 for rendition in cdoc.findall('.//rendition'):
2302 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2306 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2309 if self._downloader.params.get('listformats', None):
2310 self._print_formats([i[0] for i in turls])
2313 # For now, just pick the highest bitrate
2314 format,rtmp_video_url = turls[-1]
2316 # Get the format arg from the arg stream
2317 req_format = self._downloader.params.get('format', None)
2319 # Select format if we can find one
2322 format, rtmp_video_url = f, v
2325 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2327 raise ExtractorError(u'Cannot transform RTMP url')
2328 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2329 video_url = base + m.group('finalid')
2331 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2336 'upload_date': officialDate,
2341 'description': officialTitle,
2343 results.append(info)
2348 class EscapistIE(InfoExtractor):
2349 """Information extractor for The Escapist """
2351 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2352 IE_NAME = u'escapist'
2354 def _real_extract(self, url):
2355 mobj = re.match(self._VALID_URL, url)
2357 raise ExtractorError(u'Invalid URL: %s' % url)
2358 showName = mobj.group('showname')
2359 videoId = mobj.group('episode')
2361 self.report_extraction(showName)
2362 webPage = self._download_webpage(url, showName)
2364 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2365 description = unescapeHTML(descMatch.group(1))
2366 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2367 imgUrl = unescapeHTML(imgMatch.group(1))
2368 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2369 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2370 configUrlMatch = re.search('config=(.*)$', playerUrl)
2371 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2373 configJSON = self._download_webpage(configUrl, showName,
2374 u'Downloading configuration',
2375 u'unable to download configuration')
2377 # Technically, it's JavaScript, not JSON
2378 configJSON = configJSON.replace("'", '"')
2381 config = json.loads(configJSON)
2382 except (ValueError,) as err:
2383 raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2385 playlist = config['playlist']
2386 videoUrl = playlist[1]['url']
2391 'uploader': showName,
2392 'upload_date': None,
2395 'thumbnail': imgUrl,
2396 'description': description,
2397 'player_url': playerUrl,
2402 class CollegeHumorIE(InfoExtractor):
2403 """Information extractor for collegehumor.com"""
2406 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2407 IE_NAME = u'collegehumor'
2409 def report_manifest(self, video_id):
2410 """Report information extraction."""
2411 self.to_screen(u'%s: Downloading XML manifest' % video_id)
2413 def _real_extract(self, url):
2414 mobj = re.match(self._VALID_URL, url)
2416 raise ExtractorError(u'Invalid URL: %s' % url)
2417 video_id = mobj.group('videoid')
2422 'upload_date': None,
2425 self.report_extraction(video_id)
2426 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2428 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2429 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2430 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2432 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2434 videoNode = mdoc.findall('./video')[0]
2435 info['description'] = videoNode.findall('./description')[0].text
2436 info['title'] = videoNode.findall('./caption')[0].text
2437 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2438 manifest_url = videoNode.findall('./file')[0].text
2440 raise ExtractorError(u'Invalid metadata XML file')
2442 manifest_url += '?hdcore=2.10.3'
2443 self.report_manifest(video_id)
2445 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2446 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2447 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2449 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2451 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2452 node_id = media_node.attrib['url']
2453 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2454 except IndexError as err:
2455 raise ExtractorError(u'Invalid manifest file')
2457 url_pr = compat_urllib_parse_urlparse(manifest_url)
2458 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2465 class XVideosIE(InfoExtractor):
2466 """Information extractor for xvideos.com"""
2468 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2469 IE_NAME = u'xvideos'
2471 def _real_extract(self, url):
2472 mobj = re.match(self._VALID_URL, url)
2474 raise ExtractorError(u'Invalid URL: %s' % url)
2475 video_id = mobj.group(1)
2477 webpage = self._download_webpage(url, video_id)
2479 self.report_extraction(video_id)
2483 mobj = re.search(r'flv_url=(.+?)&', webpage)
2485 raise ExtractorError(u'Unable to extract video url')
2486 video_url = compat_urllib_parse.unquote(mobj.group(1))
2490 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2492 raise ExtractorError(u'Unable to extract video title')
2493 video_title = mobj.group(1)
2496 # Extract video thumbnail
2497 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2499 raise ExtractorError(u'Unable to extract video thumbnail')
2500 video_thumbnail = mobj.group(0)
2506 'upload_date': None,
2507 'title': video_title,
2509 'thumbnail': video_thumbnail,
2510 'description': None,
2516 class SoundcloudIE(InfoExtractor):
2517 """Information extractor for soundcloud.com
2518 To access the media, the uid of the song and a stream token
2519 must be extracted from the page source and the script must make
2520 a request to media.soundcloud.com/crossdomain.xml. Then
2521 the media can be grabbed by requesting from an url composed
2522 of the stream token and uid
2525 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2526 IE_NAME = u'soundcloud'
2528 def report_resolve(self, video_id):
2529 """Report information extraction."""
2530 self.to_screen(u'%s: Resolving id' % video_id)
2532 def _real_extract(self, url):
2533 mobj = re.match(self._VALID_URL, url)
2535 raise ExtractorError(u'Invalid URL: %s' % url)
2537 # extract uploader (which is in the url)
2538 uploader = mobj.group(1)
2539 # extract simple title (uploader + slug of song title)
2540 slug_title = mobj.group(2)
2541 simple_title = uploader + u'-' + slug_title
2542 full_title = '%s/%s' % (uploader, slug_title)
2544 self.report_resolve(full_title)
2546 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2547 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2548 info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2550 info = json.loads(info_json)
2551 video_id = info['id']
2552 self.report_extraction(full_title)
2554 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2555 stream_json = self._download_webpage(streams_url, full_title,
2556 u'Downloading stream definitions',
2557 u'unable to download stream definitions')
2559 streams = json.loads(stream_json)
2560 mediaURL = streams['http_mp3_128_url']
2561 upload_date = unified_strdate(info['created_at'])
2566 'uploader': info['user']['username'],
2567 'upload_date': upload_date,
2568 'title': info['title'],
2570 'description': info['description'],
2573 class SoundcloudSetIE(InfoExtractor):
2574 """Information extractor for soundcloud.com sets
2575 To access the media, the uid of the song and a stream token
2576 must be extracted from the page source and the script must make
2577 a request to media.soundcloud.com/crossdomain.xml. Then
2578 the media can be grabbed by requesting from an url composed
2579 of the stream token and uid
2582 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2583 IE_NAME = u'soundcloud:set'
2585 def report_resolve(self, video_id):
2586 """Report information extraction."""
2587 self.to_screen(u'%s: Resolving id' % video_id)
2589 def _real_extract(self, url):
2590 mobj = re.match(self._VALID_URL, url)
2592 raise ExtractorError(u'Invalid URL: %s' % url)
2594 # extract uploader (which is in the url)
2595 uploader = mobj.group(1)
2596 # extract simple title (uploader + slug of song title)
2597 slug_title = mobj.group(2)
2598 simple_title = uploader + u'-' + slug_title
2599 full_title = '%s/sets/%s' % (uploader, slug_title)
2601 self.report_resolve(full_title)
2603 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2604 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2605 info_json = self._download_webpage(resolv_url, full_title)
2608 info = json.loads(info_json)
2609 if 'errors' in info:
2610 for err in info['errors']:
2611 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2614 self.report_extraction(full_title)
2615 for track in info['tracks']:
2616 video_id = track['id']
2618 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2619 stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2621 self.report_extraction(video_id)
2622 streams = json.loads(stream_json)
2623 mediaURL = streams['http_mp3_128_url']
2628 'uploader': track['user']['username'],
2629 'upload_date': unified_strdate(track['created_at']),
2630 'title': track['title'],
2632 'description': track['description'],
2637 class InfoQIE(InfoExtractor):
2638 """Information extractor for infoq.com"""
2639 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2641 def _real_extract(self, url):
2642 mobj = re.match(self._VALID_URL, url)
2644 raise ExtractorError(u'Invalid URL: %s' % url)
2646 webpage = self._download_webpage(url, video_id=url)
2647 self.report_extraction(url)
2650 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2652 raise ExtractorError(u'Unable to extract video url')
2653 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2654 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2657 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2659 raise ExtractorError(u'Unable to extract video title')
2660 video_title = mobj.group(1)
2662 # Extract description
2663 video_description = u'No description available.'
2664 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2665 if mobj is not None:
2666 video_description = mobj.group(1)
2668 video_filename = video_url.split('/')[-1]
2669 video_id, extension = video_filename.split('.')
2675 'upload_date': None,
2676 'title': video_title,
2677 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2679 'description': video_description,
2684 class MixcloudIE(InfoExtractor):
2685 """Information extractor for www.mixcloud.com"""
2687 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2688 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2689 IE_NAME = u'mixcloud'
2691 def report_download_json(self, file_id):
2692 """Report JSON download."""
2693 self.to_screen(u'Downloading json')
2695 def get_urls(self, jsonData, fmt, bitrate='best'):
2696 """Get urls from 'audio_formats' section in json"""
2699 bitrate_list = jsonData[fmt]
2700 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2701 bitrate = max(bitrate_list) # select highest
2703 url_list = jsonData[fmt][bitrate]
2704 except TypeError: # we have no bitrate info.
2705 url_list = jsonData[fmt]
2708 def check_urls(self, url_list):
2709 """Returns 1st active url from list"""
2710 for url in url_list:
2712 compat_urllib_request.urlopen(url)
2714 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2719 def _print_formats(self, formats):
2720 print('Available formats:')
2721 for fmt in formats.keys():
2722 for b in formats[fmt]:
2724 ext = formats[fmt][b][0]
2725 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2726 except TypeError: # we have no bitrate info
2727 ext = formats[fmt][0]
2728 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2731 def _real_extract(self, url):
2732 mobj = re.match(self._VALID_URL, url)
2734 raise ExtractorError(u'Invalid URL: %s' % url)
2735 # extract uploader & filename from url
2736 uploader = mobj.group(1).decode('utf-8')
2737 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2739 # construct API request
2740 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2741 # retrieve .json file with links to files
2742 request = compat_urllib_request.Request(file_url)
2744 self.report_download_json(file_url)
2745 jsonData = compat_urllib_request.urlopen(request).read()
2746 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2747 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2750 json_data = json.loads(jsonData)
2751 player_url = json_data['player_swf_url']
2752 formats = dict(json_data['audio_formats'])
2754 req_format = self._downloader.params.get('format', None)
2757 if self._downloader.params.get('listformats', None):
2758 self._print_formats(formats)
2761 if req_format is None or req_format == 'best':
2762 for format_param in formats.keys():
2763 url_list = self.get_urls(formats, format_param)
2765 file_url = self.check_urls(url_list)
2766 if file_url is not None:
2769 if req_format not in formats:
2770 raise ExtractorError(u'Format is not available')
2772 url_list = self.get_urls(formats, req_format)
2773 file_url = self.check_urls(url_list)
2774 format_param = req_format
2777 'id': file_id.decode('utf-8'),
2778 'url': file_url.decode('utf-8'),
2779 'uploader': uploader.decode('utf-8'),
2780 'upload_date': None,
2781 'title': json_data['name'],
2782 'ext': file_url.split('.')[-1].decode('utf-8'),
2783 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2784 'thumbnail': json_data['thumbnail_url'],
2785 'description': json_data['description'],
2786 'player_url': player_url.decode('utf-8'),
2789 class StanfordOpenClassroomIE(InfoExtractor):
2790 """Information extractor for Stanford's Open ClassRoom"""
2792 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2793 IE_NAME = u'stanfordoc'
2795 def _real_extract(self, url):
2796 mobj = re.match(self._VALID_URL, url)
2798 raise ExtractorError(u'Invalid URL: %s' % url)
2800 if mobj.group('course') and mobj.group('video'): # A specific video
2801 course = mobj.group('course')
2802 video = mobj.group('video')
2804 'id': course + '_' + video,
2806 'upload_date': None,
2809 self.report_extraction(info['id'])
2810 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2811 xmlUrl = baseUrl + video + '.xml'
2813 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2814 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2815 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2816 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2818 info['title'] = mdoc.findall('./title')[0].text
2819 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2821 raise ExtractorError(u'Invalid metadata XML file')
2822 info['ext'] = info['url'].rpartition('.')[2]
2824 elif mobj.group('course'): # A course page
2825 course = mobj.group('course')
2830 'upload_date': None,
2833 coursepage = self._download_webpage(url, info['id'],
2834 note='Downloading course info page',
2835 errnote='Unable to download course info page')
2837 m = re.search('<h1>([^<]+)</h1>', coursepage)
2839 info['title'] = unescapeHTML(m.group(1))
2841 info['title'] = info['id']
2843 m = re.search('<description>([^<]+)</description>', coursepage)
2845 info['description'] = unescapeHTML(m.group(1))
2847 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2850 'type': 'reference',
2851 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2855 for entry in info['list']:
2856 assert entry['type'] == 'reference'
2857 results += self.extract(entry['url'])
2861 'id': 'Stanford OpenClassroom',
2864 'upload_date': None,
2867 self.report_download_webpage(info['id'])
2868 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2870 rootpage = compat_urllib_request.urlopen(rootURL).read()
2871 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2872 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2874 info['title'] = info['id']
2876 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2879 'type': 'reference',
2880 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2885 for entry in info['list']:
2886 assert entry['type'] == 'reference'
2887 results += self.extract(entry['url'])
2890 class MTVIE(InfoExtractor):
2891 """Information extractor for MTV.com"""
2893 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2896 def _real_extract(self, url):
2897 mobj = re.match(self._VALID_URL, url)
2899 raise ExtractorError(u'Invalid URL: %s' % url)
2900 if not mobj.group('proto'):
2901 url = 'http://' + url
2902 video_id = mobj.group('videoid')
2904 webpage = self._download_webpage(url, video_id)
2906 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2908 raise ExtractorError(u'Unable to extract song name')
2909 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2910 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2912 raise ExtractorError(u'Unable to extract performer')
2913 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2914 video_title = performer + ' - ' + song_name
2916 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2918 raise ExtractorError(u'Unable to mtvn_uri')
2919 mtvn_uri = mobj.group(1)
2921 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2923 raise ExtractorError(u'Unable to extract content id')
2924 content_id = mobj.group(1)
2926 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2927 self.report_extraction(video_id)
2928 request = compat_urllib_request.Request(videogen_url)
2930 metadataXml = compat_urllib_request.urlopen(request).read()
2931 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2932 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2934 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2935 renditions = mdoc.findall('.//rendition')
2937 # For now, always pick the highest quality.
2938 rendition = renditions[-1]
2941 _,_,ext = rendition.attrib['type'].partition('/')
2942 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2943 video_url = rendition.find('./src').text
2945 raise ExtractorError('Invalid rendition field.')
2950 'uploader': performer,
2951 'upload_date': None,
2952 'title': video_title,
2960 class YoukuIE(InfoExtractor):
2961 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2964 nowTime = int(time.time() * 1000)
2965 random1 = random.randint(1000,1998)
2966 random2 = random.randint(1000,9999)
2968 return "%d%d%d" %(nowTime,random1,random2)
2970 def _get_file_ID_mix_string(self, seed):
2972 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2974 for i in range(len(source)):
2975 seed = (seed * 211 + 30031 ) % 65536
2976 index = math.floor(seed / 65536 * len(source) )
2977 mixed.append(source[int(index)])
2978 source.remove(source[int(index)])
2979 #return ''.join(mixed)
2982 def _get_file_id(self, fileId, seed):
2983 mixed = self._get_file_ID_mix_string(seed)
2984 ids = fileId.split('*')
2988 realId.append(mixed[int(ch)])
2989 return ''.join(realId)
2991 def _real_extract(self, url):
2992 mobj = re.match(self._VALID_URL, url)
2994 raise ExtractorError(u'Invalid URL: %s' % url)
2995 video_id = mobj.group('ID')
2997 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
2999 jsondata = self._download_webpage(info_url, video_id)
3001 self.report_extraction(video_id)
3003 config = json.loads(jsondata)
3005 video_title = config['data'][0]['title']
3006 seed = config['data'][0]['seed']
3008 format = self._downloader.params.get('format', None)
3009 supported_format = list(config['data'][0]['streamfileids'].keys())
3011 if format is None or format == 'best':
3012 if 'hd2' in supported_format:
3017 elif format == 'worst':
3025 fileid = config['data'][0]['streamfileids'][format]
3026 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3027 except (UnicodeDecodeError, ValueError, KeyError):
3028 raise ExtractorError(u'Unable to extract info section')
3031 sid = self._gen_sid()
3032 fileid = self._get_file_id(fileid, seed)
3034 #column 8,9 of fileid represent the segment number
3035 #fileid[7:9] should be changed
3036 for index, key in enumerate(keys):
3038 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3039 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3042 'id': '%s_part%02d' % (video_id, index),
3043 'url': download_url,
3045 'upload_date': None,
3046 'title': video_title,
3049 files_info.append(info)
3054 class XNXXIE(InfoExtractor):
3055 """Information extractor for xnxx.com"""
3057 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3059 VIDEO_URL_RE = r'flv_url=(.*?)&'
3060 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3061 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3063 def _real_extract(self, url):
3064 mobj = re.match(self._VALID_URL, url)
3066 raise ExtractorError(u'Invalid URL: %s' % url)
3067 video_id = mobj.group(1)
3069 # Get webpage content
3070 webpage = self._download_webpage(url, video_id)
3072 result = re.search(self.VIDEO_URL_RE, webpage)
3074 raise ExtractorError(u'Unable to extract video url')
3075 video_url = compat_urllib_parse.unquote(result.group(1))
3077 result = re.search(self.VIDEO_TITLE_RE, webpage)
3079 raise ExtractorError(u'Unable to extract video title')
3080 video_title = result.group(1)
3082 result = re.search(self.VIDEO_THUMB_RE, webpage)
3084 raise ExtractorError(u'Unable to extract video thumbnail')
3085 video_thumbnail = result.group(1)
3091 'upload_date': None,
3092 'title': video_title,
3094 'thumbnail': video_thumbnail,
3095 'description': None,
3099 class GooglePlusIE(InfoExtractor):
3100 """Information extractor for plus.google.com."""
3102 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3103 IE_NAME = u'plus.google'
3105 def report_extract_entry(self, url):
3106 """Report downloading extry"""
3107 self.to_screen(u'Downloading entry: %s' % url)
3109 def report_date(self, upload_date):
3110 """Report downloading extry"""
3111 self.to_screen(u'Entry date: %s' % upload_date)
3113 def report_uploader(self, uploader):
3114 """Report downloading extry"""
3115 self.to_screen(u'Uploader: %s' % uploader)
3117 def report_title(self, video_title):
3118 """Report downloading extry"""
3119 self.to_screen(u'Title: %s' % video_title)
3121 def report_extract_vid_page(self, video_page):
3122 """Report information extraction."""
3123 self.to_screen(u'Extracting video page: %s' % video_page)
3125 def _real_extract(self, url):
3126 # Extract id from URL
3127 mobj = re.match(self._VALID_URL, url)
3129 raise ExtractorError(u'Invalid URL: %s' % url)
3131 post_url = mobj.group(0)
3132 video_id = mobj.group(1)
3134 video_extension = 'flv'
3136 # Step 1, Retrieve post webpage to extract further information
3137 self.report_extract_entry(post_url)
3138 webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3140 # Extract update date
3142 pattern = 'title="Timestamp">(.*?)</a>'
3143 mobj = re.search(pattern, webpage)
3145 upload_date = mobj.group(1)
3146 # Convert timestring to a format suitable for filename
3147 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3148 upload_date = upload_date.strftime('%Y%m%d')
3149 self.report_date(upload_date)
3153 pattern = r'rel\="author".*?>(.*?)</a>'
3154 mobj = re.search(pattern, webpage)
3156 uploader = mobj.group(1)
3157 self.report_uploader(uploader)
3160 # Get the first line for title
3162 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3163 mobj = re.search(pattern, webpage)
3165 video_title = mobj.group(1)
3166 self.report_title(video_title)
3168 # Step 2, Stimulate clicking the image box to launch video
3169 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3170 mobj = re.search(pattern, webpage)
3172 raise ExtractorError(u'Unable to extract video page URL')
3174 video_page = mobj.group(1)
3175 webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3176 self.report_extract_vid_page(video_page)
3179 # Extract video links on video page
3180 """Extract video links of all sizes"""
3181 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3182 mobj = re.findall(pattern, webpage)
3184 raise ExtractorError(u'Unable to extract video links')
3186 # Sort in resolution
3187 links = sorted(mobj)
3189 # Choose the lowest of the sort, i.e. highest resolution
3190 video_url = links[-1]
3191 # Only get the url. The resolution part in the tuple has no use anymore
3192 video_url = video_url[-1]
3193 # Treat escaped \u0026 style hex
3195 video_url = video_url.decode("unicode_escape")
3196 except AttributeError: # Python 3
3197 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3203 'uploader': uploader,
3204 'upload_date': upload_date,
3205 'title': video_title,
3206 'ext': video_extension,
3209 class NBAIE(InfoExtractor):
3210 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3213 def _real_extract(self, url):
3214 mobj = re.match(self._VALID_URL, url)
3216 raise ExtractorError(u'Invalid URL: %s' % url)
3218 video_id = mobj.group(1)
3219 if video_id.endswith('/index.html'):
3220 video_id = video_id[:-len('/index.html')]
3222 webpage = self._download_webpage(url, video_id)
3224 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3225 def _findProp(rexp, default=None):
3226 m = re.search(rexp, webpage)
3228 return unescapeHTML(m.group(1))
3232 shortened_video_id = video_id.rpartition('/')[2]
3233 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3235 'id': shortened_video_id,
3239 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3240 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3244 class JustinTVIE(InfoExtractor):
3245 """Information extractor for justin.tv and twitch.tv"""
3246 # TODO: One broadcast may be split into multiple videos. The key
3247 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3248 # starts at 1 and increases. Can we treat all parts as one video?
3250 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3252 (?P<channelid>[^/]+)|
3253 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3254 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3258 _JUSTIN_PAGE_LIMIT = 100
3259 IE_NAME = u'justin.tv'
3261 def report_download_page(self, channel, offset):
3262 """Report attempt to download a single page of videos."""
3263 self.to_screen(u'%s: Downloading video information from %d to %d' %
3264 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3266 # Return count of items, list of *valid* items
3267 def _parse_page(self, url, video_id):
3268 webpage = self._download_webpage(url, video_id,
3269 u'Downloading video info JSON',
3270 u'unable to download video info JSON')
3272 response = json.loads(webpage)
3273 if type(response) != list:
3274 error_text = response.get('error', 'unknown error')
3275 raise ExtractorError(u'Justin.tv API: %s' % error_text)
3277 for clip in response:
3278 video_url = clip['video_file_url']
3280 video_extension = os.path.splitext(video_url)[1][1:]
3281 video_date = re.sub('-', '', clip['start_time'][:10])
3282 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3283 video_id = clip['id']
3284 video_title = clip.get('title', video_id)
3288 'title': video_title,
3289 'uploader': clip.get('channel_name', video_uploader_id),
3290 'uploader_id': video_uploader_id,
3291 'upload_date': video_date,
3292 'ext': video_extension,
3294 return (len(response), info)
3296 def _real_extract(self, url):
3297 mobj = re.match(self._VALID_URL, url)
3299 raise ExtractorError(u'invalid URL: %s' % url)
3301 api_base = 'http://api.justin.tv'
3303 if mobj.group('channelid'):
3305 video_id = mobj.group('channelid')
3306 api = api_base + '/channel/archives/%s.json' % video_id
3307 elif mobj.group('chapterid'):
3308 chapter_id = mobj.group('chapterid')
3310 webpage = self._download_webpage(url, chapter_id)
3311 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3313 raise ExtractorError(u'Cannot find archive of a chapter')
3314 archive_id = m.group(1)
3316 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3317 chapter_info_xml = self._download_webpage(api, chapter_id,
3318 note=u'Downloading chapter information',
3319 errnote=u'Chapter information download failed')
3320 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3321 for a in doc.findall('.//archive'):
3322 if archive_id == a.find('./id').text:
3325 raise ExtractorError(u'Could not find chapter in chapter information')
3327 video_url = a.find('./video_file_url').text
3328 video_ext = video_url.rpartition('.')[2] or u'flv'
3330 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3331 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3332 note='Downloading chapter metadata',
3333 errnote='Download of chapter metadata failed')
3334 chapter_info = json.loads(chapter_info_json)
3336 bracket_start = int(doc.find('.//bracket_start').text)
3337 bracket_end = int(doc.find('.//bracket_end').text)
3339 # TODO determine start (and probably fix up file)
3340 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3341 #video_url += u'?start=' + TODO:start_timestamp
3342 # bracket_start is 13290, but we want 51670615
3343 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3344 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3347 'id': u'c' + chapter_id,
3350 'title': chapter_info['title'],
3351 'thumbnail': chapter_info['preview'],
3352 'description': chapter_info['description'],
3353 'uploader': chapter_info['channel']['display_name'],
3354 'uploader_id': chapter_info['channel']['name'],
3358 video_id = mobj.group('videoid')
3359 api = api_base + '/broadcast/by_archive/%s.json' % video_id
3361 self.report_extraction(video_id)
3365 limit = self._JUSTIN_PAGE_LIMIT
3368 self.report_download_page(video_id, offset)
3369 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3370 page_count, page_info = self._parse_page(page_url, video_id)
3371 info.extend(page_info)
3372 if not paged or page_count != limit:
3377 class FunnyOrDieIE(InfoExtractor):
3378 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3380 def _real_extract(self, url):
3381 mobj = re.match(self._VALID_URL, url)
3383 raise ExtractorError(u'invalid URL: %s' % url)
3385 video_id = mobj.group('id')
3386 webpage = self._download_webpage(url, video_id)
3388 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3390 raise ExtractorError(u'Unable to find video information')
3391 video_url = unescapeHTML(m.group('url'))
3393 m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3395 m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3397 raise ExtractorError(u'Cannot find video title')
3398 title = clean_html(m.group('title'))
3400 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3402 desc = unescapeHTML(m.group('desc'))
3411 'description': desc,
3415 class SteamIE(InfoExtractor):
3416 _VALID_URL = r"""http://store\.steampowered\.com/
3418 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3420 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3424 def suitable(cls, url):
3425 """Receives a URL and returns True if suitable for this IE."""
3426 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3428 def _real_extract(self, url):
3429 m = re.match(self._VALID_URL, url, re.VERBOSE)
3430 gameID = m.group('gameID')
3431 videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3432 self.report_age_confirmation()
3433 webpage = self._download_webpage(videourl, gameID)
3434 game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3436 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3437 mweb = re.finditer(urlRE, webpage)
3438 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3439 titles = re.finditer(namesRE, webpage)
3440 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3441 thumbs = re.finditer(thumbsRE, webpage)
3443 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3444 video_id = vid.group('videoID')
3445 title = vtitle.group('videoName')
3446 video_url = vid.group('videoURL')
3447 video_thumb = thumb.group('thumbnail')
3449 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3454 'title': unescapeHTML(title),
3455 'thumbnail': video_thumb
3458 return [self.playlist_result(videos, gameID, game_title)]
3460 class UstreamIE(InfoExtractor):
3461 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3462 IE_NAME = u'ustream'
3464 def _real_extract(self, url):
3465 m = re.match(self._VALID_URL, url)
3466 video_id = m.group('videoID')
3467 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3468 webpage = self._download_webpage(url, video_id)
3469 self.report_extraction(video_id)
3471 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3472 title = m.group('title')
3473 m = re.search(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
3475 uploader = unescapeHTML(m.group('uploader').strip())
3476 m = re.search(r'<link rel="image_src" href="(?P<thumb>.*?)"', webpage)
3477 thumb = m.group('thumb')
3478 except AttributeError:
3479 raise ExtractorError(u'Unable to extract info')
3485 'uploader': uploader,
3490 class WorldStarHipHopIE(InfoExtractor):
3491 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3492 IE_NAME = u'WorldStarHipHop'
3494 def _real_extract(self, url):
3495 _src_url = r'so\.addVariable\("file","(.*?)"\)'
3497 m = re.match(self._VALID_URL, url)
3498 video_id = m.group('id')
3500 webpage_src = self._download_webpage(url, video_id)
3502 mobj = re.search(_src_url, webpage_src)
3504 if mobj is not None:
3505 video_url = mobj.group(1)
3506 if 'mp4' in video_url:
3511 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3513 mobj = re.search(r"<title>(.*)</title>", webpage_src)
3516 raise ExtractorError(u'Cannot determine title')
3517 title = mobj.group(1)
3519 mobj = re.search(r'rel="image_src" href="(.*)" />', webpage_src)
3520 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3521 if mobj is not None:
3522 thumbnail = mobj.group(1)
3524 _title = r"""candytitles.*>(.*)</span>"""
3525 mobj = re.search(_title, webpage_src)
3526 if mobj is not None:
3527 title = mobj.group(1)
3534 'thumbnail' : thumbnail,
3539 class RBMARadioIE(InfoExtractor):
3540 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3542 def _real_extract(self, url):
3543 m = re.match(self._VALID_URL, url)
3544 video_id = m.group('videoID')
3546 webpage = self._download_webpage(url, video_id)
3547 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3549 raise ExtractorError(u'Cannot find metadata')
3550 json_data = m.group(1)
3553 data = json.loads(json_data)
3554 except ValueError as e:
3555 raise ExtractorError(u'Invalid JSON: ' + str(e))
3557 video_url = data['akamai_url'] + '&cbr=256'
3558 url_parts = compat_urllib_parse_urlparse(video_url)
3559 video_ext = url_parts.path.rpartition('.')[2]
3564 'title': data['title'],
3565 'description': data.get('teaser_text'),
3566 'location': data.get('country_of_origin'),
3567 'uploader': data.get('host', {}).get('name'),
3568 'uploader_id': data.get('host', {}).get('slug'),
3569 'thumbnail': data.get('image', {}).get('large_url_2x'),
3570 'duration': data.get('duration'),
3575 class YouPornIE(InfoExtractor):
3576 """Information extractor for youporn.com."""
3577 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3579 def _print_formats(self, formats):
3580 """Print all available formats"""
3581 print(u'Available formats:')
3582 print(u'ext\t\tformat')
3583 print(u'---------------------------------')
3584 for format in formats:
3585 print(u'%s\t\t%s' % (format['ext'], format['format']))
3587 def _specific(self, req_format, formats):
3589 if(x["format"]==req_format):
3593 def _real_extract(self, url):
3594 mobj = re.match(self._VALID_URL, url)
3596 raise ExtractorError(u'Invalid URL: %s' % url)
3598 video_id = mobj.group('videoid')
3600 req = compat_urllib_request.Request(url)
3601 req.add_header('Cookie', 'age_verified=1')
3602 webpage = self._download_webpage(req, video_id)
3604 # Get the video title
3605 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3607 raise ExtractorError(u'Unable to extract video title')
3608 video_title = result.group('title').strip()
3610 # Get the video date
3611 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3613 self._downloader.report_warning(u'unable to extract video date')
3616 upload_date = unified_strdate(result.group('date').strip())
3618 # Get the video uploader
3619 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3621 self._downloader.report_warning(u'unable to extract uploader')
3622 video_uploader = None
3624 video_uploader = result.group('uploader').strip()
3625 video_uploader = clean_html( video_uploader )
3627 # Get all of the formats available
3628 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3629 result = re.search(DOWNLOAD_LIST_RE, webpage)
3631 raise ExtractorError(u'Unable to extract download list')
3632 download_list_html = result.group('download_list').strip()
3634 # Get all of the links from the page
3635 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3636 links = re.findall(LINK_RE, download_list_html)
3637 if(len(links) == 0):
3638 raise ExtractorError(u'ERROR: no known formats available for video')
3640 self.to_screen(u'Links found: %d' % len(links))
3645 # A link looks like this:
3646 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3647 # A path looks like this:
3648 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3649 video_url = unescapeHTML( link )
3650 path = compat_urllib_parse_urlparse( video_url ).path
3651 extension = os.path.splitext( path )[1][1:]
3652 format = path.split('/')[4].split('_')[:2]
3655 format = "-".join( format )
3656 title = u'%s-%s-%s' % (video_title, size, bitrate)
3661 'uploader': video_uploader,
3662 'upload_date': upload_date,
3667 'description': None,
3671 if self._downloader.params.get('listformats', None):
3672 self._print_formats(formats)
3675 req_format = self._downloader.params.get('format', None)
3676 self.to_screen(u'Format: %s' % req_format)
3678 if req_format is None or req_format == 'best':
3680 elif req_format == 'worst':
3681 return [formats[-1]]
3682 elif req_format in ('-1', 'all'):
3685 format = self._specific( req_format, formats )
3687 raise ExtractorError(u'Requested format not available')
3692 class PornotubeIE(InfoExtractor):
3693 """Information extractor for pornotube.com."""
3694 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3696 def _real_extract(self, url):
3697 mobj = re.match(self._VALID_URL, url)
3699 raise ExtractorError(u'Invalid URL: %s' % url)
3701 video_id = mobj.group('videoid')
3702 video_title = mobj.group('title')
3704 # Get webpage content
3705 webpage = self._download_webpage(url, video_id)
3708 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3709 result = re.search(VIDEO_URL_RE, webpage)
3711 raise ExtractorError(u'Unable to extract video url')
3712 video_url = compat_urllib_parse.unquote(result.group('url'))
3714 #Get the uploaded date
3715 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3716 result = re.search(VIDEO_UPLOADED_RE, webpage)
3718 raise ExtractorError(u'Unable to extract video title')
3719 upload_date = unified_strdate(result.group('date'))
3721 info = {'id': video_id,
3724 'upload_date': upload_date,
3725 'title': video_title,
3731 class YouJizzIE(InfoExtractor):
3732 """Information extractor for youjizz.com."""
3733 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3735 def _real_extract(self, url):
3736 mobj = re.match(self._VALID_URL, url)
3738 raise ExtractorError(u'Invalid URL: %s' % url)
3740 video_id = mobj.group('videoid')
3742 # Get webpage content
3743 webpage = self._download_webpage(url, video_id)
3745 # Get the video title
3746 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3748 raise ExtractorError(u'ERROR: unable to extract video title')
3749 video_title = result.group('title').strip()
3751 # Get the embed page
3752 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3754 raise ExtractorError(u'ERROR: unable to extract embed page')
3756 embed_page_url = result.group(0).strip()
3757 video_id = result.group('videoid')
3759 webpage = self._download_webpage(embed_page_url, video_id)
3762 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3764 raise ExtractorError(u'ERROR: unable to extract video url')
3765 video_url = result.group('source')
3767 info = {'id': video_id,
3769 'title': video_title,
3772 'player_url': embed_page_url}
3776 class EightTracksIE(InfoExtractor):
3778 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3780 def _real_extract(self, url):
3781 mobj = re.match(self._VALID_URL, url)
3783 raise ExtractorError(u'Invalid URL: %s' % url)
3784 playlist_id = mobj.group('id')
3786 webpage = self._download_webpage(url, playlist_id)
3788 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3790 raise ExtractorError(u'Cannot find trax information')
3791 json_like = m.group(1)
3792 data = json.loads(json_like)
3794 session = str(random.randint(0, 1000000000))
3796 track_count = data['tracks_count']
3797 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3798 next_url = first_url
3800 for i in itertools.count():
3801 api_json = self._download_webpage(next_url, playlist_id,
3802 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3803 errnote=u'Failed to download song information')
3804 api_data = json.loads(api_json)
3805 track_data = api_data[u'set']['track']
3807 'id': track_data['id'],
3808 'url': track_data['track_file_stream_url'],
3809 'title': track_data['performer'] + u' - ' + track_data['name'],
3810 'raw_title': track_data['name'],
3811 'uploader_id': data['user']['login'],
3815 if api_data['set']['at_last_track']:
3817 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3820 class KeekIE(InfoExtractor):
3821 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3824 def _real_extract(self, url):
3825 m = re.match(self._VALID_URL, url)
3826 video_id = m.group('videoID')
3827 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3828 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3829 webpage = self._download_webpage(url, video_id)
3830 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3831 title = unescapeHTML(m.group('title'))
3832 m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
3833 uploader = clean_html(m.group('uploader'))
3839 'thumbnail': thumbnail,
3840 'uploader': uploader
3844 class TEDIE(InfoExtractor):
3845 _VALID_URL=r'''http://www\.ted\.com/
3847 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3849 ((?P<type_talk>talks)) # We have a simple talk
3851 (/lang/(.*?))? # The url may contain the language
3852 /(?P<name>\w+) # Here goes the name and then ".html"
3856 def suitable(cls, url):
3857 """Receives a URL and returns True if suitable for this IE."""
3858 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3860 def _real_extract(self, url):
3861 m=re.match(self._VALID_URL, url, re.VERBOSE)
3862 if m.group('type_talk'):
3863 return [self._talk_info(url)]
3865 playlist_id=m.group('playlist_id')
3866 name=m.group('name')
3867 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3868 return [self._playlist_videos_info(url,name,playlist_id)]
3870 def _talk_video_link(self,mediaSlug):
3871 '''Returns the video link for that mediaSlug'''
3872 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3874 def _playlist_videos_info(self,url,name,playlist_id=0):
3875 '''Returns the videos of the playlist'''
3877 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3878 ([.\s]*?)data-playlist_item_id="(\d+)"
3879 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3881 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3882 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3883 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3884 m_names=re.finditer(video_name_RE,webpage)
3886 playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3887 m_playlist = re.search(playlist_RE, webpage)
3888 playlist_title = m_playlist.group('playlist_title')
3890 playlist_entries = []
3891 for m_video, m_name in zip(m_videos,m_names):
3892 video_id=m_video.group('video_id')
3893 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3894 playlist_entries.append(self.url_result(talk_url, 'TED'))
3895 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3897 def _talk_info(self, url, video_id=0):
3898 """Return the video for the talk in the url"""
3899 m=re.match(self._VALID_URL, url,re.VERBOSE)
3900 videoName=m.group('name')
3901 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3902 # If the url includes the language we get the title translated
3903 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3904 title=re.search(title_RE, webpage).group('title')
3905 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3906 "id":(?P<videoID>[\d]+).*?
3907 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3908 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3909 thumb_match=re.search(thumb_RE,webpage)
3910 info_match=re.search(info_RE,webpage,re.VERBOSE)
3911 video_id=info_match.group('videoID')
3912 mediaSlug=info_match.group('mediaSlug')
3913 video_url=self._talk_video_link(mediaSlug)
3919 'thumbnail': thumb_match.group('thumbnail')
3923 class MySpassIE(InfoExtractor):
3924 _VALID_URL = r'http://www.myspass.de/.*'
3926 def _real_extract(self, url):
3927 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3929 # video id is the last path element of the URL
3930 # usually there is a trailing slash, so also try the second but last
3931 url_path = compat_urllib_parse_urlparse(url).path
3932 url_parent_path, video_id = os.path.split(url_path)
3934 _, video_id = os.path.split(url_parent_path)
3937 metadata_url = META_DATA_URL_TEMPLATE % video_id
3938 metadata_text = self._download_webpage(metadata_url, video_id)
3939 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3941 # extract values from metadata
3942 url_flv_el = metadata.find('url_flv')
3943 if url_flv_el is None:
3944 raise ExtractorError(u'Unable to extract download url')
3945 video_url = url_flv_el.text
3946 extension = os.path.splitext(video_url)[1][1:]
3947 title_el = metadata.find('title')
3948 if title_el is None:
3949 raise ExtractorError(u'Unable to extract title')
3950 title = title_el.text
3951 format_id_el = metadata.find('format_id')
3952 if format_id_el is None:
3955 format = format_id_el.text
3956 description_el = metadata.find('description')
3957 if description_el is not None:
3958 description = description_el.text
3961 imagePreview_el = metadata.find('imagePreview')
3962 if imagePreview_el is not None:
3963 thumbnail = imagePreview_el.text
3972 'thumbnail': thumbnail,
3973 'description': description
3977 class SpiegelIE(InfoExtractor):
3978 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3980 def _real_extract(self, url):
3981 m = re.match(self._VALID_URL, url)
3982 video_id = m.group('videoID')
3984 webpage = self._download_webpage(url, video_id)
3985 m = re.search(r'<div class="module-title">(.*?)</div>', webpage)
3987 raise ExtractorError(u'Cannot find title')
3988 video_title = unescapeHTML(m.group(1))
3990 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3991 xml_code = self._download_webpage(xml_url, video_id,
3992 note=u'Downloading XML', errnote=u'Failed to download XML')
3994 idoc = xml.etree.ElementTree.fromstring(xml_code)
3995 last_type = idoc[-1]
3996 filename = last_type.findall('./filename')[0].text
3997 duration = float(last_type.findall('./duration')[0].text)
3999 video_url = 'http://video2.spiegel.de/flash/' + filename
4000 video_ext = filename.rpartition('.')[2]
4005 'title': video_title,
4006 'duration': duration,
4010 class LiveLeakIE(InfoExtractor):
4012 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4013 IE_NAME = u'liveleak'
4015 def _real_extract(self, url):
4016 mobj = re.match(self._VALID_URL, url)
4018 raise ExtractorError(u'Invalid URL: %s' % url)
4020 video_id = mobj.group('video_id')
4022 webpage = self._download_webpage(url, video_id)
4024 m = re.search(r'file: "(.*?)",', webpage)
4026 raise ExtractorError(u'Unable to find video url')
4027 video_url = m.group(1)
4029 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4031 raise ExtractorError(u'Cannot find video title')
4032 title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4034 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4036 desc = unescapeHTML(m.group('desc'))
4040 m = re.search(r'By:.*?(\w+)</a>', webpage)
4042 uploader = clean_html(m.group(1))
4051 'description': desc,
4052 'uploader': uploader
4057 class ARDIE(InfoExtractor):
4058 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4059 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4060 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4062 def _real_extract(self, url):
4063 # determine video id from url
4064 m = re.match(self._VALID_URL, url)
4066 numid = re.search(r'documentId=([0-9]+)', url)
4068 video_id = numid.group(1)
4070 video_id = m.group('video_id')
4072 # determine title and media streams from webpage
4073 html = self._download_webpage(url, video_id)
4074 title = re.search(self._TITLE, html).group('title')
4075 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4077 assert '"fsk"' in html
4078 raise ExtractorError(u'This video is only available after 8:00 pm')
4080 # choose default media type and highest quality for now
4081 stream = max([s for s in streams if int(s["media_type"]) == 0],
4082 key=lambda s: int(s["quality"]))
4084 # there's two possibilities: RTMP stream or HTTP download
4085 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4086 if stream['rtmp_url']:
4087 self.to_screen(u'RTMP download detected')
4088 assert stream['video_url'].startswith('mp4:')
4089 info["url"] = stream["rtmp_url"]
4090 info["play_path"] = stream['video_url']
4092 assert stream["video_url"].endswith('.mp4')
4093 info["url"] = stream["video_url"]
4096 class ZDFIE(InfoExtractor):
4097 _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4098 _TITLE = r'<h1(?: class="beitragHeadline")?>(?P<title>.*)</h1>'
4099 _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>'
4100 _MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"'
4101 _RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)'
4103 def _real_extract(self, url):
4104 mobj = re.match(self._VALID_URL, url)
4106 raise ExtractorError(u'Invalid URL: %s' % url)
4107 video_id = mobj.group('video_id')
4109 html = self._download_webpage(url, video_id)
4110 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4112 raise ExtractorError(u'No media url found.')
4114 # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url
4115 # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url
4116 # choose first/default media type and highest quality for now
4117 for s in streams: #find 300 - dsl1000mbit
4118 if s['quality'] == '300' and s['media_type'] == 'wstreaming':
4121 for s in streams: #find veryhigh - dsl2000mbit
4122 if s['quality'] == 'veryhigh' and s['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working
4126 raise ExtractorError(u'No stream found.')
4128 media_link = self._download_webpage(stream_['video_url'], video_id,'Get stream URL')
4130 self.report_extraction(video_id)
4131 mobj = re.search(self._TITLE, html)
4133 raise ExtractorError(u'Cannot extract title')
4134 title = unescapeHTML(mobj.group('title'))
4136 mobj = re.search(self._MMS_STREAM, media_link)
4138 mobj = re.search(self._RTSP_STREAM, media_link)
4140 raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL')
4141 mms_url = mobj.group('video_url')
4143 mobj = re.search('(.*)[.](?P<ext>[^.]+)', mms_url)
4145 raise ExtractorError(u'Cannot extract extention')
4146 ext = mobj.group('ext')
4148 return [{'id': video_id,
4154 class TumblrIE(InfoExtractor):
4155 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4157 def _real_extract(self, url):
4158 m_url = re.match(self._VALID_URL, url)
4159 video_id = m_url.group('id')
4160 blog = m_url.group('blog_name')
4162 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4163 webpage = self._download_webpage(url, video_id)
4165 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4166 video = re.search(re_video, webpage)
4168 self.to_screen("No video found")
4170 video_url = video.group('video_url')
4171 ext = video.group('ext')
4173 re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22' # We pick the first poster
4174 thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
4176 # The only place where you can get a title, it's not complete,
4177 # but searching in other places doesn't work for all videos
4178 re_title = r'<title>(?P<title>.*?)</title>'
4179 title = unescapeHTML(re.search(re_title, webpage, re.DOTALL).group('title'))
4181 return [{'id': video_id,
4188 class BandcampIE(InfoExtractor):
4189 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4191 def _real_extract(self, url):
4192 mobj = re.match(self._VALID_URL, url)
4193 title = mobj.group('title')
4194 webpage = self._download_webpage(url, title)
4195 # We get the link to the free download page
4196 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4197 if m_download is None:
4198 raise ExtractorError(u'No free songs founded')
4200 download_link = m_download.group(1)
4201 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4202 webpage, re.MULTILINE|re.DOTALL).group('id')
4204 download_webpage = self._download_webpage(download_link, id,
4205 'Downloading free downloads page')
4206 # We get the dictionary of the track from some javascrip code
4207 info = re.search(r'items: (.*?),$',
4208 download_webpage, re.MULTILINE).group(1)
4209 info = json.loads(info)[0]
4210 # We pick mp3-320 for now, until format selection can be easily implemented.
4211 mp3_info = info[u'downloads'][u'mp3-320']
4212 # If we try to use this url it says the link has expired
4213 initial_url = mp3_info[u'url']
4214 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4215 m_url = re.match(re_url, initial_url)
4216 #We build the url we will use to get the final track url
4217 # This url is build in Bandcamp in the script download_bunde_*.js
4218 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4219 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4220 # If we could correctly generate the .rand field the url would be
4221 #in the "download_url" key
4222 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4224 track_info = {'id':id,
4225 'title' : info[u'title'],
4228 'thumbnail' : info[u'thumb_url'],
4229 'uploader' : info[u'artist']
4234 class RedTubeIE(InfoExtractor):
4235 """Information Extractor for redtube"""
4236 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4238 def _real_extract(self,url):
4239 mobj = re.match(self._VALID_URL, url)
4241 raise ExtractorError(u'Invalid URL: %s' % url)
4243 video_id = mobj.group('id')
4244 video_extension = 'mp4'
4245 webpage = self._download_webpage(url, video_id)
4246 self.report_extraction(video_id)
4247 mobj = re.search(r'<source src="'+'(.+)'+'" type="video/mp4">',webpage)
4250 raise ExtractorError(u'Unable to extract media URL')
4252 video_url = mobj.group(1)
4253 mobj = re.search('<h1 class="videoTitle slidePanelMovable">(.+)</h1>',webpage)
4255 raise ExtractorError(u'Unable to extract title')
4256 video_title = mobj.group(1)
4261 'ext': video_extension,
4262 'title': video_title,
4265 class InaIE(InfoExtractor):
4266 """Information Extractor for Ina.fr"""
4267 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4269 def _real_extract(self,url):
4270 mobj = re.match(self._VALID_URL, url)
4272 video_id = mobj.group('id')
4273 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4274 video_extension = 'mp4'
4275 webpage = self._download_webpage(mrss_url, video_id)
4277 mobj = re.search(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)', webpage)
4279 raise ExtractorError(u'Unable to extract media URL')
4280 video_url = mobj.group(1)
4282 mobj = re.search(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>', webpage)
4284 raise ExtractorError(u'Unable to extract title')
4285 video_title = mobj.group(1)
4290 'ext': video_extension,
4291 'title': video_title,
4294 class HowcastIE(InfoExtractor):
4295 """Information Extractor for Howcast.com"""
4296 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
4298 def _real_extract(self, url):
4299 mobj = re.match(self._VALID_URL, url)
4301 video_id = mobj.group('id')
4302 webpage_url = 'http://www.howcast.com/videos/' + video_id
4303 webpage = self._download_webpage(webpage_url, video_id)
4305 self.report_extraction(video_id)
4307 mobj = re.search(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)"', webpage)
4309 raise ExtractorError(u'Unable to extract video URL')
4310 video_url = mobj.group(1)
4312 mobj = re.search(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'', webpage)
4314 raise ExtractorError(u'Unable to extract title')
4315 video_title = mobj.group(1) or mobj.group(2)
4317 mobj = re.search(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'', webpage)
4319 self._downloader.report_warning(u'unable to extract description')
4320 video_description = None
4322 video_description = mobj.group(1) or mobj.group(2)
4324 mobj = re.search(r'<meta content=\'(.+?)\' property=\'og:image\'', webpage)
4326 raise ExtractorError(u'Unable to extract thumbnail')
4327 thumbnail = mobj.group(1)
4333 'title': video_title,
4334 'description': video_description,
4335 'thumbnail': thumbnail,
4338 class VineIE(InfoExtractor):
4339 """Information Extractor for Vine.co"""
4340 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
4342 def _real_extract(self, url):
4344 mobj = re.match(self._VALID_URL, url)
4346 video_id = mobj.group('id')
4347 webpage_url = 'https://vine.co/v/' + video_id
4348 webpage = self._download_webpage(webpage_url, video_id)
4350 self.report_extraction(video_id)
4352 mobj = re.search(r'<meta property="twitter:player:stream" content="(.+?)"', webpage)
4354 raise ExtractorError(u'Unable to extract video URL')
4355 video_url = mobj.group(1)
4357 mobj = re.search(r'<meta property="og:title" content="(.+?)"', webpage)
4359 raise ExtractorError(u'Unable to extract title')
4360 video_title = mobj.group(1)
4362 mobj = re.search(r'<meta property="og:image" content="(.+?)(\?.*?)?"', webpage)
4364 raise ExtractorError(u'Unable to extract thumbnail')
4365 thumbnail = mobj.group(1)
4367 mobj = re.search(r'<div class="user">.*?<h2>(.+?)</h2>', webpage, re.DOTALL)
4369 raise ExtractorError(u'Unable to extract uploader')
4370 uploader = mobj.group(1)
4376 'title': video_title,
4377 'thumbnail': thumbnail,
4378 'uploader': uploader,
4381 class FlickrIE(InfoExtractor):
4382 """Information Extractor for Flickr videos"""
4383 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
4385 def _real_extract(self, url):
4386 mobj = re.match(self._VALID_URL, url)
4388 video_id = mobj.group('id')
4389 video_uploader_id = mobj.group('uploader_id')
4390 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
4391 webpage = self._download_webpage(webpage_url, video_id)
4393 mobj = re.search(r"photo_secret: '(\w+)'", webpage)
4395 raise ExtractorError(u'Unable to extract video secret')
4396 secret = mobj.group(1)
4398 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
4399 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
4401 mobj = re.search(r'<Item id="id">(\d+-\d+)</Item>', first_xml)
4403 raise ExtractorError(u'Unable to extract node_id')
4404 node_id = mobj.group(1)
4406 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
4407 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
4409 self.report_extraction(video_id)
4411 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
4413 raise ExtractorError(u'Unable to extract video url')
4414 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
4416 mobj = re.search(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')', webpage)
4418 raise ExtractorError(u'Unable to extract title')
4419 video_title = mobj.group(1) or mobj.group(2)
4421 mobj = re.search(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')', webpage)
4423 self._downloader.report_warning(u'unable to extract description')
4424 video_description = None
4426 video_description = mobj.group(1) or mobj.group(2)
4428 mobj = re.search(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')', webpage)
4430 raise ExtractorError(u'Unable to extract thumbnail')
4431 thumbnail = mobj.group(1) or mobj.group(2)
4437 'title': video_title,
4438 'description': video_description,
4439 'thumbnail': thumbnail,
4440 'uploader_id': video_uploader_id,
4443 class TeamcocoIE(InfoExtractor):
4444 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
4446 def _real_extract(self, url):
4447 mobj = re.match(self._VALID_URL, url)
4449 raise ExtractorError(u'Invalid URL: %s' % url)
4450 url_title = mobj.group('url_title')
4451 webpage = self._download_webpage(url, url_title)
4453 mobj = re.search(r'<article class="video" data-id="(\d+?)"', webpage)
4454 video_id = mobj.group(1)
4456 self.report_extraction(video_id)
4458 mobj = re.search(r'<meta property="og:title" content="(.+?)"', webpage)
4460 raise ExtractorError(u'Unable to extract title')
4461 video_title = mobj.group(1)
4463 mobj = re.search(r'<meta property="og:image" content="(.+?)"', webpage)
4465 raise ExtractorError(u'Unable to extract thumbnail')
4466 thumbnail = mobj.group(1)
4468 mobj = re.search(r'<meta property="og:description" content="(.*?)"', webpage)
4470 raise ExtractorError(u'Unable to extract description')
4471 description = mobj.group(1)
4473 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
4474 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
4475 mobj = re.search(r'<file type="high".*?>(.*?)</file>', data)
4477 raise ExtractorError(u'Unable to extract video url')
4478 video_url = mobj.group(1)
4484 'title': video_title,
4485 'thumbnail': thumbnail,
4486 'description': description,
4489 class XHamsterIE(InfoExtractor):
4490 """Information Extractor for xHamster"""
4491 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
4493 def _real_extract(self,url):
4494 mobj = re.match(self._VALID_URL, url)
4496 video_id = mobj.group('id')
4497 mrss_url='http://xhamster.com/movies/%s/.html' % video_id
4498 webpage = self._download_webpage(mrss_url, video_id)
4499 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
4501 raise ExtractorError(u'Unable to extract media URL')
4502 if len(mobj.group('server')) == 0:
4503 video_url = compat_urllib_parse.unquote(mobj.group('file'))
4505 video_url = mobj.group('server')+'/key='+mobj.group('file')
4506 video_extension = video_url.split('.')[-1]
4508 mobj = re.search(r'<title>(?P<title>.+?) - xHamster\.com</title>', webpage)
4510 raise ExtractorError(u'Unable to extract title')
4511 video_title = unescapeHTML(mobj.group('title'))
4513 mobj = re.search(r'<span>Description: </span>(?P<description>[^<]+)', webpage)
4515 video_description = u''
4517 video_description = unescapeHTML(mobj.group('description'))
4519 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
4521 raise ExtractorError(u'Unable to extract upload date')
4522 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
4524 mobj = re.search(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^>]+)', webpage)
4526 video_uploader_id = u'anonymous'
4528 video_uploader_id = mobj.group('uploader_id')
4530 mobj = re.search(r'\'image\':\'(?P<thumbnail>[^\']+)\'', webpage)
4532 raise ExtractorError(u'Unable to extract thumbnail URL')
4533 video_thumbnail = mobj.group('thumbnail')
4538 'ext': video_extension,
4539 'title': video_title,
4540 'description': video_description,
4541 'upload_date': video_upload_date,
4542 'uploader_id': video_uploader_id,
4543 'thumbnail': video_thumbnail
4546 class HypemIE(InfoExtractor):
4547 """Information Extractor for hypem"""
4548 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
4550 def _real_extract(self, url):
4551 mobj = re.match(self._VALID_URL, url)
4553 raise ExtractorError(u'Invalid URL: %s' % url)
4554 track_id = mobj.group(1)
4556 data = { 'ax': 1, 'ts': time.time() }
4557 data_encoded = compat_urllib_parse.urlencode(data)
4558 complete_url = url + "?" + data_encoded
4559 request = compat_urllib_request.Request(complete_url)
4560 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
4561 cookie = urlh.headers.get('Set-Cookie', '')
4563 self.report_extraction(track_id)
4564 mobj = re.search(r'<script type="application/json" id="displayList-data">(.*?)</script>', response, flags=re.MULTILINE|re.DOTALL)
4566 raise ExtractorError(u'Unable to extrack tracks')
4567 html_tracks = mobj.group(1).strip()
4569 track_list = json.loads(html_tracks)
4570 track = track_list[u'tracks'][0]
4572 raise ExtractorError(u'Hypemachine contained invalid JSON.')
4575 track_id = track[u"id"]
4576 artist = track[u"artist"]
4577 title = track[u"song"]
4579 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
4580 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
4581 request.add_header('cookie', cookie)
4582 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
4584 song_data = json.loads(song_data_json)
4586 raise ExtractorError(u'Hypemachine contained invalid JSON.')
4587 final_url = song_data[u"url"]
4597 class Vbox7IE(InfoExtractor):
4598 """Information Extractor for Vbox7"""
4599 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
4601 def _real_extract(self,url):
4602 mobj = re.match(self._VALID_URL, url)
4604 raise ExtractorError(u'Invalid URL: %s' % url)
4605 video_id = mobj.group(1)
4607 redirect_page, urlh = self._download_webpage_handle(url, video_id)
4608 redirect_url = urlh.geturl() + re.search(r'window\.location = \'(.*)\';', redirect_page).group(1)
4609 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
4611 title = re.search(r'<title>(.*)</title>', webpage)
4612 title = (title.group(1)).split('/')[0].strip()
4615 info_url = "http://vbox7.com/play/magare.do"
4616 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
4617 info_request = compat_urllib_request.Request(info_url, data)
4618 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
4619 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
4620 if info_response is None:
4621 raise ExtractorError(u'Unable to extract the media url')
4622 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
4629 'thumbnail': thumbnail_url,
4632 class GametrailersIE(InfoExtractor):
4633 _VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
4635 def _real_extract(self, url):
4636 mobj = re.match(self._VALID_URL, url)
4638 raise ExtractorError(u'Invalid URL: %s' % url)
4639 video_id = mobj.group('id')
4640 video_type = mobj.group('type')
4641 webpage = self._download_webpage(url, video_id)
4642 if video_type == 'full-episodes':
4643 mgid_re = r'data-video="(?P<mgid>mgid:.*?)"'
4645 mgid_re = r'data-contentId=\'(?P<mgid>mgid:.*?)\''
4646 m_mgid = re.search(mgid_re, webpage)
4648 raise ExtractorError(u'Unable to extract mgid')
4649 mgid = m_mgid.group(1)
4650 data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'})
4652 info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data,
4653 video_id, u'Downloading video info')
4654 links_webpage = self._download_webpage('http://www.gametrailers.com/feeds/mediagen/?' + data,
4655 video_id, u'Downloading video urls info')
4657 self.report_extraction(video_id)
4658 info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
4659 <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
4661 <url>(?P<thumb>.*?)</url>.*
4664 m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL)
4666 raise ExtractorError(u'Unable to extract video info')
4667 video_title = m_info.group('title')
4668 video_description = m_info.group('description')
4669 video_thumb = m_info.group('thumb')
4671 m_urls = re.finditer(r'<src>(?P<url>.*)</src>', links_webpage)
4673 raise ExtractError(u'Unable to extrat video url')
4674 # They are sorted from worst to best quality
4675 video_url = list(m_urls)[-1].group('url')
4677 return {'url': video_url,
4679 'title': video_title,
4680 # Videos are actually flv not mp4
4682 'thumbnail': video_thumb,
4683 'description': video_description,
4686 def gen_extractors():
4687 """ Return a list of an instance of every supported extractor.
4688 The order does matter; the first extractor matched is the one handling the URL.
4691 YoutubePlaylistIE(),
4716 StanfordOpenClassroomIE(),
4726 WorldStarHipHopIE(),
4755 def get_info_extractor(ie_name):
4756 """Returns the info extractor class with the given ie_name"""
4757 return globals()[ie_name+'IE']