2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
26 class InfoExtractor(object):
27 """Information Extractor class.
29 Information extractors are the classes that, given a URL, extract
30 information about the video (or videos) the URL refers to. This
31 information includes the real video URL, the video title, author and
32 others. The information is stored in a dictionary which is then
33 passed to the FileDownloader. The FileDownloader processes this
34 information possibly downloading the video to the file system, among
35 other possible outcomes.
37 The dictionaries must include the following fields:
41 title: Video title, unescaped.
42 ext: Video filename extension.
44 The following fields are optional:
46 format: The video format, defaults to ext (used for --get-format)
47 thumbnail: Full URL to a video thumbnail image.
48 description: One-line video description.
49 uploader: Full name of the video uploader.
50 upload_date: Video upload date (YYYYMMDD).
51 uploader_id: Nickname or id of the video uploader.
52 location: Physical location of the video.
53 player_url: SWF Player URL (used for rtmpdump).
54 subtitles: The subtitle file contents.
55 urlhandle: [internal] The urlHandle to be used to download the file,
56 like returned by urllib.request.urlopen
58 The fields should all be Unicode strings.
60 Subclasses of this one should re-define the _real_initialize() and
61 _real_extract() methods and define a _VALID_URL regexp.
62 Probably, they should also be added to the list of extractors.
64 _real_extract() must return a *list* of information dictionaries as
67 Finally, the _WORKING attribute should be set to False for broken IEs
68 in order to warn the users and skip the tests.
75 def __init__(self, downloader=None):
76 """Constructor. Receives an optional downloader."""
78 self.set_downloader(downloader)
81 def suitable(cls, url):
82 """Receives a URL and returns True if suitable for this IE."""
83 return re.match(cls._VALID_URL, url) is not None
87 """Getter method for _WORKING."""
91 """Initializes an instance (authentication, etc)."""
93 self._real_initialize()
96 def extract(self, url):
97 """Extracts URL information and returns it in list of dicts."""
99 return self._real_extract(url)
101 def set_downloader(self, downloader):
102 """Sets the downloader for this IE."""
103 self._downloader = downloader
105 def _real_initialize(self):
106 """Real initialization process. Redefine in subclasses."""
109 def _real_extract(self, url):
110 """Real extraction process. Redefine in subclasses."""
115 return type(self).__name__[:-2]
117 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
118 """ Returns the response handle """
120 self.report_download_webpage(video_id)
121 elif note is not False:
122 self.to_screen(u'%s: %s' % (video_id, note))
124 return compat_urllib_request.urlopen(url_or_request)
125 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
127 errnote = u'Unable to download webpage'
128 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
130 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
131 """ Returns a tuple (page content as string, URL handle) """
132 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
133 content_type = urlh.headers.get('Content-Type', '')
134 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
136 encoding = m.group(1)
139 webpage_bytes = urlh.read()
140 if self._downloader.params.get('dump_intermediate_pages', False):
142 url = url_or_request.get_full_url()
143 except AttributeError:
145 self.to_screen(u'Dumping request to ' + url)
146 dump = base64.b64encode(webpage_bytes).decode('ascii')
147 self._downloader.to_screen(dump)
148 content = webpage_bytes.decode(encoding, 'replace')
149 return (content, urlh)
151 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
152 """ Returns the data of the page as a string """
153 return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
155 def to_screen(self, msg):
156 """Print msg to screen, prefixing it with '[ie_name]'"""
157 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
159 def report_extraction(self, id_or_name):
160 """Report information extraction."""
161 self.to_screen(u'%s: Extracting information' % id_or_name)
163 def report_download_webpage(self, video_id):
164 """Report webpage download."""
165 self.to_screen(u'%s: Downloading webpage' % video_id)
167 def report_age_confirmation(self):
168 """Report attempt to confirm age."""
169 self.to_screen(u'Confirming age')
171 #Methods for following #608
172 #They set the correct value of the '_type' key
173 def video_result(self, video_info):
174 """Returns a video"""
175 video_info['_type'] = 'video'
177 def url_result(self, url, ie=None):
178 """Returns a url that points to a page that should be processed"""
179 #TODO: ie should be the class used for getting the info
180 video_info = {'_type': 'url',
184 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
185 """Returns a playlist"""
186 video_info = {'_type': 'playlist',
189 video_info['id'] = playlist_id
191 video_info['title'] = playlist_title
194 class SearchInfoExtractor(InfoExtractor):
196 Base class for paged search queries extractors.
197 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
198 Instances should define _SEARCH_KEY and _MAX_RESULTS.
202 def _make_valid_url(cls):
203 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
206 def suitable(cls, url):
207 return re.match(cls._make_valid_url(), url) is not None
209 def _real_extract(self, query):
210 mobj = re.match(self._make_valid_url(), query)
212 raise ExtractorError(u'Invalid search query "%s"' % query)
214 prefix = mobj.group('prefix')
215 query = mobj.group('query')
217 return self._get_n_results(query, 1)
218 elif prefix == 'all':
219 return self._get_n_results(query, self._MAX_RESULTS)
223 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
224 elif n > self._MAX_RESULTS:
225 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
226 n = self._MAX_RESULTS
227 return self._get_n_results(query, n)
229 def _get_n_results(self, query, n):
230 """Get a specified number of results for a query"""
231 raise NotImplementedError("This method must be implemented by sublclasses")
234 class YoutubeIE(InfoExtractor):
235 """Information extractor for youtube.com."""
239 (?:https?://)? # http(s):// (optional)
240 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
241 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
242 (?:.*?\#/)? # handle anchor (#/) redirect urls
243 (?: # the various things that can precede the ID:
244 (?:(?:v|embed|e)/) # v/ or embed/ or e/
245 |(?: # or the v= param in all its forms
246 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
247 (?:\?|\#!?) # the params delimiter ? or # or #!
248 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
251 )? # optional -> youtube.com/xxxx is OK
252 )? # all until now is optional -> you can pass the naked ID
253 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
254 (?(1).+)? # if we found the ID, everything can follow
256 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
257 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
258 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
259 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
260 _NETRC_MACHINE = 'youtube'
261 # Listed in order of quality
262 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
263 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
264 _video_extensions = {
270 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
276 _video_dimensions = {
295 def suitable(cls, url):
296 """Receives a URL and returns True if suitable for this IE."""
297 if YoutubePlaylistIE.suitable(url): return False
298 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
300 def report_lang(self):
301 """Report attempt to set language."""
302 self.to_screen(u'Setting language')
304 def report_login(self):
305 """Report attempt to log in."""
306 self.to_screen(u'Logging in')
308 def report_video_webpage_download(self, video_id):
309 """Report attempt to download video webpage."""
310 self.to_screen(u'%s: Downloading video webpage' % video_id)
312 def report_video_info_webpage_download(self, video_id):
313 """Report attempt to download video info webpage."""
314 self.to_screen(u'%s: Downloading video info webpage' % video_id)
316 def report_video_subtitles_download(self, video_id):
317 """Report attempt to download video info webpage."""
318 self.to_screen(u'%s: Checking available subtitles' % video_id)
320 def report_video_subtitles_request(self, video_id, sub_lang, format):
321 """Report attempt to download video info webpage."""
322 self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
324 def report_video_subtitles_available(self, video_id, sub_lang_list):
325 """Report available subtitles."""
326 sub_lang = ",".join(list(sub_lang_list.keys()))
327 self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
329 def report_information_extraction(self, video_id):
330 """Report attempt to extract video information."""
331 self.to_screen(u'%s: Extracting video information' % video_id)
333 def report_unavailable_format(self, video_id, format):
334 """Report extracted video URL."""
335 self.to_screen(u'%s: Format %s not available' % (video_id, format))
337 def report_rtmp_download(self):
338 """Indicate the download will use the RTMP protocol."""
339 self.to_screen(u'RTMP download detected')
341 def _get_available_subtitles(self, video_id):
342 self.report_video_subtitles_download(video_id)
343 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
345 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
346 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
347 return (u'unable to download video subtitles: %s' % compat_str(err), None)
348 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
349 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
350 if not sub_lang_list:
351 return (u'video doesn\'t have subtitles', None)
354 def _list_available_subtitles(self, video_id):
355 sub_lang_list = self._get_available_subtitles(video_id)
356 self.report_video_subtitles_available(video_id, sub_lang_list)
358 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
361 (error_message, sub_lang, sub)
363 self.report_video_subtitles_request(video_id, sub_lang, format)
364 params = compat_urllib_parse.urlencode({
370 url = 'http://www.youtube.com/api/timedtext?' + params
372 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
373 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
374 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
376 return (u'Did not fetch video subtitles', None, None)
377 return (None, sub_lang, sub)
379 def _request_automatic_caption(self, video_id, webpage):
380 """We need the webpage for getting the captions url, pass it as an
381 argument to speed up the process."""
382 sub_lang = self._downloader.params.get('subtitleslang')
383 sub_format = self._downloader.params.get('subtitlesformat')
384 self.to_screen(u'%s: Looking for automatic captions' % video_id)
385 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
386 err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang
388 return [(err_msg, None, None)]
389 player_config = json.loads(mobj.group(1))
391 args = player_config[u'args']
392 caption_url = args[u'ttsurl']
393 timestamp = args[u'timestamp']
394 params = compat_urllib_parse.urlencode({
401 subtitles_url = caption_url + '&' + params
402 sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions')
403 return [(None, sub_lang, sub)]
405 return [(err_msg, None, None)]
407 def _extract_subtitle(self, video_id):
409 Return a list with a tuple:
410 [(error_message, sub_lang, sub)]
412 sub_lang_list = self._get_available_subtitles(video_id)
413 sub_format = self._downloader.params.get('subtitlesformat')
414 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
415 return [(sub_lang_list[0], None, None)]
416 if self._downloader.params.get('subtitleslang', False):
417 sub_lang = self._downloader.params.get('subtitleslang')
418 elif 'en' in sub_lang_list:
421 sub_lang = list(sub_lang_list.keys())[0]
422 if not sub_lang in sub_lang_list:
423 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
425 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
428 def _extract_all_subtitles(self, video_id):
429 sub_lang_list = self._get_available_subtitles(video_id)
430 sub_format = self._downloader.params.get('subtitlesformat')
431 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
432 return [(sub_lang_list[0], None, None)]
434 for sub_lang in sub_lang_list:
435 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
436 subtitles.append(subtitle)
439 def _print_formats(self, formats):
440 print('Available formats:')
442 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
444 def _real_initialize(self):
445 if self._downloader is None:
450 downloader_params = self._downloader.params
452 # Attempt to use provided username and password or .netrc data
453 if downloader_params.get('username', None) is not None:
454 username = downloader_params['username']
455 password = downloader_params['password']
456 elif downloader_params.get('usenetrc', False):
458 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
463 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
464 except (IOError, netrc.NetrcParseError) as err:
465 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
469 request = compat_urllib_request.Request(self._LANG_URL)
472 compat_urllib_request.urlopen(request).read()
473 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
474 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
477 # No authentication to be performed
481 request = compat_urllib_request.Request(self._LOGIN_URL)
483 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
484 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
485 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
490 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
492 galx = match.group(1)
494 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
500 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
504 u'PersistentCookie': u'yes',
506 u'bgresponse': u'js_disabled',
507 u'checkConnection': u'',
508 u'checkedDomains': u'youtube',
514 u'signIn': u'Sign in',
516 u'service': u'youtube',
520 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
522 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
523 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
524 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
527 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
528 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
529 self._downloader.report_warning(u'unable to log in: bad username or password')
531 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
532 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
538 'action_confirm': 'Confirm',
540 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
542 self.report_age_confirmation()
543 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
544 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
545 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
547 def _extract_id(self, url):
548 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
550 raise ExtractorError(u'Invalid URL: %s' % url)
551 video_id = mobj.group(2)
554 def _real_extract(self, url):
555 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
556 mobj = re.search(self._NEXT_URL_RE, url)
558 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
559 video_id = self._extract_id(url)
562 self.report_video_webpage_download(video_id)
563 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
564 request = compat_urllib_request.Request(url)
566 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
567 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
568 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
570 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
572 # Attempt to extract SWF player URL
573 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
575 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
580 self.report_video_info_webpage_download(video_id)
581 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
582 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
583 % (video_id, el_type))
584 video_info_webpage = self._download_webpage(video_info_url, video_id,
586 errnote='unable to download video info webpage')
587 video_info = compat_parse_qs(video_info_webpage)
588 if 'token' in video_info:
590 if 'token' not in video_info:
591 if 'reason' in video_info:
592 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
594 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
596 # Check for "rental" videos
597 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
598 raise ExtractorError(u'"rental" videos not supported')
600 # Start extracting information
601 self.report_information_extraction(video_id)
604 if 'author' not in video_info:
605 raise ExtractorError(u'Unable to extract uploader name')
606 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
609 video_uploader_id = None
610 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
612 video_uploader_id = mobj.group(1)
614 self._downloader.report_warning(u'unable to extract uploader nickname')
617 if 'title' not in video_info:
618 raise ExtractorError(u'Unable to extract video title')
619 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
622 if 'thumbnail_url' not in video_info:
623 self._downloader.report_warning(u'unable to extract video thumbnail')
625 else: # don't panic if we can't find it
626 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
630 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
632 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
633 upload_date = unified_strdate(upload_date)
636 video_description = get_element_by_id("eow-description", video_webpage)
637 if video_description:
638 video_description = clean_html(video_description)
640 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
642 video_description = unescapeHTML(fd_mobj.group(1))
644 video_description = u''
647 video_subtitles = None
649 if self._downloader.params.get('writesubtitles', False):
650 video_subtitles = self._extract_subtitle(video_id)
652 (sub_error, sub_lang, sub) = video_subtitles[0]
654 # We try with the automatic captions
655 video_subtitles = self._request_automatic_caption(video_id, video_webpage)
656 (sub_error_auto, sub_lang, sub) = video_subtitles[0]
660 # We report the original error
661 self._downloader.report_error(sub_error)
663 if self._downloader.params.get('allsubtitles', False):
664 video_subtitles = self._extract_all_subtitles(video_id)
665 for video_subtitle in video_subtitles:
666 (sub_error, sub_lang, sub) = video_subtitle
668 self._downloader.report_error(sub_error)
670 if self._downloader.params.get('listsubtitles', False):
671 sub_lang_list = self._list_available_subtitles(video_id)
674 if 'length_seconds' not in video_info:
675 self._downloader.report_warning(u'unable to extract video duration')
678 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
681 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
683 # Decide which formats to download
684 req_format = self._downloader.params.get('format', None)
686 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
687 self.report_rtmp_download()
688 video_url_list = [(None, video_info['conn'][0])]
689 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
691 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
692 url_data = compat_parse_qs(url_data_str)
693 if 'itag' in url_data and 'url' in url_data:
694 url = url_data['url'][0] + '&signature=' + url_data['sig'][0]
695 if not 'ratebypass' in url: url += '&ratebypass=yes'
696 url_map[url_data['itag'][0]] = url
698 format_limit = self._downloader.params.get('format_limit', None)
699 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
700 if format_limit is not None and format_limit in available_formats:
701 format_list = available_formats[available_formats.index(format_limit):]
703 format_list = available_formats
704 existing_formats = [x for x in format_list if x in url_map]
705 if len(existing_formats) == 0:
706 raise ExtractorError(u'no known formats available for video')
707 if self._downloader.params.get('listformats', None):
708 self._print_formats(existing_formats)
710 if req_format is None or req_format == 'best':
711 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
712 elif req_format == 'worst':
713 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
714 elif req_format in ('-1', 'all'):
715 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
717 # Specific formats. We pick the first in a slash-delimeted sequence.
718 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
719 req_formats = req_format.split('/')
720 video_url_list = None
721 for rf in req_formats:
723 video_url_list = [(rf, url_map[rf])]
725 if video_url_list is None:
726 raise ExtractorError(u'requested format not available')
728 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
731 for format_param, video_real_url in video_url_list:
733 video_extension = self._video_extensions.get(format_param, 'flv')
735 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
736 self._video_dimensions.get(format_param, '???'))
740 'url': video_real_url,
741 'uploader': video_uploader,
742 'uploader_id': video_uploader_id,
743 'upload_date': upload_date,
744 'title': video_title,
745 'ext': video_extension,
746 'format': video_format,
747 'thumbnail': video_thumbnail,
748 'description': video_description,
749 'player_url': player_url,
750 'subtitles': video_subtitles,
751 'duration': video_duration
756 class MetacafeIE(InfoExtractor):
757 """Information Extractor for metacafe.com."""
759 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
760 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
761 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
762 IE_NAME = u'metacafe'
764 def report_disclaimer(self):
765 """Report disclaimer retrieval."""
766 self.to_screen(u'Retrieving disclaimer')
768 def _real_initialize(self):
769 # Retrieve disclaimer
770 request = compat_urllib_request.Request(self._DISCLAIMER)
772 self.report_disclaimer()
773 disclaimer = compat_urllib_request.urlopen(request).read()
774 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
775 raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
780 'submit': "Continue - I'm over 18",
782 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
784 self.report_age_confirmation()
785 disclaimer = compat_urllib_request.urlopen(request).read()
786 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
787 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
789 def _real_extract(self, url):
790 # Extract id and simplified title from URL
791 mobj = re.match(self._VALID_URL, url)
793 raise ExtractorError(u'Invalid URL: %s' % url)
795 video_id = mobj.group(1)
797 # Check if video comes from YouTube
798 mobj2 = re.match(r'^yt-(.*)$', video_id)
799 if mobj2 is not None:
800 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
802 # Retrieve video webpage to extract further information
803 webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
805 # Extract URL, uploader and title from webpage
806 self.report_extraction(video_id)
807 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
809 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
810 video_extension = mediaURL[-3:]
812 # Extract gdaKey if available
813 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
817 gdaKey = mobj.group(1)
818 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
820 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
822 raise ExtractorError(u'Unable to extract media URL')
823 vardict = compat_parse_qs(mobj.group(1))
824 if 'mediaData' not in vardict:
825 raise ExtractorError(u'Unable to extract media URL')
826 mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
828 raise ExtractorError(u'Unable to extract media URL')
829 mediaURL = mobj.group('mediaURL').replace('\\/', '/')
830 video_extension = mediaURL[-3:]
831 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
833 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
835 raise ExtractorError(u'Unable to extract title')
836 video_title = mobj.group(1).decode('utf-8')
838 mobj = re.search(r'submitter=(.*?);', webpage)
840 raise ExtractorError(u'Unable to extract uploader nickname')
841 video_uploader = mobj.group(1)
844 'id': video_id.decode('utf-8'),
845 'url': video_url.decode('utf-8'),
846 'uploader': video_uploader.decode('utf-8'),
848 'title': video_title,
849 'ext': video_extension.decode('utf-8'),
852 class DailymotionIE(InfoExtractor):
853 """Information Extractor for Dailymotion"""
855 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
856 IE_NAME = u'dailymotion'
858 def _real_extract(self, url):
859 # Extract id and simplified title from URL
860 mobj = re.match(self._VALID_URL, url)
862 raise ExtractorError(u'Invalid URL: %s' % url)
864 video_id = mobj.group(1).split('_')[0].split('?')[0]
866 video_extension = 'mp4'
868 # Retrieve video webpage to extract further information
869 request = compat_urllib_request.Request(url)
870 request.add_header('Cookie', 'family_filter=off')
871 webpage = self._download_webpage(request, video_id)
873 # Extract URL, uploader and title from webpage
874 self.report_extraction(video_id)
875 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
877 raise ExtractorError(u'Unable to extract media URL')
878 flashvars = compat_urllib_parse.unquote(mobj.group(1))
880 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
883 self.to_screen(u'Using %s' % key)
886 raise ExtractorError(u'Unable to extract video URL')
888 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
890 raise ExtractorError(u'Unable to extract video URL')
892 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
894 # TODO: support choosing qualities
896 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
898 raise ExtractorError(u'Unable to extract title')
899 video_title = unescapeHTML(mobj.group('title'))
901 video_uploader = None
902 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
904 # lookin for official user
905 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
906 if mobj_official is None:
907 self._downloader.report_warning(u'unable to extract uploader nickname')
909 video_uploader = mobj_official.group(1)
911 video_uploader = mobj.group(1)
913 video_upload_date = None
914 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
916 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
921 'uploader': video_uploader,
922 'upload_date': video_upload_date,
923 'title': video_title,
924 'ext': video_extension,
928 class PhotobucketIE(InfoExtractor):
929 """Information extractor for photobucket.com."""
931 # TODO: the original _VALID_URL was:
932 # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
933 # Check if it's necessary to keep the old extracion process
934 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
935 IE_NAME = u'photobucket'
937 def _real_extract(self, url):
938 # Extract id from URL
939 mobj = re.match(self._VALID_URL, url)
941 raise ExtractorError(u'Invalid URL: %s' % url)
943 video_id = mobj.group('id')
945 video_extension = mobj.group('ext')
947 # Retrieve video webpage to extract further information
948 webpage = self._download_webpage(url, video_id)
950 # Extract URL, uploader, and title from webpage
951 self.report_extraction(video_id)
952 # We try first by looking the javascript code:
953 mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
955 info = json.loads(mobj.group('json'))
958 'url': info[u'downloadUrl'],
959 'uploader': info[u'username'],
960 'upload_date': datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
961 'title': info[u'title'],
962 'ext': video_extension,
963 'thumbnail': info[u'thumbUrl'],
966 # We try looking in other parts of the webpage
967 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
969 raise ExtractorError(u'Unable to extract media URL')
970 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
974 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
976 raise ExtractorError(u'Unable to extract title')
977 video_title = mobj.group(1).decode('utf-8')
979 video_uploader = mobj.group(2).decode('utf-8')
982 'id': video_id.decode('utf-8'),
983 'url': video_url.decode('utf-8'),
984 'uploader': video_uploader,
986 'title': video_title,
987 'ext': video_extension.decode('utf-8'),
991 class YahooIE(InfoExtractor):
992 """Information extractor for screen.yahoo.com."""
993 _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
995 def _real_extract(self, url):
996 mobj = re.match(self._VALID_URL, url)
998 raise ExtractorError(u'Invalid URL: %s' % url)
999 video_id = mobj.group('id')
1000 webpage = self._download_webpage(url, video_id)
1001 m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
1004 # TODO: Check which url parameters are required
1005 info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1006 webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
1007 info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
1008 <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
1009 <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
1010 <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
1012 self.report_extraction(video_id)
1013 m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
1015 raise ExtractorError(u'Unable to extract video info')
1016 video_title = m_info.group('title')
1017 video_description = m_info.group('description')
1018 video_thumb = m_info.group('thumb')
1019 video_date = m_info.group('date')
1020 video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
1022 # TODO: Find a way to get mp4 videos
1023 rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1024 webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
1025 m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
1026 video_url = m_rest.group('url')
1027 video_path = m_rest.group('path')
1029 raise ExtractorError(u'Unable to extract video url')
1031 else: # We have to use a different method if another id is defined
1032 long_id = m_id.group('new_id')
1033 info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
1034 webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
1035 json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
1036 info = json.loads(json_str)
1037 res = info[u'query'][u'results'][u'mediaObj'][0]
1038 stream = res[u'streams'][0]
1039 video_path = stream[u'path']
1040 video_url = stream[u'host']
1042 video_title = meta[u'title']
1043 video_description = meta[u'description']
1044 video_thumb = meta[u'thumbnail']
1045 video_date = None # I can't find it
1050 'play_path': video_path,
1051 'title':video_title,
1052 'description': video_description,
1053 'thumbnail': video_thumb,
1054 'upload_date': video_date,
1059 class VimeoIE(InfoExtractor):
1060 """Information extractor for vimeo.com."""
1062 # _VALID_URL matches Vimeo URLs
1063 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1066 def _real_extract(self, url, new_video=True):
1067 # Extract ID from URL
1068 mobj = re.match(self._VALID_URL, url)
1070 raise ExtractorError(u'Invalid URL: %s' % url)
1072 video_id = mobj.group('id')
1073 if not mobj.group('proto'):
1074 url = 'https://' + url
1075 if mobj.group('direct_link') or mobj.group('pro'):
1076 url = 'https://vimeo.com/' + video_id
1078 # Retrieve video webpage to extract further information
1079 request = compat_urllib_request.Request(url, None, std_headers)
1080 webpage = self._download_webpage(request, video_id)
1082 # Now we begin extracting as much information as we can from what we
1083 # retrieved. First we extract the information common to all extractors,
1084 # and latter we extract those that are Vimeo specific.
1085 self.report_extraction(video_id)
1087 # Extract the config JSON
1089 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1090 config = json.loads(config)
1092 if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1093 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1095 raise ExtractorError(u'Unable to extract info section')
1098 video_title = config["video"]["title"]
1100 # Extract uploader and uploader_id
1101 video_uploader = config["video"]["owner"]["name"]
1102 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None
1104 # Extract video thumbnail
1105 video_thumbnail = config["video"]["thumbnail"]
1107 # Extract video description
1108 video_description = get_element_by_attribute("itemprop", "description", webpage)
1109 if video_description: video_description = clean_html(video_description)
1110 else: video_description = u''
1112 # Extract upload date
1113 video_upload_date = None
1114 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1115 if mobj is not None:
1116 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1118 # Vimeo specific: extract request signature and timestamp
1119 sig = config['request']['signature']
1120 timestamp = config['request']['timestamp']
1122 # Vimeo specific: extract video codec and quality information
1123 # First consider quality, then codecs, then take everything
1124 # TODO bind to format param
1125 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1126 files = { 'hd': [], 'sd': [], 'other': []}
1127 for codec_name, codec_extension in codecs:
1128 if codec_name in config["video"]["files"]:
1129 if 'hd' in config["video"]["files"][codec_name]:
1130 files['hd'].append((codec_name, codec_extension, 'hd'))
1131 elif 'sd' in config["video"]["files"][codec_name]:
1132 files['sd'].append((codec_name, codec_extension, 'sd'))
1134 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1136 for quality in ('hd', 'sd', 'other'):
1137 if len(files[quality]) > 0:
1138 video_quality = files[quality][0][2]
1139 video_codec = files[quality][0][0]
1140 video_extension = files[quality][0][1]
1141 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1144 raise ExtractorError(u'No known codec found')
1146 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1147 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1152 'uploader': video_uploader,
1153 'uploader_id': video_uploader_id,
1154 'upload_date': video_upload_date,
1155 'title': video_title,
1156 'ext': video_extension,
1157 'thumbnail': video_thumbnail,
1158 'description': video_description,
1162 class ArteTvIE(InfoExtractor):
1163 """arte.tv information extractor."""
1165 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1166 _LIVE_URL = r'index-[0-9]+\.html$'
1168 IE_NAME = u'arte.tv'
1170 def fetch_webpage(self, url):
1171 request = compat_urllib_request.Request(url)
1173 self.report_download_webpage(url)
1174 webpage = compat_urllib_request.urlopen(request).read()
1175 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1176 raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1177 except ValueError as err:
1178 raise ExtractorError(u'Invalid URL: %s' % url)
1181 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1182 page = self.fetch_webpage(url)
1183 mobj = re.search(regex, page, regexFlags)
1187 raise ExtractorError(u'Invalid URL: %s' % url)
1189 for (i, key, err) in matchTuples:
1190 if mobj.group(i) is None:
1191 raise ExtractorError(err)
1193 info[key] = mobj.group(i)
1197 def extractLiveStream(self, url):
1198 video_lang = url.split('/')[-4]
1199 info = self.grep_webpage(
1201 r'src="(.*?/videothek_js.*?\.js)',
1204 (1, 'url', u'Invalid URL: %s' % url)
1207 http_host = url.split('/')[2]
1208 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1209 info = self.grep_webpage(
1211 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1212 '(http://.*?\.swf).*?' +
1216 (1, 'path', u'could not extract video path: %s' % url),
1217 (2, 'player', u'could not extract video player: %s' % url),
1218 (3, 'url', u'could not extract video url: %s' % url)
1221 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1223 def extractPlus7Stream(self, url):
1224 video_lang = url.split('/')[-3]
1225 info = self.grep_webpage(
1227 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1230 (1, 'url', u'Invalid URL: %s' % url)
1233 next_url = compat_urllib_parse.unquote(info.get('url'))
1234 info = self.grep_webpage(
1236 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1239 (1, 'url', u'Could not find <video> tag: %s' % url)
1242 next_url = compat_urllib_parse.unquote(info.get('url'))
1244 info = self.grep_webpage(
1246 r'<video id="(.*?)".*?>.*?' +
1247 '<name>(.*?)</name>.*?' +
1248 '<dateVideo>(.*?)</dateVideo>.*?' +
1249 '<url quality="hd">(.*?)</url>',
1252 (1, 'id', u'could not extract video id: %s' % url),
1253 (2, 'title', u'could not extract video title: %s' % url),
1254 (3, 'date', u'could not extract video date: %s' % url),
1255 (4, 'url', u'could not extract video url: %s' % url)
1260 'id': info.get('id'),
1261 'url': compat_urllib_parse.unquote(info.get('url')),
1262 'uploader': u'arte.tv',
1263 'upload_date': unified_strdate(info.get('date')),
1264 'title': info.get('title').decode('utf-8'),
1270 def _real_extract(self, url):
1271 video_id = url.split('/')[-1]
1272 self.report_extraction(video_id)
1274 if re.search(self._LIVE_URL, video_id) is not None:
1275 self.extractLiveStream(url)
1278 info = self.extractPlus7Stream(url)
1283 class GenericIE(InfoExtractor):
1284 """Generic last-resort information extractor."""
1287 IE_NAME = u'generic'
1289 def report_download_webpage(self, video_id):
1290 """Report webpage download."""
1291 if not self._downloader.params.get('test', False):
1292 self._downloader.report_warning(u'Falling back on generic information extractor.')
1293 super(GenericIE, self).report_download_webpage(video_id)
1295 def report_following_redirect(self, new_url):
1296 """Report information extraction."""
1297 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1299 def _test_redirect(self, url):
1300 """Check if it is a redirect, like url shorteners, in case return the new url."""
1301 class HeadRequest(compat_urllib_request.Request):
1302 def get_method(self):
1305 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1307 Subclass the HTTPRedirectHandler to make it use our
1308 HeadRequest also on the redirected URL
1310 def redirect_request(self, req, fp, code, msg, headers, newurl):
1311 if code in (301, 302, 303, 307):
1312 newurl = newurl.replace(' ', '%20')
1313 newheaders = dict((k,v) for k,v in req.headers.items()
1314 if k.lower() not in ("content-length", "content-type"))
1315 return HeadRequest(newurl,
1317 origin_req_host=req.get_origin_req_host(),
1320 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1322 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1324 Fallback to GET if HEAD is not allowed (405 HTTP error)
1326 def http_error_405(self, req, fp, code, msg, headers):
1330 newheaders = dict((k,v) for k,v in req.headers.items()
1331 if k.lower() not in ("content-length", "content-type"))
1332 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1334 origin_req_host=req.get_origin_req_host(),
1338 opener = compat_urllib_request.OpenerDirector()
1339 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1340 HTTPMethodFallback, HEADRedirectHandler,
1341 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1342 opener.add_handler(handler())
1344 response = opener.open(HeadRequest(url))
1345 if response is None:
1346 raise ExtractorError(u'Invalid URL protocol')
1347 new_url = response.geturl()
1352 self.report_following_redirect(new_url)
1355 def _real_extract(self, url):
1356 new_url = self._test_redirect(url)
1357 if new_url: return [self.url_result(new_url)]
1359 video_id = url.split('/')[-1]
1361 webpage = self._download_webpage(url, video_id)
1362 except ValueError as err:
1363 # since this is the last-resort InfoExtractor, if
1364 # this error is thrown, it'll be thrown here
1365 raise ExtractorError(u'Invalid URL: %s' % url)
1367 self.report_extraction(video_id)
1368 # Start with something easy: JW Player in SWFObject
1369 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1371 # Broaden the search a little bit
1372 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1374 # Broaden the search a little bit: JWPlayer JS loader
1375 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1377 raise ExtractorError(u'Invalid URL: %s' % url)
1379 # It's possible that one of the regexes
1380 # matched, but returned an empty group:
1381 if mobj.group(1) is None:
1382 raise ExtractorError(u'Invalid URL: %s' % url)
1384 video_url = compat_urllib_parse.unquote(mobj.group(1))
1385 video_id = os.path.basename(video_url)
1387 # here's a fun little line of code for you:
1388 video_extension = os.path.splitext(video_id)[1][1:]
1389 video_id = os.path.splitext(video_id)[0]
1391 # it's tempting to parse this further, but you would
1392 # have to take into account all the variations like
1393 # Video Title - Site Name
1394 # Site Name | Video Title
1395 # Video Title - Tagline | Site Name
1396 # and so on and so forth; it's just not practical
1397 mobj = re.search(r'<title>(.*)</title>', webpage)
1399 raise ExtractorError(u'Unable to extract title')
1400 video_title = mobj.group(1)
1402 # video uploader is domain name
1403 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1405 raise ExtractorError(u'Unable to extract title')
1406 video_uploader = mobj.group(1)
1411 'uploader': video_uploader,
1412 'upload_date': None,
1413 'title': video_title,
1414 'ext': video_extension,
1418 class YoutubeSearchIE(SearchInfoExtractor):
1419 """Information Extractor for YouTube search queries."""
1420 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1422 IE_NAME = u'youtube:search'
1423 _SEARCH_KEY = 'ytsearch'
1425 def report_download_page(self, query, pagenum):
1426 """Report attempt to download search page with given number."""
1427 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1429 def _get_n_results(self, query, n):
1430 """Get a specified number of results for a query"""
1436 while (50 * pagenum) < limit:
1437 self.report_download_page(query, pagenum+1)
1438 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1439 request = compat_urllib_request.Request(result_url)
1441 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1442 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1443 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1444 api_response = json.loads(data)['data']
1446 if not 'items' in api_response:
1447 raise ExtractorError(u'[youtube] No video results')
1449 new_ids = list(video['id'] for video in api_response['items'])
1450 video_ids += new_ids
1452 limit = min(n, api_response['totalItems'])
1455 if len(video_ids) > n:
1456 video_ids = video_ids[:n]
1457 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1458 return self.playlist_result(videos, query)
1461 class GoogleSearchIE(SearchInfoExtractor):
1462 """Information Extractor for Google Video search queries."""
1463 _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
1465 IE_NAME = u'video.google:search'
1466 _SEARCH_KEY = 'gvsearch'
1468 def _get_n_results(self, query, n):
1469 """Get a specified number of results for a query"""
1472 '_type': 'playlist',
1477 for pagenum in itertools.count(1):
1478 result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1479 webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1480 note='Downloading result page ' + str(pagenum))
1482 for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1485 'url': mobj.group(1)
1487 res['entries'].append(e)
1489 if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1492 class YahooSearchIE(SearchInfoExtractor):
1493 """Information Extractor for Yahoo! Video search queries."""
1496 IE_NAME = u'screen.yahoo:search'
1497 _SEARCH_KEY = 'yvsearch'
1499 def _get_n_results(self, query, n):
1500 """Get a specified number of results for a query"""
1503 '_type': 'playlist',
1507 for pagenum in itertools.count(0):
1508 result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
1509 webpage = self._download_webpage(result_url, query,
1510 note='Downloading results page '+str(pagenum+1))
1511 info = json.loads(webpage)
1513 results = info[u'results']
1515 for (i, r) in enumerate(results):
1516 if (pagenum * 30) +i >= n:
1518 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
1519 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
1520 res['entries'].append(e)
1521 if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
1527 class YoutubePlaylistIE(InfoExtractor):
1528 """Information Extractor for YouTube playlists."""
1530 _VALID_URL = r"""(?:
1535 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1536 \? (?:.*?&)*? (?:p|a|list)=
1539 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1542 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1544 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1546 IE_NAME = u'youtube:playlist'
1549 def suitable(cls, url):
1550 """Receives a URL and returns True if suitable for this IE."""
1551 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1553 def _real_extract(self, url):
1554 # Extract playlist id
1555 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1557 raise ExtractorError(u'Invalid URL: %s' % url)
1559 # Download playlist videos from API
1560 playlist_id = mobj.group(1) or mobj.group(2)
1565 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1566 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1569 response = json.loads(page)
1570 except ValueError as err:
1571 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1573 if 'feed' not in response:
1574 raise ExtractorError(u'Got a malformed response from YouTube API')
1575 playlist_title = response['feed']['title']['$t']
1576 if 'entry' not in response['feed']:
1577 # Number of videos is a multiple of self._MAX_RESULTS
1580 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1581 for entry in response['feed']['entry']
1582 if 'content' in entry ]
1584 if len(response['feed']['entry']) < self._MAX_RESULTS:
1588 videos = [v[1] for v in sorted(videos)]
1590 url_results = [self.url_result(url, 'Youtube') for url in videos]
1591 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1594 class YoutubeChannelIE(InfoExtractor):
1595 """Information Extractor for YouTube channels."""
1597 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1598 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1599 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1600 _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1601 IE_NAME = u'youtube:channel'
1603 def extract_videos_from_page(self, page):
1605 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1606 if mobj.group(1) not in ids_in_page:
1607 ids_in_page.append(mobj.group(1))
1610 def _real_extract(self, url):
1611 # Extract channel id
1612 mobj = re.match(self._VALID_URL, url)
1614 raise ExtractorError(u'Invalid URL: %s' % url)
1616 # Download channel page
1617 channel_id = mobj.group(1)
1621 url = self._TEMPLATE_URL % (channel_id, pagenum)
1622 page = self._download_webpage(url, channel_id,
1623 u'Downloading page #%s' % pagenum)
1625 # Extract video identifiers
1626 ids_in_page = self.extract_videos_from_page(page)
1627 video_ids.extend(ids_in_page)
1629 # Download any subsequent channel pages using the json-based channel_ajax query
1630 if self._MORE_PAGES_INDICATOR in page:
1632 pagenum = pagenum + 1
1634 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1635 page = self._download_webpage(url, channel_id,
1636 u'Downloading page #%s' % pagenum)
1638 page = json.loads(page)
1640 ids_in_page = self.extract_videos_from_page(page['content_html'])
1641 video_ids.extend(ids_in_page)
1643 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1646 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1648 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1649 url_entries = [self.url_result(url, 'Youtube') for url in urls]
1650 return [self.playlist_result(url_entries, channel_id)]
1653 class YoutubeUserIE(InfoExtractor):
1654 """Information Extractor for YouTube users."""
1656 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1657 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1658 _GDATA_PAGE_SIZE = 50
1659 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1660 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1661 IE_NAME = u'youtube:user'
1663 def _real_extract(self, url):
1665 mobj = re.match(self._VALID_URL, url)
1667 raise ExtractorError(u'Invalid URL: %s' % url)
1669 username = mobj.group(1)
1671 # Download video ids using YouTube Data API. Result size per
1672 # query is limited (currently to 50 videos) so we need to query
1673 # page by page until there are no video ids - it means we got
1680 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1682 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1683 page = self._download_webpage(gdata_url, username,
1684 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1686 # Extract video identifiers
1689 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1690 if mobj.group(1) not in ids_in_page:
1691 ids_in_page.append(mobj.group(1))
1693 video_ids.extend(ids_in_page)
1695 # A little optimization - if current page is not
1696 # "full", ie. does not contain PAGE_SIZE video ids then
1697 # we can assume that this page is the last one - there
1698 # are no more ids on further pages - no need to query
1701 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1706 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1707 url_results = [self.url_result(url, 'Youtube') for url in urls]
1708 return [self.playlist_result(url_results, playlist_title = username)]
1711 class BlipTVUserIE(InfoExtractor):
1712 """Information Extractor for blip.tv users."""
1714 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1716 IE_NAME = u'blip.tv:user'
1718 def _real_extract(self, url):
1720 mobj = re.match(self._VALID_URL, url)
1722 raise ExtractorError(u'Invalid URL: %s' % url)
1724 username = mobj.group(1)
1726 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1728 page = self._download_webpage(url, username, u'Downloading user page')
1729 mobj = re.search(r'data-users-id="([^"]+)"', page)
1730 page_base = page_base % mobj.group(1)
1733 # Download video ids using BlipTV Ajax calls. Result size per
1734 # query is limited (currently to 12 videos) so we need to query
1735 # page by page until there are no video ids - it means we got
1742 url = page_base + "&page=" + str(pagenum)
1743 page = self._download_webpage(url, username,
1744 u'Downloading video ids from page %d' % pagenum)
1746 # Extract video identifiers
1749 for mobj in re.finditer(r'href="/([^"]+)"', page):
1750 if mobj.group(1) not in ids_in_page:
1751 ids_in_page.append(unescapeHTML(mobj.group(1)))
1753 video_ids.extend(ids_in_page)
1755 # A little optimization - if current page is not
1756 # "full", ie. does not contain PAGE_SIZE video ids then
1757 # we can assume that this page is the last one - there
1758 # are no more ids on further pages - no need to query
1761 if len(ids_in_page) < self._PAGE_SIZE:
1766 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1767 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1768 return [self.playlist_result(url_entries, playlist_title = username)]
1771 class DepositFilesIE(InfoExtractor):
1772 """Information extractor for depositfiles.com"""
1774 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1776 def _real_extract(self, url):
1777 file_id = url.split('/')[-1]
1778 # Rebuild url in english locale
1779 url = 'http://depositfiles.com/en/files/' + file_id
1781 # Retrieve file webpage with 'Free download' button pressed
1782 free_download_indication = { 'gateway_result' : '1' }
1783 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1785 self.report_download_webpage(file_id)
1786 webpage = compat_urllib_request.urlopen(request).read()
1787 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1788 raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1790 # Search for the real file URL
1791 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1792 if (mobj is None) or (mobj.group(1) is None):
1793 # Try to figure out reason of the error.
1794 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1795 if (mobj is not None) and (mobj.group(1) is not None):
1796 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1797 raise ExtractorError(u'%s' % restriction_message)
1799 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1801 file_url = mobj.group(1)
1802 file_extension = os.path.splitext(file_url)[1][1:]
1804 # Search for file title
1805 mobj = re.search(r'<b title="(.*?)">', webpage)
1807 raise ExtractorError(u'Unable to extract title')
1808 file_title = mobj.group(1).decode('utf-8')
1811 'id': file_id.decode('utf-8'),
1812 'url': file_url.decode('utf-8'),
1814 'upload_date': None,
1815 'title': file_title,
1816 'ext': file_extension.decode('utf-8'),
1820 class FacebookIE(InfoExtractor):
1821 """Information Extractor for Facebook"""
1823 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1824 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1825 _NETRC_MACHINE = 'facebook'
1826 IE_NAME = u'facebook'
1828 def report_login(self):
1829 """Report attempt to log in."""
1830 self.to_screen(u'Logging in')
1832 def _real_initialize(self):
1833 if self._downloader is None:
1838 downloader_params = self._downloader.params
1840 # Attempt to use provided username and password or .netrc data
1841 if downloader_params.get('username', None) is not None:
1842 useremail = downloader_params['username']
1843 password = downloader_params['password']
1844 elif downloader_params.get('usenetrc', False):
1846 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1847 if info is not None:
1851 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1852 except (IOError, netrc.NetrcParseError) as err:
1853 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1856 if useremail is None:
1865 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1868 login_results = compat_urllib_request.urlopen(request).read()
1869 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1870 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1872 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1873 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1876 def _real_extract(self, url):
1877 mobj = re.match(self._VALID_URL, url)
1879 raise ExtractorError(u'Invalid URL: %s' % url)
1880 video_id = mobj.group('ID')
1882 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1883 webpage = self._download_webpage(url, video_id)
1885 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1886 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1887 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1889 raise ExtractorError(u'Cannot parse data')
1890 data = dict(json.loads(m.group(1)))
1891 params_raw = compat_urllib_parse.unquote(data['params'])
1892 params = json.loads(params_raw)
1893 video_data = params['video_data'][0]
1894 video_url = video_data.get('hd_src')
1896 video_url = video_data['sd_src']
1898 raise ExtractorError(u'Cannot find video URL')
1899 video_duration = int(video_data['video_duration'])
1900 thumbnail = video_data['thumbnail_src']
1902 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
1904 raise ExtractorError(u'Cannot find title in webpage')
1905 video_title = unescapeHTML(m.group(1))
1909 'title': video_title,
1912 'duration': video_duration,
1913 'thumbnail': thumbnail,
1918 class BlipTVIE(InfoExtractor):
1919 """Information extractor for blip.tv"""
1921 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
1922 _URL_EXT = r'^.*\.([a-z0-9]+)$'
1923 IE_NAME = u'blip.tv'
1925 def report_direct_download(self, title):
1926 """Report information extraction."""
1927 self.to_screen(u'%s: Direct download detected' % title)
1929 def _real_extract(self, url):
1930 mobj = re.match(self._VALID_URL, url)
1932 raise ExtractorError(u'Invalid URL: %s' % url)
1934 # See https://github.com/rg3/youtube-dl/issues/857
1935 api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
1936 if api_mobj is not None:
1937 url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
1938 urlp = compat_urllib_parse_urlparse(url)
1939 if urlp.path.startswith('/play/'):
1940 request = compat_urllib_request.Request(url)
1941 response = compat_urllib_request.urlopen(request)
1942 redirecturl = response.geturl()
1943 rurlp = compat_urllib_parse_urlparse(redirecturl)
1944 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1945 url = 'http://blip.tv/a/a-' + file_id
1946 return self._real_extract(url)
1953 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1954 request = compat_urllib_request.Request(json_url)
1955 request.add_header('User-Agent', 'iTunes/10.6.1')
1956 self.report_extraction(mobj.group(1))
1959 urlh = compat_urllib_request.urlopen(request)
1960 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1961 basename = url.split('/')[-1]
1962 title,ext = os.path.splitext(basename)
1963 title = title.decode('UTF-8')
1964 ext = ext.replace('.', '')
1965 self.report_direct_download(title)
1970 'upload_date': None,
1975 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1976 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
1977 if info is None: # Regular URL
1979 json_code_bytes = urlh.read()
1980 json_code = json_code_bytes.decode('utf-8')
1981 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1982 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
1985 json_data = json.loads(json_code)
1986 if 'Post' in json_data:
1987 data = json_data['Post']
1991 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
1992 video_url = data['media']['url']
1993 umobj = re.match(self._URL_EXT, video_url)
1995 raise ValueError('Can not determine filename extension')
1996 ext = umobj.group(1)
1999 'id': data['item_id'],
2001 'uploader': data['display_name'],
2002 'upload_date': upload_date,
2003 'title': data['title'],
2005 'format': data['media']['mimeType'],
2006 'thumbnail': data['thumbnailUrl'],
2007 'description': data['description'],
2008 'player_url': data['embedUrl'],
2009 'user_agent': 'iTunes/10.6.1',
2011 except (ValueError,KeyError) as err:
2012 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
2017 class MyVideoIE(InfoExtractor):
2018 """Information Extractor for myvideo.de."""
2020 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2021 IE_NAME = u'myvideo'
2023 # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
2024 # Released into the Public Domain by Tristan Fischer on 2013-05-19
2025 # https://github.com/rg3/youtube-dl/pull/842
2026 def __rc4crypt(self,data, key):
2028 box = list(range(256))
2029 for i in list(range(256)):
2030 x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
2031 box[i], box[x] = box[x], box[i]
2037 y = (y + box[x]) % 256
2038 box[x], box[y] = box[y], box[x]
2039 out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
2043 return hashlib.md5(s).hexdigest().encode()
2045 def _real_extract(self,url):
2046 mobj = re.match(self._VALID_URL, url)
2048 raise ExtractorError(u'invalid URL: %s' % url)
2050 video_id = mobj.group(1)
2053 b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
2054 b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
2055 b'TnpsbA0KTVRkbU1tSTRNdz09'
2059 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2060 webpage = self._download_webpage(webpage_url, video_id)
2062 mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
2063 if mobj is not None:
2064 self.report_extraction(video_id)
2065 video_url = mobj.group(1) + '.flv'
2067 mobj = re.search('<title>([^<]+)</title>', webpage)
2069 raise ExtractorError(u'Unable to extract title')
2070 video_title = mobj.group(1)
2072 mobj = re.search('[.](.+?)$', video_url)
2074 raise ExtractorError(u'Unable to extract extention')
2075 video_ext = mobj.group(1)
2081 'upload_date': None,
2082 'title': video_title,
2087 mobj = re.search('var flashvars={(.+?)}', webpage)
2089 raise ExtractorError(u'Unable to extract video')
2094 for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
2095 if not a == '_encxml':
2098 encxml = compat_urllib_parse.unquote(b)
2099 if not params.get('domain'):
2100 params['domain'] = 'www.myvideo.de'
2101 xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
2102 if 'flash_playertype=MTV' in xmldata_url:
2103 self._downloader.report_warning(u'avoiding MTV player')
2105 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
2106 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
2110 enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
2111 enc_data_b = binascii.unhexlify(enc_data)
2113 base64.b64decode(base64.b64decode(GK)) +
2115 str(video_id).encode('utf-8')
2118 dec_data = self.__rc4crypt(enc_data_b, sk)
2121 self.report_extraction(video_id)
2123 mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
2125 raise ExtractorError(u'unable to extract rtmpurl')
2126 video_rtmpurl = compat_urllib_parse.unquote(mobj.group(1))
2127 if 'myvideo2flash' in video_rtmpurl:
2128 self._downloader.report_warning(u'forcing RTMPT ...')
2129 video_rtmpurl = video_rtmpurl.replace('rtmpe://', 'rtmpt://')
2131 # extract non rtmp videos
2132 if (video_rtmpurl is None) or (video_rtmpurl == ''):
2133 mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
2135 raise ExtractorError(u'unable to extract url')
2136 video_rtmpurl = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
2138 mobj = re.search('source=\'(.*?)\'', dec_data)
2140 raise ExtractorError(u'unable to extract swfobj')
2141 video_file = compat_urllib_parse.unquote(mobj.group(1))
2143 if not video_file.endswith('f4m'):
2144 ppath, prefix = video_file.split('.')
2145 video_playpath = '%s:%s' % (prefix, ppath)
2146 video_hls_playlist = ''
2149 video_hls_playlist = (
2150 video_filepath + video_file
2151 ).replace('.f4m', '.m3u8')
2153 mobj = re.search('swfobject.embedSWF\(\'(.+?)\'', webpage)
2155 raise ExtractorError(u'unable to extract swfobj')
2156 video_swfobj = compat_urllib_parse.unquote(mobj.group(1))
2158 mobj = re.search("<h1(?: class='globalHd')?>(.*?)</h1>", webpage)
2160 raise ExtractorError(u'unable to extract title')
2161 video_title = mobj.group(1)
2165 'url': video_rtmpurl,
2166 'tc_url': video_rtmpurl,
2168 'upload_date': None,
2169 'title': video_title,
2171 'play_path': video_playpath,
2172 'video_file': video_file,
2173 'video_hls_playlist': video_hls_playlist,
2174 'player_url': video_swfobj,
2177 class ComedyCentralIE(InfoExtractor):
2178 """Information extractor for The Daily Show and Colbert Report """
2180 # urls can be abbreviations like :thedailyshow or :colbert
2181 # urls for episodes like:
2182 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2183 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2184 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2185 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2186 |(https?://)?(www\.)?
2187 (?P<showname>thedailyshow|colbertnation)\.com/
2188 (full-episodes/(?P<episode>.*)|
2190 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2191 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2194 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2196 _video_extensions = {
2204 _video_dimensions = {
2214 def suitable(cls, url):
2215 """Receives a URL and returns True if suitable for this IE."""
2216 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2218 def _print_formats(self, formats):
2219 print('Available formats:')
2221 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2224 def _real_extract(self, url):
2225 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2227 raise ExtractorError(u'Invalid URL: %s' % url)
2229 if mobj.group('shortname'):
2230 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2231 url = u'http://www.thedailyshow.com/full-episodes/'
2233 url = u'http://www.colbertnation.com/full-episodes/'
2234 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2235 assert mobj is not None
2237 if mobj.group('clip'):
2238 if mobj.group('showname') == 'thedailyshow':
2239 epTitle = mobj.group('tdstitle')
2241 epTitle = mobj.group('cntitle')
2244 dlNewest = not mobj.group('episode')
2246 epTitle = mobj.group('showname')
2248 epTitle = mobj.group('episode')
2250 self.report_extraction(epTitle)
2251 webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2253 url = htmlHandle.geturl()
2254 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2256 raise ExtractorError(u'Invalid redirected URL: ' + url)
2257 if mobj.group('episode') == '':
2258 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2259 epTitle = mobj.group('episode')
2261 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2263 if len(mMovieParams) == 0:
2264 # The Colbert Report embeds the information in a without
2265 # a URL prefix; so extract the alternate reference
2266 # and then add the URL prefix manually.
2268 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2269 if len(altMovieParams) == 0:
2270 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2272 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2274 uri = mMovieParams[0][1]
2275 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2276 indexXml = self._download_webpage(indexUrl, epTitle,
2277 u'Downloading show index',
2278 u'unable to download episode index')
2282 idoc = xml.etree.ElementTree.fromstring(indexXml)
2283 itemEls = idoc.findall('.//item')
2284 for partNum,itemEl in enumerate(itemEls):
2285 mediaId = itemEl.findall('./guid')[0].text
2286 shortMediaId = mediaId.split(':')[-1]
2287 showId = mediaId.split(':')[-2].replace('.com', '')
2288 officialTitle = itemEl.findall('./title')[0].text
2289 officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2291 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2292 compat_urllib_parse.urlencode({'uri': mediaId}))
2293 configXml = self._download_webpage(configUrl, epTitle,
2294 u'Downloading configuration for %s' % shortMediaId)
2296 cdoc = xml.etree.ElementTree.fromstring(configXml)
2298 for rendition in cdoc.findall('.//rendition'):
2299 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2303 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2306 if self._downloader.params.get('listformats', None):
2307 self._print_formats([i[0] for i in turls])
2310 # For now, just pick the highest bitrate
2311 format,rtmp_video_url = turls[-1]
2313 # Get the format arg from the arg stream
2314 req_format = self._downloader.params.get('format', None)
2316 # Select format if we can find one
2319 format, rtmp_video_url = f, v
2322 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2324 raise ExtractorError(u'Cannot transform RTMP url')
2325 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2326 video_url = base + m.group('finalid')
2328 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2333 'upload_date': officialDate,
2338 'description': officialTitle,
2340 results.append(info)
2345 class EscapistIE(InfoExtractor):
2346 """Information extractor for The Escapist """
2348 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2349 IE_NAME = u'escapist'
2351 def _real_extract(self, url):
2352 mobj = re.match(self._VALID_URL, url)
2354 raise ExtractorError(u'Invalid URL: %s' % url)
2355 showName = mobj.group('showname')
2356 videoId = mobj.group('episode')
2358 self.report_extraction(showName)
2359 webPage = self._download_webpage(url, showName)
2361 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2362 description = unescapeHTML(descMatch.group(1))
2363 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2364 imgUrl = unescapeHTML(imgMatch.group(1))
2365 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2366 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2367 configUrlMatch = re.search('config=(.*)$', playerUrl)
2368 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2370 configJSON = self._download_webpage(configUrl, showName,
2371 u'Downloading configuration',
2372 u'unable to download configuration')
2374 # Technically, it's JavaScript, not JSON
2375 configJSON = configJSON.replace("'", '"')
2378 config = json.loads(configJSON)
2379 except (ValueError,) as err:
2380 raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2382 playlist = config['playlist']
2383 videoUrl = playlist[1]['url']
2388 'uploader': showName,
2389 'upload_date': None,
2392 'thumbnail': imgUrl,
2393 'description': description,
2394 'player_url': playerUrl,
2399 class CollegeHumorIE(InfoExtractor):
2400 """Information extractor for collegehumor.com"""
2403 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2404 IE_NAME = u'collegehumor'
2406 def report_manifest(self, video_id):
2407 """Report information extraction."""
2408 self.to_screen(u'%s: Downloading XML manifest' % video_id)
2410 def _real_extract(self, url):
2411 mobj = re.match(self._VALID_URL, url)
2413 raise ExtractorError(u'Invalid URL: %s' % url)
2414 video_id = mobj.group('videoid')
2419 'upload_date': None,
2422 self.report_extraction(video_id)
2423 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2425 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2426 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2427 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2429 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2431 videoNode = mdoc.findall('./video')[0]
2432 info['description'] = videoNode.findall('./description')[0].text
2433 info['title'] = videoNode.findall('./caption')[0].text
2434 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2435 manifest_url = videoNode.findall('./file')[0].text
2437 raise ExtractorError(u'Invalid metadata XML file')
2439 manifest_url += '?hdcore=2.10.3'
2440 self.report_manifest(video_id)
2442 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2443 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2444 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2446 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2448 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2449 node_id = media_node.attrib['url']
2450 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2451 except IndexError as err:
2452 raise ExtractorError(u'Invalid manifest file')
2454 url_pr = compat_urllib_parse_urlparse(manifest_url)
2455 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2462 class XVideosIE(InfoExtractor):
2463 """Information extractor for xvideos.com"""
2465 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2466 IE_NAME = u'xvideos'
2468 def _real_extract(self, url):
2469 mobj = re.match(self._VALID_URL, url)
2471 raise ExtractorError(u'Invalid URL: %s' % url)
2472 video_id = mobj.group(1)
2474 webpage = self._download_webpage(url, video_id)
2476 self.report_extraction(video_id)
2480 mobj = re.search(r'flv_url=(.+?)&', webpage)
2482 raise ExtractorError(u'Unable to extract video url')
2483 video_url = compat_urllib_parse.unquote(mobj.group(1))
2487 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2489 raise ExtractorError(u'Unable to extract video title')
2490 video_title = mobj.group(1)
2493 # Extract video thumbnail
2494 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2496 raise ExtractorError(u'Unable to extract video thumbnail')
2497 video_thumbnail = mobj.group(0)
2503 'upload_date': None,
2504 'title': video_title,
2506 'thumbnail': video_thumbnail,
2507 'description': None,
2513 class SoundcloudIE(InfoExtractor):
2514 """Information extractor for soundcloud.com
2515 To access the media, the uid of the song and a stream token
2516 must be extracted from the page source and the script must make
2517 a request to media.soundcloud.com/crossdomain.xml. Then
2518 the media can be grabbed by requesting from an url composed
2519 of the stream token and uid
2522 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2523 IE_NAME = u'soundcloud'
2525 def report_resolve(self, video_id):
2526 """Report information extraction."""
2527 self.to_screen(u'%s: Resolving id' % video_id)
2529 def _real_extract(self, url):
2530 mobj = re.match(self._VALID_URL, url)
2532 raise ExtractorError(u'Invalid URL: %s' % url)
2534 # extract uploader (which is in the url)
2535 uploader = mobj.group(1)
2536 # extract simple title (uploader + slug of song title)
2537 slug_title = mobj.group(2)
2538 simple_title = uploader + u'-' + slug_title
2539 full_title = '%s/%s' % (uploader, slug_title)
2541 self.report_resolve(full_title)
2543 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2544 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2545 info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2547 info = json.loads(info_json)
2548 video_id = info['id']
2549 self.report_extraction(full_title)
2551 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2552 stream_json = self._download_webpage(streams_url, full_title,
2553 u'Downloading stream definitions',
2554 u'unable to download stream definitions')
2556 streams = json.loads(stream_json)
2557 mediaURL = streams['http_mp3_128_url']
2558 upload_date = unified_strdate(info['created_at'])
2563 'uploader': info['user']['username'],
2564 'upload_date': upload_date,
2565 'title': info['title'],
2567 'description': info['description'],
2570 class SoundcloudSetIE(InfoExtractor):
2571 """Information extractor for soundcloud.com sets
2572 To access the media, the uid of the song and a stream token
2573 must be extracted from the page source and the script must make
2574 a request to media.soundcloud.com/crossdomain.xml. Then
2575 the media can be grabbed by requesting from an url composed
2576 of the stream token and uid
2579 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2580 IE_NAME = u'soundcloud:set'
2582 def report_resolve(self, video_id):
2583 """Report information extraction."""
2584 self.to_screen(u'%s: Resolving id' % video_id)
2586 def _real_extract(self, url):
2587 mobj = re.match(self._VALID_URL, url)
2589 raise ExtractorError(u'Invalid URL: %s' % url)
2591 # extract uploader (which is in the url)
2592 uploader = mobj.group(1)
2593 # extract simple title (uploader + slug of song title)
2594 slug_title = mobj.group(2)
2595 simple_title = uploader + u'-' + slug_title
2596 full_title = '%s/sets/%s' % (uploader, slug_title)
2598 self.report_resolve(full_title)
2600 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2601 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2602 info_json = self._download_webpage(resolv_url, full_title)
2605 info = json.loads(info_json)
2606 if 'errors' in info:
2607 for err in info['errors']:
2608 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2611 self.report_extraction(full_title)
2612 for track in info['tracks']:
2613 video_id = track['id']
2615 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2616 stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2618 self.report_extraction(video_id)
2619 streams = json.loads(stream_json)
2620 mediaURL = streams['http_mp3_128_url']
2625 'uploader': track['user']['username'],
2626 'upload_date': unified_strdate(track['created_at']),
2627 'title': track['title'],
2629 'description': track['description'],
2634 class InfoQIE(InfoExtractor):
2635 """Information extractor for infoq.com"""
2636 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2638 def _real_extract(self, url):
2639 mobj = re.match(self._VALID_URL, url)
2641 raise ExtractorError(u'Invalid URL: %s' % url)
2643 webpage = self._download_webpage(url, video_id=url)
2644 self.report_extraction(url)
2647 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2649 raise ExtractorError(u'Unable to extract video url')
2650 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2651 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2654 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2656 raise ExtractorError(u'Unable to extract video title')
2657 video_title = mobj.group(1)
2659 # Extract description
2660 video_description = u'No description available.'
2661 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2662 if mobj is not None:
2663 video_description = mobj.group(1)
2665 video_filename = video_url.split('/')[-1]
2666 video_id, extension = video_filename.split('.')
2672 'upload_date': None,
2673 'title': video_title,
2674 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2676 'description': video_description,
2681 class MixcloudIE(InfoExtractor):
2682 """Information extractor for www.mixcloud.com"""
2684 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2685 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2686 IE_NAME = u'mixcloud'
2688 def report_download_json(self, file_id):
2689 """Report JSON download."""
2690 self.to_screen(u'Downloading json')
2692 def get_urls(self, jsonData, fmt, bitrate='best'):
2693 """Get urls from 'audio_formats' section in json"""
2696 bitrate_list = jsonData[fmt]
2697 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2698 bitrate = max(bitrate_list) # select highest
2700 url_list = jsonData[fmt][bitrate]
2701 except TypeError: # we have no bitrate info.
2702 url_list = jsonData[fmt]
2705 def check_urls(self, url_list):
2706 """Returns 1st active url from list"""
2707 for url in url_list:
2709 compat_urllib_request.urlopen(url)
2711 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2716 def _print_formats(self, formats):
2717 print('Available formats:')
2718 for fmt in formats.keys():
2719 for b in formats[fmt]:
2721 ext = formats[fmt][b][0]
2722 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2723 except TypeError: # we have no bitrate info
2724 ext = formats[fmt][0]
2725 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2728 def _real_extract(self, url):
2729 mobj = re.match(self._VALID_URL, url)
2731 raise ExtractorError(u'Invalid URL: %s' % url)
2732 # extract uploader & filename from url
2733 uploader = mobj.group(1).decode('utf-8')
2734 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2736 # construct API request
2737 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2738 # retrieve .json file with links to files
2739 request = compat_urllib_request.Request(file_url)
2741 self.report_download_json(file_url)
2742 jsonData = compat_urllib_request.urlopen(request).read()
2743 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2744 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2747 json_data = json.loads(jsonData)
2748 player_url = json_data['player_swf_url']
2749 formats = dict(json_data['audio_formats'])
2751 req_format = self._downloader.params.get('format', None)
2754 if self._downloader.params.get('listformats', None):
2755 self._print_formats(formats)
2758 if req_format is None or req_format == 'best':
2759 for format_param in formats.keys():
2760 url_list = self.get_urls(formats, format_param)
2762 file_url = self.check_urls(url_list)
2763 if file_url is not None:
2766 if req_format not in formats:
2767 raise ExtractorError(u'Format is not available')
2769 url_list = self.get_urls(formats, req_format)
2770 file_url = self.check_urls(url_list)
2771 format_param = req_format
2774 'id': file_id.decode('utf-8'),
2775 'url': file_url.decode('utf-8'),
2776 'uploader': uploader.decode('utf-8'),
2777 'upload_date': None,
2778 'title': json_data['name'],
2779 'ext': file_url.split('.')[-1].decode('utf-8'),
2780 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2781 'thumbnail': json_data['thumbnail_url'],
2782 'description': json_data['description'],
2783 'player_url': player_url.decode('utf-8'),
2786 class StanfordOpenClassroomIE(InfoExtractor):
2787 """Information extractor for Stanford's Open ClassRoom"""
2789 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2790 IE_NAME = u'stanfordoc'
2792 def _real_extract(self, url):
2793 mobj = re.match(self._VALID_URL, url)
2795 raise ExtractorError(u'Invalid URL: %s' % url)
2797 if mobj.group('course') and mobj.group('video'): # A specific video
2798 course = mobj.group('course')
2799 video = mobj.group('video')
2801 'id': course + '_' + video,
2803 'upload_date': None,
2806 self.report_extraction(info['id'])
2807 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2808 xmlUrl = baseUrl + video + '.xml'
2810 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2811 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2812 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2813 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2815 info['title'] = mdoc.findall('./title')[0].text
2816 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2818 raise ExtractorError(u'Invalid metadata XML file')
2819 info['ext'] = info['url'].rpartition('.')[2]
2821 elif mobj.group('course'): # A course page
2822 course = mobj.group('course')
2827 'upload_date': None,
2830 coursepage = self._download_webpage(url, info['id'],
2831 note='Downloading course info page',
2832 errnote='Unable to download course info page')
2834 m = re.search('<h1>([^<]+)</h1>', coursepage)
2836 info['title'] = unescapeHTML(m.group(1))
2838 info['title'] = info['id']
2840 m = re.search('<description>([^<]+)</description>', coursepage)
2842 info['description'] = unescapeHTML(m.group(1))
2844 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2847 'type': 'reference',
2848 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2852 for entry in info['list']:
2853 assert entry['type'] == 'reference'
2854 results += self.extract(entry['url'])
2858 'id': 'Stanford OpenClassroom',
2861 'upload_date': None,
2864 self.report_download_webpage(info['id'])
2865 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2867 rootpage = compat_urllib_request.urlopen(rootURL).read()
2868 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2869 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2871 info['title'] = info['id']
2873 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2876 'type': 'reference',
2877 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2882 for entry in info['list']:
2883 assert entry['type'] == 'reference'
2884 results += self.extract(entry['url'])
2887 class MTVIE(InfoExtractor):
2888 """Information extractor for MTV.com"""
2890 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2893 def _real_extract(self, url):
2894 mobj = re.match(self._VALID_URL, url)
2896 raise ExtractorError(u'Invalid URL: %s' % url)
2897 if not mobj.group('proto'):
2898 url = 'http://' + url
2899 video_id = mobj.group('videoid')
2901 webpage = self._download_webpage(url, video_id)
2903 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2905 raise ExtractorError(u'Unable to extract song name')
2906 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2907 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2909 raise ExtractorError(u'Unable to extract performer')
2910 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2911 video_title = performer + ' - ' + song_name
2913 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2915 raise ExtractorError(u'Unable to mtvn_uri')
2916 mtvn_uri = mobj.group(1)
2918 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2920 raise ExtractorError(u'Unable to extract content id')
2921 content_id = mobj.group(1)
2923 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2924 self.report_extraction(video_id)
2925 request = compat_urllib_request.Request(videogen_url)
2927 metadataXml = compat_urllib_request.urlopen(request).read()
2928 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2929 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2931 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2932 renditions = mdoc.findall('.//rendition')
2934 # For now, always pick the highest quality.
2935 rendition = renditions[-1]
2938 _,_,ext = rendition.attrib['type'].partition('/')
2939 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2940 video_url = rendition.find('./src').text
2942 raise ExtractorError('Invalid rendition field.')
2947 'uploader': performer,
2948 'upload_date': None,
2949 'title': video_title,
2957 class YoukuIE(InfoExtractor):
2958 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2961 nowTime = int(time.time() * 1000)
2962 random1 = random.randint(1000,1998)
2963 random2 = random.randint(1000,9999)
2965 return "%d%d%d" %(nowTime,random1,random2)
2967 def _get_file_ID_mix_string(self, seed):
2969 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2971 for i in range(len(source)):
2972 seed = (seed * 211 + 30031 ) % 65536
2973 index = math.floor(seed / 65536 * len(source) )
2974 mixed.append(source[int(index)])
2975 source.remove(source[int(index)])
2976 #return ''.join(mixed)
2979 def _get_file_id(self, fileId, seed):
2980 mixed = self._get_file_ID_mix_string(seed)
2981 ids = fileId.split('*')
2985 realId.append(mixed[int(ch)])
2986 return ''.join(realId)
2988 def _real_extract(self, url):
2989 mobj = re.match(self._VALID_URL, url)
2991 raise ExtractorError(u'Invalid URL: %s' % url)
2992 video_id = mobj.group('ID')
2994 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
2996 jsondata = self._download_webpage(info_url, video_id)
2998 self.report_extraction(video_id)
3000 config = json.loads(jsondata)
3002 video_title = config['data'][0]['title']
3003 seed = config['data'][0]['seed']
3005 format = self._downloader.params.get('format', None)
3006 supported_format = list(config['data'][0]['streamfileids'].keys())
3008 if format is None or format == 'best':
3009 if 'hd2' in supported_format:
3014 elif format == 'worst':
3022 fileid = config['data'][0]['streamfileids'][format]
3023 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3024 except (UnicodeDecodeError, ValueError, KeyError):
3025 raise ExtractorError(u'Unable to extract info section')
3028 sid = self._gen_sid()
3029 fileid = self._get_file_id(fileid, seed)
3031 #column 8,9 of fileid represent the segment number
3032 #fileid[7:9] should be changed
3033 for index, key in enumerate(keys):
3035 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3036 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3039 'id': '%s_part%02d' % (video_id, index),
3040 'url': download_url,
3042 'upload_date': None,
3043 'title': video_title,
3046 files_info.append(info)
3051 class XNXXIE(InfoExtractor):
3052 """Information extractor for xnxx.com"""
3054 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3056 VIDEO_URL_RE = r'flv_url=(.*?)&'
3057 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3058 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3060 def _real_extract(self, url):
3061 mobj = re.match(self._VALID_URL, url)
3063 raise ExtractorError(u'Invalid URL: %s' % url)
3064 video_id = mobj.group(1)
3066 # Get webpage content
3067 webpage = self._download_webpage(url, video_id)
3069 result = re.search(self.VIDEO_URL_RE, webpage)
3071 raise ExtractorError(u'Unable to extract video url')
3072 video_url = compat_urllib_parse.unquote(result.group(1))
3074 result = re.search(self.VIDEO_TITLE_RE, webpage)
3076 raise ExtractorError(u'Unable to extract video title')
3077 video_title = result.group(1)
3079 result = re.search(self.VIDEO_THUMB_RE, webpage)
3081 raise ExtractorError(u'Unable to extract video thumbnail')
3082 video_thumbnail = result.group(1)
3088 'upload_date': None,
3089 'title': video_title,
3091 'thumbnail': video_thumbnail,
3092 'description': None,
3096 class GooglePlusIE(InfoExtractor):
3097 """Information extractor for plus.google.com."""
3099 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3100 IE_NAME = u'plus.google'
3102 def report_extract_entry(self, url):
3103 """Report downloading extry"""
3104 self.to_screen(u'Downloading entry: %s' % url)
3106 def report_date(self, upload_date):
3107 """Report downloading extry"""
3108 self.to_screen(u'Entry date: %s' % upload_date)
3110 def report_uploader(self, uploader):
3111 """Report downloading extry"""
3112 self.to_screen(u'Uploader: %s' % uploader)
3114 def report_title(self, video_title):
3115 """Report downloading extry"""
3116 self.to_screen(u'Title: %s' % video_title)
3118 def report_extract_vid_page(self, video_page):
3119 """Report information extraction."""
3120 self.to_screen(u'Extracting video page: %s' % video_page)
3122 def _real_extract(self, url):
3123 # Extract id from URL
3124 mobj = re.match(self._VALID_URL, url)
3126 raise ExtractorError(u'Invalid URL: %s' % url)
3128 post_url = mobj.group(0)
3129 video_id = mobj.group(1)
3131 video_extension = 'flv'
3133 # Step 1, Retrieve post webpage to extract further information
3134 self.report_extract_entry(post_url)
3135 webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3137 # Extract update date
3139 pattern = 'title="Timestamp">(.*?)</a>'
3140 mobj = re.search(pattern, webpage)
3142 upload_date = mobj.group(1)
3143 # Convert timestring to a format suitable for filename
3144 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3145 upload_date = upload_date.strftime('%Y%m%d')
3146 self.report_date(upload_date)
3150 pattern = r'rel\="author".*?>(.*?)</a>'
3151 mobj = re.search(pattern, webpage)
3153 uploader = mobj.group(1)
3154 self.report_uploader(uploader)
3157 # Get the first line for title
3159 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3160 mobj = re.search(pattern, webpage)
3162 video_title = mobj.group(1)
3163 self.report_title(video_title)
3165 # Step 2, Stimulate clicking the image box to launch video
3166 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3167 mobj = re.search(pattern, webpage)
3169 raise ExtractorError(u'Unable to extract video page URL')
3171 video_page = mobj.group(1)
3172 webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3173 self.report_extract_vid_page(video_page)
3176 # Extract video links on video page
3177 """Extract video links of all sizes"""
3178 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3179 mobj = re.findall(pattern, webpage)
3181 raise ExtractorError(u'Unable to extract video links')
3183 # Sort in resolution
3184 links = sorted(mobj)
3186 # Choose the lowest of the sort, i.e. highest resolution
3187 video_url = links[-1]
3188 # Only get the url. The resolution part in the tuple has no use anymore
3189 video_url = video_url[-1]
3190 # Treat escaped \u0026 style hex
3192 video_url = video_url.decode("unicode_escape")
3193 except AttributeError: # Python 3
3194 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3200 'uploader': uploader,
3201 'upload_date': upload_date,
3202 'title': video_title,
3203 'ext': video_extension,
3206 class NBAIE(InfoExtractor):
3207 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3210 def _real_extract(self, url):
3211 mobj = re.match(self._VALID_URL, url)
3213 raise ExtractorError(u'Invalid URL: %s' % url)
3215 video_id = mobj.group(1)
3216 if video_id.endswith('/index.html'):
3217 video_id = video_id[:-len('/index.html')]
3219 webpage = self._download_webpage(url, video_id)
3221 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3222 def _findProp(rexp, default=None):
3223 m = re.search(rexp, webpage)
3225 return unescapeHTML(m.group(1))
3229 shortened_video_id = video_id.rpartition('/')[2]
3230 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3232 'id': shortened_video_id,
3236 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3237 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3241 class JustinTVIE(InfoExtractor):
3242 """Information extractor for justin.tv and twitch.tv"""
3243 # TODO: One broadcast may be split into multiple videos. The key
3244 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3245 # starts at 1 and increases. Can we treat all parts as one video?
3247 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3249 (?P<channelid>[^/]+)|
3250 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3251 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3255 _JUSTIN_PAGE_LIMIT = 100
3256 IE_NAME = u'justin.tv'
3258 def report_download_page(self, channel, offset):
3259 """Report attempt to download a single page of videos."""
3260 self.to_screen(u'%s: Downloading video information from %d to %d' %
3261 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3263 # Return count of items, list of *valid* items
3264 def _parse_page(self, url, video_id):
3265 webpage = self._download_webpage(url, video_id,
3266 u'Downloading video info JSON',
3267 u'unable to download video info JSON')
3269 response = json.loads(webpage)
3270 if type(response) != list:
3271 error_text = response.get('error', 'unknown error')
3272 raise ExtractorError(u'Justin.tv API: %s' % error_text)
3274 for clip in response:
3275 video_url = clip['video_file_url']
3277 video_extension = os.path.splitext(video_url)[1][1:]
3278 video_date = re.sub('-', '', clip['start_time'][:10])
3279 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3280 video_id = clip['id']
3281 video_title = clip.get('title', video_id)
3285 'title': video_title,
3286 'uploader': clip.get('channel_name', video_uploader_id),
3287 'uploader_id': video_uploader_id,
3288 'upload_date': video_date,
3289 'ext': video_extension,
3291 return (len(response), info)
3293 def _real_extract(self, url):
3294 mobj = re.match(self._VALID_URL, url)
3296 raise ExtractorError(u'invalid URL: %s' % url)
3298 api_base = 'http://api.justin.tv'
3300 if mobj.group('channelid'):
3302 video_id = mobj.group('channelid')
3303 api = api_base + '/channel/archives/%s.json' % video_id
3304 elif mobj.group('chapterid'):
3305 chapter_id = mobj.group('chapterid')
3307 webpage = self._download_webpage(url, chapter_id)
3308 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3310 raise ExtractorError(u'Cannot find archive of a chapter')
3311 archive_id = m.group(1)
3313 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3314 chapter_info_xml = self._download_webpage(api, chapter_id,
3315 note=u'Downloading chapter information',
3316 errnote=u'Chapter information download failed')
3317 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3318 for a in doc.findall('.//archive'):
3319 if archive_id == a.find('./id').text:
3322 raise ExtractorError(u'Could not find chapter in chapter information')
3324 video_url = a.find('./video_file_url').text
3325 video_ext = video_url.rpartition('.')[2] or u'flv'
3327 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3328 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3329 note='Downloading chapter metadata',
3330 errnote='Download of chapter metadata failed')
3331 chapter_info = json.loads(chapter_info_json)
3333 bracket_start = int(doc.find('.//bracket_start').text)
3334 bracket_end = int(doc.find('.//bracket_end').text)
3336 # TODO determine start (and probably fix up file)
3337 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3338 #video_url += u'?start=' + TODO:start_timestamp
3339 # bracket_start is 13290, but we want 51670615
3340 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3341 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3344 'id': u'c' + chapter_id,
3347 'title': chapter_info['title'],
3348 'thumbnail': chapter_info['preview'],
3349 'description': chapter_info['description'],
3350 'uploader': chapter_info['channel']['display_name'],
3351 'uploader_id': chapter_info['channel']['name'],
3355 video_id = mobj.group('videoid')
3356 api = api_base + '/broadcast/by_archive/%s.json' % video_id
3358 self.report_extraction(video_id)
3362 limit = self._JUSTIN_PAGE_LIMIT
3365 self.report_download_page(video_id, offset)
3366 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3367 page_count, page_info = self._parse_page(page_url, video_id)
3368 info.extend(page_info)
3369 if not paged or page_count != limit:
3374 class FunnyOrDieIE(InfoExtractor):
3375 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3377 def _real_extract(self, url):
3378 mobj = re.match(self._VALID_URL, url)
3380 raise ExtractorError(u'invalid URL: %s' % url)
3382 video_id = mobj.group('id')
3383 webpage = self._download_webpage(url, video_id)
3385 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3387 raise ExtractorError(u'Unable to find video information')
3388 video_url = unescapeHTML(m.group('url'))
3390 m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3392 m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3394 raise ExtractorError(u'Cannot find video title')
3395 title = clean_html(m.group('title'))
3397 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3399 desc = unescapeHTML(m.group('desc'))
3408 'description': desc,
3412 class SteamIE(InfoExtractor):
3413 _VALID_URL = r"""http://store\.steampowered\.com/
3415 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3417 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3421 def suitable(cls, url):
3422 """Receives a URL and returns True if suitable for this IE."""
3423 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3425 def _real_extract(self, url):
3426 m = re.match(self._VALID_URL, url, re.VERBOSE)
3427 gameID = m.group('gameID')
3428 videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3429 self.report_age_confirmation()
3430 webpage = self._download_webpage(videourl, gameID)
3431 game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3433 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3434 mweb = re.finditer(urlRE, webpage)
3435 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3436 titles = re.finditer(namesRE, webpage)
3437 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3438 thumbs = re.finditer(thumbsRE, webpage)
3440 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3441 video_id = vid.group('videoID')
3442 title = vtitle.group('videoName')
3443 video_url = vid.group('videoURL')
3444 video_thumb = thumb.group('thumbnail')
3446 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3451 'title': unescapeHTML(title),
3452 'thumbnail': video_thumb
3455 return [self.playlist_result(videos, gameID, game_title)]
3457 class UstreamIE(InfoExtractor):
3458 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3459 IE_NAME = u'ustream'
3461 def _real_extract(self, url):
3462 m = re.match(self._VALID_URL, url)
3463 video_id = m.group('videoID')
3464 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3465 webpage = self._download_webpage(url, video_id)
3466 self.report_extraction(video_id)
3468 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3469 title = m.group('title')
3470 m = re.search(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
3472 uploader = unescapeHTML(m.group('uploader').strip())
3473 m = re.search(r'<link rel="image_src" href="(?P<thumb>.*?)"', webpage)
3474 thumb = m.group('thumb')
3475 except AttributeError:
3476 raise ExtractorError(u'Unable to extract info')
3482 'uploader': uploader,
3487 class WorldStarHipHopIE(InfoExtractor):
3488 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3489 IE_NAME = u'WorldStarHipHop'
3491 def _real_extract(self, url):
3492 _src_url = r'so\.addVariable\("file","(.*?)"\)'
3494 m = re.match(self._VALID_URL, url)
3495 video_id = m.group('id')
3497 webpage_src = self._download_webpage(url, video_id)
3499 mobj = re.search(_src_url, webpage_src)
3501 if mobj is not None:
3502 video_url = mobj.group(1)
3503 if 'mp4' in video_url:
3508 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3510 mobj = re.search(r"<title>(.*)</title>", webpage_src)
3513 raise ExtractorError(u'Cannot determine title')
3514 title = mobj.group(1)
3516 mobj = re.search(r'rel="image_src" href="(.*)" />', webpage_src)
3517 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3518 if mobj is not None:
3519 thumbnail = mobj.group(1)
3521 _title = r"""candytitles.*>(.*)</span>"""
3522 mobj = re.search(_title, webpage_src)
3523 if mobj is not None:
3524 title = mobj.group(1)
3531 'thumbnail' : thumbnail,
3536 class RBMARadioIE(InfoExtractor):
3537 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3539 def _real_extract(self, url):
3540 m = re.match(self._VALID_URL, url)
3541 video_id = m.group('videoID')
3543 webpage = self._download_webpage(url, video_id)
3544 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3546 raise ExtractorError(u'Cannot find metadata')
3547 json_data = m.group(1)
3550 data = json.loads(json_data)
3551 except ValueError as e:
3552 raise ExtractorError(u'Invalid JSON: ' + str(e))
3554 video_url = data['akamai_url'] + '&cbr=256'
3555 url_parts = compat_urllib_parse_urlparse(video_url)
3556 video_ext = url_parts.path.rpartition('.')[2]
3561 'title': data['title'],
3562 'description': data.get('teaser_text'),
3563 'location': data.get('country_of_origin'),
3564 'uploader': data.get('host', {}).get('name'),
3565 'uploader_id': data.get('host', {}).get('slug'),
3566 'thumbnail': data.get('image', {}).get('large_url_2x'),
3567 'duration': data.get('duration'),
3572 class YouPornIE(InfoExtractor):
3573 """Information extractor for youporn.com."""
3574 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3576 def _print_formats(self, formats):
3577 """Print all available formats"""
3578 print(u'Available formats:')
3579 print(u'ext\t\tformat')
3580 print(u'---------------------------------')
3581 for format in formats:
3582 print(u'%s\t\t%s' % (format['ext'], format['format']))
3584 def _specific(self, req_format, formats):
3586 if(x["format"]==req_format):
3590 def _real_extract(self, url):
3591 mobj = re.match(self._VALID_URL, url)
3593 raise ExtractorError(u'Invalid URL: %s' % url)
3595 video_id = mobj.group('videoid')
3597 req = compat_urllib_request.Request(url)
3598 req.add_header('Cookie', 'age_verified=1')
3599 webpage = self._download_webpage(req, video_id)
3601 # Get the video title
3602 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3604 raise ExtractorError(u'Unable to extract video title')
3605 video_title = result.group('title').strip()
3607 # Get the video date
3608 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3610 self._downloader.report_warning(u'unable to extract video date')
3613 upload_date = unified_strdate(result.group('date').strip())
3615 # Get the video uploader
3616 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3618 self._downloader.report_warning(u'unable to extract uploader')
3619 video_uploader = None
3621 video_uploader = result.group('uploader').strip()
3622 video_uploader = clean_html( video_uploader )
3624 # Get all of the formats available
3625 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3626 result = re.search(DOWNLOAD_LIST_RE, webpage)
3628 raise ExtractorError(u'Unable to extract download list')
3629 download_list_html = result.group('download_list').strip()
3631 # Get all of the links from the page
3632 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3633 links = re.findall(LINK_RE, download_list_html)
3634 if(len(links) == 0):
3635 raise ExtractorError(u'ERROR: no known formats available for video')
3637 self.to_screen(u'Links found: %d' % len(links))
3642 # A link looks like this:
3643 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3644 # A path looks like this:
3645 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3646 video_url = unescapeHTML( link )
3647 path = compat_urllib_parse_urlparse( video_url ).path
3648 extension = os.path.splitext( path )[1][1:]
3649 format = path.split('/')[4].split('_')[:2]
3652 format = "-".join( format )
3653 title = u'%s-%s-%s' % (video_title, size, bitrate)
3658 'uploader': video_uploader,
3659 'upload_date': upload_date,
3664 'description': None,
3668 if self._downloader.params.get('listformats', None):
3669 self._print_formats(formats)
3672 req_format = self._downloader.params.get('format', None)
3673 self.to_screen(u'Format: %s' % req_format)
3675 if req_format is None or req_format == 'best':
3677 elif req_format == 'worst':
3678 return [formats[-1]]
3679 elif req_format in ('-1', 'all'):
3682 format = self._specific( req_format, formats )
3684 raise ExtractorError(u'Requested format not available')
3689 class PornotubeIE(InfoExtractor):
3690 """Information extractor for pornotube.com."""
3691 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3693 def _real_extract(self, url):
3694 mobj = re.match(self._VALID_URL, url)
3696 raise ExtractorError(u'Invalid URL: %s' % url)
3698 video_id = mobj.group('videoid')
3699 video_title = mobj.group('title')
3701 # Get webpage content
3702 webpage = self._download_webpage(url, video_id)
3705 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3706 result = re.search(VIDEO_URL_RE, webpage)
3708 raise ExtractorError(u'Unable to extract video url')
3709 video_url = compat_urllib_parse.unquote(result.group('url'))
3711 #Get the uploaded date
3712 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3713 result = re.search(VIDEO_UPLOADED_RE, webpage)
3715 raise ExtractorError(u'Unable to extract video title')
3716 upload_date = unified_strdate(result.group('date'))
3718 info = {'id': video_id,
3721 'upload_date': upload_date,
3722 'title': video_title,
3728 class YouJizzIE(InfoExtractor):
3729 """Information extractor for youjizz.com."""
3730 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3732 def _real_extract(self, url):
3733 mobj = re.match(self._VALID_URL, url)
3735 raise ExtractorError(u'Invalid URL: %s' % url)
3737 video_id = mobj.group('videoid')
3739 # Get webpage content
3740 webpage = self._download_webpage(url, video_id)
3742 # Get the video title
3743 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3745 raise ExtractorError(u'ERROR: unable to extract video title')
3746 video_title = result.group('title').strip()
3748 # Get the embed page
3749 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3751 raise ExtractorError(u'ERROR: unable to extract embed page')
3753 embed_page_url = result.group(0).strip()
3754 video_id = result.group('videoid')
3756 webpage = self._download_webpage(embed_page_url, video_id)
3759 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3761 raise ExtractorError(u'ERROR: unable to extract video url')
3762 video_url = result.group('source')
3764 info = {'id': video_id,
3766 'title': video_title,
3769 'player_url': embed_page_url}
3773 class EightTracksIE(InfoExtractor):
3775 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3777 def _real_extract(self, url):
3778 mobj = re.match(self._VALID_URL, url)
3780 raise ExtractorError(u'Invalid URL: %s' % url)
3781 playlist_id = mobj.group('id')
3783 webpage = self._download_webpage(url, playlist_id)
3785 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3787 raise ExtractorError(u'Cannot find trax information')
3788 json_like = m.group(1)
3789 data = json.loads(json_like)
3791 session = str(random.randint(0, 1000000000))
3793 track_count = data['tracks_count']
3794 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3795 next_url = first_url
3797 for i in itertools.count():
3798 api_json = self._download_webpage(next_url, playlist_id,
3799 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3800 errnote=u'Failed to download song information')
3801 api_data = json.loads(api_json)
3802 track_data = api_data[u'set']['track']
3804 'id': track_data['id'],
3805 'url': track_data['track_file_stream_url'],
3806 'title': track_data['performer'] + u' - ' + track_data['name'],
3807 'raw_title': track_data['name'],
3808 'uploader_id': data['user']['login'],
3812 if api_data['set']['at_last_track']:
3814 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3817 class KeekIE(InfoExtractor):
3818 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3821 def _real_extract(self, url):
3822 m = re.match(self._VALID_URL, url)
3823 video_id = m.group('videoID')
3824 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3825 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3826 webpage = self._download_webpage(url, video_id)
3827 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3828 title = unescapeHTML(m.group('title'))
3829 m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
3830 uploader = clean_html(m.group('uploader'))
3836 'thumbnail': thumbnail,
3837 'uploader': uploader
3841 class TEDIE(InfoExtractor):
3842 _VALID_URL=r'''http://www\.ted\.com/
3844 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3846 ((?P<type_talk>talks)) # We have a simple talk
3848 (/lang/(.*?))? # The url may contain the language
3849 /(?P<name>\w+) # Here goes the name and then ".html"
3853 def suitable(cls, url):
3854 """Receives a URL and returns True if suitable for this IE."""
3855 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3857 def _real_extract(self, url):
3858 m=re.match(self._VALID_URL, url, re.VERBOSE)
3859 if m.group('type_talk'):
3860 return [self._talk_info(url)]
3862 playlist_id=m.group('playlist_id')
3863 name=m.group('name')
3864 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3865 return [self._playlist_videos_info(url,name,playlist_id)]
3867 def _talk_video_link(self,mediaSlug):
3868 '''Returns the video link for that mediaSlug'''
3869 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3871 def _playlist_videos_info(self,url,name,playlist_id=0):
3872 '''Returns the videos of the playlist'''
3874 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3875 ([.\s]*?)data-playlist_item_id="(\d+)"
3876 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3878 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3879 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3880 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3881 m_names=re.finditer(video_name_RE,webpage)
3883 playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3884 m_playlist = re.search(playlist_RE, webpage)
3885 playlist_title = m_playlist.group('playlist_title')
3887 playlist_entries = []
3888 for m_video, m_name in zip(m_videos,m_names):
3889 video_id=m_video.group('video_id')
3890 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3891 playlist_entries.append(self.url_result(talk_url, 'TED'))
3892 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3894 def _talk_info(self, url, video_id=0):
3895 """Return the video for the talk in the url"""
3896 m=re.match(self._VALID_URL, url,re.VERBOSE)
3897 videoName=m.group('name')
3898 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3899 # If the url includes the language we get the title translated
3900 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3901 title=re.search(title_RE, webpage).group('title')
3902 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3903 "id":(?P<videoID>[\d]+).*?
3904 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3905 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3906 thumb_match=re.search(thumb_RE,webpage)
3907 info_match=re.search(info_RE,webpage,re.VERBOSE)
3908 video_id=info_match.group('videoID')
3909 mediaSlug=info_match.group('mediaSlug')
3910 video_url=self._talk_video_link(mediaSlug)
3916 'thumbnail': thumb_match.group('thumbnail')
3920 class MySpassIE(InfoExtractor):
3921 _VALID_URL = r'http://www.myspass.de/.*'
3923 def _real_extract(self, url):
3924 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3926 # video id is the last path element of the URL
3927 # usually there is a trailing slash, so also try the second but last
3928 url_path = compat_urllib_parse_urlparse(url).path
3929 url_parent_path, video_id = os.path.split(url_path)
3931 _, video_id = os.path.split(url_parent_path)
3934 metadata_url = META_DATA_URL_TEMPLATE % video_id
3935 metadata_text = self._download_webpage(metadata_url, video_id)
3936 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3938 # extract values from metadata
3939 url_flv_el = metadata.find('url_flv')
3940 if url_flv_el is None:
3941 raise ExtractorError(u'Unable to extract download url')
3942 video_url = url_flv_el.text
3943 extension = os.path.splitext(video_url)[1][1:]
3944 title_el = metadata.find('title')
3945 if title_el is None:
3946 raise ExtractorError(u'Unable to extract title')
3947 title = title_el.text
3948 format_id_el = metadata.find('format_id')
3949 if format_id_el is None:
3952 format = format_id_el.text
3953 description_el = metadata.find('description')
3954 if description_el is not None:
3955 description = description_el.text
3958 imagePreview_el = metadata.find('imagePreview')
3959 if imagePreview_el is not None:
3960 thumbnail = imagePreview_el.text
3969 'thumbnail': thumbnail,
3970 'description': description
3974 class SpiegelIE(InfoExtractor):
3975 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3977 def _real_extract(self, url):
3978 m = re.match(self._VALID_URL, url)
3979 video_id = m.group('videoID')
3981 webpage = self._download_webpage(url, video_id)
3982 m = re.search(r'<div class="module-title">(.*?)</div>', webpage)
3984 raise ExtractorError(u'Cannot find title')
3985 video_title = unescapeHTML(m.group(1))
3987 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3988 xml_code = self._download_webpage(xml_url, video_id,
3989 note=u'Downloading XML', errnote=u'Failed to download XML')
3991 idoc = xml.etree.ElementTree.fromstring(xml_code)
3992 last_type = idoc[-1]
3993 filename = last_type.findall('./filename')[0].text
3994 duration = float(last_type.findall('./duration')[0].text)
3996 video_url = 'http://video2.spiegel.de/flash/' + filename
3997 video_ext = filename.rpartition('.')[2]
4002 'title': video_title,
4003 'duration': duration,
4007 class LiveLeakIE(InfoExtractor):
4009 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4010 IE_NAME = u'liveleak'
4012 def _real_extract(self, url):
4013 mobj = re.match(self._VALID_URL, url)
4015 raise ExtractorError(u'Invalid URL: %s' % url)
4017 video_id = mobj.group('video_id')
4019 webpage = self._download_webpage(url, video_id)
4021 m = re.search(r'file: "(.*?)",', webpage)
4023 raise ExtractorError(u'Unable to find video url')
4024 video_url = m.group(1)
4026 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4028 raise ExtractorError(u'Cannot find video title')
4029 title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4031 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4033 desc = unescapeHTML(m.group('desc'))
4037 m = re.search(r'By:.*?(\w+)</a>', webpage)
4039 uploader = clean_html(m.group(1))
4048 'description': desc,
4049 'uploader': uploader
4054 class ARDIE(InfoExtractor):
4055 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4056 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4057 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4059 def _real_extract(self, url):
4060 # determine video id from url
4061 m = re.match(self._VALID_URL, url)
4063 numid = re.search(r'documentId=([0-9]+)', url)
4065 video_id = numid.group(1)
4067 video_id = m.group('video_id')
4069 # determine title and media streams from webpage
4070 html = self._download_webpage(url, video_id)
4071 title = re.search(self._TITLE, html).group('title')
4072 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4074 assert '"fsk"' in html
4075 raise ExtractorError(u'This video is only available after 8:00 pm')
4077 # choose default media type and highest quality for now
4078 stream = max([s for s in streams if int(s["media_type"]) == 0],
4079 key=lambda s: int(s["quality"]))
4081 # there's two possibilities: RTMP stream or HTTP download
4082 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4083 if stream['rtmp_url']:
4084 self.to_screen(u'RTMP download detected')
4085 assert stream['video_url'].startswith('mp4:')
4086 info["url"] = stream["rtmp_url"]
4087 info["play_path"] = stream['video_url']
4089 assert stream["video_url"].endswith('.mp4')
4090 info["url"] = stream["video_url"]
4093 class ZDFIE(InfoExtractor):
4094 _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4095 _TITLE = r'<h1(?: class="beitragHeadline")?>(?P<title>.*)</h1>'
4096 _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>'
4097 _MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"'
4098 _RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)'
4100 def _real_extract(self, url):
4101 mobj = re.match(self._VALID_URL, url)
4103 raise ExtractorError(u'Invalid URL: %s' % url)
4104 video_id = mobj.group('video_id')
4106 html = self._download_webpage(url, video_id)
4107 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4109 raise ExtractorError(u'No media url found.')
4111 # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url
4112 # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url
4113 # choose first/default media type and highest quality for now
4114 for s in streams: #find 300 - dsl1000mbit
4115 if s['quality'] == '300' and s['media_type'] == 'wstreaming':
4118 for s in streams: #find veryhigh - dsl2000mbit
4119 if s['quality'] == 'veryhigh' and s['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working
4123 raise ExtractorError(u'No stream found.')
4125 media_link = self._download_webpage(stream_['video_url'], video_id,'Get stream URL')
4127 self.report_extraction(video_id)
4128 mobj = re.search(self._TITLE, html)
4130 raise ExtractorError(u'Cannot extract title')
4131 title = unescapeHTML(mobj.group('title'))
4133 mobj = re.search(self._MMS_STREAM, media_link)
4135 mobj = re.search(self._RTSP_STREAM, media_link)
4137 raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL')
4138 mms_url = mobj.group('video_url')
4140 mobj = re.search('(.*)[.](?P<ext>[^.]+)', mms_url)
4142 raise ExtractorError(u'Cannot extract extention')
4143 ext = mobj.group('ext')
4145 return [{'id': video_id,
4151 class TumblrIE(InfoExtractor):
4152 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4154 def _real_extract(self, url):
4155 m_url = re.match(self._VALID_URL, url)
4156 video_id = m_url.group('id')
4157 blog = m_url.group('blog_name')
4159 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4160 webpage = self._download_webpage(url, video_id)
4162 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4163 video = re.search(re_video, webpage)
4165 self.to_screen("No video found")
4167 video_url = video.group('video_url')
4168 ext = video.group('ext')
4170 re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22' # We pick the first poster
4171 thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
4173 # The only place where you can get a title, it's not complete,
4174 # but searching in other places doesn't work for all videos
4175 re_title = r'<title>(?P<title>.*?)</title>'
4176 title = unescapeHTML(re.search(re_title, webpage, re.DOTALL).group('title'))
4178 return [{'id': video_id,
4185 class BandcampIE(InfoExtractor):
4186 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4188 def _real_extract(self, url):
4189 mobj = re.match(self._VALID_URL, url)
4190 title = mobj.group('title')
4191 webpage = self._download_webpage(url, title)
4192 # We get the link to the free download page
4193 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4194 if m_download is None:
4195 raise ExtractorError(u'No free songs founded')
4197 download_link = m_download.group(1)
4198 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4199 webpage, re.MULTILINE|re.DOTALL).group('id')
4201 download_webpage = self._download_webpage(download_link, id,
4202 'Downloading free downloads page')
4203 # We get the dictionary of the track from some javascrip code
4204 info = re.search(r'items: (.*?),$',
4205 download_webpage, re.MULTILINE).group(1)
4206 info = json.loads(info)[0]
4207 # We pick mp3-320 for now, until format selection can be easily implemented.
4208 mp3_info = info[u'downloads'][u'mp3-320']
4209 # If we try to use this url it says the link has expired
4210 initial_url = mp3_info[u'url']
4211 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4212 m_url = re.match(re_url, initial_url)
4213 #We build the url we will use to get the final track url
4214 # This url is build in Bandcamp in the script download_bunde_*.js
4215 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4216 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4217 # If we could correctly generate the .rand field the url would be
4218 #in the "download_url" key
4219 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4221 track_info = {'id':id,
4222 'title' : info[u'title'],
4225 'thumbnail' : info[u'thumb_url'],
4226 'uploader' : info[u'artist']
4231 class RedTubeIE(InfoExtractor):
4232 """Information Extractor for redtube"""
4233 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4235 def _real_extract(self,url):
4236 mobj = re.match(self._VALID_URL, url)
4238 raise ExtractorError(u'Invalid URL: %s' % url)
4240 video_id = mobj.group('id')
4241 video_extension = 'mp4'
4242 webpage = self._download_webpage(url, video_id)
4243 self.report_extraction(video_id)
4244 mobj = re.search(r'<source src="'+'(.+)'+'" type="video/mp4">',webpage)
4247 raise ExtractorError(u'Unable to extract media URL')
4249 video_url = mobj.group(1)
4250 mobj = re.search('<h1 class="videoTitle slidePanelMovable">(.+)</h1>',webpage)
4252 raise ExtractorError(u'Unable to extract title')
4253 video_title = mobj.group(1)
4258 'ext': video_extension,
4259 'title': video_title,
4262 class InaIE(InfoExtractor):
4263 """Information Extractor for Ina.fr"""
4264 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4266 def _real_extract(self,url):
4267 mobj = re.match(self._VALID_URL, url)
4269 video_id = mobj.group('id')
4270 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4271 video_extension = 'mp4'
4272 webpage = self._download_webpage(mrss_url, video_id)
4274 mobj = re.search(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)', webpage)
4276 raise ExtractorError(u'Unable to extract media URL')
4277 video_url = mobj.group(1)
4279 mobj = re.search(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>', webpage)
4281 raise ExtractorError(u'Unable to extract title')
4282 video_title = mobj.group(1)
4287 'ext': video_extension,
4288 'title': video_title,
4291 class HowcastIE(InfoExtractor):
4292 """Information Extractor for Howcast.com"""
4293 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
4295 def _real_extract(self, url):
4296 mobj = re.match(self._VALID_URL, url)
4298 video_id = mobj.group('id')
4299 webpage_url = 'http://www.howcast.com/videos/' + video_id
4300 webpage = self._download_webpage(webpage_url, video_id)
4302 self.report_extraction(video_id)
4304 mobj = re.search(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)"', webpage)
4306 raise ExtractorError(u'Unable to extract video URL')
4307 video_url = mobj.group(1)
4309 mobj = re.search(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'', webpage)
4311 raise ExtractorError(u'Unable to extract title')
4312 video_title = mobj.group(1) or mobj.group(2)
4314 mobj = re.search(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'', webpage)
4316 self._downloader.report_warning(u'unable to extract description')
4317 video_description = None
4319 video_description = mobj.group(1) or mobj.group(2)
4321 mobj = re.search(r'<meta content=\'(.+?)\' property=\'og:image\'', webpage)
4323 raise ExtractorError(u'Unable to extract thumbnail')
4324 thumbnail = mobj.group(1)
4330 'title': video_title,
4331 'description': video_description,
4332 'thumbnail': thumbnail,
4335 class VineIE(InfoExtractor):
4336 """Information Extractor for Vine.co"""
4337 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
4339 def _real_extract(self, url):
4341 mobj = re.match(self._VALID_URL, url)
4343 video_id = mobj.group('id')
4344 webpage_url = 'https://vine.co/v/' + video_id
4345 webpage = self._download_webpage(webpage_url, video_id)
4347 self.report_extraction(video_id)
4349 mobj = re.search(r'<meta property="twitter:player:stream" content="(.+?)"', webpage)
4351 raise ExtractorError(u'Unable to extract video URL')
4352 video_url = mobj.group(1)
4354 mobj = re.search(r'<meta property="og:title" content="(.+?)"', webpage)
4356 raise ExtractorError(u'Unable to extract title')
4357 video_title = mobj.group(1)
4359 mobj = re.search(r'<meta property="og:image" content="(.+?)(\?.*?)?"', webpage)
4361 raise ExtractorError(u'Unable to extract thumbnail')
4362 thumbnail = mobj.group(1)
4364 mobj = re.search(r'<div class="user">.*?<h2>(.+?)</h2>', webpage, re.DOTALL)
4366 raise ExtractorError(u'Unable to extract uploader')
4367 uploader = mobj.group(1)
4373 'title': video_title,
4374 'thumbnail': thumbnail,
4375 'uploader': uploader,
4378 class FlickrIE(InfoExtractor):
4379 """Information Extractor for Flickr videos"""
4380 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
4382 def _real_extract(self, url):
4383 mobj = re.match(self._VALID_URL, url)
4385 video_id = mobj.group('id')
4386 video_uploader_id = mobj.group('uploader_id')
4387 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
4388 webpage = self._download_webpage(webpage_url, video_id)
4390 mobj = re.search(r"photo_secret: '(\w+)'", webpage)
4392 raise ExtractorError(u'Unable to extract video secret')
4393 secret = mobj.group(1)
4395 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
4396 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
4398 mobj = re.search(r'<Item id="id">(\d+-\d+)</Item>', first_xml)
4400 raise ExtractorError(u'Unable to extract node_id')
4401 node_id = mobj.group(1)
4403 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
4404 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
4406 self.report_extraction(video_id)
4408 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
4410 raise ExtractorError(u'Unable to extract video url')
4411 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
4413 mobj = re.search(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')', webpage)
4415 raise ExtractorError(u'Unable to extract title')
4416 video_title = mobj.group(1) or mobj.group(2)
4418 mobj = re.search(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')', webpage)
4420 self._downloader.report_warning(u'unable to extract description')
4421 video_description = None
4423 video_description = mobj.group(1) or mobj.group(2)
4425 mobj = re.search(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')', webpage)
4427 raise ExtractorError(u'Unable to extract thumbnail')
4428 thumbnail = mobj.group(1) or mobj.group(2)
4434 'title': video_title,
4435 'description': video_description,
4436 'thumbnail': thumbnail,
4437 'uploader_id': video_uploader_id,
4440 class TeamcocoIE(InfoExtractor):
4441 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
4443 def _real_extract(self, url):
4444 mobj = re.match(self._VALID_URL, url)
4446 raise ExtractorError(u'Invalid URL: %s' % url)
4447 url_title = mobj.group('url_title')
4448 webpage = self._download_webpage(url, url_title)
4450 mobj = re.search(r'<article class="video" data-id="(\d+?)"', webpage)
4451 video_id = mobj.group(1)
4453 self.report_extraction(video_id)
4455 mobj = re.search(r'<meta property="og:title" content="(.+?)"', webpage)
4457 raise ExtractorError(u'Unable to extract title')
4458 video_title = mobj.group(1)
4460 mobj = re.search(r'<meta property="og:image" content="(.+?)"', webpage)
4462 raise ExtractorError(u'Unable to extract thumbnail')
4463 thumbnail = mobj.group(1)
4465 mobj = re.search(r'<meta property="og:description" content="(.*?)"', webpage)
4467 raise ExtractorError(u'Unable to extract description')
4468 description = mobj.group(1)
4470 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
4471 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
4472 mobj = re.search(r'<file type="high".*?>(.*?)</file>', data)
4474 raise ExtractorError(u'Unable to extract video url')
4475 video_url = mobj.group(1)
4481 'title': video_title,
4482 'thumbnail': thumbnail,
4483 'description': description,
4486 class XHamsterIE(InfoExtractor):
4487 """Information Extractor for xHamster"""
4488 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
4490 def _real_extract(self,url):
4491 mobj = re.match(self._VALID_URL, url)
4493 video_id = mobj.group('id')
4494 mrss_url='http://xhamster.com/movies/%s/.html' % video_id
4495 webpage = self._download_webpage(mrss_url, video_id)
4496 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
4498 raise ExtractorError(u'Unable to extract media URL')
4499 if len(mobj.group('server')) == 0:
4500 video_url = compat_urllib_parse.unquote(mobj.group('file'))
4502 video_url = mobj.group('server')+'/key='+mobj.group('file')
4503 video_extension = video_url.split('.')[-1]
4505 mobj = re.search(r'<title>(?P<title>.+?) - xHamster\.com</title>', webpage)
4507 raise ExtractorError(u'Unable to extract title')
4508 video_title = unescapeHTML(mobj.group('title'))
4510 mobj = re.search(r'<span>Description: </span>(?P<description>[^<]+)', webpage)
4512 video_description = u''
4514 video_description = unescapeHTML(mobj.group('description'))
4516 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
4518 raise ExtractorError(u'Unable to extract upload date')
4519 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
4521 mobj = re.search(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^>]+)', webpage)
4523 video_uploader_id = u'anonymous'
4525 video_uploader_id = mobj.group('uploader_id')
4527 mobj = re.search(r'\'image\':\'(?P<thumbnail>[^\']+)\'', webpage)
4529 raise ExtractorError(u'Unable to extract thumbnail URL')
4530 video_thumbnail = mobj.group('thumbnail')
4535 'ext': video_extension,
4536 'title': video_title,
4537 'description': video_description,
4538 'upload_date': video_upload_date,
4539 'uploader_id': video_uploader_id,
4540 'thumbnail': video_thumbnail
4543 class HypemIE(InfoExtractor):
4544 """Information Extractor for hypem"""
4545 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
4547 def _real_extract(self, url):
4548 mobj = re.match(self._VALID_URL, url)
4550 raise ExtractorError(u'Invalid URL: %s' % url)
4551 track_id = mobj.group(1)
4553 data = { 'ax': 1, 'ts': time.time() }
4554 data_encoded = compat_urllib_parse.urlencode(data)
4555 complete_url = url + "?" + data_encoded
4556 request = compat_urllib_request.Request(complete_url)
4557 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
4558 cookie = urlh.headers.get('Set-Cookie', '')
4560 self.report_extraction(track_id)
4561 mobj = re.search(r'<script type="application/json" id="displayList-data">(.*?)</script>', response, flags=re.MULTILINE|re.DOTALL)
4563 raise ExtractorError(u'Unable to extrack tracks')
4564 html_tracks = mobj.group(1).strip()
4566 track_list = json.loads(html_tracks)
4567 track = track_list[u'tracks'][0]
4569 raise ExtractorError(u'Hypemachine contained invalid JSON.')
4572 track_id = track[u"id"]
4573 artist = track[u"artist"]
4574 title = track[u"song"]
4576 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
4577 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
4578 request.add_header('cookie', cookie)
4579 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
4581 song_data = json.loads(song_data_json)
4583 raise ExtractorError(u'Hypemachine contained invalid JSON.')
4584 final_url = song_data[u"url"]
4594 class Vbox7IE(InfoExtractor):
4595 """Information Extractor for Vbox7"""
4596 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
4598 def _real_extract(self,url):
4599 mobj = re.match(self._VALID_URL, url)
4601 raise ExtractorError(u'Invalid URL: %s' % url)
4602 video_id = mobj.group(1)
4604 redirect_page, urlh = self._download_webpage_handle(url, video_id)
4605 redirect_url = urlh.geturl() + re.search(r'window\.location = \'(.*)\';', redirect_page).group(1)
4606 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
4608 title = re.search(r'<title>(.*)</title>', webpage)
4609 title = (title.group(1)).split('/')[0].strip()
4612 info_url = "http://vbox7.com/play/magare.do"
4613 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
4614 info_request = compat_urllib_request.Request(info_url, data)
4615 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
4616 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
4617 if info_response is None:
4618 raise ExtractorError(u'Unable to extract the media url')
4619 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
4626 'thumbnail': thumbnail_url,
4629 def gen_extractors():
4630 """ Return a list of an instance of every supported extractor.
4631 The order does matter; the first extractor matched is the one handling the URL.
4634 YoutubePlaylistIE(),
4659 StanfordOpenClassroomIE(),
4669 WorldStarHipHopIE(),
4697 def get_info_extractor(ie_name):
4698 """Returns the info extractor class with the given ie_name"""
4699 return globals()[ie_name+'IE']