2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
26 class InfoExtractor(object):
27 """Information Extractor class.
29 Information extractors are the classes that, given a URL, extract
30 information about the video (or videos) the URL refers to. This
31 information includes the real video URL, the video title, author and
32 others. The information is stored in a dictionary which is then
33 passed to the FileDownloader. The FileDownloader processes this
34 information possibly downloading the video to the file system, among
35 other possible outcomes.
37 The dictionaries must include the following fields:
41 title: Video title, unescaped.
42 ext: Video filename extension.
44 The following fields are optional:
46 format: The video format, defaults to ext (used for --get-format)
47 thumbnail: Full URL to a video thumbnail image.
48 description: One-line video description.
49 uploader: Full name of the video uploader.
50 upload_date: Video upload date (YYYYMMDD).
51 uploader_id: Nickname or id of the video uploader.
52 location: Physical location of the video.
53 player_url: SWF Player URL (used for rtmpdump).
54 subtitles: The subtitle file contents.
55 urlhandle: [internal] The urlHandle to be used to download the file,
56 like returned by urllib.request.urlopen
58 The fields should all be Unicode strings.
60 Subclasses of this one should re-define the _real_initialize() and
61 _real_extract() methods and define a _VALID_URL regexp.
62 Probably, they should also be added to the list of extractors.
64 _real_extract() must return a *list* of information dictionaries as
67 Finally, the _WORKING attribute should be set to False for broken IEs
68 in order to warn the users and skip the tests.
75 def __init__(self, downloader=None):
76 """Constructor. Receives an optional downloader."""
78 self.set_downloader(downloader)
81 def suitable(cls, url):
82 """Receives a URL and returns True if suitable for this IE."""
83 return re.match(cls._VALID_URL, url) is not None
87 """Getter method for _WORKING."""
91 """Initializes an instance (authentication, etc)."""
93 self._real_initialize()
96 def extract(self, url):
97 """Extracts URL information and returns it in list of dicts."""
99 return self._real_extract(url)
101 def set_downloader(self, downloader):
102 """Sets the downloader for this IE."""
103 self._downloader = downloader
105 def _real_initialize(self):
106 """Real initialization process. Redefine in subclasses."""
109 def _real_extract(self, url):
110 """Real extraction process. Redefine in subclasses."""
115 return type(self).__name__[:-2]
117 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
118 """ Returns the response handle """
120 self.report_download_webpage(video_id)
121 elif note is not False:
122 self.to_screen(u'%s: %s' % (video_id, note))
124 return compat_urllib_request.urlopen(url_or_request)
125 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
127 errnote = u'Unable to download webpage'
128 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
130 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
131 """ Returns a tuple (page content as string, URL handle) """
132 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
133 content_type = urlh.headers.get('Content-Type', '')
134 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
136 encoding = m.group(1)
139 webpage_bytes = urlh.read()
140 if self._downloader.params.get('dump_intermediate_pages', False):
142 url = url_or_request.get_full_url()
143 except AttributeError:
145 self.to_screen(u'Dumping request to ' + url)
146 dump = base64.b64encode(webpage_bytes).decode('ascii')
147 self._downloader.to_screen(dump)
148 content = webpage_bytes.decode(encoding, 'replace')
149 return (content, urlh)
151 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
152 """ Returns the data of the page as a string """
153 return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
155 def to_screen(self, msg):
156 """Print msg to screen, prefixing it with '[ie_name]'"""
157 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
159 def report_extraction(self, id_or_name):
160 """Report information extraction."""
161 self.to_screen(u'%s: Extracting information' % id_or_name)
163 def report_download_webpage(self, video_id):
164 """Report webpage download."""
165 self.to_screen(u'%s: Downloading webpage' % video_id)
167 def report_age_confirmation(self):
168 """Report attempt to confirm age."""
169 self.to_screen(u'Confirming age')
171 #Methods for following #608
172 #They set the correct value of the '_type' key
173 def video_result(self, video_info):
174 """Returns a video"""
175 video_info['_type'] = 'video'
177 def url_result(self, url, ie=None):
178 """Returns a url that points to a page that should be processed"""
179 #TODO: ie should be the class used for getting the info
180 video_info = {'_type': 'url',
184 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
185 """Returns a playlist"""
186 video_info = {'_type': 'playlist',
189 video_info['id'] = playlist_id
191 video_info['title'] = playlist_title
194 def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
196 Perform a regex search on the given string, using a single or a list of
197 patterns returning the first matching group.
198 In case of failure return a default value or raise a WARNING or a
199 ExtractorError, depending on fatal, specifying the field name.
201 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
202 mobj = re.search(pattern, string, flags)
205 mobj = re.search(p, string, flags)
208 if sys.stderr.isatty() and os.name != 'nt':
209 _name = u'\033[0;34m%s\033[0m' % name
214 # return the first matching group
215 return next(g for g in mobj.groups() if g is not None)
216 elif default is not None:
219 raise ExtractorError(u'Unable to extract %s' % _name)
221 self._downloader.report_warning(u'unable to extract %s; '
222 u'please report this issue on GitHub.' % _name)
225 def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
227 Like _search_regex, but strips HTML tags and unescapes entities.
229 res = self._search_regex(pattern, string, name, default, fatal, flags)
231 return clean_html(res).strip()
235 class SearchInfoExtractor(InfoExtractor):
237 Base class for paged search queries extractors.
238 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
239 Instances should define _SEARCH_KEY and _MAX_RESULTS.
243 def _make_valid_url(cls):
244 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
247 def suitable(cls, url):
248 return re.match(cls._make_valid_url(), url) is not None
250 def _real_extract(self, query):
251 mobj = re.match(self._make_valid_url(), query)
253 raise ExtractorError(u'Invalid search query "%s"' % query)
255 prefix = mobj.group('prefix')
256 query = mobj.group('query')
258 return self._get_n_results(query, 1)
259 elif prefix == 'all':
260 return self._get_n_results(query, self._MAX_RESULTS)
264 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
265 elif n > self._MAX_RESULTS:
266 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
267 n = self._MAX_RESULTS
268 return self._get_n_results(query, n)
270 def _get_n_results(self, query, n):
271 """Get a specified number of results for a query"""
272 raise NotImplementedError("This method must be implemented by sublclasses")
275 class YoutubeIE(InfoExtractor):
276 """Information extractor for youtube.com."""
280 (?:https?://)? # http(s):// (optional)
281 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
282 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
283 (?:.*?\#/)? # handle anchor (#/) redirect urls
284 (?: # the various things that can precede the ID:
285 (?:(?:v|embed|e)/) # v/ or embed/ or e/
286 |(?: # or the v= param in all its forms
287 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
288 (?:\?|\#!?) # the params delimiter ? or # or #!
289 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
292 )? # optional -> youtube.com/xxxx is OK
293 )? # all until now is optional -> you can pass the naked ID
294 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
295 (?(1).+)? # if we found the ID, everything can follow
297 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
298 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
299 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
300 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
301 _NETRC_MACHINE = 'youtube'
302 # Listed in order of quality
303 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
304 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
305 _video_extensions = {
311 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
317 _video_dimensions = {
336 def suitable(cls, url):
337 """Receives a URL and returns True if suitable for this IE."""
338 if YoutubePlaylistIE.suitable(url): return False
339 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
341 def report_lang(self):
342 """Report attempt to set language."""
343 self.to_screen(u'Setting language')
345 def report_login(self):
346 """Report attempt to log in."""
347 self.to_screen(u'Logging in')
349 def report_video_webpage_download(self, video_id):
350 """Report attempt to download video webpage."""
351 self.to_screen(u'%s: Downloading video webpage' % video_id)
353 def report_video_info_webpage_download(self, video_id):
354 """Report attempt to download video info webpage."""
355 self.to_screen(u'%s: Downloading video info webpage' % video_id)
357 def report_video_subtitles_download(self, video_id):
358 """Report attempt to download video info webpage."""
359 self.to_screen(u'%s: Checking available subtitles' % video_id)
361 def report_video_subtitles_request(self, video_id, sub_lang, format):
362 """Report attempt to download video info webpage."""
363 self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
365 def report_video_subtitles_available(self, video_id, sub_lang_list):
366 """Report available subtitles."""
367 sub_lang = ",".join(list(sub_lang_list.keys()))
368 self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
370 def report_information_extraction(self, video_id):
371 """Report attempt to extract video information."""
372 self.to_screen(u'%s: Extracting video information' % video_id)
374 def report_unavailable_format(self, video_id, format):
375 """Report extracted video URL."""
376 self.to_screen(u'%s: Format %s not available' % (video_id, format))
378 def report_rtmp_download(self):
379 """Indicate the download will use the RTMP protocol."""
380 self.to_screen(u'RTMP download detected')
383 def _decrypt_signature(s):
384 """Decrypt the key the two subkeys must have a length of 43"""
386 if len(a) != 43 or len(b) != 43:
387 raise ExtractorError(u'Unable to decrypt signature, subkeys lengths not valid')
388 b = ''.join([b[:8],a[0],b[9:18],b[-4],b[19:39], b[18]])[0:40]
390 s_dec = '.'.join((a,b))[::-1]
393 def _get_available_subtitles(self, video_id):
394 self.report_video_subtitles_download(video_id)
395 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
397 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
398 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
399 return (u'unable to download video subtitles: %s' % compat_str(err), None)
400 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
401 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
402 if not sub_lang_list:
403 return (u'video doesn\'t have subtitles', None)
406 def _list_available_subtitles(self, video_id):
407 sub_lang_list = self._get_available_subtitles(video_id)
408 self.report_video_subtitles_available(video_id, sub_lang_list)
410 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
413 (error_message, sub_lang, sub)
415 self.report_video_subtitles_request(video_id, sub_lang, format)
416 params = compat_urllib_parse.urlencode({
422 url = 'http://www.youtube.com/api/timedtext?' + params
424 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
425 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
426 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
428 return (u'Did not fetch video subtitles', None, None)
429 return (None, sub_lang, sub)
431 def _request_automatic_caption(self, video_id, webpage):
432 """We need the webpage for getting the captions url, pass it as an
433 argument to speed up the process."""
434 sub_lang = self._downloader.params.get('subtitleslang')
435 sub_format = self._downloader.params.get('subtitlesformat')
436 self.to_screen(u'%s: Looking for automatic captions' % video_id)
437 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
438 err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang
440 return [(err_msg, None, None)]
441 player_config = json.loads(mobj.group(1))
443 args = player_config[u'args']
444 caption_url = args[u'ttsurl']
445 timestamp = args[u'timestamp']
446 params = compat_urllib_parse.urlencode({
453 subtitles_url = caption_url + '&' + params
454 sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions')
455 return [(None, sub_lang, sub)]
457 return [(err_msg, None, None)]
459 def _extract_subtitle(self, video_id):
461 Return a list with a tuple:
462 [(error_message, sub_lang, sub)]
464 sub_lang_list = self._get_available_subtitles(video_id)
465 sub_format = self._downloader.params.get('subtitlesformat')
466 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
467 return [(sub_lang_list[0], None, None)]
468 if self._downloader.params.get('subtitleslang', False):
469 sub_lang = self._downloader.params.get('subtitleslang')
470 elif 'en' in sub_lang_list:
473 sub_lang = list(sub_lang_list.keys())[0]
474 if not sub_lang in sub_lang_list:
475 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
477 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
480 def _extract_all_subtitles(self, video_id):
481 sub_lang_list = self._get_available_subtitles(video_id)
482 sub_format = self._downloader.params.get('subtitlesformat')
483 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
484 return [(sub_lang_list[0], None, None)]
486 for sub_lang in sub_lang_list:
487 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
488 subtitles.append(subtitle)
491 def _print_formats(self, formats):
492 print('Available formats:')
494 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
496 def _real_initialize(self):
497 if self._downloader is None:
502 downloader_params = self._downloader.params
504 # Attempt to use provided username and password or .netrc data
505 if downloader_params.get('username', None) is not None:
506 username = downloader_params['username']
507 password = downloader_params['password']
508 elif downloader_params.get('usenetrc', False):
510 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
515 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
516 except (IOError, netrc.NetrcParseError) as err:
517 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
521 request = compat_urllib_request.Request(self._LANG_URL)
524 compat_urllib_request.urlopen(request).read()
525 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
526 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
529 # No authentication to be performed
533 request = compat_urllib_request.Request(self._LOGIN_URL)
535 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
536 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
537 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
542 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
544 galx = match.group(1)
546 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
552 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
556 u'PersistentCookie': u'yes',
558 u'bgresponse': u'js_disabled',
559 u'checkConnection': u'',
560 u'checkedDomains': u'youtube',
566 u'signIn': u'Sign in',
568 u'service': u'youtube',
572 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
574 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
575 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
576 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
579 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
580 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
581 self._downloader.report_warning(u'unable to log in: bad username or password')
583 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
584 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
590 'action_confirm': 'Confirm',
592 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
594 self.report_age_confirmation()
595 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
596 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
597 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
599 def _extract_id(self, url):
600 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
602 raise ExtractorError(u'Invalid URL: %s' % url)
603 video_id = mobj.group(2)
606 def _real_extract(self, url):
607 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
608 mobj = re.search(self._NEXT_URL_RE, url)
610 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
611 video_id = self._extract_id(url)
614 self.report_video_webpage_download(video_id)
615 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
616 request = compat_urllib_request.Request(url)
618 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
619 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
620 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
622 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
624 # Attempt to extract SWF player URL
625 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
627 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
632 self.report_video_info_webpage_download(video_id)
633 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
634 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
635 % (video_id, el_type))
636 video_info_webpage = self._download_webpage(video_info_url, video_id,
638 errnote='unable to download video info webpage')
639 video_info = compat_parse_qs(video_info_webpage)
640 if 'token' in video_info:
642 if 'token' not in video_info:
643 if 'reason' in video_info:
644 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
646 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
648 # Check for "rental" videos
649 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
650 raise ExtractorError(u'"rental" videos not supported')
652 # Start extracting information
653 self.report_information_extraction(video_id)
656 if 'author' not in video_info:
657 raise ExtractorError(u'Unable to extract uploader name')
658 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
661 video_uploader_id = None
662 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
664 video_uploader_id = mobj.group(1)
666 self._downloader.report_warning(u'unable to extract uploader nickname')
669 if 'title' not in video_info:
670 raise ExtractorError(u'Unable to extract video title')
671 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
674 if 'thumbnail_url' not in video_info:
675 self._downloader.report_warning(u'unable to extract video thumbnail')
677 else: # don't panic if we can't find it
678 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
682 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
684 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
685 upload_date = unified_strdate(upload_date)
688 video_description = get_element_by_id("eow-description", video_webpage)
689 if video_description:
690 video_description = clean_html(video_description)
692 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
694 video_description = unescapeHTML(fd_mobj.group(1))
696 video_description = u''
699 video_subtitles = None
701 if self._downloader.params.get('writesubtitles', False):
702 video_subtitles = self._extract_subtitle(video_id)
704 (sub_error, sub_lang, sub) = video_subtitles[0]
706 # We try with the automatic captions
707 video_subtitles = self._request_automatic_caption(video_id, video_webpage)
708 (sub_error_auto, sub_lang, sub) = video_subtitles[0]
712 # We report the original error
713 self._downloader.report_error(sub_error)
715 if self._downloader.params.get('allsubtitles', False):
716 video_subtitles = self._extract_all_subtitles(video_id)
717 for video_subtitle in video_subtitles:
718 (sub_error, sub_lang, sub) = video_subtitle
720 self._downloader.report_error(sub_error)
722 if self._downloader.params.get('listsubtitles', False):
723 sub_lang_list = self._list_available_subtitles(video_id)
726 if 'length_seconds' not in video_info:
727 self._downloader.report_warning(u'unable to extract video duration')
730 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
733 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
735 # Decide which formats to download
736 req_format = self._downloader.params.get('format', None)
739 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
740 info = json.loads(mobj.group(1))
742 if args.get('ptk','') == 'vevo' or 'dashmpd':
743 # Vevo videos with encrypted signatures
744 self.to_screen(u'Vevo video detected.')
745 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
749 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
750 self.report_rtmp_download()
751 video_url_list = [(None, video_info['conn'][0])]
752 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
754 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
755 url_data = compat_parse_qs(url_data_str)
756 if 'itag' in url_data and 'url' in url_data:
757 url = url_data['url'][0]
758 if 'sig' in url_data:
759 url += '&signature=' + url_data['sig'][0]
761 signature = self._decrypt_signature(url_data['s'][0])
762 url += '&signature=' + signature
763 if 'ratebypass' not in url:
764 url += '&ratebypass=yes'
765 url_map[url_data['itag'][0]] = url
767 format_limit = self._downloader.params.get('format_limit', None)
768 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
769 if format_limit is not None and format_limit in available_formats:
770 format_list = available_formats[available_formats.index(format_limit):]
772 format_list = available_formats
773 existing_formats = [x for x in format_list if x in url_map]
774 if len(existing_formats) == 0:
775 raise ExtractorError(u'no known formats available for video')
776 if self._downloader.params.get('listformats', None):
777 self._print_formats(existing_formats)
779 if req_format is None or req_format == 'best':
780 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
781 elif req_format == 'worst':
782 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
783 elif req_format in ('-1', 'all'):
784 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
786 # Specific formats. We pick the first in a slash-delimeted sequence.
787 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
788 req_formats = req_format.split('/')
789 video_url_list = None
790 for rf in req_formats:
792 video_url_list = [(rf, url_map[rf])]
794 if video_url_list is None:
795 raise ExtractorError(u'requested format not available')
797 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
800 for format_param, video_real_url in video_url_list:
802 video_extension = self._video_extensions.get(format_param, 'flv')
804 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
805 self._video_dimensions.get(format_param, '???'))
809 'url': video_real_url,
810 'uploader': video_uploader,
811 'uploader_id': video_uploader_id,
812 'upload_date': upload_date,
813 'title': video_title,
814 'ext': video_extension,
815 'format': video_format,
816 'thumbnail': video_thumbnail,
817 'description': video_description,
818 'player_url': player_url,
819 'subtitles': video_subtitles,
820 'duration': video_duration
825 class MetacafeIE(InfoExtractor):
826 """Information Extractor for metacafe.com."""
828 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
829 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
830 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
831 IE_NAME = u'metacafe'
833 def report_disclaimer(self):
834 """Report disclaimer retrieval."""
835 self.to_screen(u'Retrieving disclaimer')
837 def _real_initialize(self):
838 # Retrieve disclaimer
839 request = compat_urllib_request.Request(self._DISCLAIMER)
841 self.report_disclaimer()
842 disclaimer = compat_urllib_request.urlopen(request).read()
843 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
844 raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
849 'submit': "Continue - I'm over 18",
851 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
853 self.report_age_confirmation()
854 disclaimer = compat_urllib_request.urlopen(request).read()
855 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
856 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
858 def _real_extract(self, url):
859 # Extract id and simplified title from URL
860 mobj = re.match(self._VALID_URL, url)
862 raise ExtractorError(u'Invalid URL: %s' % url)
864 video_id = mobj.group(1)
866 # Check if video comes from YouTube
867 mobj2 = re.match(r'^yt-(.*)$', video_id)
868 if mobj2 is not None:
869 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
871 # Retrieve video webpage to extract further information
872 webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
874 # Extract URL, uploader and title from webpage
875 self.report_extraction(video_id)
876 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
878 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
879 video_extension = mediaURL[-3:]
881 # Extract gdaKey if available
882 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
886 gdaKey = mobj.group(1)
887 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
889 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
891 raise ExtractorError(u'Unable to extract media URL')
892 vardict = compat_parse_qs(mobj.group(1))
893 if 'mediaData' not in vardict:
894 raise ExtractorError(u'Unable to extract media URL')
895 mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
897 raise ExtractorError(u'Unable to extract media URL')
898 mediaURL = mobj.group('mediaURL').replace('\\/', '/')
899 video_extension = mediaURL[-3:]
900 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
902 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
904 raise ExtractorError(u'Unable to extract title')
905 video_title = mobj.group(1).decode('utf-8')
907 mobj = re.search(r'submitter=(.*?);', webpage)
909 raise ExtractorError(u'Unable to extract uploader nickname')
910 video_uploader = mobj.group(1)
913 'id': video_id.decode('utf-8'),
914 'url': video_url.decode('utf-8'),
915 'uploader': video_uploader.decode('utf-8'),
917 'title': video_title,
918 'ext': video_extension.decode('utf-8'),
921 class DailymotionIE(InfoExtractor):
922 """Information Extractor for Dailymotion"""
924 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
925 IE_NAME = u'dailymotion'
927 def _real_extract(self, url):
928 # Extract id and simplified title from URL
929 mobj = re.match(self._VALID_URL, url)
931 raise ExtractorError(u'Invalid URL: %s' % url)
933 video_id = mobj.group(1).split('_')[0].split('?')[0]
935 video_extension = 'mp4'
937 # Retrieve video webpage to extract further information
938 request = compat_urllib_request.Request(url)
939 request.add_header('Cookie', 'family_filter=off')
940 webpage = self._download_webpage(request, video_id)
942 # Extract URL, uploader and title from webpage
943 self.report_extraction(video_id)
944 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
946 raise ExtractorError(u'Unable to extract media URL')
947 flashvars = compat_urllib_parse.unquote(mobj.group(1))
949 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
952 self.to_screen(u'Using %s' % key)
955 raise ExtractorError(u'Unable to extract video URL')
957 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
959 raise ExtractorError(u'Unable to extract video URL')
961 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
963 # TODO: support choosing qualities
965 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
967 raise ExtractorError(u'Unable to extract title')
968 video_title = unescapeHTML(mobj.group('title'))
970 video_uploader = None
971 video_uploader = self._search_regex([r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>',
972 # Looking for official user
973 r'<(?:span|a) .*?rel="author".*?>([^<]+?)</'],
974 webpage, 'video uploader')
976 video_upload_date = None
977 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
979 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
984 'uploader': video_uploader,
985 'upload_date': video_upload_date,
986 'title': video_title,
987 'ext': video_extension,
991 class PhotobucketIE(InfoExtractor):
992 """Information extractor for photobucket.com."""
994 # TODO: the original _VALID_URL was:
995 # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
996 # Check if it's necessary to keep the old extracion process
997 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
998 IE_NAME = u'photobucket'
1000 def _real_extract(self, url):
1001 # Extract id from URL
1002 mobj = re.match(self._VALID_URL, url)
1004 raise ExtractorError(u'Invalid URL: %s' % url)
1006 video_id = mobj.group('id')
1008 video_extension = mobj.group('ext')
1010 # Retrieve video webpage to extract further information
1011 webpage = self._download_webpage(url, video_id)
1013 # Extract URL, uploader, and title from webpage
1014 self.report_extraction(video_id)
1015 # We try first by looking the javascript code:
1016 mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
1017 if mobj is not None:
1018 info = json.loads(mobj.group('json'))
1021 'url': info[u'downloadUrl'],
1022 'uploader': info[u'username'],
1023 'upload_date': datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
1024 'title': info[u'title'],
1025 'ext': video_extension,
1026 'thumbnail': info[u'thumbUrl'],
1029 # We try looking in other parts of the webpage
1030 video_url = self._search_regex(r'<link rel="video_src" href=".*\?file=([^"]+)" />',
1031 webpage, u'video URL')
1033 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1035 raise ExtractorError(u'Unable to extract title')
1036 video_title = mobj.group(1).decode('utf-8')
1037 video_uploader = mobj.group(2).decode('utf-8')
1040 'id': video_id.decode('utf-8'),
1041 'url': video_url.decode('utf-8'),
1042 'uploader': video_uploader,
1043 'upload_date': None,
1044 'title': video_title,
1045 'ext': video_extension.decode('utf-8'),
1049 class YahooIE(InfoExtractor):
1050 """Information extractor for screen.yahoo.com."""
1051 _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
1053 def _real_extract(self, url):
1054 mobj = re.match(self._VALID_URL, url)
1056 raise ExtractorError(u'Invalid URL: %s' % url)
1057 video_id = mobj.group('id')
1058 webpage = self._download_webpage(url, video_id)
1059 m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
1062 # TODO: Check which url parameters are required
1063 info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1064 webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
1065 info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
1066 <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
1067 <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
1068 <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
1070 self.report_extraction(video_id)
1071 m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
1073 raise ExtractorError(u'Unable to extract video info')
1074 video_title = m_info.group('title')
1075 video_description = m_info.group('description')
1076 video_thumb = m_info.group('thumb')
1077 video_date = m_info.group('date')
1078 video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
1080 # TODO: Find a way to get mp4 videos
1081 rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1082 webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
1083 m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
1084 video_url = m_rest.group('url')
1085 video_path = m_rest.group('path')
1087 raise ExtractorError(u'Unable to extract video url')
1089 else: # We have to use a different method if another id is defined
1090 long_id = m_id.group('new_id')
1091 info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
1092 webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
1093 json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
1094 info = json.loads(json_str)
1095 res = info[u'query'][u'results'][u'mediaObj'][0]
1096 stream = res[u'streams'][0]
1097 video_path = stream[u'path']
1098 video_url = stream[u'host']
1100 video_title = meta[u'title']
1101 video_description = meta[u'description']
1102 video_thumb = meta[u'thumbnail']
1103 video_date = None # I can't find it
1108 'play_path': video_path,
1109 'title':video_title,
1110 'description': video_description,
1111 'thumbnail': video_thumb,
1112 'upload_date': video_date,
1117 class VimeoIE(InfoExtractor):
1118 """Information extractor for vimeo.com."""
1120 # _VALID_URL matches Vimeo URLs
1121 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1124 def _real_extract(self, url, new_video=True):
1125 # Extract ID from URL
1126 mobj = re.match(self._VALID_URL, url)
1128 raise ExtractorError(u'Invalid URL: %s' % url)
1130 video_id = mobj.group('id')
1131 if not mobj.group('proto'):
1132 url = 'https://' + url
1133 if mobj.group('direct_link') or mobj.group('pro'):
1134 url = 'https://vimeo.com/' + video_id
1136 # Retrieve video webpage to extract further information
1137 request = compat_urllib_request.Request(url, None, std_headers)
1138 webpage = self._download_webpage(request, video_id)
1140 # Now we begin extracting as much information as we can from what we
1141 # retrieved. First we extract the information common to all extractors,
1142 # and latter we extract those that are Vimeo specific.
1143 self.report_extraction(video_id)
1145 # Extract the config JSON
1147 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1148 config = json.loads(config)
1150 if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1151 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1153 raise ExtractorError(u'Unable to extract info section')
1156 video_title = config["video"]["title"]
1158 # Extract uploader and uploader_id
1159 video_uploader = config["video"]["owner"]["name"]
1160 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None
1162 # Extract video thumbnail
1163 video_thumbnail = config["video"]["thumbnail"]
1165 # Extract video description
1166 video_description = get_element_by_attribute("itemprop", "description", webpage)
1167 if video_description: video_description = clean_html(video_description)
1168 else: video_description = u''
1170 # Extract upload date
1171 video_upload_date = None
1172 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1173 if mobj is not None:
1174 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1176 # Vimeo specific: extract request signature and timestamp
1177 sig = config['request']['signature']
1178 timestamp = config['request']['timestamp']
1180 # Vimeo specific: extract video codec and quality information
1181 # First consider quality, then codecs, then take everything
1182 # TODO bind to format param
1183 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1184 files = { 'hd': [], 'sd': [], 'other': []}
1185 for codec_name, codec_extension in codecs:
1186 if codec_name in config["video"]["files"]:
1187 if 'hd' in config["video"]["files"][codec_name]:
1188 files['hd'].append((codec_name, codec_extension, 'hd'))
1189 elif 'sd' in config["video"]["files"][codec_name]:
1190 files['sd'].append((codec_name, codec_extension, 'sd'))
1192 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1194 for quality in ('hd', 'sd', 'other'):
1195 if len(files[quality]) > 0:
1196 video_quality = files[quality][0][2]
1197 video_codec = files[quality][0][0]
1198 video_extension = files[quality][0][1]
1199 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1202 raise ExtractorError(u'No known codec found')
1204 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1205 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1210 'uploader': video_uploader,
1211 'uploader_id': video_uploader_id,
1212 'upload_date': video_upload_date,
1213 'title': video_title,
1214 'ext': video_extension,
1215 'thumbnail': video_thumbnail,
1216 'description': video_description,
1220 class ArteTvIE(InfoExtractor):
1221 """arte.tv information extractor."""
1223 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1224 _LIVE_URL = r'index-[0-9]+\.html$'
1226 IE_NAME = u'arte.tv'
1228 def fetch_webpage(self, url):
1229 request = compat_urllib_request.Request(url)
1231 self.report_download_webpage(url)
1232 webpage = compat_urllib_request.urlopen(request).read()
1233 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1234 raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1235 except ValueError as err:
1236 raise ExtractorError(u'Invalid URL: %s' % url)
1239 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1240 page = self.fetch_webpage(url)
1241 mobj = re.search(regex, page, regexFlags)
1245 raise ExtractorError(u'Invalid URL: %s' % url)
1247 for (i, key, err) in matchTuples:
1248 if mobj.group(i) is None:
1249 raise ExtractorError(err)
1251 info[key] = mobj.group(i)
1255 def extractLiveStream(self, url):
1256 video_lang = url.split('/')[-4]
1257 info = self.grep_webpage(
1259 r'src="(.*?/videothek_js.*?\.js)',
1262 (1, 'url', u'Invalid URL: %s' % url)
1265 http_host = url.split('/')[2]
1266 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1267 info = self.grep_webpage(
1269 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1270 '(http://.*?\.swf).*?' +
1274 (1, 'path', u'could not extract video path: %s' % url),
1275 (2, 'player', u'could not extract video player: %s' % url),
1276 (3, 'url', u'could not extract video url: %s' % url)
1279 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1281 def extractPlus7Stream(self, url):
1282 video_lang = url.split('/')[-3]
1283 info = self.grep_webpage(
1285 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1288 (1, 'url', u'Invalid URL: %s' % url)
1291 next_url = compat_urllib_parse.unquote(info.get('url'))
1292 info = self.grep_webpage(
1294 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1297 (1, 'url', u'Could not find <video> tag: %s' % url)
1300 next_url = compat_urllib_parse.unquote(info.get('url'))
1302 info = self.grep_webpage(
1304 r'<video id="(.*?)".*?>.*?' +
1305 '<name>(.*?)</name>.*?' +
1306 '<dateVideo>(.*?)</dateVideo>.*?' +
1307 '<url quality="hd">(.*?)</url>',
1310 (1, 'id', u'could not extract video id: %s' % url),
1311 (2, 'title', u'could not extract video title: %s' % url),
1312 (3, 'date', u'could not extract video date: %s' % url),
1313 (4, 'url', u'could not extract video url: %s' % url)
1318 'id': info.get('id'),
1319 'url': compat_urllib_parse.unquote(info.get('url')),
1320 'uploader': u'arte.tv',
1321 'upload_date': unified_strdate(info.get('date')),
1322 'title': info.get('title').decode('utf-8'),
1328 def _real_extract(self, url):
1329 video_id = url.split('/')[-1]
1330 self.report_extraction(video_id)
1332 if re.search(self._LIVE_URL, video_id) is not None:
1333 self.extractLiveStream(url)
1336 info = self.extractPlus7Stream(url)
1341 class GenericIE(InfoExtractor):
1342 """Generic last-resort information extractor."""
1345 IE_NAME = u'generic'
1347 def report_download_webpage(self, video_id):
1348 """Report webpage download."""
1349 if not self._downloader.params.get('test', False):
1350 self._downloader.report_warning(u'Falling back on generic information extractor.')
1351 super(GenericIE, self).report_download_webpage(video_id)
1353 def report_following_redirect(self, new_url):
1354 """Report information extraction."""
1355 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1357 def _test_redirect(self, url):
1358 """Check if it is a redirect, like url shorteners, in case return the new url."""
1359 class HeadRequest(compat_urllib_request.Request):
1360 def get_method(self):
1363 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1365 Subclass the HTTPRedirectHandler to make it use our
1366 HeadRequest also on the redirected URL
1368 def redirect_request(self, req, fp, code, msg, headers, newurl):
1369 if code in (301, 302, 303, 307):
1370 newurl = newurl.replace(' ', '%20')
1371 newheaders = dict((k,v) for k,v in req.headers.items()
1372 if k.lower() not in ("content-length", "content-type"))
1373 return HeadRequest(newurl,
1375 origin_req_host=req.get_origin_req_host(),
1378 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1380 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1382 Fallback to GET if HEAD is not allowed (405 HTTP error)
1384 def http_error_405(self, req, fp, code, msg, headers):
1388 newheaders = dict((k,v) for k,v in req.headers.items()
1389 if k.lower() not in ("content-length", "content-type"))
1390 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1392 origin_req_host=req.get_origin_req_host(),
1396 opener = compat_urllib_request.OpenerDirector()
1397 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1398 HTTPMethodFallback, HEADRedirectHandler,
1399 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1400 opener.add_handler(handler())
1402 response = opener.open(HeadRequest(url))
1403 if response is None:
1404 raise ExtractorError(u'Invalid URL protocol')
1405 new_url = response.geturl()
1410 self.report_following_redirect(new_url)
1413 def _real_extract(self, url):
1414 new_url = self._test_redirect(url)
1415 if new_url: return [self.url_result(new_url)]
1417 video_id = url.split('/')[-1]
1419 webpage = self._download_webpage(url, video_id)
1420 except ValueError as err:
1421 # since this is the last-resort InfoExtractor, if
1422 # this error is thrown, it'll be thrown here
1423 raise ExtractorError(u'Invalid URL: %s' % url)
1425 self.report_extraction(video_id)
1426 # Start with something easy: JW Player in SWFObject
1427 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1429 # Broaden the search a little bit
1430 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1432 # Broaden the search a little bit: JWPlayer JS loader
1433 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1435 # Try to find twitter cards info
1436 mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
1438 raise ExtractorError(u'Invalid URL: %s' % url)
1440 # It's possible that one of the regexes
1441 # matched, but returned an empty group:
1442 if mobj.group(1) is None:
1443 raise ExtractorError(u'Invalid URL: %s' % url)
1445 video_url = compat_urllib_parse.unquote(mobj.group(1))
1446 video_id = os.path.basename(video_url)
1448 # here's a fun little line of code for you:
1449 video_extension = os.path.splitext(video_id)[1][1:]
1450 video_id = os.path.splitext(video_id)[0]
1452 # it's tempting to parse this further, but you would
1453 # have to take into account all the variations like
1454 # Video Title - Site Name
1455 # Site Name | Video Title
1456 # Video Title - Tagline | Site Name
1457 # and so on and so forth; it's just not practical
1458 video_title = self._html_search_regex(r'<title>(.*)</title>',
1459 webpage, u'video title')
1461 # video uploader is domain name
1462 video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
1463 url, u'video uploader')
1468 'uploader': video_uploader,
1469 'upload_date': None,
1470 'title': video_title,
1471 'ext': video_extension,
1475 class YoutubeSearchIE(SearchInfoExtractor):
1476 """Information Extractor for YouTube search queries."""
1477 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1479 IE_NAME = u'youtube:search'
1480 _SEARCH_KEY = 'ytsearch'
1482 def report_download_page(self, query, pagenum):
1483 """Report attempt to download search page with given number."""
1484 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1486 def _get_n_results(self, query, n):
1487 """Get a specified number of results for a query"""
1493 while (50 * pagenum) < limit:
1494 self.report_download_page(query, pagenum+1)
1495 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1496 request = compat_urllib_request.Request(result_url)
1498 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1499 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1500 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1501 api_response = json.loads(data)['data']
1503 if not 'items' in api_response:
1504 raise ExtractorError(u'[youtube] No video results')
1506 new_ids = list(video['id'] for video in api_response['items'])
1507 video_ids += new_ids
1509 limit = min(n, api_response['totalItems'])
1512 if len(video_ids) > n:
1513 video_ids = video_ids[:n]
1514 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1515 return self.playlist_result(videos, query)
1518 class GoogleSearchIE(SearchInfoExtractor):
1519 """Information Extractor for Google Video search queries."""
1520 _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
1522 IE_NAME = u'video.google:search'
1523 _SEARCH_KEY = 'gvsearch'
1525 def _get_n_results(self, query, n):
1526 """Get a specified number of results for a query"""
1529 '_type': 'playlist',
1534 for pagenum in itertools.count(1):
1535 result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1536 webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1537 note='Downloading result page ' + str(pagenum))
1539 for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1542 'url': mobj.group(1)
1544 res['entries'].append(e)
1546 if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1549 class YahooSearchIE(SearchInfoExtractor):
1550 """Information Extractor for Yahoo! Video search queries."""
1553 IE_NAME = u'screen.yahoo:search'
1554 _SEARCH_KEY = 'yvsearch'
1556 def _get_n_results(self, query, n):
1557 """Get a specified number of results for a query"""
1560 '_type': 'playlist',
1564 for pagenum in itertools.count(0):
1565 result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
1566 webpage = self._download_webpage(result_url, query,
1567 note='Downloading results page '+str(pagenum+1))
1568 info = json.loads(webpage)
1570 results = info[u'results']
1572 for (i, r) in enumerate(results):
1573 if (pagenum * 30) +i >= n:
1575 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
1576 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
1577 res['entries'].append(e)
1578 if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
1584 class YoutubePlaylistIE(InfoExtractor):
1585 """Information Extractor for YouTube playlists."""
1587 _VALID_URL = r"""(?:
1592 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1593 \? (?:.*?&)*? (?:p|a|list)=
1596 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1599 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1601 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1603 IE_NAME = u'youtube:playlist'
1606 def suitable(cls, url):
1607 """Receives a URL and returns True if suitable for this IE."""
1608 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1610 def _real_extract(self, url):
1611 # Extract playlist id
1612 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1614 raise ExtractorError(u'Invalid URL: %s' % url)
1616 # Download playlist videos from API
1617 playlist_id = mobj.group(1) or mobj.group(2)
1622 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1623 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1626 response = json.loads(page)
1627 except ValueError as err:
1628 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1630 if 'feed' not in response:
1631 raise ExtractorError(u'Got a malformed response from YouTube API')
1632 playlist_title = response['feed']['title']['$t']
1633 if 'entry' not in response['feed']:
1634 # Number of videos is a multiple of self._MAX_RESULTS
1637 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1638 for entry in response['feed']['entry']
1639 if 'content' in entry ]
1641 if len(response['feed']['entry']) < self._MAX_RESULTS:
1645 videos = [v[1] for v in sorted(videos)]
1647 url_results = [self.url_result(url, 'Youtube') for url in videos]
1648 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1651 class YoutubeChannelIE(InfoExtractor):
1652 """Information Extractor for YouTube channels."""
1654 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1655 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1656 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1657 _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1658 IE_NAME = u'youtube:channel'
1660 def extract_videos_from_page(self, page):
1662 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1663 if mobj.group(1) not in ids_in_page:
1664 ids_in_page.append(mobj.group(1))
1667 def _real_extract(self, url):
1668 # Extract channel id
1669 mobj = re.match(self._VALID_URL, url)
1671 raise ExtractorError(u'Invalid URL: %s' % url)
1673 # Download channel page
1674 channel_id = mobj.group(1)
1678 url = self._TEMPLATE_URL % (channel_id, pagenum)
1679 page = self._download_webpage(url, channel_id,
1680 u'Downloading page #%s' % pagenum)
1682 # Extract video identifiers
1683 ids_in_page = self.extract_videos_from_page(page)
1684 video_ids.extend(ids_in_page)
1686 # Download any subsequent channel pages using the json-based channel_ajax query
1687 if self._MORE_PAGES_INDICATOR in page:
1689 pagenum = pagenum + 1
1691 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1692 page = self._download_webpage(url, channel_id,
1693 u'Downloading page #%s' % pagenum)
1695 page = json.loads(page)
1697 ids_in_page = self.extract_videos_from_page(page['content_html'])
1698 video_ids.extend(ids_in_page)
1700 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1703 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1705 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1706 url_entries = [self.url_result(url, 'Youtube') for url in urls]
1707 return [self.playlist_result(url_entries, channel_id)]
1710 class YoutubeUserIE(InfoExtractor):
1711 """Information Extractor for YouTube users."""
1713 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1714 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1715 _GDATA_PAGE_SIZE = 50
1716 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1717 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1718 IE_NAME = u'youtube:user'
1720 def _real_extract(self, url):
1722 mobj = re.match(self._VALID_URL, url)
1724 raise ExtractorError(u'Invalid URL: %s' % url)
1726 username = mobj.group(1)
1728 # Download video ids using YouTube Data API. Result size per
1729 # query is limited (currently to 50 videos) so we need to query
1730 # page by page until there are no video ids - it means we got
1737 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1739 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1740 page = self._download_webpage(gdata_url, username,
1741 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1743 # Extract video identifiers
1746 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1747 if mobj.group(1) not in ids_in_page:
1748 ids_in_page.append(mobj.group(1))
1750 video_ids.extend(ids_in_page)
1752 # A little optimization - if current page is not
1753 # "full", ie. does not contain PAGE_SIZE video ids then
1754 # we can assume that this page is the last one - there
1755 # are no more ids on further pages - no need to query
1758 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1763 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1764 url_results = [self.url_result(url, 'Youtube') for url in urls]
1765 return [self.playlist_result(url_results, playlist_title = username)]
1768 class BlipTVUserIE(InfoExtractor):
1769 """Information Extractor for blip.tv users."""
1771 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1773 IE_NAME = u'blip.tv:user'
1775 def _real_extract(self, url):
1777 mobj = re.match(self._VALID_URL, url)
1779 raise ExtractorError(u'Invalid URL: %s' % url)
1781 username = mobj.group(1)
1783 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1785 page = self._download_webpage(url, username, u'Downloading user page')
1786 mobj = re.search(r'data-users-id="([^"]+)"', page)
1787 page_base = page_base % mobj.group(1)
1790 # Download video ids using BlipTV Ajax calls. Result size per
1791 # query is limited (currently to 12 videos) so we need to query
1792 # page by page until there are no video ids - it means we got
1799 url = page_base + "&page=" + str(pagenum)
1800 page = self._download_webpage(url, username,
1801 u'Downloading video ids from page %d' % pagenum)
1803 # Extract video identifiers
1806 for mobj in re.finditer(r'href="/([^"]+)"', page):
1807 if mobj.group(1) not in ids_in_page:
1808 ids_in_page.append(unescapeHTML(mobj.group(1)))
1810 video_ids.extend(ids_in_page)
1812 # A little optimization - if current page is not
1813 # "full", ie. does not contain PAGE_SIZE video ids then
1814 # we can assume that this page is the last one - there
1815 # are no more ids on further pages - no need to query
1818 if len(ids_in_page) < self._PAGE_SIZE:
1823 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1824 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1825 return [self.playlist_result(url_entries, playlist_title = username)]
1828 class DepositFilesIE(InfoExtractor):
1829 """Information extractor for depositfiles.com"""
1831 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1833 def _real_extract(self, url):
1834 file_id = url.split('/')[-1]
1835 # Rebuild url in english locale
1836 url = 'http://depositfiles.com/en/files/' + file_id
1838 # Retrieve file webpage with 'Free download' button pressed
1839 free_download_indication = { 'gateway_result' : '1' }
1840 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1842 self.report_download_webpage(file_id)
1843 webpage = compat_urllib_request.urlopen(request).read()
1844 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1845 raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1847 # Search for the real file URL
1848 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1849 if (mobj is None) or (mobj.group(1) is None):
1850 # Try to figure out reason of the error.
1851 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1852 if (mobj is not None) and (mobj.group(1) is not None):
1853 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1854 raise ExtractorError(u'%s' % restriction_message)
1856 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1858 file_url = mobj.group(1)
1859 file_extension = os.path.splitext(file_url)[1][1:]
1861 # Search for file title
1862 file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
1865 'id': file_id.decode('utf-8'),
1866 'url': file_url.decode('utf-8'),
1868 'upload_date': None,
1869 'title': file_title,
1870 'ext': file_extension.decode('utf-8'),
1874 class FacebookIE(InfoExtractor):
1875 """Information Extractor for Facebook"""
1877 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1878 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1879 _NETRC_MACHINE = 'facebook'
1880 IE_NAME = u'facebook'
1882 def report_login(self):
1883 """Report attempt to log in."""
1884 self.to_screen(u'Logging in')
1886 def _real_initialize(self):
1887 if self._downloader is None:
1892 downloader_params = self._downloader.params
1894 # Attempt to use provided username and password or .netrc data
1895 if downloader_params.get('username', None) is not None:
1896 useremail = downloader_params['username']
1897 password = downloader_params['password']
1898 elif downloader_params.get('usenetrc', False):
1900 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1901 if info is not None:
1905 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1906 except (IOError, netrc.NetrcParseError) as err:
1907 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1910 if useremail is None:
1919 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1922 login_results = compat_urllib_request.urlopen(request).read()
1923 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1924 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1926 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1927 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1930 def _real_extract(self, url):
1931 mobj = re.match(self._VALID_URL, url)
1933 raise ExtractorError(u'Invalid URL: %s' % url)
1934 video_id = mobj.group('ID')
1936 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1937 webpage = self._download_webpage(url, video_id)
1939 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1940 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1941 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1943 raise ExtractorError(u'Cannot parse data')
1944 data = dict(json.loads(m.group(1)))
1945 params_raw = compat_urllib_parse.unquote(data['params'])
1946 params = json.loads(params_raw)
1947 video_data = params['video_data'][0]
1948 video_url = video_data.get('hd_src')
1950 video_url = video_data['sd_src']
1952 raise ExtractorError(u'Cannot find video URL')
1953 video_duration = int(video_data['video_duration'])
1954 thumbnail = video_data['thumbnail_src']
1956 video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
1961 'title': video_title,
1964 'duration': video_duration,
1965 'thumbnail': thumbnail,
1970 class BlipTVIE(InfoExtractor):
1971 """Information extractor for blip.tv"""
1973 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
1974 _URL_EXT = r'^.*\.([a-z0-9]+)$'
1975 IE_NAME = u'blip.tv'
1977 def report_direct_download(self, title):
1978 """Report information extraction."""
1979 self.to_screen(u'%s: Direct download detected' % title)
1981 def _real_extract(self, url):
1982 mobj = re.match(self._VALID_URL, url)
1984 raise ExtractorError(u'Invalid URL: %s' % url)
1986 # See https://github.com/rg3/youtube-dl/issues/857
1987 api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
1988 if api_mobj is not None:
1989 url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
1990 urlp = compat_urllib_parse_urlparse(url)
1991 if urlp.path.startswith('/play/'):
1992 request = compat_urllib_request.Request(url)
1993 response = compat_urllib_request.urlopen(request)
1994 redirecturl = response.geturl()
1995 rurlp = compat_urllib_parse_urlparse(redirecturl)
1996 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1997 url = 'http://blip.tv/a/a-' + file_id
1998 return self._real_extract(url)
2005 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2006 request = compat_urllib_request.Request(json_url)
2007 request.add_header('User-Agent', 'iTunes/10.6.1')
2008 self.report_extraction(mobj.group(1))
2011 urlh = compat_urllib_request.urlopen(request)
2012 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2013 basename = url.split('/')[-1]
2014 title,ext = os.path.splitext(basename)
2015 title = title.decode('UTF-8')
2016 ext = ext.replace('.', '')
2017 self.report_direct_download(title)
2022 'upload_date': None,
2027 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2028 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2029 if info is None: # Regular URL
2031 json_code_bytes = urlh.read()
2032 json_code = json_code_bytes.decode('utf-8')
2033 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2034 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
2037 json_data = json.loads(json_code)
2038 if 'Post' in json_data:
2039 data = json_data['Post']
2043 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2044 video_url = data['media']['url']
2045 umobj = re.match(self._URL_EXT, video_url)
2047 raise ValueError('Can not determine filename extension')
2048 ext = umobj.group(1)
2051 'id': data['item_id'],
2053 'uploader': data['display_name'],
2054 'upload_date': upload_date,
2055 'title': data['title'],
2057 'format': data['media']['mimeType'],
2058 'thumbnail': data['thumbnailUrl'],
2059 'description': data['description'],
2060 'player_url': data['embedUrl'],
2061 'user_agent': 'iTunes/10.6.1',
2063 except (ValueError,KeyError) as err:
2064 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
2069 class MyVideoIE(InfoExtractor):
2070 """Information Extractor for myvideo.de."""
2072 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2073 IE_NAME = u'myvideo'
2075 # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
2076 # Released into the Public Domain by Tristan Fischer on 2013-05-19
2077 # https://github.com/rg3/youtube-dl/pull/842
2078 def __rc4crypt(self,data, key):
2080 box = list(range(256))
2081 for i in list(range(256)):
2082 x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
2083 box[i], box[x] = box[x], box[i]
2089 y = (y + box[x]) % 256
2090 box[x], box[y] = box[y], box[x]
2091 out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
2095 return hashlib.md5(s).hexdigest().encode()
2097 def _real_extract(self,url):
2098 mobj = re.match(self._VALID_URL, url)
2100 raise ExtractorError(u'invalid URL: %s' % url)
2102 video_id = mobj.group(1)
2105 b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
2106 b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
2107 b'TnpsbA0KTVRkbU1tSTRNdz09'
2111 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2112 webpage = self._download_webpage(webpage_url, video_id)
2114 mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
2115 if mobj is not None:
2116 self.report_extraction(video_id)
2117 video_url = mobj.group(1) + '.flv'
2119 video_title = self._html_search_regex('<title>([^<]+)</title>',
2122 video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
2128 'upload_date': None,
2129 'title': video_title,
2134 mobj = re.search('var flashvars={(.+?)}', webpage)
2136 raise ExtractorError(u'Unable to extract video')
2141 for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
2142 if not a == '_encxml':
2145 encxml = compat_urllib_parse.unquote(b)
2146 if not params.get('domain'):
2147 params['domain'] = 'www.myvideo.de'
2148 xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
2149 if 'flash_playertype=MTV' in xmldata_url:
2150 self._downloader.report_warning(u'avoiding MTV player')
2152 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
2153 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
2157 enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
2158 enc_data_b = binascii.unhexlify(enc_data)
2160 base64.b64decode(base64.b64decode(GK)) +
2162 str(video_id).encode('utf-8')
2165 dec_data = self.__rc4crypt(enc_data_b, sk)
2168 self.report_extraction(video_id)
2171 mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
2173 video_url = compat_urllib_parse.unquote(mobj.group(1))
2174 if 'myvideo2flash' in video_url:
2175 self._downloader.report_warning(u'forcing RTMPT ...')
2176 video_url = video_url.replace('rtmpe://', 'rtmpt://')
2179 # extract non rtmp videos
2180 mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
2182 raise ExtractorError(u'unable to extract url')
2183 video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
2185 video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
2186 video_file = compat_urllib_parse.unquote(video_file)
2188 if not video_file.endswith('f4m'):
2189 ppath, prefix = video_file.split('.')
2190 video_playpath = '%s:%s' % (prefix, ppath)
2191 video_hls_playlist = ''
2194 video_hls_playlist = (
2195 video_filepath + video_file
2196 ).replace('.f4m', '.m3u8')
2198 video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
2199 video_swfobj = compat_urllib_parse.unquote(video_swfobj)
2201 video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
2207 'tc_url': video_url,
2209 'upload_date': None,
2210 'title': video_title,
2212 'play_path': video_playpath,
2213 'video_file': video_file,
2214 'video_hls_playlist': video_hls_playlist,
2215 'player_url': video_swfobj,
2219 class ComedyCentralIE(InfoExtractor):
2220 """Information extractor for The Daily Show and Colbert Report """
2222 # urls can be abbreviations like :thedailyshow or :colbert
2223 # urls for episodes like:
2224 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2225 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2226 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2227 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2228 |(https?://)?(www\.)?
2229 (?P<showname>thedailyshow|colbertnation)\.com/
2230 (full-episodes/(?P<episode>.*)|
2232 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2233 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2236 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2238 _video_extensions = {
2246 _video_dimensions = {
2256 def suitable(cls, url):
2257 """Receives a URL and returns True if suitable for this IE."""
2258 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2260 def _print_formats(self, formats):
2261 print('Available formats:')
2263 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2266 def _real_extract(self, url):
2267 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2269 raise ExtractorError(u'Invalid URL: %s' % url)
2271 if mobj.group('shortname'):
2272 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2273 url = u'http://www.thedailyshow.com/full-episodes/'
2275 url = u'http://www.colbertnation.com/full-episodes/'
2276 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2277 assert mobj is not None
2279 if mobj.group('clip'):
2280 if mobj.group('showname') == 'thedailyshow':
2281 epTitle = mobj.group('tdstitle')
2283 epTitle = mobj.group('cntitle')
2286 dlNewest = not mobj.group('episode')
2288 epTitle = mobj.group('showname')
2290 epTitle = mobj.group('episode')
2292 self.report_extraction(epTitle)
2293 webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2295 url = htmlHandle.geturl()
2296 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2298 raise ExtractorError(u'Invalid redirected URL: ' + url)
2299 if mobj.group('episode') == '':
2300 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2301 epTitle = mobj.group('episode')
2303 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2305 if len(mMovieParams) == 0:
2306 # The Colbert Report embeds the information in a without
2307 # a URL prefix; so extract the alternate reference
2308 # and then add the URL prefix manually.
2310 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2311 if len(altMovieParams) == 0:
2312 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2314 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2316 uri = mMovieParams[0][1]
2317 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2318 indexXml = self._download_webpage(indexUrl, epTitle,
2319 u'Downloading show index',
2320 u'unable to download episode index')
2324 idoc = xml.etree.ElementTree.fromstring(indexXml)
2325 itemEls = idoc.findall('.//item')
2326 for partNum,itemEl in enumerate(itemEls):
2327 mediaId = itemEl.findall('./guid')[0].text
2328 shortMediaId = mediaId.split(':')[-1]
2329 showId = mediaId.split(':')[-2].replace('.com', '')
2330 officialTitle = itemEl.findall('./title')[0].text
2331 officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2333 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2334 compat_urllib_parse.urlencode({'uri': mediaId}))
2335 configXml = self._download_webpage(configUrl, epTitle,
2336 u'Downloading configuration for %s' % shortMediaId)
2338 cdoc = xml.etree.ElementTree.fromstring(configXml)
2340 for rendition in cdoc.findall('.//rendition'):
2341 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2345 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2348 if self._downloader.params.get('listformats', None):
2349 self._print_formats([i[0] for i in turls])
2352 # For now, just pick the highest bitrate
2353 format,rtmp_video_url = turls[-1]
2355 # Get the format arg from the arg stream
2356 req_format = self._downloader.params.get('format', None)
2358 # Select format if we can find one
2361 format, rtmp_video_url = f, v
2364 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2366 raise ExtractorError(u'Cannot transform RTMP url')
2367 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2368 video_url = base + m.group('finalid')
2370 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2375 'upload_date': officialDate,
2380 'description': officialTitle,
2382 results.append(info)
2387 class EscapistIE(InfoExtractor):
2388 """Information extractor for The Escapist """
2390 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2391 IE_NAME = u'escapist'
2393 def _real_extract(self, url):
2394 mobj = re.match(self._VALID_URL, url)
2396 raise ExtractorError(u'Invalid URL: %s' % url)
2397 showName = mobj.group('showname')
2398 videoId = mobj.group('episode')
2400 self.report_extraction(videoId)
2401 webpage = self._download_webpage(url, videoId)
2403 videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
2404 webpage, u'description', fatal=False)
2406 imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
2407 webpage, u'thumbnail', fatal=False)
2409 playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
2410 webpage, u'player url')
2412 title = self._html_search_regex('<meta name="title" content="([^"]*)"',
2413 webpage, u'player url').split(' : ')[-1]
2415 configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
2416 configUrl = compat_urllib_parse.unquote(configUrl)
2418 configJSON = self._download_webpage(configUrl, videoId,
2419 u'Downloading configuration',
2420 u'unable to download configuration')
2422 # Technically, it's JavaScript, not JSON
2423 configJSON = configJSON.replace("'", '"')
2426 config = json.loads(configJSON)
2427 except (ValueError,) as err:
2428 raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2430 playlist = config['playlist']
2431 videoUrl = playlist[1]['url']
2436 'uploader': showName,
2437 'upload_date': None,
2440 'thumbnail': imgUrl,
2441 'description': videoDesc,
2442 'player_url': playerUrl,
2447 class CollegeHumorIE(InfoExtractor):
2448 """Information extractor for collegehumor.com"""
2451 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2452 IE_NAME = u'collegehumor'
2454 def report_manifest(self, video_id):
2455 """Report information extraction."""
2456 self.to_screen(u'%s: Downloading XML manifest' % video_id)
2458 def _real_extract(self, url):
2459 mobj = re.match(self._VALID_URL, url)
2461 raise ExtractorError(u'Invalid URL: %s' % url)
2462 video_id = mobj.group('videoid')
2467 'upload_date': None,
2470 self.report_extraction(video_id)
2471 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2473 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2474 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2475 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2477 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2479 videoNode = mdoc.findall('./video')[0]
2480 info['description'] = videoNode.findall('./description')[0].text
2481 info['title'] = videoNode.findall('./caption')[0].text
2482 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2483 manifest_url = videoNode.findall('./file')[0].text
2485 raise ExtractorError(u'Invalid metadata XML file')
2487 manifest_url += '?hdcore=2.10.3'
2488 self.report_manifest(video_id)
2490 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2491 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2492 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2494 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2496 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2497 node_id = media_node.attrib['url']
2498 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2499 except IndexError as err:
2500 raise ExtractorError(u'Invalid manifest file')
2502 url_pr = compat_urllib_parse_urlparse(manifest_url)
2503 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2510 class XVideosIE(InfoExtractor):
2511 """Information extractor for xvideos.com"""
2513 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2514 IE_NAME = u'xvideos'
2516 def _real_extract(self, url):
2517 mobj = re.match(self._VALID_URL, url)
2519 raise ExtractorError(u'Invalid URL: %s' % url)
2520 video_id = mobj.group(1)
2522 webpage = self._download_webpage(url, video_id)
2524 self.report_extraction(video_id)
2527 video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
2528 webpage, u'video URL'))
2531 video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
2534 # Extract video thumbnail
2535 video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
2536 webpage, u'thumbnail', fatal=False)
2542 'upload_date': None,
2543 'title': video_title,
2545 'thumbnail': video_thumbnail,
2546 'description': None,
2552 class SoundcloudIE(InfoExtractor):
2553 """Information extractor for soundcloud.com
2554 To access the media, the uid of the song and a stream token
2555 must be extracted from the page source and the script must make
2556 a request to media.soundcloud.com/crossdomain.xml. Then
2557 the media can be grabbed by requesting from an url composed
2558 of the stream token and uid
2561 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2562 IE_NAME = u'soundcloud'
2564 def report_resolve(self, video_id):
2565 """Report information extraction."""
2566 self.to_screen(u'%s: Resolving id' % video_id)
2568 def _real_extract(self, url):
2569 mobj = re.match(self._VALID_URL, url)
2571 raise ExtractorError(u'Invalid URL: %s' % url)
2573 # extract uploader (which is in the url)
2574 uploader = mobj.group(1)
2575 # extract simple title (uploader + slug of song title)
2576 slug_title = mobj.group(2)
2577 simple_title = uploader + u'-' + slug_title
2578 full_title = '%s/%s' % (uploader, slug_title)
2580 self.report_resolve(full_title)
2582 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2583 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2584 info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2586 info = json.loads(info_json)
2587 video_id = info['id']
2588 self.report_extraction(full_title)
2590 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2591 stream_json = self._download_webpage(streams_url, full_title,
2592 u'Downloading stream definitions',
2593 u'unable to download stream definitions')
2595 streams = json.loads(stream_json)
2596 mediaURL = streams['http_mp3_128_url']
2597 upload_date = unified_strdate(info['created_at'])
2602 'uploader': info['user']['username'],
2603 'upload_date': upload_date,
2604 'title': info['title'],
2606 'description': info['description'],
2609 class SoundcloudSetIE(InfoExtractor):
2610 """Information extractor for soundcloud.com sets
2611 To access the media, the uid of the song and a stream token
2612 must be extracted from the page source and the script must make
2613 a request to media.soundcloud.com/crossdomain.xml. Then
2614 the media can be grabbed by requesting from an url composed
2615 of the stream token and uid
2618 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2619 IE_NAME = u'soundcloud:set'
2621 def report_resolve(self, video_id):
2622 """Report information extraction."""
2623 self.to_screen(u'%s: Resolving id' % video_id)
2625 def _real_extract(self, url):
2626 mobj = re.match(self._VALID_URL, url)
2628 raise ExtractorError(u'Invalid URL: %s' % url)
2630 # extract uploader (which is in the url)
2631 uploader = mobj.group(1)
2632 # extract simple title (uploader + slug of song title)
2633 slug_title = mobj.group(2)
2634 simple_title = uploader + u'-' + slug_title
2635 full_title = '%s/sets/%s' % (uploader, slug_title)
2637 self.report_resolve(full_title)
2639 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2640 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2641 info_json = self._download_webpage(resolv_url, full_title)
2644 info = json.loads(info_json)
2645 if 'errors' in info:
2646 for err in info['errors']:
2647 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2650 self.report_extraction(full_title)
2651 for track in info['tracks']:
2652 video_id = track['id']
2654 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2655 stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2657 self.report_extraction(video_id)
2658 streams = json.loads(stream_json)
2659 mediaURL = streams['http_mp3_128_url']
2664 'uploader': track['user']['username'],
2665 'upload_date': unified_strdate(track['created_at']),
2666 'title': track['title'],
2668 'description': track['description'],
2673 class InfoQIE(InfoExtractor):
2674 """Information extractor for infoq.com"""
2675 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2677 def _real_extract(self, url):
2678 mobj = re.match(self._VALID_URL, url)
2680 raise ExtractorError(u'Invalid URL: %s' % url)
2682 webpage = self._download_webpage(url, video_id=url)
2683 self.report_extraction(url)
2686 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2688 raise ExtractorError(u'Unable to extract video url')
2689 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2690 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2693 video_title = self._search_regex(r'contentTitle = "(.*?)";',
2696 # Extract description
2697 video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
2698 webpage, u'description', fatal=False)
2700 video_filename = video_url.split('/')[-1]
2701 video_id, extension = video_filename.split('.')
2707 'upload_date': None,
2708 'title': video_title,
2709 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2711 'description': video_description,
2716 class MixcloudIE(InfoExtractor):
2717 """Information extractor for www.mixcloud.com"""
2719 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2720 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2721 IE_NAME = u'mixcloud'
2723 def report_download_json(self, file_id):
2724 """Report JSON download."""
2725 self.to_screen(u'Downloading json')
2727 def get_urls(self, jsonData, fmt, bitrate='best'):
2728 """Get urls from 'audio_formats' section in json"""
2731 bitrate_list = jsonData[fmt]
2732 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2733 bitrate = max(bitrate_list) # select highest
2735 url_list = jsonData[fmt][bitrate]
2736 except TypeError: # we have no bitrate info.
2737 url_list = jsonData[fmt]
2740 def check_urls(self, url_list):
2741 """Returns 1st active url from list"""
2742 for url in url_list:
2744 compat_urllib_request.urlopen(url)
2746 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2751 def _print_formats(self, formats):
2752 print('Available formats:')
2753 for fmt in formats.keys():
2754 for b in formats[fmt]:
2756 ext = formats[fmt][b][0]
2757 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2758 except TypeError: # we have no bitrate info
2759 ext = formats[fmt][0]
2760 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2763 def _real_extract(self, url):
2764 mobj = re.match(self._VALID_URL, url)
2766 raise ExtractorError(u'Invalid URL: %s' % url)
2767 # extract uploader & filename from url
2768 uploader = mobj.group(1).decode('utf-8')
2769 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2771 # construct API request
2772 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2773 # retrieve .json file with links to files
2774 request = compat_urllib_request.Request(file_url)
2776 self.report_download_json(file_url)
2777 jsonData = compat_urllib_request.urlopen(request).read()
2778 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2779 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2782 json_data = json.loads(jsonData)
2783 player_url = json_data['player_swf_url']
2784 formats = dict(json_data['audio_formats'])
2786 req_format = self._downloader.params.get('format', None)
2789 if self._downloader.params.get('listformats', None):
2790 self._print_formats(formats)
2793 if req_format is None or req_format == 'best':
2794 for format_param in formats.keys():
2795 url_list = self.get_urls(formats, format_param)
2797 file_url = self.check_urls(url_list)
2798 if file_url is not None:
2801 if req_format not in formats:
2802 raise ExtractorError(u'Format is not available')
2804 url_list = self.get_urls(formats, req_format)
2805 file_url = self.check_urls(url_list)
2806 format_param = req_format
2809 'id': file_id.decode('utf-8'),
2810 'url': file_url.decode('utf-8'),
2811 'uploader': uploader.decode('utf-8'),
2812 'upload_date': None,
2813 'title': json_data['name'],
2814 'ext': file_url.split('.')[-1].decode('utf-8'),
2815 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2816 'thumbnail': json_data['thumbnail_url'],
2817 'description': json_data['description'],
2818 'player_url': player_url.decode('utf-8'),
2821 class StanfordOpenClassroomIE(InfoExtractor):
2822 """Information extractor for Stanford's Open ClassRoom"""
2824 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2825 IE_NAME = u'stanfordoc'
2827 def _real_extract(self, url):
2828 mobj = re.match(self._VALID_URL, url)
2830 raise ExtractorError(u'Invalid URL: %s' % url)
2832 if mobj.group('course') and mobj.group('video'): # A specific video
2833 course = mobj.group('course')
2834 video = mobj.group('video')
2836 'id': course + '_' + video,
2838 'upload_date': None,
2841 self.report_extraction(info['id'])
2842 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2843 xmlUrl = baseUrl + video + '.xml'
2845 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2846 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2847 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2848 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2850 info['title'] = mdoc.findall('./title')[0].text
2851 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2853 raise ExtractorError(u'Invalid metadata XML file')
2854 info['ext'] = info['url'].rpartition('.')[2]
2856 elif mobj.group('course'): # A course page
2857 course = mobj.group('course')
2862 'upload_date': None,
2865 coursepage = self._download_webpage(url, info['id'],
2866 note='Downloading course info page',
2867 errnote='Unable to download course info page')
2869 info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
2871 info['description'] = self._html_search_regex('<description>([^<]+)</description>',
2872 coursepage, u'description', fatal=False)
2874 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2877 'type': 'reference',
2878 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2882 for entry in info['list']:
2883 assert entry['type'] == 'reference'
2884 results += self.extract(entry['url'])
2888 'id': 'Stanford OpenClassroom',
2891 'upload_date': None,
2894 self.report_download_webpage(info['id'])
2895 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2897 rootpage = compat_urllib_request.urlopen(rootURL).read()
2898 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2899 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2901 info['title'] = info['id']
2903 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2906 'type': 'reference',
2907 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2912 for entry in info['list']:
2913 assert entry['type'] == 'reference'
2914 results += self.extract(entry['url'])
2917 class MTVIE(InfoExtractor):
2918 """Information extractor for MTV.com"""
2920 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2923 def _real_extract(self, url):
2924 mobj = re.match(self._VALID_URL, url)
2926 raise ExtractorError(u'Invalid URL: %s' % url)
2927 if not mobj.group('proto'):
2928 url = 'http://' + url
2929 video_id = mobj.group('videoid')
2931 webpage = self._download_webpage(url, video_id)
2933 song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
2934 webpage, u'song name', fatal=False)
2936 video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
2939 mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
2940 webpage, u'mtvn_uri', fatal=False)
2942 content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
2943 webpage, u'content id', fatal=False)
2945 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2946 self.report_extraction(video_id)
2947 request = compat_urllib_request.Request(videogen_url)
2949 metadataXml = compat_urllib_request.urlopen(request).read()
2950 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2951 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2953 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2954 renditions = mdoc.findall('.//rendition')
2956 # For now, always pick the highest quality.
2957 rendition = renditions[-1]
2960 _,_,ext = rendition.attrib['type'].partition('/')
2961 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2962 video_url = rendition.find('./src').text
2964 raise ExtractorError('Invalid rendition field.')
2969 'uploader': performer,
2970 'upload_date': None,
2971 'title': video_title,
2979 class YoukuIE(InfoExtractor):
2980 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2983 nowTime = int(time.time() * 1000)
2984 random1 = random.randint(1000,1998)
2985 random2 = random.randint(1000,9999)
2987 return "%d%d%d" %(nowTime,random1,random2)
2989 def _get_file_ID_mix_string(self, seed):
2991 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2993 for i in range(len(source)):
2994 seed = (seed * 211 + 30031 ) % 65536
2995 index = math.floor(seed / 65536 * len(source) )
2996 mixed.append(source[int(index)])
2997 source.remove(source[int(index)])
2998 #return ''.join(mixed)
3001 def _get_file_id(self, fileId, seed):
3002 mixed = self._get_file_ID_mix_string(seed)
3003 ids = fileId.split('*')
3007 realId.append(mixed[int(ch)])
3008 return ''.join(realId)
3010 def _real_extract(self, url):
3011 mobj = re.match(self._VALID_URL, url)
3013 raise ExtractorError(u'Invalid URL: %s' % url)
3014 video_id = mobj.group('ID')
3016 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3018 jsondata = self._download_webpage(info_url, video_id)
3020 self.report_extraction(video_id)
3022 config = json.loads(jsondata)
3024 video_title = config['data'][0]['title']
3025 seed = config['data'][0]['seed']
3027 format = self._downloader.params.get('format', None)
3028 supported_format = list(config['data'][0]['streamfileids'].keys())
3030 if format is None or format == 'best':
3031 if 'hd2' in supported_format:
3036 elif format == 'worst':
3044 fileid = config['data'][0]['streamfileids'][format]
3045 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3046 except (UnicodeDecodeError, ValueError, KeyError):
3047 raise ExtractorError(u'Unable to extract info section')
3050 sid = self._gen_sid()
3051 fileid = self._get_file_id(fileid, seed)
3053 #column 8,9 of fileid represent the segment number
3054 #fileid[7:9] should be changed
3055 for index, key in enumerate(keys):
3057 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3058 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3061 'id': '%s_part%02d' % (video_id, index),
3062 'url': download_url,
3064 'upload_date': None,
3065 'title': video_title,
3068 files_info.append(info)
3073 class XNXXIE(InfoExtractor):
3074 """Information extractor for xnxx.com"""
3076 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3078 VIDEO_URL_RE = r'flv_url=(.*?)&'
3079 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3080 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3082 def _real_extract(self, url):
3083 mobj = re.match(self._VALID_URL, url)
3085 raise ExtractorError(u'Invalid URL: %s' % url)
3086 video_id = mobj.group(1)
3088 # Get webpage content
3089 webpage = self._download_webpage(url, video_id)
3091 video_url = self._search_regex(self.VIDEO_URL_RE,
3092 webpage, u'video URL')
3093 video_url = compat_urllib_parse.unquote(video_url)
3095 video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
3098 video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
3099 webpage, u'thumbnail', fatal=False)
3105 'upload_date': None,
3106 'title': video_title,
3108 'thumbnail': video_thumbnail,
3109 'description': None,
3113 class GooglePlusIE(InfoExtractor):
3114 """Information extractor for plus.google.com."""
3116 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3117 IE_NAME = u'plus.google'
3119 def _real_extract(self, url):
3120 # Extract id from URL
3121 mobj = re.match(self._VALID_URL, url)
3123 raise ExtractorError(u'Invalid URL: %s' % url)
3125 post_url = mobj.group(0)
3126 video_id = mobj.group(1)
3128 video_extension = 'flv'
3130 # Step 1, Retrieve post webpage to extract further information
3131 webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3133 self.report_extraction(video_id)
3135 # Extract update date
3136 upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
3137 webpage, u'upload date', fatal=False)
3139 # Convert timestring to a format suitable for filename
3140 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3141 upload_date = upload_date.strftime('%Y%m%d')
3144 uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
3145 webpage, u'uploader', fatal=False)
3148 # Get the first line for title
3149 video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
3150 webpage, 'title', default=u'NA')
3152 # Step 2, Stimulate clicking the image box to launch video
3153 video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
3154 webpage, u'video page URL')
3155 webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3157 # Extract video links on video page
3158 """Extract video links of all sizes"""
3159 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3160 mobj = re.findall(pattern, webpage)
3162 raise ExtractorError(u'Unable to extract video links')
3164 # Sort in resolution
3165 links = sorted(mobj)
3167 # Choose the lowest of the sort, i.e. highest resolution
3168 video_url = links[-1]
3169 # Only get the url. The resolution part in the tuple has no use anymore
3170 video_url = video_url[-1]
3171 # Treat escaped \u0026 style hex
3173 video_url = video_url.decode("unicode_escape")
3174 except AttributeError: # Python 3
3175 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3181 'uploader': uploader,
3182 'upload_date': upload_date,
3183 'title': video_title,
3184 'ext': video_extension,
3187 class NBAIE(InfoExtractor):
3188 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
3191 def _real_extract(self, url):
3192 mobj = re.match(self._VALID_URL, url)
3194 raise ExtractorError(u'Invalid URL: %s' % url)
3196 video_id = mobj.group(1)
3198 webpage = self._download_webpage(url, video_id)
3200 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3202 shortened_video_id = video_id.rpartition('/')[2]
3203 title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
3204 webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
3206 # It isn't there in the HTML it returns to us
3207 # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
3209 description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
3212 'id': shortened_video_id,
3216 # 'uploader_date': uploader_date,
3217 'description': description,
3221 class JustinTVIE(InfoExtractor):
3222 """Information extractor for justin.tv and twitch.tv"""
3223 # TODO: One broadcast may be split into multiple videos. The key
3224 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3225 # starts at 1 and increases. Can we treat all parts as one video?
3227 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3229 (?P<channelid>[^/]+)|
3230 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3231 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3235 _JUSTIN_PAGE_LIMIT = 100
3236 IE_NAME = u'justin.tv'
3238 def report_download_page(self, channel, offset):
3239 """Report attempt to download a single page of videos."""
3240 self.to_screen(u'%s: Downloading video information from %d to %d' %
3241 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3243 # Return count of items, list of *valid* items
3244 def _parse_page(self, url, video_id):
3245 webpage = self._download_webpage(url, video_id,
3246 u'Downloading video info JSON',
3247 u'unable to download video info JSON')
3249 response = json.loads(webpage)
3250 if type(response) != list:
3251 error_text = response.get('error', 'unknown error')
3252 raise ExtractorError(u'Justin.tv API: %s' % error_text)
3254 for clip in response:
3255 video_url = clip['video_file_url']
3257 video_extension = os.path.splitext(video_url)[1][1:]
3258 video_date = re.sub('-', '', clip['start_time'][:10])
3259 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3260 video_id = clip['id']
3261 video_title = clip.get('title', video_id)
3265 'title': video_title,
3266 'uploader': clip.get('channel_name', video_uploader_id),
3267 'uploader_id': video_uploader_id,
3268 'upload_date': video_date,
3269 'ext': video_extension,
3271 return (len(response), info)
3273 def _real_extract(self, url):
3274 mobj = re.match(self._VALID_URL, url)
3276 raise ExtractorError(u'invalid URL: %s' % url)
3278 api_base = 'http://api.justin.tv'
3280 if mobj.group('channelid'):
3282 video_id = mobj.group('channelid')
3283 api = api_base + '/channel/archives/%s.json' % video_id
3284 elif mobj.group('chapterid'):
3285 chapter_id = mobj.group('chapterid')
3287 webpage = self._download_webpage(url, chapter_id)
3288 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3290 raise ExtractorError(u'Cannot find archive of a chapter')
3291 archive_id = m.group(1)
3293 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3294 chapter_info_xml = self._download_webpage(api, chapter_id,
3295 note=u'Downloading chapter information',
3296 errnote=u'Chapter information download failed')
3297 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3298 for a in doc.findall('.//archive'):
3299 if archive_id == a.find('./id').text:
3302 raise ExtractorError(u'Could not find chapter in chapter information')
3304 video_url = a.find('./video_file_url').text
3305 video_ext = video_url.rpartition('.')[2] or u'flv'
3307 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3308 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3309 note='Downloading chapter metadata',
3310 errnote='Download of chapter metadata failed')
3311 chapter_info = json.loads(chapter_info_json)
3313 bracket_start = int(doc.find('.//bracket_start').text)
3314 bracket_end = int(doc.find('.//bracket_end').text)
3316 # TODO determine start (and probably fix up file)
3317 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3318 #video_url += u'?start=' + TODO:start_timestamp
3319 # bracket_start is 13290, but we want 51670615
3320 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3321 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3324 'id': u'c' + chapter_id,
3327 'title': chapter_info['title'],
3328 'thumbnail': chapter_info['preview'],
3329 'description': chapter_info['description'],
3330 'uploader': chapter_info['channel']['display_name'],
3331 'uploader_id': chapter_info['channel']['name'],
3335 video_id = mobj.group('videoid')
3336 api = api_base + '/broadcast/by_archive/%s.json' % video_id
3338 self.report_extraction(video_id)
3342 limit = self._JUSTIN_PAGE_LIMIT
3345 self.report_download_page(video_id, offset)
3346 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3347 page_count, page_info = self._parse_page(page_url, video_id)
3348 info.extend(page_info)
3349 if not paged or page_count != limit:
3354 class FunnyOrDieIE(InfoExtractor):
3355 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3357 def _real_extract(self, url):
3358 mobj = re.match(self._VALID_URL, url)
3360 raise ExtractorError(u'invalid URL: %s' % url)
3362 video_id = mobj.group('id')
3363 webpage = self._download_webpage(url, video_id)
3365 video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
3366 webpage, u'video URL', flags=re.DOTALL)
3368 title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
3369 r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
3371 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3372 webpage, u'description', fatal=False, flags=re.DOTALL)
3379 'description': video_description,
3383 class SteamIE(InfoExtractor):
3384 _VALID_URL = r"""http://store\.steampowered\.com/
3386 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3388 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3390 _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
3391 _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
3394 def suitable(cls, url):
3395 """Receives a URL and returns True if suitable for this IE."""
3396 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3398 def _real_extract(self, url):
3399 m = re.match(self._VALID_URL, url, re.VERBOSE)
3400 gameID = m.group('gameID')
3402 videourl = self._VIDEO_PAGE_TEMPLATE % gameID
3403 webpage = self._download_webpage(videourl, gameID)
3405 if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
3406 videourl = self._AGECHECK_TEMPLATE % gameID
3407 self.report_age_confirmation()
3408 webpage = self._download_webpage(videourl, gameID)
3410 self.report_extraction(gameID)
3411 game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
3412 webpage, 'game title')
3414 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3415 mweb = re.finditer(urlRE, webpage)
3416 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3417 titles = re.finditer(namesRE, webpage)
3418 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3419 thumbs = re.finditer(thumbsRE, webpage)
3421 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3422 video_id = vid.group('videoID')
3423 title = vtitle.group('videoName')
3424 video_url = vid.group('videoURL')
3425 video_thumb = thumb.group('thumbnail')
3427 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3432 'title': unescapeHTML(title),
3433 'thumbnail': video_thumb
3436 return [self.playlist_result(videos, gameID, game_title)]
3438 class UstreamIE(InfoExtractor):
3439 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3440 IE_NAME = u'ustream'
3442 def _real_extract(self, url):
3443 m = re.match(self._VALID_URL, url)
3444 video_id = m.group('videoID')
3446 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3447 webpage = self._download_webpage(url, video_id)
3449 self.report_extraction(video_id)
3451 video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
3454 uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
3455 webpage, u'uploader', fatal=False, flags=re.DOTALL)
3457 thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
3458 webpage, u'thumbnail', fatal=False)
3464 'title': video_title,
3465 'uploader': uploader,
3466 'thumbnail': thumbnail,
3470 class WorldStarHipHopIE(InfoExtractor):
3471 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3472 IE_NAME = u'WorldStarHipHop'
3474 def _real_extract(self, url):
3475 m = re.match(self._VALID_URL, url)
3476 video_id = m.group('id')
3478 webpage_src = self._download_webpage(url, video_id)
3480 video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
3481 webpage_src, u'video URL')
3483 if 'mp4' in video_url:
3488 video_title = self._html_search_regex(r"<title>(.*)</title>",
3489 webpage_src, u'title')
3491 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3492 thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
3493 webpage_src, u'thumbnail', fatal=False)
3496 _title = r"""candytitles.*>(.*)</span>"""
3497 mobj = re.search(_title, webpage_src)
3498 if mobj is not None:
3499 video_title = mobj.group(1)
3504 'title' : video_title,
3505 'thumbnail' : thumbnail,
3510 class RBMARadioIE(InfoExtractor):
3511 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3513 def _real_extract(self, url):
3514 m = re.match(self._VALID_URL, url)
3515 video_id = m.group('videoID')
3517 webpage = self._download_webpage(url, video_id)
3519 json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
3520 webpage, u'json data', flags=re.MULTILINE)
3523 data = json.loads(json_data)
3524 except ValueError as e:
3525 raise ExtractorError(u'Invalid JSON: ' + str(e))
3527 video_url = data['akamai_url'] + '&cbr=256'
3528 url_parts = compat_urllib_parse_urlparse(video_url)
3529 video_ext = url_parts.path.rpartition('.')[2]
3534 'title': data['title'],
3535 'description': data.get('teaser_text'),
3536 'location': data.get('country_of_origin'),
3537 'uploader': data.get('host', {}).get('name'),
3538 'uploader_id': data.get('host', {}).get('slug'),
3539 'thumbnail': data.get('image', {}).get('large_url_2x'),
3540 'duration': data.get('duration'),
3545 class YouPornIE(InfoExtractor):
3546 """Information extractor for youporn.com."""
3547 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3549 def _print_formats(self, formats):
3550 """Print all available formats"""
3551 print(u'Available formats:')
3552 print(u'ext\t\tformat')
3553 print(u'---------------------------------')
3554 for format in formats:
3555 print(u'%s\t\t%s' % (format['ext'], format['format']))
3557 def _specific(self, req_format, formats):
3559 if(x["format"]==req_format):
3563 def _real_extract(self, url):
3564 mobj = re.match(self._VALID_URL, url)
3566 raise ExtractorError(u'Invalid URL: %s' % url)
3567 video_id = mobj.group('videoid')
3569 req = compat_urllib_request.Request(url)
3570 req.add_header('Cookie', 'age_verified=1')
3571 webpage = self._download_webpage(req, video_id)
3573 # Get JSON parameters
3574 json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
3576 params = json.loads(json_params)
3578 raise ExtractorError(u'Invalid JSON')
3580 self.report_extraction(video_id)
3582 video_title = params['title']
3583 upload_date = unified_strdate(params['release_date_f'])
3584 video_description = params['description']
3585 video_uploader = params['submitted_by']
3586 thumbnail = params['thumbnails'][0]['image']
3588 raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
3590 # Get all of the formats available
3591 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3592 download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
3593 webpage, u'download list').strip()
3595 # Get all of the links from the page
3596 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3597 links = re.findall(LINK_RE, download_list_html)
3598 if(len(links) == 0):
3599 raise ExtractorError(u'ERROR: no known formats available for video')
3601 self.to_screen(u'Links found: %d' % len(links))
3606 # A link looks like this:
3607 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3608 # A path looks like this:
3609 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3610 video_url = unescapeHTML( link )
3611 path = compat_urllib_parse_urlparse( video_url ).path
3612 extension = os.path.splitext( path )[1][1:]
3613 format = path.split('/')[4].split('_')[:2]
3616 format = "-".join( format )
3617 # title = u'%s-%s-%s' % (video_title, size, bitrate)
3622 'uploader': video_uploader,
3623 'upload_date': upload_date,
3624 'title': video_title,
3627 'thumbnail': thumbnail,
3628 'description': video_description
3631 if self._downloader.params.get('listformats', None):
3632 self._print_formats(formats)
3635 req_format = self._downloader.params.get('format', None)
3636 self.to_screen(u'Format: %s' % req_format)
3638 if req_format is None or req_format == 'best':
3640 elif req_format == 'worst':
3641 return [formats[-1]]
3642 elif req_format in ('-1', 'all'):
3645 format = self._specific( req_format, formats )
3647 raise ExtractorError(u'Requested format not available')
3652 class PornotubeIE(InfoExtractor):
3653 """Information extractor for pornotube.com."""
3654 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3656 def _real_extract(self, url):
3657 mobj = re.match(self._VALID_URL, url)
3659 raise ExtractorError(u'Invalid URL: %s' % url)
3661 video_id = mobj.group('videoid')
3662 video_title = mobj.group('title')
3664 # Get webpage content
3665 webpage = self._download_webpage(url, video_id)
3668 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3669 video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
3670 video_url = compat_urllib_parse.unquote(video_url)
3672 #Get the uploaded date
3673 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3674 upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
3675 if upload_date: upload_date = unified_strdate(upload_date)
3677 info = {'id': video_id,
3680 'upload_date': upload_date,
3681 'title': video_title,
3687 class YouJizzIE(InfoExtractor):
3688 """Information extractor for youjizz.com."""
3689 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3691 def _real_extract(self, url):
3692 mobj = re.match(self._VALID_URL, url)
3694 raise ExtractorError(u'Invalid URL: %s' % url)
3696 video_id = mobj.group('videoid')
3698 # Get webpage content
3699 webpage = self._download_webpage(url, video_id)
3701 # Get the video title
3702 video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
3703 webpage, u'title').strip()
3705 # Get the embed page
3706 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3708 raise ExtractorError(u'ERROR: unable to extract embed page')
3710 embed_page_url = result.group(0).strip()
3711 video_id = result.group('videoid')
3713 webpage = self._download_webpage(embed_page_url, video_id)
3716 video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
3717 webpage, u'video URL')
3719 info = {'id': video_id,
3721 'title': video_title,
3724 'player_url': embed_page_url}
3728 class EightTracksIE(InfoExtractor):
3730 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3732 def _real_extract(self, url):
3733 mobj = re.match(self._VALID_URL, url)
3735 raise ExtractorError(u'Invalid URL: %s' % url)
3736 playlist_id = mobj.group('id')
3738 webpage = self._download_webpage(url, playlist_id)
3740 json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
3741 data = json.loads(json_like)
3743 session = str(random.randint(0, 1000000000))
3745 track_count = data['tracks_count']
3746 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3747 next_url = first_url
3749 for i in itertools.count():
3750 api_json = self._download_webpage(next_url, playlist_id,
3751 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3752 errnote=u'Failed to download song information')
3753 api_data = json.loads(api_json)
3754 track_data = api_data[u'set']['track']
3756 'id': track_data['id'],
3757 'url': track_data['track_file_stream_url'],
3758 'title': track_data['performer'] + u' - ' + track_data['name'],
3759 'raw_title': track_data['name'],
3760 'uploader_id': data['user']['login'],
3764 if api_data['set']['at_last_track']:
3766 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3769 class KeekIE(InfoExtractor):
3770 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3773 def _real_extract(self, url):
3774 m = re.match(self._VALID_URL, url)
3775 video_id = m.group('videoID')
3777 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3778 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3779 webpage = self._download_webpage(url, video_id)
3781 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3784 uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
3785 webpage, u'uploader', fatal=False)
3791 'title': video_title,
3792 'thumbnail': thumbnail,
3793 'uploader': uploader
3797 class TEDIE(InfoExtractor):
3798 _VALID_URL=r'''http://www\.ted\.com/
3800 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3802 ((?P<type_talk>talks)) # We have a simple talk
3804 (/lang/(.*?))? # The url may contain the language
3805 /(?P<name>\w+) # Here goes the name and then ".html"
3809 def suitable(cls, url):
3810 """Receives a URL and returns True if suitable for this IE."""
3811 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3813 def _real_extract(self, url):
3814 m=re.match(self._VALID_URL, url, re.VERBOSE)
3815 if m.group('type_talk'):
3816 return [self._talk_info(url)]
3818 playlist_id=m.group('playlist_id')
3819 name=m.group('name')
3820 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3821 return [self._playlist_videos_info(url,name,playlist_id)]
3823 def _playlist_videos_info(self,url,name,playlist_id=0):
3824 '''Returns the videos of the playlist'''
3826 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3827 ([.\s]*?)data-playlist_item_id="(\d+)"
3828 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3830 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3831 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3832 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3833 m_names=re.finditer(video_name_RE,webpage)
3835 playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
3836 webpage, 'playlist title')
3838 playlist_entries = []
3839 for m_video, m_name in zip(m_videos,m_names):
3840 video_id=m_video.group('video_id')
3841 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3842 playlist_entries.append(self.url_result(talk_url, 'TED'))
3843 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3845 def _talk_info(self, url, video_id=0):
3846 """Return the video for the talk in the url"""
3847 m = re.match(self._VALID_URL, url,re.VERBOSE)
3848 video_name = m.group('name')
3849 webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
3850 self.report_extraction(video_name)
3851 # If the url includes the language we get the title translated
3852 title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
3854 json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
3855 webpage, 'json data')
3856 info = json.loads(json_data)
3857 desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
3858 webpage, 'description', flags = re.DOTALL)
3860 thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
3861 webpage, 'thumbnail')
3864 'url': info['htmlStreams'][-1]['file'],
3867 'thumbnail': thumbnail,
3868 'description': desc,
3872 class MySpassIE(InfoExtractor):
3873 _VALID_URL = r'http://www.myspass.de/.*'
3875 def _real_extract(self, url):
3876 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3878 # video id is the last path element of the URL
3879 # usually there is a trailing slash, so also try the second but last
3880 url_path = compat_urllib_parse_urlparse(url).path
3881 url_parent_path, video_id = os.path.split(url_path)
3883 _, video_id = os.path.split(url_parent_path)
3886 metadata_url = META_DATA_URL_TEMPLATE % video_id
3887 metadata_text = self._download_webpage(metadata_url, video_id)
3888 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3890 # extract values from metadata
3891 url_flv_el = metadata.find('url_flv')
3892 if url_flv_el is None:
3893 raise ExtractorError(u'Unable to extract download url')
3894 video_url = url_flv_el.text
3895 extension = os.path.splitext(video_url)[1][1:]
3896 title_el = metadata.find('title')
3897 if title_el is None:
3898 raise ExtractorError(u'Unable to extract title')
3899 title = title_el.text
3900 format_id_el = metadata.find('format_id')
3901 if format_id_el is None:
3904 format = format_id_el.text
3905 description_el = metadata.find('description')
3906 if description_el is not None:
3907 description = description_el.text
3910 imagePreview_el = metadata.find('imagePreview')
3911 if imagePreview_el is not None:
3912 thumbnail = imagePreview_el.text
3921 'thumbnail': thumbnail,
3922 'description': description
3926 class SpiegelIE(InfoExtractor):
3927 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3929 def _real_extract(self, url):
3930 m = re.match(self._VALID_URL, url)
3931 video_id = m.group('videoID')
3933 webpage = self._download_webpage(url, video_id)
3935 video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
3938 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3939 xml_code = self._download_webpage(xml_url, video_id,
3940 note=u'Downloading XML', errnote=u'Failed to download XML')
3942 idoc = xml.etree.ElementTree.fromstring(xml_code)
3943 last_type = idoc[-1]
3944 filename = last_type.findall('./filename')[0].text
3945 duration = float(last_type.findall('./duration')[0].text)
3947 video_url = 'http://video2.spiegel.de/flash/' + filename
3948 video_ext = filename.rpartition('.')[2]
3953 'title': video_title,
3954 'duration': duration,
3958 class LiveLeakIE(InfoExtractor):
3960 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3961 IE_NAME = u'liveleak'
3963 def _real_extract(self, url):
3964 mobj = re.match(self._VALID_URL, url)
3966 raise ExtractorError(u'Invalid URL: %s' % url)
3968 video_id = mobj.group('video_id')
3970 webpage = self._download_webpage(url, video_id)
3972 video_url = self._search_regex(r'file: "(.*?)",',
3973 webpage, u'video URL')
3975 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3976 webpage, u'title').replace('LiveLeak.com -', '').strip()
3978 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3979 webpage, u'description', fatal=False)
3981 video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
3982 webpage, u'uploader', fatal=False)
3988 'title': video_title,
3989 'description': video_description,
3990 'uploader': video_uploader
3995 class ARDIE(InfoExtractor):
3996 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
3997 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
3998 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4000 def _real_extract(self, url):
4001 # determine video id from url
4002 m = re.match(self._VALID_URL, url)
4004 numid = re.search(r'documentId=([0-9]+)', url)
4006 video_id = numid.group(1)
4008 video_id = m.group('video_id')
4010 # determine title and media streams from webpage
4011 html = self._download_webpage(url, video_id)
4012 title = re.search(self._TITLE, html).group('title')
4013 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4015 assert '"fsk"' in html
4016 raise ExtractorError(u'This video is only available after 8:00 pm')
4018 # choose default media type and highest quality for now
4019 stream = max([s for s in streams if int(s["media_type"]) == 0],
4020 key=lambda s: int(s["quality"]))
4022 # there's two possibilities: RTMP stream or HTTP download
4023 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4024 if stream['rtmp_url']:
4025 self.to_screen(u'RTMP download detected')
4026 assert stream['video_url'].startswith('mp4:')
4027 info["url"] = stream["rtmp_url"]
4028 info["play_path"] = stream['video_url']
4030 assert stream["video_url"].endswith('.mp4')
4031 info["url"] = stream["video_url"]
4034 class ZDFIE(InfoExtractor):
4035 _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4036 _TITLE = r'<h1(?: class="beitragHeadline")?>(?P<title>.*)</h1>'
4037 _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>'
4038 _MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"'
4039 _RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)'
4041 def _real_extract(self, url):
4042 mobj = re.match(self._VALID_URL, url)
4044 raise ExtractorError(u'Invalid URL: %s' % url)
4045 video_id = mobj.group('video_id')
4047 html = self._download_webpage(url, video_id)
4048 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4050 raise ExtractorError(u'No media url found.')
4052 # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url
4053 # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url
4054 # choose first/default media type and highest quality for now
4055 for s in streams: #find 300 - dsl1000mbit
4056 if s['quality'] == '300' and s['media_type'] == 'wstreaming':
4059 for s in streams: #find veryhigh - dsl2000mbit
4060 if s['quality'] == 'veryhigh' and s['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working
4064 raise ExtractorError(u'No stream found.')
4066 media_link = self._download_webpage(stream_['video_url'], video_id,'Get stream URL')
4068 self.report_extraction(video_id)
4069 mobj = re.search(self._TITLE, html)
4071 raise ExtractorError(u'Cannot extract title')
4072 title = unescapeHTML(mobj.group('title'))
4074 mobj = re.search(self._MMS_STREAM, media_link)
4076 mobj = re.search(self._RTSP_STREAM, media_link)
4078 raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL')
4079 mms_url = mobj.group('video_url')
4081 mobj = re.search('(.*)[.](?P<ext>[^.]+)', mms_url)
4083 raise ExtractorError(u'Cannot extract extention')
4084 ext = mobj.group('ext')
4086 return [{'id': video_id,
4092 class TumblrIE(InfoExtractor):
4093 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4095 def _real_extract(self, url):
4096 m_url = re.match(self._VALID_URL, url)
4097 video_id = m_url.group('id')
4098 blog = m_url.group('blog_name')
4100 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4101 webpage = self._download_webpage(url, video_id)
4103 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4104 video = re.search(re_video, webpage)
4106 raise ExtractorError(u'Unable to extract video')
4107 video_url = video.group('video_url')
4108 ext = video.group('ext')
4110 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
4111 webpage, u'thumbnail', fatal=False) # We pick the first poster
4112 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
4114 # The only place where you can get a title, it's not complete,
4115 # but searching in other places doesn't work for all videos
4116 video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
4117 webpage, u'title', flags=re.DOTALL)
4119 return [{'id': video_id,
4121 'title': video_title,
4122 'thumbnail': video_thumbnail,
4126 class BandcampIE(InfoExtractor):
4127 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4129 def _real_extract(self, url):
4130 mobj = re.match(self._VALID_URL, url)
4131 title = mobj.group('title')
4132 webpage = self._download_webpage(url, title)
4133 # We get the link to the free download page
4134 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4135 if m_download is None:
4136 raise ExtractorError(u'No free songs found')
4138 download_link = m_download.group(1)
4139 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4140 webpage, re.MULTILINE|re.DOTALL).group('id')
4142 download_webpage = self._download_webpage(download_link, id,
4143 'Downloading free downloads page')
4144 # We get the dictionary of the track from some javascrip code
4145 info = re.search(r'items: (.*?),$',
4146 download_webpage, re.MULTILINE).group(1)
4147 info = json.loads(info)[0]
4148 # We pick mp3-320 for now, until format selection can be easily implemented.
4149 mp3_info = info[u'downloads'][u'mp3-320']
4150 # If we try to use this url it says the link has expired
4151 initial_url = mp3_info[u'url']
4152 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4153 m_url = re.match(re_url, initial_url)
4154 #We build the url we will use to get the final track url
4155 # This url is build in Bandcamp in the script download_bunde_*.js
4156 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4157 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4158 # If we could correctly generate the .rand field the url would be
4159 #in the "download_url" key
4160 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4162 track_info = {'id':id,
4163 'title' : info[u'title'],
4166 'thumbnail' : info[u'thumb_url'],
4167 'uploader' : info[u'artist']
4172 class RedTubeIE(InfoExtractor):
4173 """Information Extractor for redtube"""
4174 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4176 def _real_extract(self,url):
4177 mobj = re.match(self._VALID_URL, url)
4179 raise ExtractorError(u'Invalid URL: %s' % url)
4181 video_id = mobj.group('id')
4182 video_extension = 'mp4'
4183 webpage = self._download_webpage(url, video_id)
4185 self.report_extraction(video_id)
4187 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
4188 webpage, u'video URL')
4190 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
4196 'ext': video_extension,
4197 'title': video_title,
4200 class InaIE(InfoExtractor):
4201 """Information Extractor for Ina.fr"""
4202 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4204 def _real_extract(self,url):
4205 mobj = re.match(self._VALID_URL, url)
4207 video_id = mobj.group('id')
4208 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4209 video_extension = 'mp4'
4210 webpage = self._download_webpage(mrss_url, video_id)
4212 self.report_extraction(video_id)
4214 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
4215 webpage, u'video URL')
4217 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
4223 'ext': video_extension,
4224 'title': video_title,
4227 class HowcastIE(InfoExtractor):
4228 """Information Extractor for Howcast.com"""
4229 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
4231 def _real_extract(self, url):
4232 mobj = re.match(self._VALID_URL, url)
4234 video_id = mobj.group('id')
4235 webpage_url = 'http://www.howcast.com/videos/' + video_id
4236 webpage = self._download_webpage(webpage_url, video_id)
4238 self.report_extraction(video_id)
4240 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
4241 webpage, u'video URL')
4243 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
4246 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
4247 webpage, u'description', fatal=False)
4249 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
4250 webpage, u'thumbnail', fatal=False)
4256 'title': video_title,
4257 'description': video_description,
4258 'thumbnail': thumbnail,
4261 class VineIE(InfoExtractor):
4262 """Information Extractor for Vine.co"""
4263 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
4265 def _real_extract(self, url):
4266 mobj = re.match(self._VALID_URL, url)
4268 video_id = mobj.group('id')
4269 webpage_url = 'https://vine.co/v/' + video_id
4270 webpage = self._download_webpage(webpage_url, video_id)
4272 self.report_extraction(video_id)
4274 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
4275 webpage, u'video URL')
4277 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
4280 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
4281 webpage, u'thumbnail', fatal=False)
4283 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
4284 webpage, u'uploader', fatal=False, flags=re.DOTALL)
4290 'title': video_title,
4291 'thumbnail': thumbnail,
4292 'uploader': uploader,
4295 class FlickrIE(InfoExtractor):
4296 """Information Extractor for Flickr videos"""
4297 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
4299 def _real_extract(self, url):
4300 mobj = re.match(self._VALID_URL, url)
4302 video_id = mobj.group('id')
4303 video_uploader_id = mobj.group('uploader_id')
4304 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
4305 webpage = self._download_webpage(webpage_url, video_id)
4307 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
4309 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
4310 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
4312 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
4313 first_xml, u'node_id')
4315 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
4316 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
4318 self.report_extraction(video_id)
4320 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
4322 raise ExtractorError(u'Unable to extract video url')
4323 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
4325 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
4326 webpage, u'video title')
4328 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
4329 webpage, u'description', fatal=False)
4331 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
4332 webpage, u'thumbnail', fatal=False)
4338 'title': video_title,
4339 'description': video_description,
4340 'thumbnail': thumbnail,
4341 'uploader_id': video_uploader_id,
4344 class TeamcocoIE(InfoExtractor):
4345 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
4347 def _real_extract(self, url):
4348 mobj = re.match(self._VALID_URL, url)
4350 raise ExtractorError(u'Invalid URL: %s' % url)
4351 url_title = mobj.group('url_title')
4352 webpage = self._download_webpage(url, url_title)
4354 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
4355 webpage, u'video id')
4357 self.report_extraction(video_id)
4359 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
4362 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
4363 webpage, u'thumbnail', fatal=False)
4365 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
4366 webpage, u'description', fatal=False)
4368 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
4369 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
4371 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
4378 'title': video_title,
4379 'thumbnail': thumbnail,
4380 'description': video_description,
4383 class XHamsterIE(InfoExtractor):
4384 """Information Extractor for xHamster"""
4385 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
4387 def _real_extract(self,url):
4388 mobj = re.match(self._VALID_URL, url)
4390 video_id = mobj.group('id')
4391 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
4392 webpage = self._download_webpage(mrss_url, video_id)
4394 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
4396 raise ExtractorError(u'Unable to extract media URL')
4397 if len(mobj.group('server')) == 0:
4398 video_url = compat_urllib_parse.unquote(mobj.group('file'))
4400 video_url = mobj.group('server')+'/key='+mobj.group('file')
4401 video_extension = video_url.split('.')[-1]
4403 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
4406 # Can't see the description anywhere in the UI
4407 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
4408 # webpage, u'description', fatal=False)
4409 # if video_description: video_description = unescapeHTML(video_description)
4411 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
4413 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
4415 video_upload_date = None
4416 self._downloader.report_warning(u'Unable to extract upload date')
4418 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
4419 webpage, u'uploader id', default=u'anonymous')
4421 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
4422 webpage, u'thumbnail', fatal=False)
4427 'ext': video_extension,
4428 'title': video_title,
4429 # 'description': video_description,
4430 'upload_date': video_upload_date,
4431 'uploader_id': video_uploader_id,
4432 'thumbnail': video_thumbnail
4435 class HypemIE(InfoExtractor):
4436 """Information Extractor for hypem"""
4437 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
4439 def _real_extract(self, url):
4440 mobj = re.match(self._VALID_URL, url)
4442 raise ExtractorError(u'Invalid URL: %s' % url)
4443 track_id = mobj.group(1)
4445 data = { 'ax': 1, 'ts': time.time() }
4446 data_encoded = compat_urllib_parse.urlencode(data)
4447 complete_url = url + "?" + data_encoded
4448 request = compat_urllib_request.Request(complete_url)
4449 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
4450 cookie = urlh.headers.get('Set-Cookie', '')
4452 self.report_extraction(track_id)
4454 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
4455 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
4457 track_list = json.loads(html_tracks)
4458 track = track_list[u'tracks'][0]
4460 raise ExtractorError(u'Hypemachine contained invalid JSON.')
4463 track_id = track[u"id"]
4464 artist = track[u"artist"]
4465 title = track[u"song"]
4467 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
4468 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
4469 request.add_header('cookie', cookie)
4470 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
4472 song_data = json.loads(song_data_json)
4474 raise ExtractorError(u'Hypemachine contained invalid JSON.')
4475 final_url = song_data[u"url"]
4485 class Vbox7IE(InfoExtractor):
4486 """Information Extractor for Vbox7"""
4487 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
4489 def _real_extract(self,url):
4490 mobj = re.match(self._VALID_URL, url)
4492 raise ExtractorError(u'Invalid URL: %s' % url)
4493 video_id = mobj.group(1)
4495 redirect_page, urlh = self._download_webpage_handle(url, video_id)
4496 new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
4497 redirect_url = urlh.geturl() + new_location
4498 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
4500 title = self._html_search_regex(r'<title>(.*)</title>',
4501 webpage, u'title').split('/')[0].strip()
4504 info_url = "http://vbox7.com/play/magare.do"
4505 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
4506 info_request = compat_urllib_request.Request(info_url, data)
4507 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
4508 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
4509 if info_response is None:
4510 raise ExtractorError(u'Unable to extract the media url')
4511 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
4518 'thumbnail': thumbnail_url,
4521 class GametrailersIE(InfoExtractor):
4522 _VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
4524 def _real_extract(self, url):
4525 mobj = re.match(self._VALID_URL, url)
4527 raise ExtractorError(u'Invalid URL: %s' % url)
4528 video_id = mobj.group('id')
4529 video_type = mobj.group('type')
4530 webpage = self._download_webpage(url, video_id)
4531 if video_type == 'full-episodes':
4532 mgid_re = r'data-video="(?P<mgid>mgid:.*?)"'
4534 mgid_re = r'data-contentId=\'(?P<mgid>mgid:.*?)\''
4535 mgid = self._search_regex(mgid_re, webpage, u'mgid')
4536 data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'})
4538 info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data,
4539 video_id, u'Downloading video info')
4540 links_webpage = self._download_webpage('http://www.gametrailers.com/feeds/mediagen/?' + data,
4541 video_id, u'Downloading video urls info')
4543 self.report_extraction(video_id)
4544 info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
4545 <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
4547 <url>(?P<thumb>.*?)</url>.*
4550 m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL)
4552 raise ExtractorError(u'Unable to extract video info')
4553 video_title = m_info.group('title')
4554 video_description = m_info.group('description')
4555 video_thumb = m_info.group('thumb')
4557 m_urls = list(re.finditer(r'<src>(?P<url>.*)</src>', links_webpage))
4558 if m_urls is None or len(m_urls) == 0:
4559 raise ExtractError(u'Unable to extrat video url')
4560 # They are sorted from worst to best quality
4561 video_url = m_urls[-1].group('url')
4563 return {'url': video_url,
4565 'title': video_title,
4566 # Videos are actually flv not mp4
4568 'thumbnail': video_thumb,
4569 'description': video_description,
4572 def gen_extractors():
4573 """ Return a list of an instance of every supported extractor.
4574 The order does matter; the first extractor matched is the one handling the URL.
4577 YoutubePlaylistIE(),
4602 StanfordOpenClassroomIE(),
4612 WorldStarHipHopIE(),
4641 def get_info_extractor(ie_name):
4642 """Returns the info extractor class with the given ie_name"""
4643 return globals()[ie_name+'IE']