2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
23 class InfoExtractor(object):
24 """Information Extractor class.
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
29 others. The information is stored in a dictionary which is then
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
34 The dictionaries must include the following fields:
38 title: Video title, unescaped.
39 ext: Video filename extension.
41 The following fields are optional:
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
46 uploader: Full name of the video uploader.
47 upload_date: Video upload date (YYYYMMDD).
48 uploader_id: Nickname or id of the video uploader.
49 location: Physical location of the video.
50 player_url: SWF Player URL (used for rtmpdump).
51 subtitles: The subtitle file contents.
52 urlhandle: [internal] The urlHandle to be used to download the file,
53 like returned by urllib.request.urlopen
55 The fields should all be Unicode strings.
57 Subclasses of this one should re-define the _real_initialize() and
58 _real_extract() methods and define a _VALID_URL regexp.
59 Probably, they should also be added to the list of extractors.
61 _real_extract() must return a *list* of information dictionaries as
64 Finally, the _WORKING attribute should be set to False for broken IEs
65 in order to warn the users and skip the tests.
72 def __init__(self, downloader=None):
73 """Constructor. Receives an optional downloader."""
75 self.set_downloader(downloader)
78 def suitable(cls, url):
79 """Receives a URL and returns True if suitable for this IE."""
80 return re.match(cls._VALID_URL, url) is not None
84 """Getter method for _WORKING."""
88 """Initializes an instance (authentication, etc)."""
90 self._real_initialize()
93 def extract(self, url):
94 """Extracts URL information and returns it in list of dicts."""
96 return self._real_extract(url)
98 def set_downloader(self, downloader):
99 """Sets the downloader for this IE."""
100 self._downloader = downloader
102 def _real_initialize(self):
103 """Real initialization process. Redefine in subclasses."""
106 def _real_extract(self, url):
107 """Real extraction process. Redefine in subclasses."""
112 return type(self).__name__[:-2]
114 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115 """ Returns the response handle """
117 self.report_download_webpage(video_id)
118 elif note is not False:
119 self.to_screen(u'%s: %s' % (video_id, note))
121 return compat_urllib_request.urlopen(url_or_request)
122 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
124 errnote = u'Unable to download webpage'
125 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
127 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
128 """ Returns the data of the page as a string """
129 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
130 content_type = urlh.headers.get('Content-Type', '')
131 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
133 encoding = m.group(1)
136 webpage_bytes = urlh.read()
137 if self._downloader.params.get('dump_intermediate_pages', False):
139 url = url_or_request.get_full_url()
140 except AttributeError:
142 self.to_screen(u'Dumping request to ' + url)
143 dump = base64.b64encode(webpage_bytes).decode('ascii')
144 self._downloader.to_screen(dump)
145 return webpage_bytes.decode(encoding, 'replace')
147 def to_screen(self, msg):
148 """Print msg to screen, prefixing it with '[ie_name]'"""
149 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
151 def report_extraction(self, id_or_name):
152 """Report information extraction."""
153 self.to_screen(u'%s: Extracting information' % id_or_name)
155 def report_download_webpage(self, video_id):
156 """Report webpage download."""
157 self.to_screen(u'%s: Downloading webpage' % video_id)
159 def report_age_confirmation(self):
160 """Report attempt to confirm age."""
161 self.to_screen(u'Confirming age')
163 #Methods for following #608
164 #They set the correct value of the '_type' key
165 def video_result(self, video_info):
166 """Returns a video"""
167 video_info['_type'] = 'video'
169 def url_result(self, url, ie=None):
170 """Returns a url that points to a page that should be processed"""
171 #TODO: ie should be the class used for getting the info
172 video_info = {'_type': 'url',
176 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
177 """Returns a playlist"""
178 video_info = {'_type': 'playlist',
181 video_info['id'] = playlist_id
183 video_info['title'] = playlist_title
187 class YoutubeIE(InfoExtractor):
188 """Information extractor for youtube.com."""
192 (?:https?://)? # http(s):// (optional)
193 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
194 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
195 (?:.*?\#/)? # handle anchor (#/) redirect urls
196 (?: # the various things that can precede the ID:
197 (?:(?:v|embed|e)/) # v/ or embed/ or e/
198 |(?: # or the v= param in all its forms
199 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
200 (?:\?|\#!?) # the params delimiter ? or # or #!
201 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
204 )? # optional -> youtube.com/xxxx is OK
205 )? # all until now is optional -> you can pass the naked ID
206 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
207 (?(1).+)? # if we found the ID, everything can follow
209 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
210 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
211 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
212 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
213 _NETRC_MACHINE = 'youtube'
214 # Listed in order of quality
215 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
216 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
217 _video_extensions = {
223 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
229 _video_dimensions = {
248 def suitable(cls, url):
249 """Receives a URL and returns True if suitable for this IE."""
250 if YoutubePlaylistIE.suitable(url): return False
251 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
253 def report_lang(self):
254 """Report attempt to set language."""
255 self.to_screen(u'Setting language')
257 def report_login(self):
258 """Report attempt to log in."""
259 self.to_screen(u'Logging in')
261 def report_video_webpage_download(self, video_id):
262 """Report attempt to download video webpage."""
263 self.to_screen(u'%s: Downloading video webpage' % video_id)
265 def report_video_info_webpage_download(self, video_id):
266 """Report attempt to download video info webpage."""
267 self.to_screen(u'%s: Downloading video info webpage' % video_id)
269 def report_video_subtitles_download(self, video_id):
270 """Report attempt to download video info webpage."""
271 self.to_screen(u'%s: Checking available subtitles' % video_id)
273 def report_video_subtitles_request(self, video_id, sub_lang, format):
274 """Report attempt to download video info webpage."""
275 self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
277 def report_video_subtitles_available(self, video_id, sub_lang_list):
278 """Report available subtitles."""
279 sub_lang = ",".join(list(sub_lang_list.keys()))
280 self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
282 def report_information_extraction(self, video_id):
283 """Report attempt to extract video information."""
284 self.to_screen(u'%s: Extracting video information' % video_id)
286 def report_unavailable_format(self, video_id, format):
287 """Report extracted video URL."""
288 self.to_screen(u'%s: Format %s not available' % (video_id, format))
290 def report_rtmp_download(self):
291 """Indicate the download will use the RTMP protocol."""
292 self.to_screen(u'RTMP download detected')
294 def _get_available_subtitles(self, video_id):
295 self.report_video_subtitles_download(video_id)
296 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
298 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
299 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
300 return (u'unable to download video subtitles: %s' % compat_str(err), None)
301 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
302 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
303 if not sub_lang_list:
304 return (u'video doesn\'t have subtitles', None)
307 def _list_available_subtitles(self, video_id):
308 sub_lang_list = self._get_available_subtitles(video_id)
309 self.report_video_subtitles_available(video_id, sub_lang_list)
311 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
314 (error_message, sub_lang, sub)
316 self.report_video_subtitles_request(video_id, sub_lang, format)
317 params = compat_urllib_parse.urlencode({
323 url = 'http://www.youtube.com/api/timedtext?' + params
325 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
326 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
327 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
329 return (u'Did not fetch video subtitles', None, None)
330 return (None, sub_lang, sub)
332 def _extract_subtitle(self, video_id):
334 Return a list with a tuple:
335 [(error_message, sub_lang, sub)]
337 sub_lang_list = self._get_available_subtitles(video_id)
338 sub_format = self._downloader.params.get('subtitlesformat')
339 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
340 return [(sub_lang_list[0], None, None)]
341 if self._downloader.params.get('subtitleslang', False):
342 sub_lang = self._downloader.params.get('subtitleslang')
343 elif 'en' in sub_lang_list:
346 sub_lang = list(sub_lang_list.keys())[0]
347 if not sub_lang in sub_lang_list:
348 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
350 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
353 def _extract_all_subtitles(self, video_id):
354 sub_lang_list = self._get_available_subtitles(video_id)
355 sub_format = self._downloader.params.get('subtitlesformat')
356 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
357 return [(sub_lang_list[0], None, None)]
359 for sub_lang in sub_lang_list:
360 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
361 subtitles.append(subtitle)
364 def _print_formats(self, formats):
365 print('Available formats:')
367 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
369 def _real_initialize(self):
370 if self._downloader is None:
375 downloader_params = self._downloader.params
377 # Attempt to use provided username and password or .netrc data
378 if downloader_params.get('username', None) is not None:
379 username = downloader_params['username']
380 password = downloader_params['password']
381 elif downloader_params.get('usenetrc', False):
383 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
388 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
389 except (IOError, netrc.NetrcParseError) as err:
390 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
394 request = compat_urllib_request.Request(self._LANG_URL)
397 compat_urllib_request.urlopen(request).read()
398 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
399 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
402 # No authentication to be performed
406 request = compat_urllib_request.Request(self._LOGIN_URL)
408 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
409 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
410 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
415 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
417 galx = match.group(1)
419 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
425 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
429 u'PersistentCookie': u'yes',
431 u'bgresponse': u'js_disabled',
432 u'checkConnection': u'',
433 u'checkedDomains': u'youtube',
439 u'signIn': u'Sign in',
441 u'service': u'youtube',
445 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
447 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
448 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
449 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
452 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
453 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
454 self._downloader.report_warning(u'unable to log in: bad username or password')
456 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
457 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
463 'action_confirm': 'Confirm',
465 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
467 self.report_age_confirmation()
468 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
469 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
470 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
473 def _extract_id(self, url):
474 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
476 self._downloader.report_error(u'invalid URL: %s' % url)
478 video_id = mobj.group(2)
481 def _real_extract(self, url):
482 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
483 mobj = re.search(self._NEXT_URL_RE, url)
485 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
486 video_id = self._extract_id(url)
489 self.report_video_webpage_download(video_id)
490 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
491 request = compat_urllib_request.Request(url)
493 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
494 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
495 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
498 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
500 # Attempt to extract SWF player URL
501 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
503 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
508 self.report_video_info_webpage_download(video_id)
509 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
510 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
511 % (video_id, el_type))
512 video_info_webpage = self._download_webpage(video_info_url, video_id,
514 errnote='unable to download video info webpage')
515 video_info = compat_parse_qs(video_info_webpage)
516 if 'token' in video_info:
518 if 'token' not in video_info:
519 if 'reason' in video_info:
520 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
522 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
525 # Check for "rental" videos
526 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
527 self._downloader.report_error(u'"rental" videos not supported')
530 # Start extracting information
531 self.report_information_extraction(video_id)
534 if 'author' not in video_info:
535 self._downloader.report_error(u'unable to extract uploader name')
537 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
540 video_uploader_id = None
541 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
543 video_uploader_id = mobj.group(1)
545 self._downloader.report_warning(u'unable to extract uploader nickname')
548 if 'title' not in video_info:
549 self._downloader.report_error(u'unable to extract video title')
551 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
554 if 'thumbnail_url' not in video_info:
555 self._downloader.report_warning(u'unable to extract video thumbnail')
557 else: # don't panic if we can't find it
558 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
562 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
564 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
565 upload_date = unified_strdate(upload_date)
568 video_description = get_element_by_id("eow-description", video_webpage)
569 if video_description:
570 video_description = clean_html(video_description)
572 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
574 video_description = unescapeHTML(fd_mobj.group(1))
576 video_description = u''
579 video_subtitles = None
581 if self._downloader.params.get('writesubtitles', False):
582 video_subtitles = self._extract_subtitle(video_id)
584 (sub_error, sub_lang, sub) = video_subtitles[0]
586 self._downloader.report_error(sub_error)
588 if self._downloader.params.get('allsubtitles', False):
589 video_subtitles = self._extract_all_subtitles(video_id)
590 for video_subtitle in video_subtitles:
591 (sub_error, sub_lang, sub) = video_subtitle
593 self._downloader.report_error(sub_error)
595 if self._downloader.params.get('listsubtitles', False):
596 sub_lang_list = self._list_available_subtitles(video_id)
599 if 'length_seconds' not in video_info:
600 self._downloader.report_warning(u'unable to extract video duration')
603 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
606 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
608 # Decide which formats to download
609 req_format = self._downloader.params.get('format', None)
611 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
612 self.report_rtmp_download()
613 video_url_list = [(None, video_info['conn'][0])]
614 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
615 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
616 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
617 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
618 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
620 format_limit = self._downloader.params.get('format_limit', None)
621 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
622 if format_limit is not None and format_limit in available_formats:
623 format_list = available_formats[available_formats.index(format_limit):]
625 format_list = available_formats
626 existing_formats = [x for x in format_list if x in url_map]
627 if len(existing_formats) == 0:
628 raise ExtractorError(u'no known formats available for video')
629 if self._downloader.params.get('listformats', None):
630 self._print_formats(existing_formats)
632 if req_format is None or req_format == 'best':
633 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
634 elif req_format == 'worst':
635 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
636 elif req_format in ('-1', 'all'):
637 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
639 # Specific formats. We pick the first in a slash-delimeted sequence.
640 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
641 req_formats = req_format.split('/')
642 video_url_list = None
643 for rf in req_formats:
645 video_url_list = [(rf, url_map[rf])]
647 if video_url_list is None:
648 raise ExtractorError(u'requested format not available')
650 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
653 for format_param, video_real_url in video_url_list:
655 video_extension = self._video_extensions.get(format_param, 'flv')
657 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
658 self._video_dimensions.get(format_param, '???'))
662 'url': video_real_url,
663 'uploader': video_uploader,
664 'uploader_id': video_uploader_id,
665 'upload_date': upload_date,
666 'title': video_title,
667 'ext': video_extension,
668 'format': video_format,
669 'thumbnail': video_thumbnail,
670 'description': video_description,
671 'player_url': player_url,
672 'subtitles': video_subtitles,
673 'duration': video_duration
678 class MetacafeIE(InfoExtractor):
679 """Information Extractor for metacafe.com."""
681 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
682 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
683 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
684 IE_NAME = u'metacafe'
686 def report_disclaimer(self):
687 """Report disclaimer retrieval."""
688 self.to_screen(u'Retrieving disclaimer')
690 def _real_initialize(self):
691 # Retrieve disclaimer
692 request = compat_urllib_request.Request(self._DISCLAIMER)
694 self.report_disclaimer()
695 disclaimer = compat_urllib_request.urlopen(request).read()
696 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
697 self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
703 'submit': "Continue - I'm over 18",
705 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
707 self.report_age_confirmation()
708 disclaimer = compat_urllib_request.urlopen(request).read()
709 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
710 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
713 def _real_extract(self, url):
714 # Extract id and simplified title from URL
715 mobj = re.match(self._VALID_URL, url)
717 self._downloader.report_error(u'invalid URL: %s' % url)
720 video_id = mobj.group(1)
722 # Check if video comes from YouTube
723 mobj2 = re.match(r'^yt-(.*)$', video_id)
724 if mobj2 is not None:
725 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
727 # Retrieve video webpage to extract further information
728 webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
730 # Extract URL, uploader and title from webpage
731 self.report_extraction(video_id)
732 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
734 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
735 video_extension = mediaURL[-3:]
737 # Extract gdaKey if available
738 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
742 gdaKey = mobj.group(1)
743 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
745 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
747 self._downloader.report_error(u'unable to extract media URL')
749 vardict = compat_parse_qs(mobj.group(1))
750 if 'mediaData' not in vardict:
751 self._downloader.report_error(u'unable to extract media URL')
753 mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
755 self._downloader.report_error(u'unable to extract media URL')
757 mediaURL = mobj.group('mediaURL').replace('\\/', '/')
758 video_extension = mediaURL[-3:]
759 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
761 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
763 self._downloader.report_error(u'unable to extract title')
765 video_title = mobj.group(1).decode('utf-8')
767 mobj = re.search(r'submitter=(.*?);', webpage)
769 self._downloader.report_error(u'unable to extract uploader nickname')
771 video_uploader = mobj.group(1)
774 'id': video_id.decode('utf-8'),
775 'url': video_url.decode('utf-8'),
776 'uploader': video_uploader.decode('utf-8'),
778 'title': video_title,
779 'ext': video_extension.decode('utf-8'),
782 class RedtubeIE(InfoExtractor):
783 """Information Extractor for redtube"""
784 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
787 def _real_extract(self,url):
788 mobj = re.match(self._VALID_URL, url)
790 self._downloader.report_error(u'invalid URL: %s' % url)
792 video_id = mobj.group('id')
793 video_extension = 'mp4'
794 webpage = self._download_webpage(url, video_id)
795 self.report_extraction(video_id)
796 mobj = re.search(r'<source src="'+'(.+)'+'" type="video/mp4">',webpage)
798 video_url = mobj.group(1)
800 self._downloader.report_error(u'unable to extract media URL')
802 mobj = re.search('<h1 class="videoTitle slidePanelMovable">'+r'(.+)'+r'</h1>',webpage)
804 video_title = mobj.group(1)
806 video_title = 'Redtube - %s' % time.ctime()
811 'ext': video_extension,
812 'title': video_title,
815 class DailymotionIE(InfoExtractor):
816 """Information Extractor for Dailymotion"""
818 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
819 IE_NAME = u'dailymotion'
821 def _real_extract(self, url):
822 # Extract id and simplified title from URL
823 mobj = re.match(self._VALID_URL, url)
825 self._downloader.report_error(u'invalid URL: %s' % url)
828 video_id = mobj.group(1).split('_')[0].split('?')[0]
830 video_extension = 'mp4'
832 # Retrieve video webpage to extract further information
833 request = compat_urllib_request.Request(url)
834 request.add_header('Cookie', 'family_filter=off')
835 webpage = self._download_webpage(request, video_id)
837 # Extract URL, uploader and title from webpage
838 self.report_extraction(video_id)
839 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
841 self._downloader.report_error(u'unable to extract media URL')
843 flashvars = compat_urllib_parse.unquote(mobj.group(1))
845 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
848 self.to_screen(u'Using %s' % key)
851 self._downloader.report_error(u'unable to extract video URL')
854 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
856 self._downloader.report_error(u'unable to extract video URL')
859 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
861 # TODO: support choosing qualities
863 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
865 self._downloader.report_error(u'unable to extract title')
867 video_title = unescapeHTML(mobj.group('title'))
869 video_uploader = None
870 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
872 # lookin for official user
873 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
874 if mobj_official is None:
875 self._downloader.report_warning(u'unable to extract uploader nickname')
877 video_uploader = mobj_official.group(1)
879 video_uploader = mobj.group(1)
881 video_upload_date = None
882 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
884 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
889 'uploader': video_uploader,
890 'upload_date': video_upload_date,
891 'title': video_title,
892 'ext': video_extension,
896 class PhotobucketIE(InfoExtractor):
897 """Information extractor for photobucket.com."""
899 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
900 IE_NAME = u'photobucket'
902 def _real_extract(self, url):
903 # Extract id from URL
904 mobj = re.match(self._VALID_URL, url)
906 self._downloader.report_error(u'Invalid URL: %s' % url)
909 video_id = mobj.group(1)
911 video_extension = 'flv'
913 # Retrieve video webpage to extract further information
914 request = compat_urllib_request.Request(url)
916 self.report_download_webpage(video_id)
917 webpage = compat_urllib_request.urlopen(request).read()
918 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
919 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
922 # Extract URL, uploader, and title from webpage
923 self.report_extraction(video_id)
924 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
926 self._downloader.report_error(u'unable to extract media URL')
928 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
932 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
934 self._downloader.report_error(u'unable to extract title')
936 video_title = mobj.group(1).decode('utf-8')
938 video_uploader = mobj.group(2).decode('utf-8')
941 'id': video_id.decode('utf-8'),
942 'url': video_url.decode('utf-8'),
943 'uploader': video_uploader,
945 'title': video_title,
946 'ext': video_extension.decode('utf-8'),
950 class YahooIE(InfoExtractor):
951 """Information extractor for video.yahoo.com."""
954 # _VALID_URL matches all Yahoo! Video URLs
955 # _VPAGE_URL matches only the extractable '/watch/' URLs
956 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
957 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
958 IE_NAME = u'video.yahoo'
960 def _real_extract(self, url, new_video=True):
961 # Extract ID from URL
962 mobj = re.match(self._VALID_URL, url)
964 self._downloader.report_error(u'Invalid URL: %s' % url)
967 video_id = mobj.group(2)
968 video_extension = 'flv'
970 # Rewrite valid but non-extractable URLs as
971 # extractable English language /watch/ URLs
972 if re.match(self._VPAGE_URL, url) is None:
973 request = compat_urllib_request.Request(url)
975 webpage = compat_urllib_request.urlopen(request).read()
976 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
977 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
980 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
982 self._downloader.report_error(u'Unable to extract id field')
984 yahoo_id = mobj.group(1)
986 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
988 self._downloader.report_error(u'Unable to extract vid field')
990 yahoo_vid = mobj.group(1)
992 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
993 return self._real_extract(url, new_video=False)
995 # Retrieve video webpage to extract further information
996 request = compat_urllib_request.Request(url)
998 self.report_download_webpage(video_id)
999 webpage = compat_urllib_request.urlopen(request).read()
1000 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1001 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1004 # Extract uploader and title from webpage
1005 self.report_extraction(video_id)
1006 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1008 self._downloader.report_error(u'unable to extract video title')
1010 video_title = mobj.group(1).decode('utf-8')
1012 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1014 self._downloader.report_error(u'unable to extract video uploader')
1016 video_uploader = mobj.group(1).decode('utf-8')
1018 # Extract video thumbnail
1019 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1021 self._downloader.report_error(u'unable to extract video thumbnail')
1023 video_thumbnail = mobj.group(1).decode('utf-8')
1025 # Extract video description
1026 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1028 self._downloader.report_error(u'unable to extract video description')
1030 video_description = mobj.group(1).decode('utf-8')
1031 if not video_description:
1032 video_description = 'No description available.'
1034 # Extract video height and width
1035 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1037 self._downloader.report_error(u'unable to extract video height')
1039 yv_video_height = mobj.group(1)
1041 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1043 self._downloader.report_error(u'unable to extract video width')
1045 yv_video_width = mobj.group(1)
1047 # Retrieve video playlist to extract media URL
1048 # I'm not completely sure what all these options are, but we
1049 # seem to need most of them, otherwise the server sends a 401.
1050 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1051 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1052 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1053 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1054 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1056 self.report_download_webpage(video_id)
1057 webpage = compat_urllib_request.urlopen(request).read()
1058 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1059 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1062 # Extract media URL from playlist XML
1063 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1065 self._downloader.report_error(u'Unable to extract media URL')
1067 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1068 video_url = unescapeHTML(video_url)
1071 'id': video_id.decode('utf-8'),
1073 'uploader': video_uploader,
1074 'upload_date': None,
1075 'title': video_title,
1076 'ext': video_extension.decode('utf-8'),
1077 'thumbnail': video_thumbnail.decode('utf-8'),
1078 'description': video_description,
1082 class VimeoIE(InfoExtractor):
1083 """Information extractor for vimeo.com."""
1085 # _VALID_URL matches Vimeo URLs
1086 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1089 def _real_extract(self, url, new_video=True):
1090 # Extract ID from URL
1091 mobj = re.match(self._VALID_URL, url)
1093 self._downloader.report_error(u'Invalid URL: %s' % url)
1096 video_id = mobj.group('id')
1097 if not mobj.group('proto'):
1098 url = 'https://' + url
1099 if mobj.group('direct_link'):
1100 url = 'https://vimeo.com/' + video_id
1102 # Retrieve video webpage to extract further information
1103 request = compat_urllib_request.Request(url, None, std_headers)
1104 webpage = self._download_webpage(request, video_id)
1106 # Now we begin extracting as much information as we can from what we
1107 # retrieved. First we extract the information common to all extractors,
1108 # and latter we extract those that are Vimeo specific.
1109 self.report_extraction(video_id)
1111 # Extract the config JSON
1113 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1114 config = json.loads(config)
1116 if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1117 self._downloader.report_error(u'The author has restricted the access to this video, try with the "--referer" option')
1119 self._downloader.report_error(u'unable to extract info section')
1123 video_title = config["video"]["title"]
1125 # Extract uploader and uploader_id
1126 video_uploader = config["video"]["owner"]["name"]
1127 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1129 # Extract video thumbnail
1130 video_thumbnail = config["video"]["thumbnail"]
1132 # Extract video description
1133 video_description = get_element_by_attribute("itemprop", "description", webpage)
1134 if video_description: video_description = clean_html(video_description)
1135 else: video_description = u''
1137 # Extract upload date
1138 video_upload_date = None
1139 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1140 if mobj is not None:
1141 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1143 # Vimeo specific: extract request signature and timestamp
1144 sig = config['request']['signature']
1145 timestamp = config['request']['timestamp']
1147 # Vimeo specific: extract video codec and quality information
1148 # First consider quality, then codecs, then take everything
1149 # TODO bind to format param
1150 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1151 files = { 'hd': [], 'sd': [], 'other': []}
1152 for codec_name, codec_extension in codecs:
1153 if codec_name in config["video"]["files"]:
1154 if 'hd' in config["video"]["files"][codec_name]:
1155 files['hd'].append((codec_name, codec_extension, 'hd'))
1156 elif 'sd' in config["video"]["files"][codec_name]:
1157 files['sd'].append((codec_name, codec_extension, 'sd'))
1159 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1161 for quality in ('hd', 'sd', 'other'):
1162 if len(files[quality]) > 0:
1163 video_quality = files[quality][0][2]
1164 video_codec = files[quality][0][0]
1165 video_extension = files[quality][0][1]
1166 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1169 self._downloader.report_error(u'no known codec found')
1172 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1173 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1178 'uploader': video_uploader,
1179 'uploader_id': video_uploader_id,
1180 'upload_date': video_upload_date,
1181 'title': video_title,
1182 'ext': video_extension,
1183 'thumbnail': video_thumbnail,
1184 'description': video_description,
1188 class ArteTvIE(InfoExtractor):
1189 """arte.tv information extractor."""
1191 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1192 _LIVE_URL = r'index-[0-9]+\.html$'
1194 IE_NAME = u'arte.tv'
1196 def fetch_webpage(self, url):
1197 request = compat_urllib_request.Request(url)
1199 self.report_download_webpage(url)
1200 webpage = compat_urllib_request.urlopen(request).read()
1201 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1202 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1204 except ValueError as err:
1205 self._downloader.report_error(u'Invalid URL: %s' % url)
1209 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1210 page = self.fetch_webpage(url)
1211 mobj = re.search(regex, page, regexFlags)
1215 self._downloader.report_error(u'Invalid URL: %s' % url)
1218 for (i, key, err) in matchTuples:
1219 if mobj.group(i) is None:
1220 self._downloader.report_error(err)
1223 info[key] = mobj.group(i)
1227 def extractLiveStream(self, url):
1228 video_lang = url.split('/')[-4]
1229 info = self.grep_webpage(
1231 r'src="(.*?/videothek_js.*?\.js)',
1234 (1, 'url', u'Invalid URL: %s' % url)
1237 http_host = url.split('/')[2]
1238 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1239 info = self.grep_webpage(
1241 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1242 '(http://.*?\.swf).*?' +
1246 (1, 'path', u'could not extract video path: %s' % url),
1247 (2, 'player', u'could not extract video player: %s' % url),
1248 (3, 'url', u'could not extract video url: %s' % url)
1251 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1253 def extractPlus7Stream(self, url):
1254 video_lang = url.split('/')[-3]
1255 info = self.grep_webpage(
1257 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1260 (1, 'url', u'Invalid URL: %s' % url)
1263 next_url = compat_urllib_parse.unquote(info.get('url'))
1264 info = self.grep_webpage(
1266 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1269 (1, 'url', u'Could not find <video> tag: %s' % url)
1272 next_url = compat_urllib_parse.unquote(info.get('url'))
1274 info = self.grep_webpage(
1276 r'<video id="(.*?)".*?>.*?' +
1277 '<name>(.*?)</name>.*?' +
1278 '<dateVideo>(.*?)</dateVideo>.*?' +
1279 '<url quality="hd">(.*?)</url>',
1282 (1, 'id', u'could not extract video id: %s' % url),
1283 (2, 'title', u'could not extract video title: %s' % url),
1284 (3, 'date', u'could not extract video date: %s' % url),
1285 (4, 'url', u'could not extract video url: %s' % url)
1290 'id': info.get('id'),
1291 'url': compat_urllib_parse.unquote(info.get('url')),
1292 'uploader': u'arte.tv',
1293 'upload_date': info.get('date'),
1294 'title': info.get('title').decode('utf-8'),
1300 def _real_extract(self, url):
1301 video_id = url.split('/')[-1]
1302 self.report_extraction(video_id)
1304 if re.search(self._LIVE_URL, video_id) is not None:
1305 self.extractLiveStream(url)
1308 info = self.extractPlus7Stream(url)
1313 class GenericIE(InfoExtractor):
1314 """Generic last-resort information extractor."""
1317 IE_NAME = u'generic'
1319 def report_download_webpage(self, video_id):
1320 """Report webpage download."""
1321 if not self._downloader.params.get('test', False):
1322 self._downloader.report_warning(u'Falling back on generic information extractor.')
1323 super(GenericIE, self).report_download_webpage(video_id)
1325 def report_following_redirect(self, new_url):
1326 """Report information extraction."""
1327 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1329 def _test_redirect(self, url):
1330 """Check if it is a redirect, like url shorteners, in case return the new url."""
1331 class HeadRequest(compat_urllib_request.Request):
1332 def get_method(self):
1335 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1337 Subclass the HTTPRedirectHandler to make it use our
1338 HeadRequest also on the redirected URL
1340 def redirect_request(self, req, fp, code, msg, headers, newurl):
1341 if code in (301, 302, 303, 307):
1342 newurl = newurl.replace(' ', '%20')
1343 newheaders = dict((k,v) for k,v in req.headers.items()
1344 if k.lower() not in ("content-length", "content-type"))
1345 return HeadRequest(newurl,
1347 origin_req_host=req.get_origin_req_host(),
1350 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1352 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1354 Fallback to GET if HEAD is not allowed (405 HTTP error)
1356 def http_error_405(self, req, fp, code, msg, headers):
1360 newheaders = dict((k,v) for k,v in req.headers.items()
1361 if k.lower() not in ("content-length", "content-type"))
1362 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1364 origin_req_host=req.get_origin_req_host(),
1368 opener = compat_urllib_request.OpenerDirector()
1369 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1370 HTTPMethodFallback, HEADRedirectHandler,
1371 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1372 opener.add_handler(handler())
1374 response = opener.open(HeadRequest(url))
1375 new_url = response.geturl()
1380 self.report_following_redirect(new_url)
1383 def _real_extract(self, url):
1384 new_url = self._test_redirect(url)
1385 if new_url: return [self.url_result(new_url)]
1387 video_id = url.split('/')[-1]
1389 webpage = self._download_webpage(url, video_id)
1390 except ValueError as err:
1391 # since this is the last-resort InfoExtractor, if
1392 # this error is thrown, it'll be thrown here
1393 self._downloader.report_error(u'Invalid URL: %s' % url)
1396 self.report_extraction(video_id)
1397 # Start with something easy: JW Player in SWFObject
1398 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1400 # Broaden the search a little bit
1401 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1403 # Broaden the search a little bit: JWPlayer JS loader
1404 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1406 self._downloader.report_error(u'Invalid URL: %s' % url)
1409 # It's possible that one of the regexes
1410 # matched, but returned an empty group:
1411 if mobj.group(1) is None:
1412 self._downloader.report_error(u'Invalid URL: %s' % url)
1415 video_url = compat_urllib_parse.unquote(mobj.group(1))
1416 video_id = os.path.basename(video_url)
1418 # here's a fun little line of code for you:
1419 video_extension = os.path.splitext(video_id)[1][1:]
1420 video_id = os.path.splitext(video_id)[0]
1422 # it's tempting to parse this further, but you would
1423 # have to take into account all the variations like
1424 # Video Title - Site Name
1425 # Site Name | Video Title
1426 # Video Title - Tagline | Site Name
1427 # and so on and so forth; it's just not practical
1428 mobj = re.search(r'<title>(.*)</title>', webpage)
1430 self._downloader.report_error(u'unable to extract title')
1432 video_title = mobj.group(1)
1434 # video uploader is domain name
1435 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1437 self._downloader.report_error(u'unable to extract title')
1439 video_uploader = mobj.group(1)
1444 'uploader': video_uploader,
1445 'upload_date': None,
1446 'title': video_title,
1447 'ext': video_extension,
1451 class YoutubeSearchIE(InfoExtractor):
1452 """Information Extractor for YouTube search queries."""
1453 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1454 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1455 _max_youtube_results = 1000
1456 IE_NAME = u'youtube:search'
1458 def report_download_page(self, query, pagenum):
1459 """Report attempt to download search page with given number."""
1460 query = query.decode(preferredencoding())
1461 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1463 def _real_extract(self, query):
1464 mobj = re.match(self._VALID_URL, query)
1466 self._downloader.report_error(u'invalid search query "%s"' % query)
1469 prefix, query = query.split(':')
1471 query = query.encode('utf-8')
1473 return self._get_n_results(query, 1)
1474 elif prefix == 'all':
1475 self._get_n_results(query, self._max_youtube_results)
1480 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1482 elif n > self._max_youtube_results:
1483 self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1484 n = self._max_youtube_results
1485 return self._get_n_results(query, n)
1486 except ValueError: # parsing prefix as integer fails
1487 return self._get_n_results(query, 1)
1489 def _get_n_results(self, query, n):
1490 """Get a specified number of results for a query"""
1496 while (50 * pagenum) < limit:
1497 self.report_download_page(query, pagenum+1)
1498 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1499 request = compat_urllib_request.Request(result_url)
1501 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1502 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1503 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1505 api_response = json.loads(data)['data']
1507 if not 'items' in api_response:
1508 self._downloader.report_error(u'[youtube] No video results')
1511 new_ids = list(video['id'] for video in api_response['items'])
1512 video_ids += new_ids
1514 limit = min(n, api_response['totalItems'])
1517 if len(video_ids) > n:
1518 video_ids = video_ids[:n]
1519 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1523 class GoogleSearchIE(InfoExtractor):
1524 """Information Extractor for Google Video search queries."""
1525 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1526 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1527 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1528 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1529 _max_google_results = 1000
1530 IE_NAME = u'video.google:search'
1532 def report_download_page(self, query, pagenum):
1533 """Report attempt to download playlist page with given number."""
1534 query = query.decode(preferredencoding())
1535 self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1537 def _real_extract(self, query):
1538 mobj = re.match(self._VALID_URL, query)
1540 self._downloader.report_error(u'invalid search query "%s"' % query)
1543 prefix, query = query.split(':')
1545 query = query.encode('utf-8')
1547 self._download_n_results(query, 1)
1549 elif prefix == 'all':
1550 self._download_n_results(query, self._max_google_results)
1556 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1558 elif n > self._max_google_results:
1559 self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1560 n = self._max_google_results
1561 self._download_n_results(query, n)
1563 except ValueError: # parsing prefix as integer fails
1564 self._download_n_results(query, 1)
1567 def _download_n_results(self, query, n):
1568 """Downloads a specified number of results for a query"""
1574 self.report_download_page(query, pagenum)
1575 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1576 request = compat_urllib_request.Request(result_url)
1578 page = compat_urllib_request.urlopen(request).read()
1579 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1580 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1583 # Extract video identifiers
1584 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1585 video_id = mobj.group(1)
1586 if video_id not in video_ids:
1587 video_ids.append(video_id)
1588 if len(video_ids) == n:
1589 # Specified n videos reached
1590 for id in video_ids:
1591 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1594 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1595 for id in video_ids:
1596 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1599 pagenum = pagenum + 1
1602 class YahooSearchIE(InfoExtractor):
1603 """Information Extractor for Yahoo! Video search queries."""
1606 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1607 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1608 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1609 _MORE_PAGES_INDICATOR = r'\s*Next'
1610 _max_yahoo_results = 1000
1611 IE_NAME = u'video.yahoo:search'
1613 def report_download_page(self, query, pagenum):
1614 """Report attempt to download playlist page with given number."""
1615 query = query.decode(preferredencoding())
1616 self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1618 def _real_extract(self, query):
1619 mobj = re.match(self._VALID_URL, query)
1621 self._downloader.report_error(u'invalid search query "%s"' % query)
1624 prefix, query = query.split(':')
1626 query = query.encode('utf-8')
1628 self._download_n_results(query, 1)
1630 elif prefix == 'all':
1631 self._download_n_results(query, self._max_yahoo_results)
1637 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1639 elif n > self._max_yahoo_results:
1640 self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1641 n = self._max_yahoo_results
1642 self._download_n_results(query, n)
1644 except ValueError: # parsing prefix as integer fails
1645 self._download_n_results(query, 1)
1648 def _download_n_results(self, query, n):
1649 """Downloads a specified number of results for a query"""
1652 already_seen = set()
1656 self.report_download_page(query, pagenum)
1657 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1658 request = compat_urllib_request.Request(result_url)
1660 page = compat_urllib_request.urlopen(request).read()
1661 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1662 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1665 # Extract video identifiers
1666 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1667 video_id = mobj.group(1)
1668 if video_id not in already_seen:
1669 video_ids.append(video_id)
1670 already_seen.add(video_id)
1671 if len(video_ids) == n:
1672 # Specified n videos reached
1673 for id in video_ids:
1674 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1677 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1678 for id in video_ids:
1679 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1682 pagenum = pagenum + 1
1685 class YoutubePlaylistIE(InfoExtractor):
1686 """Information Extractor for YouTube playlists."""
1688 _VALID_URL = r"""(?:
1693 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1694 \? (?:.*?&)*? (?:p|a|list)=
1697 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1700 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1702 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1704 IE_NAME = u'youtube:playlist'
1707 def suitable(cls, url):
1708 """Receives a URL and returns True if suitable for this IE."""
1709 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1711 def _real_extract(self, url):
1712 # Extract playlist id
1713 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1715 self._downloader.report_error(u'invalid url: %s' % url)
1718 # Download playlist videos from API
1719 playlist_id = mobj.group(1) or mobj.group(2)
1724 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1725 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1728 response = json.loads(page)
1729 except ValueError as err:
1730 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1733 if 'feed' not in response:
1734 self._downloader.report_error(u'Got a malformed response from YouTube API')
1736 playlist_title = response['feed']['title']['$t']
1737 if 'entry' not in response['feed']:
1738 # Number of videos is a multiple of self._MAX_RESULTS
1741 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1742 for entry in response['feed']['entry']
1743 if 'content' in entry ]
1745 if len(response['feed']['entry']) < self._MAX_RESULTS:
1749 videos = [v[1] for v in sorted(videos)]
1751 url_results = [self.url_result(url, 'Youtube') for url in videos]
1752 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1755 class YoutubeChannelIE(InfoExtractor):
1756 """Information Extractor for YouTube channels."""
1758 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1759 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1760 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1761 _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1762 IE_NAME = u'youtube:channel'
1764 def extract_videos_from_page(self, page):
1766 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1767 if mobj.group(1) not in ids_in_page:
1768 ids_in_page.append(mobj.group(1))
1771 def _real_extract(self, url):
1772 # Extract channel id
1773 mobj = re.match(self._VALID_URL, url)
1775 self._downloader.report_error(u'invalid url: %s' % url)
1778 # Download channel page
1779 channel_id = mobj.group(1)
1783 url = self._TEMPLATE_URL % (channel_id, pagenum)
1784 page = self._download_webpage(url, channel_id,
1785 u'Downloading page #%s' % pagenum)
1787 # Extract video identifiers
1788 ids_in_page = self.extract_videos_from_page(page)
1789 video_ids.extend(ids_in_page)
1791 # Download any subsequent channel pages using the json-based channel_ajax query
1792 if self._MORE_PAGES_INDICATOR in page:
1794 pagenum = pagenum + 1
1796 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1797 page = self._download_webpage(url, channel_id,
1798 u'Downloading page #%s' % pagenum)
1800 page = json.loads(page)
1802 ids_in_page = self.extract_videos_from_page(page['content_html'])
1803 video_ids.extend(ids_in_page)
1805 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1808 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1810 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1811 url_entries = [self.url_result(url, 'Youtube') for url in urls]
1812 return [self.playlist_result(url_entries, channel_id)]
1815 class YoutubeUserIE(InfoExtractor):
1816 """Information Extractor for YouTube users."""
1818 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1819 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1820 _GDATA_PAGE_SIZE = 50
1821 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1822 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1823 IE_NAME = u'youtube:user'
1825 def _real_extract(self, url):
1827 mobj = re.match(self._VALID_URL, url)
1829 self._downloader.report_error(u'invalid url: %s' % url)
1832 username = mobj.group(1)
1834 # Download video ids using YouTube Data API. Result size per
1835 # query is limited (currently to 50 videos) so we need to query
1836 # page by page until there are no video ids - it means we got
1843 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1845 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1846 page = self._download_webpage(gdata_url, username,
1847 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1849 # Extract video identifiers
1852 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1853 if mobj.group(1) not in ids_in_page:
1854 ids_in_page.append(mobj.group(1))
1856 video_ids.extend(ids_in_page)
1858 # A little optimization - if current page is not
1859 # "full", ie. does not contain PAGE_SIZE video ids then
1860 # we can assume that this page is the last one - there
1861 # are no more ids on further pages - no need to query
1864 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1869 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1870 url_results = [self.url_result(url, 'Youtube') for url in urls]
1871 return [self.playlist_result(url_results, playlist_title = username)]
1874 class BlipTVUserIE(InfoExtractor):
1875 """Information Extractor for blip.tv users."""
1877 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1879 IE_NAME = u'blip.tv:user'
1881 def _real_extract(self, url):
1883 mobj = re.match(self._VALID_URL, url)
1885 self._downloader.report_error(u'invalid url: %s' % url)
1888 username = mobj.group(1)
1890 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1892 page = self._download_webpage(url, username, u'Downloading user page')
1893 mobj = re.search(r'data-users-id="([^"]+)"', page)
1894 page_base = page_base % mobj.group(1)
1897 # Download video ids using BlipTV Ajax calls. Result size per
1898 # query is limited (currently to 12 videos) so we need to query
1899 # page by page until there are no video ids - it means we got
1906 url = page_base + "&page=" + str(pagenum)
1907 page = self._download_webpage(url, username,
1908 u'Downloading video ids from page %d' % pagenum)
1910 # Extract video identifiers
1913 for mobj in re.finditer(r'href="/([^"]+)"', page):
1914 if mobj.group(1) not in ids_in_page:
1915 ids_in_page.append(unescapeHTML(mobj.group(1)))
1917 video_ids.extend(ids_in_page)
1919 # A little optimization - if current page is not
1920 # "full", ie. does not contain PAGE_SIZE video ids then
1921 # we can assume that this page is the last one - there
1922 # are no more ids on further pages - no need to query
1925 if len(ids_in_page) < self._PAGE_SIZE:
1930 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1931 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1932 return [self.playlist_result(url_entries, playlist_title = username)]
1935 class DepositFilesIE(InfoExtractor):
1936 """Information extractor for depositfiles.com"""
1938 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1940 def _real_extract(self, url):
1941 file_id = url.split('/')[-1]
1942 # Rebuild url in english locale
1943 url = 'http://depositfiles.com/en/files/' + file_id
1945 # Retrieve file webpage with 'Free download' button pressed
1946 free_download_indication = { 'gateway_result' : '1' }
1947 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1949 self.report_download_webpage(file_id)
1950 webpage = compat_urllib_request.urlopen(request).read()
1951 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1952 self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
1955 # Search for the real file URL
1956 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1957 if (mobj is None) or (mobj.group(1) is None):
1958 # Try to figure out reason of the error.
1959 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1960 if (mobj is not None) and (mobj.group(1) is not None):
1961 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1962 self._downloader.report_error(u'%s' % restriction_message)
1964 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
1967 file_url = mobj.group(1)
1968 file_extension = os.path.splitext(file_url)[1][1:]
1970 # Search for file title
1971 mobj = re.search(r'<b title="(.*?)">', webpage)
1973 self._downloader.report_error(u'unable to extract title')
1975 file_title = mobj.group(1).decode('utf-8')
1978 'id': file_id.decode('utf-8'),
1979 'url': file_url.decode('utf-8'),
1981 'upload_date': None,
1982 'title': file_title,
1983 'ext': file_extension.decode('utf-8'),
1987 class FacebookIE(InfoExtractor):
1988 """Information Extractor for Facebook"""
1990 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1991 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1992 _NETRC_MACHINE = 'facebook'
1993 IE_NAME = u'facebook'
1995 def report_login(self):
1996 """Report attempt to log in."""
1997 self.to_screen(u'Logging in')
1999 def _real_initialize(self):
2000 if self._downloader is None:
2005 downloader_params = self._downloader.params
2007 # Attempt to use provided username and password or .netrc data
2008 if downloader_params.get('username', None) is not None:
2009 useremail = downloader_params['username']
2010 password = downloader_params['password']
2011 elif downloader_params.get('usenetrc', False):
2013 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2014 if info is not None:
2018 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2019 except (IOError, netrc.NetrcParseError) as err:
2020 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2023 if useremail is None:
2032 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2035 login_results = compat_urllib_request.urlopen(request).read()
2036 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2037 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2039 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2040 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2043 def _real_extract(self, url):
2044 mobj = re.match(self._VALID_URL, url)
2046 self._downloader.report_error(u'invalid URL: %s' % url)
2048 video_id = mobj.group('ID')
2050 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2051 webpage = self._download_webpage(url, video_id)
2053 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
2054 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2055 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2057 raise ExtractorError(u'Cannot parse data')
2058 data = dict(json.loads(m.group(1)))
2059 params_raw = compat_urllib_parse.unquote(data['params'])
2060 params = json.loads(params_raw)
2061 video_data = params['video_data'][0]
2062 video_url = video_data.get('hd_src')
2064 video_url = video_data['sd_src']
2066 raise ExtractorError(u'Cannot find video URL')
2067 video_duration = int(video_data['video_duration'])
2068 thumbnail = video_data['thumbnail_src']
2070 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2072 raise ExtractorError(u'Cannot find title in webpage')
2073 video_title = unescapeHTML(m.group(1))
2077 'title': video_title,
2080 'duration': video_duration,
2081 'thumbnail': thumbnail,
2086 class BlipTVIE(InfoExtractor):
2087 """Information extractor for blip.tv"""
2089 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2090 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2091 IE_NAME = u'blip.tv'
2093 def report_direct_download(self, title):
2094 """Report information extraction."""
2095 self.to_screen(u'%s: Direct download detected' % title)
2097 def _real_extract(self, url):
2098 mobj = re.match(self._VALID_URL, url)
2100 self._downloader.report_error(u'invalid URL: %s' % url)
2103 urlp = compat_urllib_parse_urlparse(url)
2104 if urlp.path.startswith('/play/'):
2105 request = compat_urllib_request.Request(url)
2106 response = compat_urllib_request.urlopen(request)
2107 redirecturl = response.geturl()
2108 rurlp = compat_urllib_parse_urlparse(redirecturl)
2109 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2110 url = 'http://blip.tv/a/a-' + file_id
2111 return self._real_extract(url)
2118 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2119 request = compat_urllib_request.Request(json_url)
2120 request.add_header('User-Agent', 'iTunes/10.6.1')
2121 self.report_extraction(mobj.group(1))
2124 urlh = compat_urllib_request.urlopen(request)
2125 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2126 basename = url.split('/')[-1]
2127 title,ext = os.path.splitext(basename)
2128 title = title.decode('UTF-8')
2129 ext = ext.replace('.', '')
2130 self.report_direct_download(title)
2135 'upload_date': None,
2140 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2141 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2142 if info is None: # Regular URL
2144 json_code_bytes = urlh.read()
2145 json_code = json_code_bytes.decode('utf-8')
2146 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2147 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2151 json_data = json.loads(json_code)
2152 if 'Post' in json_data:
2153 data = json_data['Post']
2157 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2158 video_url = data['media']['url']
2159 umobj = re.match(self._URL_EXT, video_url)
2161 raise ValueError('Can not determine filename extension')
2162 ext = umobj.group(1)
2165 'id': data['item_id'],
2167 'uploader': data['display_name'],
2168 'upload_date': upload_date,
2169 'title': data['title'],
2171 'format': data['media']['mimeType'],
2172 'thumbnail': data['thumbnailUrl'],
2173 'description': data['description'],
2174 'player_url': data['embedUrl'],
2175 'user_agent': 'iTunes/10.6.1',
2177 except (ValueError,KeyError) as err:
2178 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2184 class MyVideoIE(InfoExtractor):
2185 """Information Extractor for myvideo.de."""
2187 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2188 IE_NAME = u'myvideo'
2190 def _real_extract(self,url):
2191 mobj = re.match(self._VALID_URL, url)
2193 self._download.report_error(u'invalid URL: %s' % url)
2196 video_id = mobj.group(1)
2199 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2200 webpage = self._download_webpage(webpage_url, video_id)
2202 self.report_extraction(video_id)
2203 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2206 self._downloader.report_error(u'unable to extract media URL')
2208 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2210 mobj = re.search('<title>([^<]+)</title>', webpage)
2212 self._downloader.report_error(u'unable to extract title')
2215 video_title = mobj.group(1)
2221 'upload_date': None,
2222 'title': video_title,
2226 class ComedyCentralIE(InfoExtractor):
2227 """Information extractor for The Daily Show and Colbert Report """
2229 # urls can be abbreviations like :thedailyshow or :colbert
2230 # urls for episodes like:
2231 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2232 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2233 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2234 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2235 |(https?://)?(www\.)?
2236 (?P<showname>thedailyshow|colbertnation)\.com/
2237 (full-episodes/(?P<episode>.*)|
2239 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2240 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2243 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2245 _video_extensions = {
2253 _video_dimensions = {
2263 def suitable(cls, url):
2264 """Receives a URL and returns True if suitable for this IE."""
2265 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2267 def _print_formats(self, formats):
2268 print('Available formats:')
2270 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2273 def _real_extract(self, url):
2274 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2276 self._downloader.report_error(u'invalid URL: %s' % url)
2279 if mobj.group('shortname'):
2280 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2281 url = u'http://www.thedailyshow.com/full-episodes/'
2283 url = u'http://www.colbertnation.com/full-episodes/'
2284 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2285 assert mobj is not None
2287 if mobj.group('clip'):
2288 if mobj.group('showname') == 'thedailyshow':
2289 epTitle = mobj.group('tdstitle')
2291 epTitle = mobj.group('cntitle')
2294 dlNewest = not mobj.group('episode')
2296 epTitle = mobj.group('showname')
2298 epTitle = mobj.group('episode')
2300 self.report_extraction(epTitle)
2301 webpage = self._download_webpage(url, epTitle)
2303 url = htmlHandle.geturl()
2304 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2306 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2308 if mobj.group('episode') == '':
2309 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2311 epTitle = mobj.group('episode')
2313 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2315 if len(mMovieParams) == 0:
2316 # The Colbert Report embeds the information in a without
2317 # a URL prefix; so extract the alternate reference
2318 # and then add the URL prefix manually.
2320 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2321 if len(altMovieParams) == 0:
2322 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2325 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2327 uri = mMovieParams[0][1]
2328 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2329 indexXml = self._download_webpage(indexUrl, epTitle,
2330 u'Downloading show index',
2331 u'unable to download episode index')
2335 idoc = xml.etree.ElementTree.fromstring(indexXml)
2336 itemEls = idoc.findall('.//item')
2337 for partNum,itemEl in enumerate(itemEls):
2338 mediaId = itemEl.findall('./guid')[0].text
2339 shortMediaId = mediaId.split(':')[-1]
2340 showId = mediaId.split(':')[-2].replace('.com', '')
2341 officialTitle = itemEl.findall('./title')[0].text
2342 officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2344 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2345 compat_urllib_parse.urlencode({'uri': mediaId}))
2346 configXml = self._download_webpage(configUrl, epTitle,
2347 u'Downloading configuration for %s' % shortMediaId)
2349 cdoc = xml.etree.ElementTree.fromstring(configXml)
2351 for rendition in cdoc.findall('.//rendition'):
2352 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2356 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2359 if self._downloader.params.get('listformats', None):
2360 self._print_formats([i[0] for i in turls])
2363 # For now, just pick the highest bitrate
2364 format,rtmp_video_url = turls[-1]
2366 # Get the format arg from the arg stream
2367 req_format = self._downloader.params.get('format', None)
2369 # Select format if we can find one
2372 format, rtmp_video_url = f, v
2375 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2377 raise ExtractorError(u'Cannot transform RTMP url')
2378 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2379 video_url = base + m.group('finalid')
2381 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2386 'upload_date': officialDate,
2391 'description': officialTitle,
2393 results.append(info)
2398 class EscapistIE(InfoExtractor):
2399 """Information extractor for The Escapist """
2401 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2402 IE_NAME = u'escapist'
2404 def _real_extract(self, url):
2405 mobj = re.match(self._VALID_URL, url)
2407 self._downloader.report_error(u'invalid URL: %s' % url)
2409 showName = mobj.group('showname')
2410 videoId = mobj.group('episode')
2412 self.report_extraction(showName)
2413 webPage = self._download_webpage(url, showName)
2415 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2416 description = unescapeHTML(descMatch.group(1))
2417 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2418 imgUrl = unescapeHTML(imgMatch.group(1))
2419 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2420 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2421 configUrlMatch = re.search('config=(.*)$', playerUrl)
2422 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2424 configJSON = self._download_webpage(configUrl, showName,
2425 u'Downloading configuration',
2426 u'unable to download configuration')
2428 # Technically, it's JavaScript, not JSON
2429 configJSON = configJSON.replace("'", '"')
2432 config = json.loads(configJSON)
2433 except (ValueError,) as err:
2434 self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2437 playlist = config['playlist']
2438 videoUrl = playlist[1]['url']
2443 'uploader': showName,
2444 'upload_date': None,
2447 'thumbnail': imgUrl,
2448 'description': description,
2449 'player_url': playerUrl,
2454 class CollegeHumorIE(InfoExtractor):
2455 """Information extractor for collegehumor.com"""
2458 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2459 IE_NAME = u'collegehumor'
2461 def report_manifest(self, video_id):
2462 """Report information extraction."""
2463 self.to_screen(u'%s: Downloading XML manifest' % video_id)
2465 def _real_extract(self, url):
2466 mobj = re.match(self._VALID_URL, url)
2468 self._downloader.report_error(u'invalid URL: %s' % url)
2470 video_id = mobj.group('videoid')
2475 'upload_date': None,
2478 self.report_extraction(video_id)
2479 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2481 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2482 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2483 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2486 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2488 videoNode = mdoc.findall('./video')[0]
2489 info['description'] = videoNode.findall('./description')[0].text
2490 info['title'] = videoNode.findall('./caption')[0].text
2491 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2492 manifest_url = videoNode.findall('./file')[0].text
2494 self._downloader.report_error(u'Invalid metadata XML file')
2497 manifest_url += '?hdcore=2.10.3'
2498 self.report_manifest(video_id)
2500 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2501 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2502 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2505 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2507 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2508 node_id = media_node.attrib['url']
2509 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2510 except IndexError as err:
2511 self._downloader.report_error(u'Invalid manifest file')
2514 url_pr = compat_urllib_parse_urlparse(manifest_url)
2515 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2522 class XVideosIE(InfoExtractor):
2523 """Information extractor for xvideos.com"""
2525 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2526 IE_NAME = u'xvideos'
2528 def _real_extract(self, url):
2529 mobj = re.match(self._VALID_URL, url)
2531 self._downloader.report_error(u'invalid URL: %s' % url)
2533 video_id = mobj.group(1)
2535 webpage = self._download_webpage(url, video_id)
2537 self.report_extraction(video_id)
2541 mobj = re.search(r'flv_url=(.+?)&', webpage)
2543 self._downloader.report_error(u'unable to extract video url')
2545 video_url = compat_urllib_parse.unquote(mobj.group(1))
2549 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2551 self._downloader.report_error(u'unable to extract video title')
2553 video_title = mobj.group(1)
2556 # Extract video thumbnail
2557 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2559 self._downloader.report_error(u'unable to extract video thumbnail')
2561 video_thumbnail = mobj.group(0)
2567 'upload_date': None,
2568 'title': video_title,
2570 'thumbnail': video_thumbnail,
2571 'description': None,
2577 class SoundcloudIE(InfoExtractor):
2578 """Information extractor for soundcloud.com
2579 To access the media, the uid of the song and a stream token
2580 must be extracted from the page source and the script must make
2581 a request to media.soundcloud.com/crossdomain.xml. Then
2582 the media can be grabbed by requesting from an url composed
2583 of the stream token and uid
2586 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2587 IE_NAME = u'soundcloud'
2589 def report_resolve(self, video_id):
2590 """Report information extraction."""
2591 self.to_screen(u'%s: Resolving id' % video_id)
2593 def _real_extract(self, url):
2594 mobj = re.match(self._VALID_URL, url)
2596 self._downloader.report_error(u'invalid URL: %s' % url)
2599 # extract uploader (which is in the url)
2600 uploader = mobj.group(1)
2601 # extract simple title (uploader + slug of song title)
2602 slug_title = mobj.group(2)
2603 simple_title = uploader + u'-' + slug_title
2604 full_title = '%s/%s' % (uploader, slug_title)
2606 self.report_resolve(full_title)
2608 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2609 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2610 info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2612 info = json.loads(info_json)
2613 video_id = info['id']
2614 self.report_extraction(full_title)
2616 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2617 stream_json = self._download_webpage(streams_url, full_title,
2618 u'Downloading stream definitions',
2619 u'unable to download stream definitions')
2621 streams = json.loads(stream_json)
2622 mediaURL = streams['http_mp3_128_url']
2623 upload_date = unified_strdate(info['created_at'])
2628 'uploader': info['user']['username'],
2629 'upload_date': upload_date,
2630 'title': info['title'],
2632 'description': info['description'],
2635 class SoundcloudSetIE(InfoExtractor):
2636 """Information extractor for soundcloud.com sets
2637 To access the media, the uid of the song and a stream token
2638 must be extracted from the page source and the script must make
2639 a request to media.soundcloud.com/crossdomain.xml. Then
2640 the media can be grabbed by requesting from an url composed
2641 of the stream token and uid
2644 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2645 IE_NAME = u'soundcloud:set'
2647 def report_resolve(self, video_id):
2648 """Report information extraction."""
2649 self.to_screen(u'%s: Resolving id' % video_id)
2651 def _real_extract(self, url):
2652 mobj = re.match(self._VALID_URL, url)
2654 self._downloader.report_error(u'invalid URL: %s' % url)
2657 # extract uploader (which is in the url)
2658 uploader = mobj.group(1)
2659 # extract simple title (uploader + slug of song title)
2660 slug_title = mobj.group(2)
2661 simple_title = uploader + u'-' + slug_title
2662 full_title = '%s/sets/%s' % (uploader, slug_title)
2664 self.report_resolve(full_title)
2666 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2667 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2668 info_json = self._download_webpage(resolv_url, full_title)
2671 info = json.loads(info_json)
2672 if 'errors' in info:
2673 for err in info['errors']:
2674 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2677 self.report_extraction(full_title)
2678 for track in info['tracks']:
2679 video_id = track['id']
2681 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2682 stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2684 self.report_extraction(video_id)
2685 streams = json.loads(stream_json)
2686 mediaURL = streams['http_mp3_128_url']
2691 'uploader': track['user']['username'],
2692 'upload_date': unified_strdate(track['created_at']),
2693 'title': track['title'],
2695 'description': track['description'],
2700 class InfoQIE(InfoExtractor):
2701 """Information extractor for infoq.com"""
2702 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2704 def _real_extract(self, url):
2705 mobj = re.match(self._VALID_URL, url)
2707 self._downloader.report_error(u'invalid URL: %s' % url)
2710 webpage = self._download_webpage(url, video_id=url)
2711 self.report_extraction(url)
2714 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2716 self._downloader.report_error(u'unable to extract video url')
2718 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2719 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2722 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2724 self._downloader.report_error(u'unable to extract video title')
2726 video_title = mobj.group(1)
2728 # Extract description
2729 video_description = u'No description available.'
2730 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2731 if mobj is not None:
2732 video_description = mobj.group(1)
2734 video_filename = video_url.split('/')[-1]
2735 video_id, extension = video_filename.split('.')
2741 'upload_date': None,
2742 'title': video_title,
2743 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2745 'description': video_description,
2750 class MixcloudIE(InfoExtractor):
2751 """Information extractor for www.mixcloud.com"""
2753 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2754 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2755 IE_NAME = u'mixcloud'
2757 def report_download_json(self, file_id):
2758 """Report JSON download."""
2759 self.to_screen(u'Downloading json')
2761 def get_urls(self, jsonData, fmt, bitrate='best'):
2762 """Get urls from 'audio_formats' section in json"""
2765 bitrate_list = jsonData[fmt]
2766 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2767 bitrate = max(bitrate_list) # select highest
2769 url_list = jsonData[fmt][bitrate]
2770 except TypeError: # we have no bitrate info.
2771 url_list = jsonData[fmt]
2774 def check_urls(self, url_list):
2775 """Returns 1st active url from list"""
2776 for url in url_list:
2778 compat_urllib_request.urlopen(url)
2780 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2785 def _print_formats(self, formats):
2786 print('Available formats:')
2787 for fmt in formats.keys():
2788 for b in formats[fmt]:
2790 ext = formats[fmt][b][0]
2791 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2792 except TypeError: # we have no bitrate info
2793 ext = formats[fmt][0]
2794 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2797 def _real_extract(self, url):
2798 mobj = re.match(self._VALID_URL, url)
2800 self._downloader.report_error(u'invalid URL: %s' % url)
2802 # extract uploader & filename from url
2803 uploader = mobj.group(1).decode('utf-8')
2804 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2806 # construct API request
2807 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2808 # retrieve .json file with links to files
2809 request = compat_urllib_request.Request(file_url)
2811 self.report_download_json(file_url)
2812 jsonData = compat_urllib_request.urlopen(request).read()
2813 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2814 self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
2818 json_data = json.loads(jsonData)
2819 player_url = json_data['player_swf_url']
2820 formats = dict(json_data['audio_formats'])
2822 req_format = self._downloader.params.get('format', None)
2825 if self._downloader.params.get('listformats', None):
2826 self._print_formats(formats)
2829 if req_format is None or req_format == 'best':
2830 for format_param in formats.keys():
2831 url_list = self.get_urls(formats, format_param)
2833 file_url = self.check_urls(url_list)
2834 if file_url is not None:
2837 if req_format not in formats:
2838 self._downloader.report_error(u'format is not available')
2841 url_list = self.get_urls(formats, req_format)
2842 file_url = self.check_urls(url_list)
2843 format_param = req_format
2846 'id': file_id.decode('utf-8'),
2847 'url': file_url.decode('utf-8'),
2848 'uploader': uploader.decode('utf-8'),
2849 'upload_date': None,
2850 'title': json_data['name'],
2851 'ext': file_url.split('.')[-1].decode('utf-8'),
2852 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2853 'thumbnail': json_data['thumbnail_url'],
2854 'description': json_data['description'],
2855 'player_url': player_url.decode('utf-8'),
2858 class StanfordOpenClassroomIE(InfoExtractor):
2859 """Information extractor for Stanford's Open ClassRoom"""
2861 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2862 IE_NAME = u'stanfordoc'
2864 def _real_extract(self, url):
2865 mobj = re.match(self._VALID_URL, url)
2867 raise ExtractorError(u'Invalid URL: %s' % url)
2869 if mobj.group('course') and mobj.group('video'): # A specific video
2870 course = mobj.group('course')
2871 video = mobj.group('video')
2873 'id': course + '_' + video,
2875 'upload_date': None,
2878 self.report_extraction(info['id'])
2879 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2880 xmlUrl = baseUrl + video + '.xml'
2882 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2883 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2884 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2886 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2888 info['title'] = mdoc.findall('./title')[0].text
2889 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2891 self._downloader.report_error(u'Invalid metadata XML file')
2893 info['ext'] = info['url'].rpartition('.')[2]
2895 elif mobj.group('course'): # A course page
2896 course = mobj.group('course')
2901 'upload_date': None,
2904 coursepage = self._download_webpage(url, info['id'],
2905 note='Downloading course info page',
2906 errnote='Unable to download course info page')
2908 m = re.search('<h1>([^<]+)</h1>', coursepage)
2910 info['title'] = unescapeHTML(m.group(1))
2912 info['title'] = info['id']
2914 m = re.search('<description>([^<]+)</description>', coursepage)
2916 info['description'] = unescapeHTML(m.group(1))
2918 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2921 'type': 'reference',
2922 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2926 for entry in info['list']:
2927 assert entry['type'] == 'reference'
2928 results += self.extract(entry['url'])
2932 'id': 'Stanford OpenClassroom',
2935 'upload_date': None,
2938 self.report_download_webpage(info['id'])
2939 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2941 rootpage = compat_urllib_request.urlopen(rootURL).read()
2942 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2943 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
2946 info['title'] = info['id']
2948 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2951 'type': 'reference',
2952 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2957 for entry in info['list']:
2958 assert entry['type'] == 'reference'
2959 results += self.extract(entry['url'])
2962 class MTVIE(InfoExtractor):
2963 """Information extractor for MTV.com"""
2965 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2968 def _real_extract(self, url):
2969 mobj = re.match(self._VALID_URL, url)
2971 self._downloader.report_error(u'invalid URL: %s' % url)
2973 if not mobj.group('proto'):
2974 url = 'http://' + url
2975 video_id = mobj.group('videoid')
2977 webpage = self._download_webpage(url, video_id)
2979 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2981 self._downloader.report_error(u'unable to extract song name')
2983 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2984 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2986 self._downloader.report_error(u'unable to extract performer')
2988 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2989 video_title = performer + ' - ' + song_name
2991 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2993 self._downloader.report_error(u'unable to mtvn_uri')
2995 mtvn_uri = mobj.group(1)
2997 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2999 self._downloader.report_error(u'unable to extract content id')
3001 content_id = mobj.group(1)
3003 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3004 self.report_extraction(video_id)
3005 request = compat_urllib_request.Request(videogen_url)
3007 metadataXml = compat_urllib_request.urlopen(request).read()
3008 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3009 self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3012 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3013 renditions = mdoc.findall('.//rendition')
3015 # For now, always pick the highest quality.
3016 rendition = renditions[-1]
3019 _,_,ext = rendition.attrib['type'].partition('/')
3020 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3021 video_url = rendition.find('./src').text
3023 self._downloader.report_error('Invalid rendition field.')
3029 'uploader': performer,
3030 'upload_date': None,
3031 'title': video_title,
3039 class YoukuIE(InfoExtractor):
3040 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3043 nowTime = int(time.time() * 1000)
3044 random1 = random.randint(1000,1998)
3045 random2 = random.randint(1000,9999)
3047 return "%d%d%d" %(nowTime,random1,random2)
3049 def _get_file_ID_mix_string(self, seed):
3051 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3053 for i in range(len(source)):
3054 seed = (seed * 211 + 30031 ) % 65536
3055 index = math.floor(seed / 65536 * len(source) )
3056 mixed.append(source[int(index)])
3057 source.remove(source[int(index)])
3058 #return ''.join(mixed)
3061 def _get_file_id(self, fileId, seed):
3062 mixed = self._get_file_ID_mix_string(seed)
3063 ids = fileId.split('*')
3067 realId.append(mixed[int(ch)])
3068 return ''.join(realId)
3070 def _real_extract(self, url):
3071 mobj = re.match(self._VALID_URL, url)
3073 self._downloader.report_error(u'invalid URL: %s' % url)
3075 video_id = mobj.group('ID')
3077 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3079 jsondata = self._download_webpage(info_url, video_id)
3081 self.report_extraction(video_id)
3083 config = json.loads(jsondata)
3085 video_title = config['data'][0]['title']
3086 seed = config['data'][0]['seed']
3088 format = self._downloader.params.get('format', None)
3089 supported_format = list(config['data'][0]['streamfileids'].keys())
3091 if format is None or format == 'best':
3092 if 'hd2' in supported_format:
3097 elif format == 'worst':
3105 fileid = config['data'][0]['streamfileids'][format]
3106 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3107 except (UnicodeDecodeError, ValueError, KeyError):
3108 self._downloader.report_error(u'unable to extract info section')
3112 sid = self._gen_sid()
3113 fileid = self._get_file_id(fileid, seed)
3115 #column 8,9 of fileid represent the segment number
3116 #fileid[7:9] should be changed
3117 for index, key in enumerate(keys):
3119 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3120 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3123 'id': '%s_part%02d' % (video_id, index),
3124 'url': download_url,
3126 'upload_date': None,
3127 'title': video_title,
3130 files_info.append(info)
3135 class XNXXIE(InfoExtractor):
3136 """Information extractor for xnxx.com"""
3138 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3140 VIDEO_URL_RE = r'flv_url=(.*?)&'
3141 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3142 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3144 def _real_extract(self, url):
3145 mobj = re.match(self._VALID_URL, url)
3147 self._downloader.report_error(u'invalid URL: %s' % url)
3149 video_id = mobj.group(1)
3151 # Get webpage content
3152 webpage = self._download_webpage(url, video_id)
3154 result = re.search(self.VIDEO_URL_RE, webpage)
3156 self._downloader.report_error(u'unable to extract video url')
3158 video_url = compat_urllib_parse.unquote(result.group(1))
3160 result = re.search(self.VIDEO_TITLE_RE, webpage)
3162 self._downloader.report_error(u'unable to extract video title')
3164 video_title = result.group(1)
3166 result = re.search(self.VIDEO_THUMB_RE, webpage)
3168 self._downloader.report_error(u'unable to extract video thumbnail')
3170 video_thumbnail = result.group(1)
3176 'upload_date': None,
3177 'title': video_title,
3179 'thumbnail': video_thumbnail,
3180 'description': None,
3184 class GooglePlusIE(InfoExtractor):
3185 """Information extractor for plus.google.com."""
3187 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3188 IE_NAME = u'plus.google'
3190 def report_extract_entry(self, url):
3191 """Report downloading extry"""
3192 self.to_screen(u'Downloading entry: %s' % url)
3194 def report_date(self, upload_date):
3195 """Report downloading extry"""
3196 self.to_screen(u'Entry date: %s' % upload_date)
3198 def report_uploader(self, uploader):
3199 """Report downloading extry"""
3200 self.to_screen(u'Uploader: %s' % uploader)
3202 def report_title(self, video_title):
3203 """Report downloading extry"""
3204 self.to_screen(u'Title: %s' % video_title)
3206 def report_extract_vid_page(self, video_page):
3207 """Report information extraction."""
3208 self.to_screen(u'Extracting video page: %s' % video_page)
3210 def _real_extract(self, url):
3211 # Extract id from URL
3212 mobj = re.match(self._VALID_URL, url)
3214 self._downloader.report_error(u'Invalid URL: %s' % url)
3217 post_url = mobj.group(0)
3218 video_id = mobj.group(1)
3220 video_extension = 'flv'
3222 # Step 1, Retrieve post webpage to extract further information
3223 self.report_extract_entry(post_url)
3224 webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3226 # Extract update date
3228 pattern = 'title="Timestamp">(.*?)</a>'
3229 mobj = re.search(pattern, webpage)
3231 upload_date = mobj.group(1)
3232 # Convert timestring to a format suitable for filename
3233 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3234 upload_date = upload_date.strftime('%Y%m%d')
3235 self.report_date(upload_date)
3239 pattern = r'rel\="author".*?>(.*?)</a>'
3240 mobj = re.search(pattern, webpage)
3242 uploader = mobj.group(1)
3243 self.report_uploader(uploader)
3246 # Get the first line for title
3248 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3249 mobj = re.search(pattern, webpage)
3251 video_title = mobj.group(1)
3252 self.report_title(video_title)
3254 # Step 2, Stimulate clicking the image box to launch video
3255 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3256 mobj = re.search(pattern, webpage)
3258 self._downloader.report_error(u'unable to extract video page URL')
3260 video_page = mobj.group(1)
3261 webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3262 self.report_extract_vid_page(video_page)
3265 # Extract video links on video page
3266 """Extract video links of all sizes"""
3267 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3268 mobj = re.findall(pattern, webpage)
3270 self._downloader.report_error(u'unable to extract video links')
3272 # Sort in resolution
3273 links = sorted(mobj)
3275 # Choose the lowest of the sort, i.e. highest resolution
3276 video_url = links[-1]
3277 # Only get the url. The resolution part in the tuple has no use anymore
3278 video_url = video_url[-1]
3279 # Treat escaped \u0026 style hex
3281 video_url = video_url.decode("unicode_escape")
3282 except AttributeError: # Python 3
3283 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3289 'uploader': uploader,
3290 'upload_date': upload_date,
3291 'title': video_title,
3292 'ext': video_extension,
3295 class NBAIE(InfoExtractor):
3296 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3299 def _real_extract(self, url):
3300 mobj = re.match(self._VALID_URL, url)
3302 self._downloader.report_error(u'invalid URL: %s' % url)
3305 video_id = mobj.group(1)
3306 if video_id.endswith('/index.html'):
3307 video_id = video_id[:-len('/index.html')]
3309 webpage = self._download_webpage(url, video_id)
3311 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3312 def _findProp(rexp, default=None):
3313 m = re.search(rexp, webpage)
3315 return unescapeHTML(m.group(1))
3319 shortened_video_id = video_id.rpartition('/')[2]
3320 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3322 'id': shortened_video_id,
3326 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3327 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3331 class JustinTVIE(InfoExtractor):
3332 """Information extractor for justin.tv and twitch.tv"""
3333 # TODO: One broadcast may be split into multiple videos. The key
3334 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3335 # starts at 1 and increases. Can we treat all parts as one video?
3337 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3338 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3339 _JUSTIN_PAGE_LIMIT = 100
3340 IE_NAME = u'justin.tv'
3342 def report_download_page(self, channel, offset):
3343 """Report attempt to download a single page of videos."""
3344 self.to_screen(u'%s: Downloading video information from %d to %d' %
3345 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3347 # Return count of items, list of *valid* items
3348 def _parse_page(self, url, video_id):
3349 webpage = self._download_webpage(url, video_id,
3350 u'Downloading video info JSON',
3351 u'unable to download video info JSON')
3353 response = json.loads(webpage)
3354 if type(response) != list:
3355 error_text = response.get('error', 'unknown error')
3356 self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3359 for clip in response:
3360 video_url = clip['video_file_url']
3362 video_extension = os.path.splitext(video_url)[1][1:]
3363 video_date = re.sub('-', '', clip['start_time'][:10])
3364 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3365 video_id = clip['id']
3366 video_title = clip.get('title', video_id)
3370 'title': video_title,
3371 'uploader': clip.get('channel_name', video_uploader_id),
3372 'uploader_id': video_uploader_id,
3373 'upload_date': video_date,
3374 'ext': video_extension,
3376 return (len(response), info)
3378 def _real_extract(self, url):
3379 mobj = re.match(self._VALID_URL, url)
3381 self._downloader.report_error(u'invalid URL: %s' % url)
3384 api = 'http://api.justin.tv'
3385 video_id = mobj.group(mobj.lastindex)
3387 if mobj.lastindex == 1:
3389 api += '/channel/archives/%s.json'
3391 api += '/broadcast/by_archive/%s.json'
3392 api = api % (video_id,)
3394 self.report_extraction(video_id)
3398 limit = self._JUSTIN_PAGE_LIMIT
3401 self.report_download_page(video_id, offset)
3402 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3403 page_count, page_info = self._parse_page(page_url, video_id)
3404 info.extend(page_info)
3405 if not paged or page_count != limit:
3410 class FunnyOrDieIE(InfoExtractor):
3411 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3413 def _real_extract(self, url):
3414 mobj = re.match(self._VALID_URL, url)
3416 self._downloader.report_error(u'invalid URL: %s' % url)
3419 video_id = mobj.group('id')
3420 webpage = self._download_webpage(url, video_id)
3422 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3424 self._downloader.report_error(u'unable to find video information')
3425 video_url = unescapeHTML(m.group('url'))
3427 m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3429 m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3431 self._downloader.report_error(u'Cannot find video title')
3432 title = clean_html(m.group('title'))
3434 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3436 desc = unescapeHTML(m.group('desc'))
3445 'description': desc,
3449 class SteamIE(InfoExtractor):
3450 _VALID_URL = r"""http://store\.steampowered\.com/
3452 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3454 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3458 def suitable(cls, url):
3459 """Receives a URL and returns True if suitable for this IE."""
3460 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3462 def _real_extract(self, url):
3463 m = re.match(self._VALID_URL, url, re.VERBOSE)
3464 gameID = m.group('gameID')
3465 videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3466 self.report_age_confirmation()
3467 webpage = self._download_webpage(videourl, gameID)
3468 game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3470 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3471 mweb = re.finditer(urlRE, webpage)
3472 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3473 titles = re.finditer(namesRE, webpage)
3474 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3475 thumbs = re.finditer(thumbsRE, webpage)
3477 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3478 video_id = vid.group('videoID')
3479 title = vtitle.group('videoName')
3480 video_url = vid.group('videoURL')
3481 video_thumb = thumb.group('thumbnail')
3483 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3488 'title': unescapeHTML(title),
3489 'thumbnail': video_thumb
3492 return [self.playlist_result(videos, gameID, game_title)]
3494 class UstreamIE(InfoExtractor):
3495 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3496 IE_NAME = u'ustream'
3498 def _real_extract(self, url):
3499 m = re.match(self._VALID_URL, url)
3500 video_id = m.group('videoID')
3501 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3502 webpage = self._download_webpage(url, video_id)
3503 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3504 title = m.group('title')
3505 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3506 uploader = m.group('uploader')
3512 'uploader': uploader
3516 class WorldStarHipHopIE(InfoExtractor):
3517 _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3518 IE_NAME = u'WorldStarHipHop'
3520 def _real_extract(self, url):
3521 _src_url = r"""(http://(hw-videos|hw-post1).*(?:mp4|flv))"""
3523 m = re.match(self._VALID_URL, url)
3524 video_id = m.group('id')
3526 webpage_src = self._download_webpage(url, video_id)
3528 mobj = re.search(_src_url, webpage_src)
3530 if mobj is not None:
3531 video_url = mobj.group()
3532 if 'mp4' in video_url:
3537 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3540 _title = r"""<title>(.*)</title>"""
3542 mobj = re.search(_title, webpage_src)
3544 if mobj is not None:
3545 title = mobj.group(1)
3547 title = 'World Start Hip Hop - %s' % time.ctime()
3549 _thumbnail = r"""rel="image_src" href="(.*)" />"""
3550 mobj = re.search(_thumbnail, webpage_src)
3552 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3553 if mobj is not None:
3554 thumbnail = mobj.group(1)
3556 _title = r"""candytitles.*>(.*)</span>"""
3557 mobj = re.search(_title, webpage_src)
3558 if mobj is not None:
3559 title = mobj.group(1)
3566 'thumbnail' : thumbnail,
3571 class RBMARadioIE(InfoExtractor):
3572 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3574 def _real_extract(self, url):
3575 m = re.match(self._VALID_URL, url)
3576 video_id = m.group('videoID')
3578 webpage = self._download_webpage(url, video_id)
3579 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3581 raise ExtractorError(u'Cannot find metadata')
3582 json_data = m.group(1)
3585 data = json.loads(json_data)
3586 except ValueError as e:
3587 raise ExtractorError(u'Invalid JSON: ' + str(e))
3589 video_url = data['akamai_url'] + '&cbr=256'
3590 url_parts = compat_urllib_parse_urlparse(video_url)
3591 video_ext = url_parts.path.rpartition('.')[2]
3596 'title': data['title'],
3597 'description': data.get('teaser_text'),
3598 'location': data.get('country_of_origin'),
3599 'uploader': data.get('host', {}).get('name'),
3600 'uploader_id': data.get('host', {}).get('slug'),
3601 'thumbnail': data.get('image', {}).get('large_url_2x'),
3602 'duration': data.get('duration'),
3607 class YouPornIE(InfoExtractor):
3608 """Information extractor for youporn.com."""
3609 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3611 def _print_formats(self, formats):
3612 """Print all available formats"""
3613 print(u'Available formats:')
3614 print(u'ext\t\tformat')
3615 print(u'---------------------------------')
3616 for format in formats:
3617 print(u'%s\t\t%s' % (format['ext'], format['format']))
3619 def _specific(self, req_format, formats):
3621 if(x["format"]==req_format):
3625 def _real_extract(self, url):
3626 mobj = re.match(self._VALID_URL, url)
3628 self._downloader.report_error(u'invalid URL: %s' % url)
3631 video_id = mobj.group('videoid')
3633 req = compat_urllib_request.Request(url)
3634 req.add_header('Cookie', 'age_verified=1')
3635 webpage = self._download_webpage(req, video_id)
3637 # Get the video title
3638 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3640 raise ExtractorError(u'Unable to extract video title')
3641 video_title = result.group('title').strip()
3643 # Get the video date
3644 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3646 self._downloader.report_warning(u'unable to extract video date')
3649 upload_date = unified_strdate(result.group('date').strip())
3651 # Get the video uploader
3652 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3654 self._downloader.report_warning(u'unable to extract uploader')
3655 video_uploader = None
3657 video_uploader = result.group('uploader').strip()
3658 video_uploader = clean_html( video_uploader )
3660 # Get all of the formats available
3661 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3662 result = re.search(DOWNLOAD_LIST_RE, webpage)
3664 raise ExtractorError(u'Unable to extract download list')
3665 download_list_html = result.group('download_list').strip()
3667 # Get all of the links from the page
3668 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3669 links = re.findall(LINK_RE, download_list_html)
3670 if(len(links) == 0):
3671 raise ExtractorError(u'ERROR: no known formats available for video')
3673 self.to_screen(u'Links found: %d' % len(links))
3678 # A link looks like this:
3679 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3680 # A path looks like this:
3681 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3682 video_url = unescapeHTML( link )
3683 path = compat_urllib_parse_urlparse( video_url ).path
3684 extension = os.path.splitext( path )[1][1:]
3685 format = path.split('/')[4].split('_')[:2]
3688 format = "-".join( format )
3689 title = u'%s-%s-%s' % (video_title, size, bitrate)
3694 'uploader': video_uploader,
3695 'upload_date': upload_date,
3700 'description': None,
3704 if self._downloader.params.get('listformats', None):
3705 self._print_formats(formats)
3708 req_format = self._downloader.params.get('format', None)
3709 self.to_screen(u'Format: %s' % req_format)
3711 if req_format is None or req_format == 'best':
3713 elif req_format == 'worst':
3714 return [formats[-1]]
3715 elif req_format in ('-1', 'all'):
3718 format = self._specific( req_format, formats )
3720 self._downloader.report_error(u'requested format not available')
3726 class PornotubeIE(InfoExtractor):
3727 """Information extractor for pornotube.com."""
3728 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3730 def _real_extract(self, url):
3731 mobj = re.match(self._VALID_URL, url)
3733 self._downloader.report_error(u'invalid URL: %s' % url)
3736 video_id = mobj.group('videoid')
3737 video_title = mobj.group('title')
3739 # Get webpage content
3740 webpage = self._download_webpage(url, video_id)
3743 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3744 result = re.search(VIDEO_URL_RE, webpage)
3746 self._downloader.report_error(u'unable to extract video url')
3748 video_url = compat_urllib_parse.unquote(result.group('url'))
3750 #Get the uploaded date
3751 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3752 result = re.search(VIDEO_UPLOADED_RE, webpage)
3754 self._downloader.report_error(u'unable to extract video title')
3756 upload_date = unified_strdate(result.group('date'))
3758 info = {'id': video_id,
3761 'upload_date': upload_date,
3762 'title': video_title,
3768 class YouJizzIE(InfoExtractor):
3769 """Information extractor for youjizz.com."""
3770 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3772 def _real_extract(self, url):
3773 mobj = re.match(self._VALID_URL, url)
3775 self._downloader.report_error(u'invalid URL: %s' % url)
3778 video_id = mobj.group('videoid')
3780 # Get webpage content
3781 webpage = self._download_webpage(url, video_id)
3783 # Get the video title
3784 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3786 raise ExtractorError(u'ERROR: unable to extract video title')
3787 video_title = result.group('title').strip()
3789 # Get the embed page
3790 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3792 raise ExtractorError(u'ERROR: unable to extract embed page')
3794 embed_page_url = result.group(0).strip()
3795 video_id = result.group('videoid')
3797 webpage = self._download_webpage(embed_page_url, video_id)
3800 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3802 raise ExtractorError(u'ERROR: unable to extract video url')
3803 video_url = result.group('source')
3805 info = {'id': video_id,
3807 'title': video_title,
3810 'player_url': embed_page_url}
3814 class EightTracksIE(InfoExtractor):
3816 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3818 def _real_extract(self, url):
3819 mobj = re.match(self._VALID_URL, url)
3821 raise ExtractorError(u'Invalid URL: %s' % url)
3822 playlist_id = mobj.group('id')
3824 webpage = self._download_webpage(url, playlist_id)
3826 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3828 raise ExtractorError(u'Cannot find trax information')
3829 json_like = m.group(1)
3830 data = json.loads(json_like)
3832 session = str(random.randint(0, 1000000000))
3834 track_count = data['tracks_count']
3835 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3836 next_url = first_url
3838 for i in itertools.count():
3839 api_json = self._download_webpage(next_url, playlist_id,
3840 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3841 errnote=u'Failed to download song information')
3842 api_data = json.loads(api_json)
3843 track_data = api_data[u'set']['track']
3845 'id': track_data['id'],
3846 'url': track_data['track_file_stream_url'],
3847 'title': track_data['performer'] + u' - ' + track_data['name'],
3848 'raw_title': track_data['name'],
3849 'uploader_id': data['user']['login'],
3853 if api_data['set']['at_last_track']:
3855 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3858 class KeekIE(InfoExtractor):
3859 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3862 def _real_extract(self, url):
3863 m = re.match(self._VALID_URL, url)
3864 video_id = m.group('videoID')
3865 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3866 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3867 webpage = self._download_webpage(url, video_id)
3868 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3869 title = unescapeHTML(m.group('title'))
3870 m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
3871 uploader = clean_html(m.group('uploader'))
3877 'thumbnail': thumbnail,
3878 'uploader': uploader
3882 class TEDIE(InfoExtractor):
3883 _VALID_URL=r'''http://www\.ted\.com/
3885 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3887 ((?P<type_talk>talks)) # We have a simple talk
3889 (/lang/(.*?))? # The url may contain the language
3890 /(?P<name>\w+) # Here goes the name and then ".html"
3894 def suitable(cls, url):
3895 """Receives a URL and returns True if suitable for this IE."""
3896 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3898 def _real_extract(self, url):
3899 m=re.match(self._VALID_URL, url, re.VERBOSE)
3900 if m.group('type_talk'):
3901 return [self._talk_info(url)]
3903 playlist_id=m.group('playlist_id')
3904 name=m.group('name')
3905 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3906 return [self._playlist_videos_info(url,name,playlist_id)]
3908 def _talk_video_link(self,mediaSlug):
3909 '''Returns the video link for that mediaSlug'''
3910 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3912 def _playlist_videos_info(self,url,name,playlist_id=0):
3913 '''Returns the videos of the playlist'''
3915 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3916 ([.\s]*?)data-playlist_item_id="(\d+)"
3917 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3919 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3920 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3921 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3922 m_names=re.finditer(video_name_RE,webpage)
3924 playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3925 m_playlist = re.search(playlist_RE, webpage)
3926 playlist_title = m_playlist.group('playlist_title')
3928 playlist_entries = []
3929 for m_video, m_name in zip(m_videos,m_names):
3930 video_id=m_video.group('video_id')
3931 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3932 playlist_entries.append(self.url_result(talk_url, 'TED'))
3933 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3935 def _talk_info(self, url, video_id=0):
3936 """Return the video for the talk in the url"""
3937 m=re.match(self._VALID_URL, url,re.VERBOSE)
3938 videoName=m.group('name')
3939 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3940 # If the url includes the language we get the title translated
3941 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3942 title=re.search(title_RE, webpage).group('title')
3943 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3944 "id":(?P<videoID>[\d]+).*?
3945 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3946 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3947 thumb_match=re.search(thumb_RE,webpage)
3948 info_match=re.search(info_RE,webpage,re.VERBOSE)
3949 video_id=info_match.group('videoID')
3950 mediaSlug=info_match.group('mediaSlug')
3951 video_url=self._talk_video_link(mediaSlug)
3957 'thumbnail': thumb_match.group('thumbnail')
3961 class MySpassIE(InfoExtractor):
3962 _VALID_URL = r'http://www.myspass.de/.*'
3964 def _real_extract(self, url):
3965 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3967 # video id is the last path element of the URL
3968 # usually there is a trailing slash, so also try the second but last
3969 url_path = compat_urllib_parse_urlparse(url).path
3970 url_parent_path, video_id = os.path.split(url_path)
3972 _, video_id = os.path.split(url_parent_path)
3975 metadata_url = META_DATA_URL_TEMPLATE % video_id
3976 metadata_text = self._download_webpage(metadata_url, video_id)
3977 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3979 # extract values from metadata
3980 url_flv_el = metadata.find('url_flv')
3981 if url_flv_el is None:
3982 self._downloader.report_error(u'unable to extract download url')
3984 video_url = url_flv_el.text
3985 extension = os.path.splitext(video_url)[1][1:]
3986 title_el = metadata.find('title')
3987 if title_el is None:
3988 self._downloader.report_error(u'unable to extract title')
3990 title = title_el.text
3991 format_id_el = metadata.find('format_id')
3992 if format_id_el is None:
3995 format = format_id_el.text
3996 description_el = metadata.find('description')
3997 if description_el is not None:
3998 description = description_el.text
4001 imagePreview_el = metadata.find('imagePreview')
4002 if imagePreview_el is not None:
4003 thumbnail = imagePreview_el.text
4012 'thumbnail': thumbnail,
4013 'description': description
4017 class SpiegelIE(InfoExtractor):
4018 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4020 def _real_extract(self, url):
4021 m = re.match(self._VALID_URL, url)
4022 video_id = m.group('videoID')
4024 webpage = self._download_webpage(url, video_id)
4025 m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4027 raise ExtractorError(u'Cannot find title')
4028 video_title = unescapeHTML(m.group(1))
4030 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4031 xml_code = self._download_webpage(xml_url, video_id,
4032 note=u'Downloading XML', errnote=u'Failed to download XML')
4034 idoc = xml.etree.ElementTree.fromstring(xml_code)
4035 last_type = idoc[-1]
4036 filename = last_type.findall('./filename')[0].text
4037 duration = float(last_type.findall('./duration')[0].text)
4039 video_url = 'http://video2.spiegel.de/flash/' + filename
4040 video_ext = filename.rpartition('.')[2]
4045 'title': video_title,
4046 'duration': duration,
4050 class LiveLeakIE(InfoExtractor):
4052 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4053 IE_NAME = u'liveleak'
4055 def _real_extract(self, url):
4056 mobj = re.match(self._VALID_URL, url)
4058 self._downloader.report_error(u'invalid URL: %s' % url)
4061 video_id = mobj.group('video_id')
4063 webpage = self._download_webpage(url, video_id)
4065 m = re.search(r'file: "(.*?)",', webpage)
4067 self._downloader.report_error(u'unable to find video url')
4069 video_url = m.group(1)
4071 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4073 self._downloader.report_error(u'Cannot find video title')
4074 title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4076 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4078 desc = unescapeHTML(m.group('desc'))
4082 m = re.search(r'By:.*?(\w+)</a>', webpage)
4084 uploader = clean_html(m.group(1))
4093 'description': desc,
4094 'uploader': uploader
4099 class ARDIE(InfoExtractor):
4100 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4101 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4102 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4104 def _real_extract(self, url):
4105 # determine video id from url
4106 m = re.match(self._VALID_URL, url)
4108 numid = re.search(r'documentId=([0-9]+)', url)
4110 video_id = numid.group(1)
4112 video_id = m.group('video_id')
4114 # determine title and media streams from webpage
4115 html = self._download_webpage(url, video_id)
4116 title = re.search(self._TITLE, html).group('title')
4117 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4119 assert '"fsk"' in html
4120 self._downloader.report_error(u'this video is only available after 8:00 pm')
4123 # choose default media type and highest quality for now
4124 stream = max([s for s in streams if int(s["media_type"]) == 0],
4125 key=lambda s: int(s["quality"]))
4127 # there's two possibilities: RTMP stream or HTTP download
4128 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4129 if stream['rtmp_url']:
4130 self.to_screen(u'RTMP download detected')
4131 assert stream['video_url'].startswith('mp4:')
4132 info["url"] = stream["rtmp_url"]
4133 info["play_path"] = stream['video_url']
4135 assert stream["video_url"].endswith('.mp4')
4136 info["url"] = stream["video_url"]
4139 class TumblrIE(InfoExtractor):
4140 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4142 def _real_extract(self, url):
4143 m_url = re.match(self._VALID_URL, url)
4144 video_id = m_url.group('id')
4145 blog = m_url.group('blog_name')
4147 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4148 webpage = self._download_webpage(url, video_id)
4150 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4151 video = re.search(re_video, webpage)
4153 self.to_screen("No video founded")
4155 video_url = video.group('video_url')
4156 ext = video.group('ext')
4158 re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22' # We pick the first poster
4159 thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
4161 # The only place where you can get a title, it's not complete,
4162 # but searching in other places doesn't work for all videos
4163 re_title = r'<title>(?P<title>.*?)</title>'
4164 title = unescapeHTML(re.search(re_title, webpage, re.DOTALL).group('title'))
4166 return [{'id': video_id,
4173 class BandcampIE(InfoExtractor):
4174 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4176 def _real_extract(self, url):
4177 mobj = re.match(self._VALID_URL, url)
4178 title = mobj.group('title')
4179 webpage = self._download_webpage(url, title)
4180 # We get the link to the free download page
4181 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4182 if m_download is None:
4183 self._downloader.report_error('No free songs founded')
4185 download_link = m_download.group(1)
4186 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4187 webpage, re.MULTILINE|re.DOTALL).group('id')
4189 download_webpage = self._download_webpage(download_link, id,
4190 'Downloading free downloads page')
4191 # We get the dictionary of the track from some javascrip code
4192 info = re.search(r'items: (.*?),$',
4193 download_webpage, re.MULTILINE).group(1)
4194 info = json.loads(info)[0]
4195 # We pick mp3-320 for now, until format selection can be easily implemented.
4196 mp3_info = info[u'downloads'][u'mp3-320']
4197 # If we try to use this url it says the link has expired
4198 initial_url = mp3_info[u'url']
4199 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4200 m_url = re.match(re_url, initial_url)
4201 #We build the url we will use to get the final track url
4202 # This url is build in Bandcamp in the script download_bunde_*.js
4203 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4204 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4205 # If we could correctly generate the .rand field the url would be
4206 #in the "download_url" key
4207 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4209 track_info = {'id':id,
4210 'title' : info[u'title'],
4213 'thumbnail' : info[u'thumb_url'],
4214 'uploader' : info[u'artist']
4220 def gen_extractors():
4221 """ Return a list of an instance of every supported extractor.
4222 The order does matter; the first extractor matched is the one handling the URL.
4225 YoutubePlaylistIE(),
4250 StanfordOpenClassroomIE(),
4260 WorldStarHipHopIE(),
4279 def get_info_extractor(ie_name):
4280 """Returns the info extractor class with the given ie_name"""
4281 return globals()[ie_name+'IE']