2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
23 class InfoExtractor(object):
24 """Information Extractor class.
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
29 others. The information is stored in a dictionary which is then
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
34 The dictionaries must include the following fields:
38 title: Video title, unescaped.
39 ext: Video filename extension.
41 The following fields are optional:
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
46 uploader: Full name of the video uploader.
47 upload_date: Video upload date (YYYYMMDD).
48 uploader_id: Nickname or id of the video uploader.
49 location: Physical location of the video.
50 player_url: SWF Player URL (used for rtmpdump).
51 subtitles: The subtitle file contents.
52 urlhandle: [internal] The urlHandle to be used to download the file,
53 like returned by urllib.request.urlopen
55 The fields should all be Unicode strings.
57 Subclasses of this one should re-define the _real_initialize() and
58 _real_extract() methods and define a _VALID_URL regexp.
59 Probably, they should also be added to the list of extractors.
61 _real_extract() must return a *list* of information dictionaries as
64 Finally, the _WORKING attribute should be set to False for broken IEs
65 in order to warn the users and skip the tests.
72 def __init__(self, downloader=None):
73 """Constructor. Receives an optional downloader."""
75 self.set_downloader(downloader)
78 def suitable(cls, url):
79 """Receives a URL and returns True if suitable for this IE."""
80 return re.match(cls._VALID_URL, url) is not None
84 """Getter method for _WORKING."""
88 """Initializes an instance (authentication, etc)."""
90 self._real_initialize()
93 def extract(self, url):
94 """Extracts URL information and returns it in list of dicts."""
96 return self._real_extract(url)
98 def set_downloader(self, downloader):
99 """Sets the downloader for this IE."""
100 self._downloader = downloader
102 def _real_initialize(self):
103 """Real initialization process. Redefine in subclasses."""
106 def _real_extract(self, url):
107 """Real extraction process. Redefine in subclasses."""
112 return type(self).__name__[:-2]
114 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115 """ Returns the response handle """
117 self.report_download_webpage(video_id)
118 elif note is not False:
119 self.to_screen(u'%s: %s' % (video_id, note))
121 return compat_urllib_request.urlopen(url_or_request)
122 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
124 errnote = u'Unable to download webpage'
125 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
127 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
128 """ Returns the data of the page as a string """
129 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
130 content_type = urlh.headers.get('Content-Type', '')
131 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
133 encoding = m.group(1)
136 webpage_bytes = urlh.read()
137 if self._downloader.params.get('dump_intermediate_pages', False):
139 url = url_or_request.get_full_url()
140 except AttributeError:
142 self.to_screen(u'Dumping request to ' + url)
143 dump = base64.b64encode(webpage_bytes).decode('ascii')
144 self._downloader.to_screen(dump)
145 return webpage_bytes.decode(encoding, 'replace')
147 def to_screen(self, msg):
148 """Print msg to screen, prefixing it with '[ie_name]'"""
149 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
151 def report_extraction(self, id_or_name):
152 """Report information extraction."""
153 self.to_screen(u'%s: Extracting information' % id_or_name)
155 def report_download_webpage(self, video_id):
156 """Report webpage download."""
157 self.to_screen(u'%s: Downloading webpage' % video_id)
159 def report_age_confirmation(self):
160 """Report attempt to confirm age."""
161 self.to_screen(u'Confirming age')
163 #Methods for following #608
164 #They set the correct value of the '_type' key
165 def video_result(self, video_info):
166 """Returns a video"""
167 video_info['_type'] = 'video'
169 def url_result(self, url, ie=None):
170 """Returns a url that points to a page that should be processed"""
171 #TODO: ie should be the class used for getting the info
172 video_info = {'_type': 'url',
176 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
177 """Returns a playlist"""
178 video_info = {'_type': 'playlist',
181 video_info['id'] = playlist_id
183 video_info['title'] = playlist_title
187 class YoutubeIE(InfoExtractor):
188 """Information extractor for youtube.com."""
192 (?:https?://)? # http(s):// (optional)
193 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
194 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
195 (?:.*?\#/)? # handle anchor (#/) redirect urls
196 (?: # the various things that can precede the ID:
197 (?:(?:v|embed|e)/) # v/ or embed/ or e/
198 |(?: # or the v= param in all its forms
199 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
200 (?:\?|\#!?) # the params delimiter ? or # or #!
201 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
204 )? # optional -> youtube.com/xxxx is OK
205 )? # all until now is optional -> you can pass the naked ID
206 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
207 (?(1).+)? # if we found the ID, everything can follow
209 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
210 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
211 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
212 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
213 _NETRC_MACHINE = 'youtube'
214 # Listed in order of quality
215 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
216 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
217 _video_extensions = {
223 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
229 _video_dimensions = {
248 def suitable(cls, url):
249 """Receives a URL and returns True if suitable for this IE."""
250 if YoutubePlaylistIE.suitable(url): return False
251 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
253 def report_lang(self):
254 """Report attempt to set language."""
255 self.to_screen(u'Setting language')
257 def report_login(self):
258 """Report attempt to log in."""
259 self.to_screen(u'Logging in')
261 def report_video_webpage_download(self, video_id):
262 """Report attempt to download video webpage."""
263 self.to_screen(u'%s: Downloading video webpage' % video_id)
265 def report_video_info_webpage_download(self, video_id):
266 """Report attempt to download video info webpage."""
267 self.to_screen(u'%s: Downloading video info webpage' % video_id)
269 def report_video_subtitles_download(self, video_id):
270 """Report attempt to download video info webpage."""
271 self.to_screen(u'%s: Checking available subtitles' % video_id)
273 def report_video_subtitles_request(self, video_id, sub_lang, format):
274 """Report attempt to download video info webpage."""
275 self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
277 def report_video_subtitles_available(self, video_id, sub_lang_list):
278 """Report available subtitles."""
279 sub_lang = ",".join(list(sub_lang_list.keys()))
280 self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
282 def report_information_extraction(self, video_id):
283 """Report attempt to extract video information."""
284 self.to_screen(u'%s: Extracting video information' % video_id)
286 def report_unavailable_format(self, video_id, format):
287 """Report extracted video URL."""
288 self.to_screen(u'%s: Format %s not available' % (video_id, format))
290 def report_rtmp_download(self):
291 """Indicate the download will use the RTMP protocol."""
292 self.to_screen(u'RTMP download detected')
294 def _get_available_subtitles(self, video_id):
295 self.report_video_subtitles_download(video_id)
296 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
298 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
299 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
300 return (u'unable to download video subtitles: %s' % compat_str(err), None)
301 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
302 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
303 if not sub_lang_list:
304 return (u'video doesn\'t have subtitles', None)
307 def _list_available_subtitles(self, video_id):
308 sub_lang_list = self._get_available_subtitles(video_id)
309 self.report_video_subtitles_available(video_id, sub_lang_list)
311 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
314 (error_message, sub_lang, sub)
316 self.report_video_subtitles_request(video_id, sub_lang, format)
317 params = compat_urllib_parse.urlencode({
323 url = 'http://www.youtube.com/api/timedtext?' + params
325 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
326 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
327 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
329 return (u'Did not fetch video subtitles', None, None)
330 return (None, sub_lang, sub)
332 def _extract_subtitle(self, video_id):
334 Return a list with a tuple:
335 [(error_message, sub_lang, sub)]
337 sub_lang_list = self._get_available_subtitles(video_id)
338 sub_format = self._downloader.params.get('subtitlesformat')
339 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
340 return [(sub_lang_list[0], None, None)]
341 if self._downloader.params.get('subtitleslang', False):
342 sub_lang = self._downloader.params.get('subtitleslang')
343 elif 'en' in sub_lang_list:
346 sub_lang = list(sub_lang_list.keys())[0]
347 if not sub_lang in sub_lang_list:
348 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
350 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
353 def _extract_all_subtitles(self, video_id):
354 sub_lang_list = self._get_available_subtitles(video_id)
355 sub_format = self._downloader.params.get('subtitlesformat')
356 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
357 return [(sub_lang_list[0], None, None)]
359 for sub_lang in sub_lang_list:
360 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
361 subtitles.append(subtitle)
364 def _print_formats(self, formats):
365 print('Available formats:')
367 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
369 def _real_initialize(self):
370 if self._downloader is None:
375 downloader_params = self._downloader.params
377 # Attempt to use provided username and password or .netrc data
378 if downloader_params.get('username', None) is not None:
379 username = downloader_params['username']
380 password = downloader_params['password']
381 elif downloader_params.get('usenetrc', False):
383 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
388 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
389 except (IOError, netrc.NetrcParseError) as err:
390 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
394 request = compat_urllib_request.Request(self._LANG_URL)
397 compat_urllib_request.urlopen(request).read()
398 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
399 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
402 # No authentication to be performed
406 request = compat_urllib_request.Request(self._LOGIN_URL)
408 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
409 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
410 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
415 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
417 galx = match.group(1)
419 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
425 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
429 u'PersistentCookie': u'yes',
431 u'bgresponse': u'js_disabled',
432 u'checkConnection': u'',
433 u'checkedDomains': u'youtube',
439 u'signIn': u'Sign in',
441 u'service': u'youtube',
445 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
447 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
448 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
449 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
452 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
453 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
454 self._downloader.report_warning(u'unable to log in: bad username or password')
456 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
457 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
463 'action_confirm': 'Confirm',
465 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
467 self.report_age_confirmation()
468 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
469 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
470 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
473 def _extract_id(self, url):
474 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
476 self._downloader.report_error(u'invalid URL: %s' % url)
478 video_id = mobj.group(2)
481 def _real_extract(self, url):
482 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
483 mobj = re.search(self._NEXT_URL_RE, url)
485 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
486 video_id = self._extract_id(url)
489 self.report_video_webpage_download(video_id)
490 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
491 request = compat_urllib_request.Request(url)
493 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
494 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
495 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
498 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
500 # Attempt to extract SWF player URL
501 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
503 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
508 self.report_video_info_webpage_download(video_id)
509 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
510 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
511 % (video_id, el_type))
512 video_info_webpage = self._download_webpage(video_info_url, video_id,
514 errnote='unable to download video info webpage')
515 video_info = compat_parse_qs(video_info_webpage)
516 if 'token' in video_info:
518 if 'token' not in video_info:
519 if 'reason' in video_info:
520 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
522 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
525 # Check for "rental" videos
526 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
527 self._downloader.report_error(u'"rental" videos not supported')
530 # Start extracting information
531 self.report_information_extraction(video_id)
534 if 'author' not in video_info:
535 self._downloader.report_error(u'unable to extract uploader name')
537 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
540 video_uploader_id = None
541 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
543 video_uploader_id = mobj.group(1)
545 self._downloader.report_warning(u'unable to extract uploader nickname')
548 if 'title' not in video_info:
549 self._downloader.report_error(u'unable to extract video title')
551 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
554 if 'thumbnail_url' not in video_info:
555 self._downloader.report_warning(u'unable to extract video thumbnail')
557 else: # don't panic if we can't find it
558 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
562 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
564 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
565 upload_date = unified_strdate(upload_date)
568 video_description = get_element_by_id("eow-description", video_webpage)
569 if video_description:
570 video_description = clean_html(video_description)
572 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
574 video_description = unescapeHTML(fd_mobj.group(1))
576 video_description = u''
579 video_subtitles = None
581 if self._downloader.params.get('writesubtitles', False):
582 video_subtitles = self._extract_subtitle(video_id)
584 (sub_error, sub_lang, sub) = video_subtitles[0]
586 self._downloader.report_error(sub_error)
588 if self._downloader.params.get('allsubtitles', False):
589 video_subtitles = self._extract_all_subtitles(video_id)
590 for video_subtitle in video_subtitles:
591 (sub_error, sub_lang, sub) = video_subtitle
593 self._downloader.report_error(sub_error)
595 if self._downloader.params.get('listsubtitles', False):
596 sub_lang_list = self._list_available_subtitles(video_id)
599 if 'length_seconds' not in video_info:
600 self._downloader.report_warning(u'unable to extract video duration')
603 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
606 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
608 # Decide which formats to download
609 req_format = self._downloader.params.get('format', None)
611 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
612 self.report_rtmp_download()
613 video_url_list = [(None, video_info['conn'][0])]
614 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
615 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
616 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
617 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
618 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
620 format_limit = self._downloader.params.get('format_limit', None)
621 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
622 if format_limit is not None and format_limit in available_formats:
623 format_list = available_formats[available_formats.index(format_limit):]
625 format_list = available_formats
626 existing_formats = [x for x in format_list if x in url_map]
627 if len(existing_formats) == 0:
628 raise ExtractorError(u'no known formats available for video')
629 if self._downloader.params.get('listformats', None):
630 self._print_formats(existing_formats)
632 if req_format is None or req_format == 'best':
633 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
634 elif req_format == 'worst':
635 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
636 elif req_format in ('-1', 'all'):
637 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
639 # Specific formats. We pick the first in a slash-delimeted sequence.
640 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
641 req_formats = req_format.split('/')
642 video_url_list = None
643 for rf in req_formats:
645 video_url_list = [(rf, url_map[rf])]
647 if video_url_list is None:
648 raise ExtractorError(u'requested format not available')
650 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
653 for format_param, video_real_url in video_url_list:
655 video_extension = self._video_extensions.get(format_param, 'flv')
657 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
658 self._video_dimensions.get(format_param, '???'))
662 'url': video_real_url,
663 'uploader': video_uploader,
664 'uploader_id': video_uploader_id,
665 'upload_date': upload_date,
666 'title': video_title,
667 'ext': video_extension,
668 'format': video_format,
669 'thumbnail': video_thumbnail,
670 'description': video_description,
671 'player_url': player_url,
672 'subtitles': video_subtitles,
673 'duration': video_duration
678 class MetacafeIE(InfoExtractor):
679 """Information Extractor for metacafe.com."""
681 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
682 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
683 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
684 IE_NAME = u'metacafe'
686 def report_disclaimer(self):
687 """Report disclaimer retrieval."""
688 self.to_screen(u'Retrieving disclaimer')
690 def _real_initialize(self):
691 # Retrieve disclaimer
692 request = compat_urllib_request.Request(self._DISCLAIMER)
694 self.report_disclaimer()
695 disclaimer = compat_urllib_request.urlopen(request).read()
696 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
697 self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
703 'submit': "Continue - I'm over 18",
705 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
707 self.report_age_confirmation()
708 disclaimer = compat_urllib_request.urlopen(request).read()
709 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
710 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
713 def _real_extract(self, url):
714 # Extract id and simplified title from URL
715 mobj = re.match(self._VALID_URL, url)
717 self._downloader.report_error(u'invalid URL: %s' % url)
720 video_id = mobj.group(1)
722 # Check if video comes from YouTube
723 mobj2 = re.match(r'^yt-(.*)$', video_id)
724 if mobj2 is not None:
725 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
727 # Retrieve video webpage to extract further information
728 webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
730 # Extract URL, uploader and title from webpage
731 self.report_extraction(video_id)
732 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
734 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
735 video_extension = mediaURL[-3:]
737 # Extract gdaKey if available
738 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
742 gdaKey = mobj.group(1)
743 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
745 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
747 self._downloader.report_error(u'unable to extract media URL')
749 vardict = compat_parse_qs(mobj.group(1))
750 if 'mediaData' not in vardict:
751 self._downloader.report_error(u'unable to extract media URL')
753 mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
755 self._downloader.report_error(u'unable to extract media URL')
757 mediaURL = mobj.group('mediaURL').replace('\\/', '/')
758 video_extension = mediaURL[-3:]
759 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
761 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
763 self._downloader.report_error(u'unable to extract title')
765 video_title = mobj.group(1).decode('utf-8')
767 mobj = re.search(r'submitter=(.*?);', webpage)
769 self._downloader.report_error(u'unable to extract uploader nickname')
771 video_uploader = mobj.group(1)
774 'id': video_id.decode('utf-8'),
775 'url': video_url.decode('utf-8'),
776 'uploader': video_uploader.decode('utf-8'),
778 'title': video_title,
779 'ext': video_extension.decode('utf-8'),
783 class DailymotionIE(InfoExtractor):
784 """Information Extractor for Dailymotion"""
786 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
787 IE_NAME = u'dailymotion'
789 def _real_extract(self, url):
790 # Extract id and simplified title from URL
791 mobj = re.match(self._VALID_URL, url)
793 self._downloader.report_error(u'invalid URL: %s' % url)
796 video_id = mobj.group(1).split('_')[0].split('?')[0]
798 video_extension = 'mp4'
800 # Retrieve video webpage to extract further information
801 request = compat_urllib_request.Request(url)
802 request.add_header('Cookie', 'family_filter=off')
803 webpage = self._download_webpage(request, video_id)
805 # Extract URL, uploader and title from webpage
806 self.report_extraction(video_id)
807 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
809 self._downloader.report_error(u'unable to extract media URL')
811 flashvars = compat_urllib_parse.unquote(mobj.group(1))
813 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
816 self.to_screen(u'Using %s' % key)
819 self._downloader.report_error(u'unable to extract video URL')
822 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
824 self._downloader.report_error(u'unable to extract video URL')
827 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
829 # TODO: support choosing qualities
831 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
833 self._downloader.report_error(u'unable to extract title')
835 video_title = unescapeHTML(mobj.group('title'))
837 video_uploader = None
838 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
840 # lookin for official user
841 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
842 if mobj_official is None:
843 self._downloader.report_warning(u'unable to extract uploader nickname')
845 video_uploader = mobj_official.group(1)
847 video_uploader = mobj.group(1)
849 video_upload_date = None
850 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
852 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
857 'uploader': video_uploader,
858 'upload_date': video_upload_date,
859 'title': video_title,
860 'ext': video_extension,
864 class PhotobucketIE(InfoExtractor):
865 """Information extractor for photobucket.com."""
867 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
868 IE_NAME = u'photobucket'
870 def _real_extract(self, url):
871 # Extract id from URL
872 mobj = re.match(self._VALID_URL, url)
874 self._downloader.report_error(u'Invalid URL: %s' % url)
877 video_id = mobj.group(1)
879 video_extension = 'flv'
881 # Retrieve video webpage to extract further information
882 request = compat_urllib_request.Request(url)
884 self.report_download_webpage(video_id)
885 webpage = compat_urllib_request.urlopen(request).read()
886 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
887 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
890 # Extract URL, uploader, and title from webpage
891 self.report_extraction(video_id)
892 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
894 self._downloader.report_error(u'unable to extract media URL')
896 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
900 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
902 self._downloader.report_error(u'unable to extract title')
904 video_title = mobj.group(1).decode('utf-8')
906 video_uploader = mobj.group(2).decode('utf-8')
909 'id': video_id.decode('utf-8'),
910 'url': video_url.decode('utf-8'),
911 'uploader': video_uploader,
913 'title': video_title,
914 'ext': video_extension.decode('utf-8'),
918 class YahooIE(InfoExtractor):
919 """Information extractor for video.yahoo.com."""
922 # _VALID_URL matches all Yahoo! Video URLs
923 # _VPAGE_URL matches only the extractable '/watch/' URLs
924 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
925 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
926 IE_NAME = u'video.yahoo'
928 def _real_extract(self, url, new_video=True):
929 # Extract ID from URL
930 mobj = re.match(self._VALID_URL, url)
932 self._downloader.report_error(u'Invalid URL: %s' % url)
935 video_id = mobj.group(2)
936 video_extension = 'flv'
938 # Rewrite valid but non-extractable URLs as
939 # extractable English language /watch/ URLs
940 if re.match(self._VPAGE_URL, url) is None:
941 request = compat_urllib_request.Request(url)
943 webpage = compat_urllib_request.urlopen(request).read()
944 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
945 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
948 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
950 self._downloader.report_error(u'Unable to extract id field')
952 yahoo_id = mobj.group(1)
954 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
956 self._downloader.report_error(u'Unable to extract vid field')
958 yahoo_vid = mobj.group(1)
960 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
961 return self._real_extract(url, new_video=False)
963 # Retrieve video webpage to extract further information
964 request = compat_urllib_request.Request(url)
966 self.report_download_webpage(video_id)
967 webpage = compat_urllib_request.urlopen(request).read()
968 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
969 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
972 # Extract uploader and title from webpage
973 self.report_extraction(video_id)
974 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
976 self._downloader.report_error(u'unable to extract video title')
978 video_title = mobj.group(1).decode('utf-8')
980 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
982 self._downloader.report_error(u'unable to extract video uploader')
984 video_uploader = mobj.group(1).decode('utf-8')
986 # Extract video thumbnail
987 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
989 self._downloader.report_error(u'unable to extract video thumbnail')
991 video_thumbnail = mobj.group(1).decode('utf-8')
993 # Extract video description
994 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
996 self._downloader.report_error(u'unable to extract video description')
998 video_description = mobj.group(1).decode('utf-8')
999 if not video_description:
1000 video_description = 'No description available.'
1002 # Extract video height and width
1003 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1005 self._downloader.report_error(u'unable to extract video height')
1007 yv_video_height = mobj.group(1)
1009 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1011 self._downloader.report_error(u'unable to extract video width')
1013 yv_video_width = mobj.group(1)
1015 # Retrieve video playlist to extract media URL
1016 # I'm not completely sure what all these options are, but we
1017 # seem to need most of them, otherwise the server sends a 401.
1018 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1019 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1020 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1021 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1022 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1024 self.report_download_webpage(video_id)
1025 webpage = compat_urllib_request.urlopen(request).read()
1026 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1027 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1030 # Extract media URL from playlist XML
1031 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1033 self._downloader.report_error(u'Unable to extract media URL')
1035 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1036 video_url = unescapeHTML(video_url)
1039 'id': video_id.decode('utf-8'),
1041 'uploader': video_uploader,
1042 'upload_date': None,
1043 'title': video_title,
1044 'ext': video_extension.decode('utf-8'),
1045 'thumbnail': video_thumbnail.decode('utf-8'),
1046 'description': video_description,
1050 class VimeoIE(InfoExtractor):
1051 """Information extractor for vimeo.com."""
1053 # _VALID_URL matches Vimeo URLs
1054 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1057 def _real_extract(self, url, new_video=True):
1058 # Extract ID from URL
1059 mobj = re.match(self._VALID_URL, url)
1061 self._downloader.report_error(u'Invalid URL: %s' % url)
1064 video_id = mobj.group('id')
1065 if not mobj.group('proto'):
1066 url = 'https://' + url
1067 if mobj.group('direct_link'):
1068 url = 'https://vimeo.com/' + video_id
1070 # Retrieve video webpage to extract further information
1071 request = compat_urllib_request.Request(url, None, std_headers)
1072 webpage = self._download_webpage(request, video_id)
1074 # Now we begin extracting as much information as we can from what we
1075 # retrieved. First we extract the information common to all extractors,
1076 # and latter we extract those that are Vimeo specific.
1077 self.report_extraction(video_id)
1079 # Extract the config JSON
1081 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1082 config = json.loads(config)
1084 if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1085 self._downloader.report_error(u'The author has restricted the access to this video, try with the "--referer" option')
1087 self._downloader.report_error(u'unable to extract info section')
1091 video_title = config["video"]["title"]
1093 # Extract uploader and uploader_id
1094 video_uploader = config["video"]["owner"]["name"]
1095 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1097 # Extract video thumbnail
1098 video_thumbnail = config["video"]["thumbnail"]
1100 # Extract video description
1101 video_description = get_element_by_attribute("itemprop", "description", webpage)
1102 if video_description: video_description = clean_html(video_description)
1103 else: video_description = u''
1105 # Extract upload date
1106 video_upload_date = None
1107 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1108 if mobj is not None:
1109 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1111 # Vimeo specific: extract request signature and timestamp
1112 sig = config['request']['signature']
1113 timestamp = config['request']['timestamp']
1115 # Vimeo specific: extract video codec and quality information
1116 # First consider quality, then codecs, then take everything
1117 # TODO bind to format param
1118 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1119 files = { 'hd': [], 'sd': [], 'other': []}
1120 for codec_name, codec_extension in codecs:
1121 if codec_name in config["video"]["files"]:
1122 if 'hd' in config["video"]["files"][codec_name]:
1123 files['hd'].append((codec_name, codec_extension, 'hd'))
1124 elif 'sd' in config["video"]["files"][codec_name]:
1125 files['sd'].append((codec_name, codec_extension, 'sd'))
1127 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1129 for quality in ('hd', 'sd', 'other'):
1130 if len(files[quality]) > 0:
1131 video_quality = files[quality][0][2]
1132 video_codec = files[quality][0][0]
1133 video_extension = files[quality][0][1]
1134 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1137 self._downloader.report_error(u'no known codec found')
1140 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1141 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1146 'uploader': video_uploader,
1147 'uploader_id': video_uploader_id,
1148 'upload_date': video_upload_date,
1149 'title': video_title,
1150 'ext': video_extension,
1151 'thumbnail': video_thumbnail,
1152 'description': video_description,
1156 class ArteTvIE(InfoExtractor):
1157 """arte.tv information extractor."""
1159 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1160 _LIVE_URL = r'index-[0-9]+\.html$'
1162 IE_NAME = u'arte.tv'
1164 def fetch_webpage(self, url):
1165 request = compat_urllib_request.Request(url)
1167 self.report_download_webpage(url)
1168 webpage = compat_urllib_request.urlopen(request).read()
1169 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1170 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1172 except ValueError as err:
1173 self._downloader.report_error(u'Invalid URL: %s' % url)
1177 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1178 page = self.fetch_webpage(url)
1179 mobj = re.search(regex, page, regexFlags)
1183 self._downloader.report_error(u'Invalid URL: %s' % url)
1186 for (i, key, err) in matchTuples:
1187 if mobj.group(i) is None:
1188 self._downloader.report_error(err)
1191 info[key] = mobj.group(i)
1195 def extractLiveStream(self, url):
1196 video_lang = url.split('/')[-4]
1197 info = self.grep_webpage(
1199 r'src="(.*?/videothek_js.*?\.js)',
1202 (1, 'url', u'Invalid URL: %s' % url)
1205 http_host = url.split('/')[2]
1206 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1207 info = self.grep_webpage(
1209 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1210 '(http://.*?\.swf).*?' +
1214 (1, 'path', u'could not extract video path: %s' % url),
1215 (2, 'player', u'could not extract video player: %s' % url),
1216 (3, 'url', u'could not extract video url: %s' % url)
1219 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1221 def extractPlus7Stream(self, url):
1222 video_lang = url.split('/')[-3]
1223 info = self.grep_webpage(
1225 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1228 (1, 'url', u'Invalid URL: %s' % url)
1231 next_url = compat_urllib_parse.unquote(info.get('url'))
1232 info = self.grep_webpage(
1234 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1237 (1, 'url', u'Could not find <video> tag: %s' % url)
1240 next_url = compat_urllib_parse.unquote(info.get('url'))
1242 info = self.grep_webpage(
1244 r'<video id="(.*?)".*?>.*?' +
1245 '<name>(.*?)</name>.*?' +
1246 '<dateVideo>(.*?)</dateVideo>.*?' +
1247 '<url quality="hd">(.*?)</url>',
1250 (1, 'id', u'could not extract video id: %s' % url),
1251 (2, 'title', u'could not extract video title: %s' % url),
1252 (3, 'date', u'could not extract video date: %s' % url),
1253 (4, 'url', u'could not extract video url: %s' % url)
1258 'id': info.get('id'),
1259 'url': compat_urllib_parse.unquote(info.get('url')),
1260 'uploader': u'arte.tv',
1261 'upload_date': info.get('date'),
1262 'title': info.get('title').decode('utf-8'),
1268 def _real_extract(self, url):
1269 video_id = url.split('/')[-1]
1270 self.report_extraction(video_id)
1272 if re.search(self._LIVE_URL, video_id) is not None:
1273 self.extractLiveStream(url)
1276 info = self.extractPlus7Stream(url)
1281 class GenericIE(InfoExtractor):
1282 """Generic last-resort information extractor."""
1285 IE_NAME = u'generic'
1287 def report_download_webpage(self, video_id):
1288 """Report webpage download."""
1289 if not self._downloader.params.get('test', False):
1290 self._downloader.report_warning(u'Falling back on generic information extractor.')
1291 super(GenericIE, self).report_download_webpage(video_id)
1293 def report_following_redirect(self, new_url):
1294 """Report information extraction."""
1295 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1297 def _test_redirect(self, url):
1298 """Check if it is a redirect, like url shorteners, in case return the new url."""
1299 class HeadRequest(compat_urllib_request.Request):
1300 def get_method(self):
1303 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1305 Subclass the HTTPRedirectHandler to make it use our
1306 HeadRequest also on the redirected URL
1308 def redirect_request(self, req, fp, code, msg, headers, newurl):
1309 if code in (301, 302, 303, 307):
1310 newurl = newurl.replace(' ', '%20')
1311 newheaders = dict((k,v) for k,v in req.headers.items()
1312 if k.lower() not in ("content-length", "content-type"))
1313 return HeadRequest(newurl,
1315 origin_req_host=req.get_origin_req_host(),
1318 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1320 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1322 Fallback to GET if HEAD is not allowed (405 HTTP error)
1324 def http_error_405(self, req, fp, code, msg, headers):
1328 newheaders = dict((k,v) for k,v in req.headers.items()
1329 if k.lower() not in ("content-length", "content-type"))
1330 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1332 origin_req_host=req.get_origin_req_host(),
1336 opener = compat_urllib_request.OpenerDirector()
1337 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1338 HTTPMethodFallback, HEADRedirectHandler,
1339 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1340 opener.add_handler(handler())
1342 response = opener.open(HeadRequest(url))
1343 new_url = response.geturl()
1348 self.report_following_redirect(new_url)
1351 def _real_extract(self, url):
1352 new_url = self._test_redirect(url)
1353 if new_url: return [self.url_result(new_url)]
1355 video_id = url.split('/')[-1]
1357 webpage = self._download_webpage(url, video_id)
1358 except ValueError as err:
1359 # since this is the last-resort InfoExtractor, if
1360 # this error is thrown, it'll be thrown here
1361 self._downloader.report_error(u'Invalid URL: %s' % url)
1364 self.report_extraction(video_id)
1365 # Start with something easy: JW Player in SWFObject
1366 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1368 # Broaden the search a little bit
1369 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1371 # Broaden the search a little bit: JWPlayer JS loader
1372 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1374 self._downloader.report_error(u'Invalid URL: %s' % url)
1377 # It's possible that one of the regexes
1378 # matched, but returned an empty group:
1379 if mobj.group(1) is None:
1380 self._downloader.report_error(u'Invalid URL: %s' % url)
1383 video_url = compat_urllib_parse.unquote(mobj.group(1))
1384 video_id = os.path.basename(video_url)
1386 # here's a fun little line of code for you:
1387 video_extension = os.path.splitext(video_id)[1][1:]
1388 video_id = os.path.splitext(video_id)[0]
1390 # it's tempting to parse this further, but you would
1391 # have to take into account all the variations like
1392 # Video Title - Site Name
1393 # Site Name | Video Title
1394 # Video Title - Tagline | Site Name
1395 # and so on and so forth; it's just not practical
1396 mobj = re.search(r'<title>(.*)</title>', webpage)
1398 self._downloader.report_error(u'unable to extract title')
1400 video_title = mobj.group(1)
1402 # video uploader is domain name
1403 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1405 self._downloader.report_error(u'unable to extract title')
1407 video_uploader = mobj.group(1)
1412 'uploader': video_uploader,
1413 'upload_date': None,
1414 'title': video_title,
1415 'ext': video_extension,
1419 class YoutubeSearchIE(InfoExtractor):
1420 """Information Extractor for YouTube search queries."""
1421 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1422 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1423 _max_youtube_results = 1000
1424 IE_NAME = u'youtube:search'
1426 def report_download_page(self, query, pagenum):
1427 """Report attempt to download search page with given number."""
1428 query = query.decode(preferredencoding())
1429 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1431 def _real_extract(self, query):
1432 mobj = re.match(self._VALID_URL, query)
1434 self._downloader.report_error(u'invalid search query "%s"' % query)
1437 prefix, query = query.split(':')
1439 query = query.encode('utf-8')
1441 return self._get_n_results(query, 1)
1442 elif prefix == 'all':
1443 self._get_n_results(query, self._max_youtube_results)
1448 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1450 elif n > self._max_youtube_results:
1451 self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1452 n = self._max_youtube_results
1453 return self._get_n_results(query, n)
1454 except ValueError: # parsing prefix as integer fails
1455 return self._get_n_results(query, 1)
1457 def _get_n_results(self, query, n):
1458 """Get a specified number of results for a query"""
1464 while (50 * pagenum) < limit:
1465 self.report_download_page(query, pagenum+1)
1466 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1467 request = compat_urllib_request.Request(result_url)
1469 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1470 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1471 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1473 api_response = json.loads(data)['data']
1475 if not 'items' in api_response:
1476 self._downloader.report_error(u'[youtube] No video results')
1479 new_ids = list(video['id'] for video in api_response['items'])
1480 video_ids += new_ids
1482 limit = min(n, api_response['totalItems'])
1485 if len(video_ids) > n:
1486 video_ids = video_ids[:n]
1487 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1491 class GoogleSearchIE(InfoExtractor):
1492 """Information Extractor for Google Video search queries."""
1493 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1494 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1495 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1496 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1497 _max_google_results = 1000
1498 IE_NAME = u'video.google:search'
1500 def report_download_page(self, query, pagenum):
1501 """Report attempt to download playlist page with given number."""
1502 query = query.decode(preferredencoding())
1503 self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1505 def _real_extract(self, query):
1506 mobj = re.match(self._VALID_URL, query)
1508 self._downloader.report_error(u'invalid search query "%s"' % query)
1511 prefix, query = query.split(':')
1513 query = query.encode('utf-8')
1515 self._download_n_results(query, 1)
1517 elif prefix == 'all':
1518 self._download_n_results(query, self._max_google_results)
1524 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1526 elif n > self._max_google_results:
1527 self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1528 n = self._max_google_results
1529 self._download_n_results(query, n)
1531 except ValueError: # parsing prefix as integer fails
1532 self._download_n_results(query, 1)
1535 def _download_n_results(self, query, n):
1536 """Downloads a specified number of results for a query"""
1542 self.report_download_page(query, pagenum)
1543 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1544 request = compat_urllib_request.Request(result_url)
1546 page = compat_urllib_request.urlopen(request).read()
1547 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1548 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1551 # Extract video identifiers
1552 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1553 video_id = mobj.group(1)
1554 if video_id not in video_ids:
1555 video_ids.append(video_id)
1556 if len(video_ids) == n:
1557 # Specified n videos reached
1558 for id in video_ids:
1559 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1562 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1563 for id in video_ids:
1564 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1567 pagenum = pagenum + 1
1570 class YahooSearchIE(InfoExtractor):
1571 """Information Extractor for Yahoo! Video search queries."""
1574 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1575 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1576 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1577 _MORE_PAGES_INDICATOR = r'\s*Next'
1578 _max_yahoo_results = 1000
1579 IE_NAME = u'video.yahoo:search'
1581 def report_download_page(self, query, pagenum):
1582 """Report attempt to download playlist page with given number."""
1583 query = query.decode(preferredencoding())
1584 self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1586 def _real_extract(self, query):
1587 mobj = re.match(self._VALID_URL, query)
1589 self._downloader.report_error(u'invalid search query "%s"' % query)
1592 prefix, query = query.split(':')
1594 query = query.encode('utf-8')
1596 self._download_n_results(query, 1)
1598 elif prefix == 'all':
1599 self._download_n_results(query, self._max_yahoo_results)
1605 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1607 elif n > self._max_yahoo_results:
1608 self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1609 n = self._max_yahoo_results
1610 self._download_n_results(query, n)
1612 except ValueError: # parsing prefix as integer fails
1613 self._download_n_results(query, 1)
1616 def _download_n_results(self, query, n):
1617 """Downloads a specified number of results for a query"""
1620 already_seen = set()
1624 self.report_download_page(query, pagenum)
1625 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1626 request = compat_urllib_request.Request(result_url)
1628 page = compat_urllib_request.urlopen(request).read()
1629 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1630 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1633 # Extract video identifiers
1634 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1635 video_id = mobj.group(1)
1636 if video_id not in already_seen:
1637 video_ids.append(video_id)
1638 already_seen.add(video_id)
1639 if len(video_ids) == n:
1640 # Specified n videos reached
1641 for id in video_ids:
1642 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1645 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1646 for id in video_ids:
1647 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1650 pagenum = pagenum + 1
1653 class YoutubePlaylistIE(InfoExtractor):
1654 """Information Extractor for YouTube playlists."""
1656 _VALID_URL = r"""(?:
1661 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1662 \? (?:.*?&)*? (?:p|a|list)=
1665 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1668 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1670 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1672 IE_NAME = u'youtube:playlist'
1675 def suitable(cls, url):
1676 """Receives a URL and returns True if suitable for this IE."""
1677 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1679 def _real_extract(self, url):
1680 # Extract playlist id
1681 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1683 self._downloader.report_error(u'invalid url: %s' % url)
1686 # Download playlist videos from API
1687 playlist_id = mobj.group(1) or mobj.group(2)
1692 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1693 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1696 response = json.loads(page)
1697 except ValueError as err:
1698 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1701 if 'feed' not in response:
1702 self._downloader.report_error(u'Got a malformed response from YouTube API')
1704 playlist_title = response['feed']['title']['$t']
1705 if 'entry' not in response['feed']:
1706 # Number of videos is a multiple of self._MAX_RESULTS
1709 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1710 for entry in response['feed']['entry']
1711 if 'content' in entry ]
1713 if len(response['feed']['entry']) < self._MAX_RESULTS:
1717 videos = [v[1] for v in sorted(videos)]
1719 url_results = [self.url_result(url, 'Youtube') for url in videos]
1720 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1723 class YoutubeChannelIE(InfoExtractor):
1724 """Information Extractor for YouTube channels."""
1726 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1727 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1728 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1729 _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1730 IE_NAME = u'youtube:channel'
1732 def extract_videos_from_page(self, page):
1734 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1735 if mobj.group(1) not in ids_in_page:
1736 ids_in_page.append(mobj.group(1))
1739 def _real_extract(self, url):
1740 # Extract channel id
1741 mobj = re.match(self._VALID_URL, url)
1743 self._downloader.report_error(u'invalid url: %s' % url)
1746 # Download channel page
1747 channel_id = mobj.group(1)
1751 url = self._TEMPLATE_URL % (channel_id, pagenum)
1752 page = self._download_webpage(url, channel_id,
1753 u'Downloading page #%s' % pagenum)
1755 # Extract video identifiers
1756 ids_in_page = self.extract_videos_from_page(page)
1757 video_ids.extend(ids_in_page)
1759 # Download any subsequent channel pages using the json-based channel_ajax query
1760 if self._MORE_PAGES_INDICATOR in page:
1762 pagenum = pagenum + 1
1764 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1765 page = self._download_webpage(url, channel_id,
1766 u'Downloading page #%s' % pagenum)
1768 page = json.loads(page)
1770 ids_in_page = self.extract_videos_from_page(page['content_html'])
1771 video_ids.extend(ids_in_page)
1773 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1776 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1778 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1779 url_entries = [self.url_result(url, 'Youtube') for url in urls]
1780 return [self.playlist_result(url_entries, channel_id)]
1783 class YoutubeUserIE(InfoExtractor):
1784 """Information Extractor for YouTube users."""
1786 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1787 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1788 _GDATA_PAGE_SIZE = 50
1789 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1790 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1791 IE_NAME = u'youtube:user'
1793 def _real_extract(self, url):
1795 mobj = re.match(self._VALID_URL, url)
1797 self._downloader.report_error(u'invalid url: %s' % url)
1800 username = mobj.group(1)
1802 # Download video ids using YouTube Data API. Result size per
1803 # query is limited (currently to 50 videos) so we need to query
1804 # page by page until there are no video ids - it means we got
1811 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1813 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1814 page = self._download_webpage(gdata_url, username,
1815 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1817 # Extract video identifiers
1820 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1821 if mobj.group(1) not in ids_in_page:
1822 ids_in_page.append(mobj.group(1))
1824 video_ids.extend(ids_in_page)
1826 # A little optimization - if current page is not
1827 # "full", ie. does not contain PAGE_SIZE video ids then
1828 # we can assume that this page is the last one - there
1829 # are no more ids on further pages - no need to query
1832 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1837 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1838 url_results = [self.url_result(url, 'Youtube') for url in urls]
1839 return [self.playlist_result(url_results, playlist_title = username)]
1842 class BlipTVUserIE(InfoExtractor):
1843 """Information Extractor for blip.tv users."""
1845 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1847 IE_NAME = u'blip.tv:user'
1849 def _real_extract(self, url):
1851 mobj = re.match(self._VALID_URL, url)
1853 self._downloader.report_error(u'invalid url: %s' % url)
1856 username = mobj.group(1)
1858 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1860 page = self._download_webpage(url, username, u'Downloading user page')
1861 mobj = re.search(r'data-users-id="([^"]+)"', page)
1862 page_base = page_base % mobj.group(1)
1865 # Download video ids using BlipTV Ajax calls. Result size per
1866 # query is limited (currently to 12 videos) so we need to query
1867 # page by page until there are no video ids - it means we got
1874 url = page_base + "&page=" + str(pagenum)
1875 page = self._download_webpage(url, username,
1876 u'Downloading video ids from page %d' % pagenum)
1878 # Extract video identifiers
1881 for mobj in re.finditer(r'href="/([^"]+)"', page):
1882 if mobj.group(1) not in ids_in_page:
1883 ids_in_page.append(unescapeHTML(mobj.group(1)))
1885 video_ids.extend(ids_in_page)
1887 # A little optimization - if current page is not
1888 # "full", ie. does not contain PAGE_SIZE video ids then
1889 # we can assume that this page is the last one - there
1890 # are no more ids on further pages - no need to query
1893 if len(ids_in_page) < self._PAGE_SIZE:
1898 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1899 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1900 return [self.playlist_result(url_entries, playlist_title = username)]
1903 class DepositFilesIE(InfoExtractor):
1904 """Information extractor for depositfiles.com"""
1906 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1908 def _real_extract(self, url):
1909 file_id = url.split('/')[-1]
1910 # Rebuild url in english locale
1911 url = 'http://depositfiles.com/en/files/' + file_id
1913 # Retrieve file webpage with 'Free download' button pressed
1914 free_download_indication = { 'gateway_result' : '1' }
1915 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1917 self.report_download_webpage(file_id)
1918 webpage = compat_urllib_request.urlopen(request).read()
1919 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1920 self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
1923 # Search for the real file URL
1924 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1925 if (mobj is None) or (mobj.group(1) is None):
1926 # Try to figure out reason of the error.
1927 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1928 if (mobj is not None) and (mobj.group(1) is not None):
1929 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1930 self._downloader.report_error(u'%s' % restriction_message)
1932 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
1935 file_url = mobj.group(1)
1936 file_extension = os.path.splitext(file_url)[1][1:]
1938 # Search for file title
1939 mobj = re.search(r'<b title="(.*?)">', webpage)
1941 self._downloader.report_error(u'unable to extract title')
1943 file_title = mobj.group(1).decode('utf-8')
1946 'id': file_id.decode('utf-8'),
1947 'url': file_url.decode('utf-8'),
1949 'upload_date': None,
1950 'title': file_title,
1951 'ext': file_extension.decode('utf-8'),
1955 class FacebookIE(InfoExtractor):
1956 """Information Extractor for Facebook"""
1958 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1959 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1960 _NETRC_MACHINE = 'facebook'
1961 IE_NAME = u'facebook'
1963 def report_login(self):
1964 """Report attempt to log in."""
1965 self.to_screen(u'Logging in')
1967 def _real_initialize(self):
1968 if self._downloader is None:
1973 downloader_params = self._downloader.params
1975 # Attempt to use provided username and password or .netrc data
1976 if downloader_params.get('username', None) is not None:
1977 useremail = downloader_params['username']
1978 password = downloader_params['password']
1979 elif downloader_params.get('usenetrc', False):
1981 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1982 if info is not None:
1986 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1987 except (IOError, netrc.NetrcParseError) as err:
1988 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1991 if useremail is None:
2000 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2003 login_results = compat_urllib_request.urlopen(request).read()
2004 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2005 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2007 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2008 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2011 def _real_extract(self, url):
2012 mobj = re.match(self._VALID_URL, url)
2014 self._downloader.report_error(u'invalid URL: %s' % url)
2016 video_id = mobj.group('ID')
2018 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2019 webpage = self._download_webpage(url, video_id)
2021 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
2022 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2023 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2025 raise ExtractorError(u'Cannot parse data')
2026 data = dict(json.loads(m.group(1)))
2027 params_raw = compat_urllib_parse.unquote(data['params'])
2028 params = json.loads(params_raw)
2029 video_data = params['video_data'][0]
2030 video_url = video_data.get('hd_src')
2032 video_url = video_data['sd_src']
2034 raise ExtractorError(u'Cannot find video URL')
2035 video_duration = int(video_data['video_duration'])
2036 thumbnail = video_data['thumbnail_src']
2038 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2040 raise ExtractorError(u'Cannot find title in webpage')
2041 video_title = unescapeHTML(m.group(1))
2045 'title': video_title,
2048 'duration': video_duration,
2049 'thumbnail': thumbnail,
2054 class BlipTVIE(InfoExtractor):
2055 """Information extractor for blip.tv"""
2057 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2058 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2059 IE_NAME = u'blip.tv'
2061 def report_direct_download(self, title):
2062 """Report information extraction."""
2063 self.to_screen(u'%s: Direct download detected' % title)
2065 def _real_extract(self, url):
2066 mobj = re.match(self._VALID_URL, url)
2068 self._downloader.report_error(u'invalid URL: %s' % url)
2071 urlp = compat_urllib_parse_urlparse(url)
2072 if urlp.path.startswith('/play/'):
2073 request = compat_urllib_request.Request(url)
2074 response = compat_urllib_request.urlopen(request)
2075 redirecturl = response.geturl()
2076 rurlp = compat_urllib_parse_urlparse(redirecturl)
2077 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2078 url = 'http://blip.tv/a/a-' + file_id
2079 return self._real_extract(url)
2086 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2087 request = compat_urllib_request.Request(json_url)
2088 request.add_header('User-Agent', 'iTunes/10.6.1')
2089 self.report_extraction(mobj.group(1))
2092 urlh = compat_urllib_request.urlopen(request)
2093 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2094 basename = url.split('/')[-1]
2095 title,ext = os.path.splitext(basename)
2096 title = title.decode('UTF-8')
2097 ext = ext.replace('.', '')
2098 self.report_direct_download(title)
2103 'upload_date': None,
2108 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2109 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2110 if info is None: # Regular URL
2112 json_code_bytes = urlh.read()
2113 json_code = json_code_bytes.decode('utf-8')
2114 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2115 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2119 json_data = json.loads(json_code)
2120 if 'Post' in json_data:
2121 data = json_data['Post']
2125 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2126 video_url = data['media']['url']
2127 umobj = re.match(self._URL_EXT, video_url)
2129 raise ValueError('Can not determine filename extension')
2130 ext = umobj.group(1)
2133 'id': data['item_id'],
2135 'uploader': data['display_name'],
2136 'upload_date': upload_date,
2137 'title': data['title'],
2139 'format': data['media']['mimeType'],
2140 'thumbnail': data['thumbnailUrl'],
2141 'description': data['description'],
2142 'player_url': data['embedUrl'],
2143 'user_agent': 'iTunes/10.6.1',
2145 except (ValueError,KeyError) as err:
2146 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2152 class MyVideoIE(InfoExtractor):
2153 """Information Extractor for myvideo.de."""
2155 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2156 IE_NAME = u'myvideo'
2158 def _real_extract(self,url):
2159 mobj = re.match(self._VALID_URL, url)
2161 self._download.report_error(u'invalid URL: %s' % url)
2164 video_id = mobj.group(1)
2167 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2168 webpage = self._download_webpage(webpage_url, video_id)
2170 self.report_extraction(video_id)
2171 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2174 self._downloader.report_error(u'unable to extract media URL')
2176 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2178 mobj = re.search('<title>([^<]+)</title>', webpage)
2180 self._downloader.report_error(u'unable to extract title')
2183 video_title = mobj.group(1)
2189 'upload_date': None,
2190 'title': video_title,
2194 class ComedyCentralIE(InfoExtractor):
2195 """Information extractor for The Daily Show and Colbert Report """
2197 # urls can be abbreviations like :thedailyshow or :colbert
2198 # urls for episodes like:
2199 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2200 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2201 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2202 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2203 |(https?://)?(www\.)?
2204 (?P<showname>thedailyshow|colbertnation)\.com/
2205 (full-episodes/(?P<episode>.*)|
2207 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2208 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2211 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2213 _video_extensions = {
2221 _video_dimensions = {
2231 def suitable(cls, url):
2232 """Receives a URL and returns True if suitable for this IE."""
2233 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2235 def _print_formats(self, formats):
2236 print('Available formats:')
2238 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2241 def _real_extract(self, url):
2242 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2244 self._downloader.report_error(u'invalid URL: %s' % url)
2247 if mobj.group('shortname'):
2248 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2249 url = u'http://www.thedailyshow.com/full-episodes/'
2251 url = u'http://www.colbertnation.com/full-episodes/'
2252 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2253 assert mobj is not None
2255 if mobj.group('clip'):
2256 if mobj.group('showname') == 'thedailyshow':
2257 epTitle = mobj.group('tdstitle')
2259 epTitle = mobj.group('cntitle')
2262 dlNewest = not mobj.group('episode')
2264 epTitle = mobj.group('showname')
2266 epTitle = mobj.group('episode')
2268 self.report_extraction(epTitle)
2269 webpage = self._download_webpage(url, epTitle)
2271 url = htmlHandle.geturl()
2272 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2274 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2276 if mobj.group('episode') == '':
2277 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2279 epTitle = mobj.group('episode')
2281 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2283 if len(mMovieParams) == 0:
2284 # The Colbert Report embeds the information in a without
2285 # a URL prefix; so extract the alternate reference
2286 # and then add the URL prefix manually.
2288 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2289 if len(altMovieParams) == 0:
2290 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2293 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2295 uri = mMovieParams[0][1]
2296 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2297 indexXml = self._download_webpage(indexUrl, epTitle,
2298 u'Downloading show index',
2299 u'unable to download episode index')
2303 idoc = xml.etree.ElementTree.fromstring(indexXml)
2304 itemEls = idoc.findall('.//item')
2305 for partNum,itemEl in enumerate(itemEls):
2306 mediaId = itemEl.findall('./guid')[0].text
2307 shortMediaId = mediaId.split(':')[-1]
2308 showId = mediaId.split(':')[-2].replace('.com', '')
2309 officialTitle = itemEl.findall('./title')[0].text
2310 officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2312 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2313 compat_urllib_parse.urlencode({'uri': mediaId}))
2314 configXml = self._download_webpage(configUrl, epTitle,
2315 u'Downloading configuration for %s' % shortMediaId)
2317 cdoc = xml.etree.ElementTree.fromstring(configXml)
2319 for rendition in cdoc.findall('.//rendition'):
2320 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2324 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2327 if self._downloader.params.get('listformats', None):
2328 self._print_formats([i[0] for i in turls])
2331 # For now, just pick the highest bitrate
2332 format,rtmp_video_url = turls[-1]
2334 # Get the format arg from the arg stream
2335 req_format = self._downloader.params.get('format', None)
2337 # Select format if we can find one
2340 format, rtmp_video_url = f, v
2343 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2345 raise ExtractorError(u'Cannot transform RTMP url')
2346 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2347 video_url = base + m.group('finalid')
2349 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2354 'upload_date': officialDate,
2359 'description': officialTitle,
2361 results.append(info)
2366 class EscapistIE(InfoExtractor):
2367 """Information extractor for The Escapist """
2369 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2370 IE_NAME = u'escapist'
2372 def _real_extract(self, url):
2373 mobj = re.match(self._VALID_URL, url)
2375 self._downloader.report_error(u'invalid URL: %s' % url)
2377 showName = mobj.group('showname')
2378 videoId = mobj.group('episode')
2380 self.report_extraction(showName)
2381 webPage = self._download_webpage(url, showName)
2383 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2384 description = unescapeHTML(descMatch.group(1))
2385 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2386 imgUrl = unescapeHTML(imgMatch.group(1))
2387 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2388 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2389 configUrlMatch = re.search('config=(.*)$', playerUrl)
2390 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2392 configJSON = self._download_webpage(configUrl, showName,
2393 u'Downloading configuration',
2394 u'unable to download configuration')
2396 # Technically, it's JavaScript, not JSON
2397 configJSON = configJSON.replace("'", '"')
2400 config = json.loads(configJSON)
2401 except (ValueError,) as err:
2402 self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2405 playlist = config['playlist']
2406 videoUrl = playlist[1]['url']
2411 'uploader': showName,
2412 'upload_date': None,
2415 'thumbnail': imgUrl,
2416 'description': description,
2417 'player_url': playerUrl,
2422 class CollegeHumorIE(InfoExtractor):
2423 """Information extractor for collegehumor.com"""
2426 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2427 IE_NAME = u'collegehumor'
2429 def report_manifest(self, video_id):
2430 """Report information extraction."""
2431 self.to_screen(u'%s: Downloading XML manifest' % video_id)
2433 def _real_extract(self, url):
2434 mobj = re.match(self._VALID_URL, url)
2436 self._downloader.report_error(u'invalid URL: %s' % url)
2438 video_id = mobj.group('videoid')
2443 'upload_date': None,
2446 self.report_extraction(video_id)
2447 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2449 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2450 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2451 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2454 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2456 videoNode = mdoc.findall('./video')[0]
2457 info['description'] = videoNode.findall('./description')[0].text
2458 info['title'] = videoNode.findall('./caption')[0].text
2459 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2460 manifest_url = videoNode.findall('./file')[0].text
2462 self._downloader.report_error(u'Invalid metadata XML file')
2465 manifest_url += '?hdcore=2.10.3'
2466 self.report_manifest(video_id)
2468 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2469 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2470 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2473 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2475 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2476 node_id = media_node.attrib['url']
2477 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2478 except IndexError as err:
2479 self._downloader.report_error(u'Invalid manifest file')
2482 url_pr = compat_urllib_parse_urlparse(manifest_url)
2483 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2490 class XVideosIE(InfoExtractor):
2491 """Information extractor for xvideos.com"""
2493 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2494 IE_NAME = u'xvideos'
2496 def _real_extract(self, url):
2497 mobj = re.match(self._VALID_URL, url)
2499 self._downloader.report_error(u'invalid URL: %s' % url)
2501 video_id = mobj.group(1)
2503 webpage = self._download_webpage(url, video_id)
2505 self.report_extraction(video_id)
2509 mobj = re.search(r'flv_url=(.+?)&', webpage)
2511 self._downloader.report_error(u'unable to extract video url')
2513 video_url = compat_urllib_parse.unquote(mobj.group(1))
2517 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2519 self._downloader.report_error(u'unable to extract video title')
2521 video_title = mobj.group(1)
2524 # Extract video thumbnail
2525 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2527 self._downloader.report_error(u'unable to extract video thumbnail')
2529 video_thumbnail = mobj.group(0)
2535 'upload_date': None,
2536 'title': video_title,
2538 'thumbnail': video_thumbnail,
2539 'description': None,
2545 class SoundcloudIE(InfoExtractor):
2546 """Information extractor for soundcloud.com
2547 To access the media, the uid of the song and a stream token
2548 must be extracted from the page source and the script must make
2549 a request to media.soundcloud.com/crossdomain.xml. Then
2550 the media can be grabbed by requesting from an url composed
2551 of the stream token and uid
2554 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2555 IE_NAME = u'soundcloud'
2557 def report_resolve(self, video_id):
2558 """Report information extraction."""
2559 self.to_screen(u'%s: Resolving id' % video_id)
2561 def _real_extract(self, url):
2562 mobj = re.match(self._VALID_URL, url)
2564 self._downloader.report_error(u'invalid URL: %s' % url)
2567 # extract uploader (which is in the url)
2568 uploader = mobj.group(1)
2569 # extract simple title (uploader + slug of song title)
2570 slug_title = mobj.group(2)
2571 simple_title = uploader + u'-' + slug_title
2572 full_title = '%s/%s' % (uploader, slug_title)
2574 self.report_resolve(full_title)
2576 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2577 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2578 info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2580 info = json.loads(info_json)
2581 video_id = info['id']
2582 self.report_extraction(full_title)
2584 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2585 stream_json = self._download_webpage(streams_url, full_title,
2586 u'Downloading stream definitions',
2587 u'unable to download stream definitions')
2589 streams = json.loads(stream_json)
2590 mediaURL = streams['http_mp3_128_url']
2591 upload_date = unified_strdate(info['created_at'])
2596 'uploader': info['user']['username'],
2597 'upload_date': upload_date,
2598 'title': info['title'],
2600 'description': info['description'],
2603 class SoundcloudSetIE(InfoExtractor):
2604 """Information extractor for soundcloud.com sets
2605 To access the media, the uid of the song and a stream token
2606 must be extracted from the page source and the script must make
2607 a request to media.soundcloud.com/crossdomain.xml. Then
2608 the media can be grabbed by requesting from an url composed
2609 of the stream token and uid
2612 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2613 IE_NAME = u'soundcloud:set'
2615 def report_resolve(self, video_id):
2616 """Report information extraction."""
2617 self.to_screen(u'%s: Resolving id' % video_id)
2619 def _real_extract(self, url):
2620 mobj = re.match(self._VALID_URL, url)
2622 self._downloader.report_error(u'invalid URL: %s' % url)
2625 # extract uploader (which is in the url)
2626 uploader = mobj.group(1)
2627 # extract simple title (uploader + slug of song title)
2628 slug_title = mobj.group(2)
2629 simple_title = uploader + u'-' + slug_title
2630 full_title = '%s/sets/%s' % (uploader, slug_title)
2632 self.report_resolve(full_title)
2634 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2635 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2636 info_json = self._download_webpage(resolv_url, full_title)
2639 info = json.loads(info_json)
2640 if 'errors' in info:
2641 for err in info['errors']:
2642 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2645 self.report_extraction(full_title)
2646 for track in info['tracks']:
2647 video_id = track['id']
2649 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2650 stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2652 self.report_extraction(video_id)
2653 streams = json.loads(stream_json)
2654 mediaURL = streams['http_mp3_128_url']
2659 'uploader': track['user']['username'],
2660 'upload_date': unified_strdate(track['created_at']),
2661 'title': track['title'],
2663 'description': track['description'],
2668 class InfoQIE(InfoExtractor):
2669 """Information extractor for infoq.com"""
2670 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2672 def _real_extract(self, url):
2673 mobj = re.match(self._VALID_URL, url)
2675 self._downloader.report_error(u'invalid URL: %s' % url)
2678 webpage = self._download_webpage(url, video_id=url)
2679 self.report_extraction(url)
2682 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2684 self._downloader.report_error(u'unable to extract video url')
2686 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2687 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2690 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2692 self._downloader.report_error(u'unable to extract video title')
2694 video_title = mobj.group(1)
2696 # Extract description
2697 video_description = u'No description available.'
2698 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2699 if mobj is not None:
2700 video_description = mobj.group(1)
2702 video_filename = video_url.split('/')[-1]
2703 video_id, extension = video_filename.split('.')
2709 'upload_date': None,
2710 'title': video_title,
2711 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2713 'description': video_description,
2718 class MixcloudIE(InfoExtractor):
2719 """Information extractor for www.mixcloud.com"""
2721 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2722 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2723 IE_NAME = u'mixcloud'
2725 def report_download_json(self, file_id):
2726 """Report JSON download."""
2727 self.to_screen(u'Downloading json')
2729 def get_urls(self, jsonData, fmt, bitrate='best'):
2730 """Get urls from 'audio_formats' section in json"""
2733 bitrate_list = jsonData[fmt]
2734 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2735 bitrate = max(bitrate_list) # select highest
2737 url_list = jsonData[fmt][bitrate]
2738 except TypeError: # we have no bitrate info.
2739 url_list = jsonData[fmt]
2742 def check_urls(self, url_list):
2743 """Returns 1st active url from list"""
2744 for url in url_list:
2746 compat_urllib_request.urlopen(url)
2748 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2753 def _print_formats(self, formats):
2754 print('Available formats:')
2755 for fmt in formats.keys():
2756 for b in formats[fmt]:
2758 ext = formats[fmt][b][0]
2759 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2760 except TypeError: # we have no bitrate info
2761 ext = formats[fmt][0]
2762 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2765 def _real_extract(self, url):
2766 mobj = re.match(self._VALID_URL, url)
2768 self._downloader.report_error(u'invalid URL: %s' % url)
2770 # extract uploader & filename from url
2771 uploader = mobj.group(1).decode('utf-8')
2772 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2774 # construct API request
2775 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2776 # retrieve .json file with links to files
2777 request = compat_urllib_request.Request(file_url)
2779 self.report_download_json(file_url)
2780 jsonData = compat_urllib_request.urlopen(request).read()
2781 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2782 self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
2786 json_data = json.loads(jsonData)
2787 player_url = json_data['player_swf_url']
2788 formats = dict(json_data['audio_formats'])
2790 req_format = self._downloader.params.get('format', None)
2793 if self._downloader.params.get('listformats', None):
2794 self._print_formats(formats)
2797 if req_format is None or req_format == 'best':
2798 for format_param in formats.keys():
2799 url_list = self.get_urls(formats, format_param)
2801 file_url = self.check_urls(url_list)
2802 if file_url is not None:
2805 if req_format not in formats:
2806 self._downloader.report_error(u'format is not available')
2809 url_list = self.get_urls(formats, req_format)
2810 file_url = self.check_urls(url_list)
2811 format_param = req_format
2814 'id': file_id.decode('utf-8'),
2815 'url': file_url.decode('utf-8'),
2816 'uploader': uploader.decode('utf-8'),
2817 'upload_date': None,
2818 'title': json_data['name'],
2819 'ext': file_url.split('.')[-1].decode('utf-8'),
2820 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2821 'thumbnail': json_data['thumbnail_url'],
2822 'description': json_data['description'],
2823 'player_url': player_url.decode('utf-8'),
2826 class StanfordOpenClassroomIE(InfoExtractor):
2827 """Information extractor for Stanford's Open ClassRoom"""
2829 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2830 IE_NAME = u'stanfordoc'
2832 def _real_extract(self, url):
2833 mobj = re.match(self._VALID_URL, url)
2835 raise ExtractorError(u'Invalid URL: %s' % url)
2837 if mobj.group('course') and mobj.group('video'): # A specific video
2838 course = mobj.group('course')
2839 video = mobj.group('video')
2841 'id': course + '_' + video,
2843 'upload_date': None,
2846 self.report_extraction(info['id'])
2847 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2848 xmlUrl = baseUrl + video + '.xml'
2850 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2851 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2852 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2854 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2856 info['title'] = mdoc.findall('./title')[0].text
2857 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2859 self._downloader.report_error(u'Invalid metadata XML file')
2861 info['ext'] = info['url'].rpartition('.')[2]
2863 elif mobj.group('course'): # A course page
2864 course = mobj.group('course')
2869 'upload_date': None,
2872 coursepage = self._download_webpage(url, info['id'],
2873 note='Downloading course info page',
2874 errnote='Unable to download course info page')
2876 m = re.search('<h1>([^<]+)</h1>', coursepage)
2878 info['title'] = unescapeHTML(m.group(1))
2880 info['title'] = info['id']
2882 m = re.search('<description>([^<]+)</description>', coursepage)
2884 info['description'] = unescapeHTML(m.group(1))
2886 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2889 'type': 'reference',
2890 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2894 for entry in info['list']:
2895 assert entry['type'] == 'reference'
2896 results += self.extract(entry['url'])
2900 'id': 'Stanford OpenClassroom',
2903 'upload_date': None,
2906 self.report_download_webpage(info['id'])
2907 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2909 rootpage = compat_urllib_request.urlopen(rootURL).read()
2910 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2911 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
2914 info['title'] = info['id']
2916 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2919 'type': 'reference',
2920 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2925 for entry in info['list']:
2926 assert entry['type'] == 'reference'
2927 results += self.extract(entry['url'])
2930 class MTVIE(InfoExtractor):
2931 """Information extractor for MTV.com"""
2933 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2936 def _real_extract(self, url):
2937 mobj = re.match(self._VALID_URL, url)
2939 self._downloader.report_error(u'invalid URL: %s' % url)
2941 if not mobj.group('proto'):
2942 url = 'http://' + url
2943 video_id = mobj.group('videoid')
2945 webpage = self._download_webpage(url, video_id)
2947 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2949 self._downloader.report_error(u'unable to extract song name')
2951 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2952 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2954 self._downloader.report_error(u'unable to extract performer')
2956 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2957 video_title = performer + ' - ' + song_name
2959 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2961 self._downloader.report_error(u'unable to mtvn_uri')
2963 mtvn_uri = mobj.group(1)
2965 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2967 self._downloader.report_error(u'unable to extract content id')
2969 content_id = mobj.group(1)
2971 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2972 self.report_extraction(video_id)
2973 request = compat_urllib_request.Request(videogen_url)
2975 metadataXml = compat_urllib_request.urlopen(request).read()
2976 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2977 self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
2980 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2981 renditions = mdoc.findall('.//rendition')
2983 # For now, always pick the highest quality.
2984 rendition = renditions[-1]
2987 _,_,ext = rendition.attrib['type'].partition('/')
2988 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2989 video_url = rendition.find('./src').text
2991 self._downloader.report_error('Invalid rendition field.')
2997 'uploader': performer,
2998 'upload_date': None,
2999 'title': video_title,
3007 class YoukuIE(InfoExtractor):
3008 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3011 nowTime = int(time.time() * 1000)
3012 random1 = random.randint(1000,1998)
3013 random2 = random.randint(1000,9999)
3015 return "%d%d%d" %(nowTime,random1,random2)
3017 def _get_file_ID_mix_string(self, seed):
3019 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3021 for i in range(len(source)):
3022 seed = (seed * 211 + 30031 ) % 65536
3023 index = math.floor(seed / 65536 * len(source) )
3024 mixed.append(source[int(index)])
3025 source.remove(source[int(index)])
3026 #return ''.join(mixed)
3029 def _get_file_id(self, fileId, seed):
3030 mixed = self._get_file_ID_mix_string(seed)
3031 ids = fileId.split('*')
3035 realId.append(mixed[int(ch)])
3036 return ''.join(realId)
3038 def _real_extract(self, url):
3039 mobj = re.match(self._VALID_URL, url)
3041 self._downloader.report_error(u'invalid URL: %s' % url)
3043 video_id = mobj.group('ID')
3045 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3047 jsondata = self._download_webpage(info_url, video_id)
3049 self.report_extraction(video_id)
3051 config = json.loads(jsondata)
3053 video_title = config['data'][0]['title']
3054 seed = config['data'][0]['seed']
3056 format = self._downloader.params.get('format', None)
3057 supported_format = list(config['data'][0]['streamfileids'].keys())
3059 if format is None or format == 'best':
3060 if 'hd2' in supported_format:
3065 elif format == 'worst':
3073 fileid = config['data'][0]['streamfileids'][format]
3074 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3075 except (UnicodeDecodeError, ValueError, KeyError):
3076 self._downloader.report_error(u'unable to extract info section')
3080 sid = self._gen_sid()
3081 fileid = self._get_file_id(fileid, seed)
3083 #column 8,9 of fileid represent the segment number
3084 #fileid[7:9] should be changed
3085 for index, key in enumerate(keys):
3087 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3088 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3091 'id': '%s_part%02d' % (video_id, index),
3092 'url': download_url,
3094 'upload_date': None,
3095 'title': video_title,
3098 files_info.append(info)
3103 class XNXXIE(InfoExtractor):
3104 """Information extractor for xnxx.com"""
3106 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3108 VIDEO_URL_RE = r'flv_url=(.*?)&'
3109 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3110 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3112 def _real_extract(self, url):
3113 mobj = re.match(self._VALID_URL, url)
3115 self._downloader.report_error(u'invalid URL: %s' % url)
3117 video_id = mobj.group(1)
3119 # Get webpage content
3120 webpage = self._download_webpage(url, video_id)
3122 result = re.search(self.VIDEO_URL_RE, webpage)
3124 self._downloader.report_error(u'unable to extract video url')
3126 video_url = compat_urllib_parse.unquote(result.group(1))
3128 result = re.search(self.VIDEO_TITLE_RE, webpage)
3130 self._downloader.report_error(u'unable to extract video title')
3132 video_title = result.group(1)
3134 result = re.search(self.VIDEO_THUMB_RE, webpage)
3136 self._downloader.report_error(u'unable to extract video thumbnail')
3138 video_thumbnail = result.group(1)
3144 'upload_date': None,
3145 'title': video_title,
3147 'thumbnail': video_thumbnail,
3148 'description': None,
3152 class GooglePlusIE(InfoExtractor):
3153 """Information extractor for plus.google.com."""
3155 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3156 IE_NAME = u'plus.google'
3158 def report_extract_entry(self, url):
3159 """Report downloading extry"""
3160 self.to_screen(u'Downloading entry: %s' % url)
3162 def report_date(self, upload_date):
3163 """Report downloading extry"""
3164 self.to_screen(u'Entry date: %s' % upload_date)
3166 def report_uploader(self, uploader):
3167 """Report downloading extry"""
3168 self.to_screen(u'Uploader: %s' % uploader)
3170 def report_title(self, video_title):
3171 """Report downloading extry"""
3172 self.to_screen(u'Title: %s' % video_title)
3174 def report_extract_vid_page(self, video_page):
3175 """Report information extraction."""
3176 self.to_screen(u'Extracting video page: %s' % video_page)
3178 def _real_extract(self, url):
3179 # Extract id from URL
3180 mobj = re.match(self._VALID_URL, url)
3182 self._downloader.report_error(u'Invalid URL: %s' % url)
3185 post_url = mobj.group(0)
3186 video_id = mobj.group(1)
3188 video_extension = 'flv'
3190 # Step 1, Retrieve post webpage to extract further information
3191 self.report_extract_entry(post_url)
3192 webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3194 # Extract update date
3196 pattern = 'title="Timestamp">(.*?)</a>'
3197 mobj = re.search(pattern, webpage)
3199 upload_date = mobj.group(1)
3200 # Convert timestring to a format suitable for filename
3201 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3202 upload_date = upload_date.strftime('%Y%m%d')
3203 self.report_date(upload_date)
3207 pattern = r'rel\="author".*?>(.*?)</a>'
3208 mobj = re.search(pattern, webpage)
3210 uploader = mobj.group(1)
3211 self.report_uploader(uploader)
3214 # Get the first line for title
3216 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3217 mobj = re.search(pattern, webpage)
3219 video_title = mobj.group(1)
3220 self.report_title(video_title)
3222 # Step 2, Stimulate clicking the image box to launch video
3223 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3224 mobj = re.search(pattern, webpage)
3226 self._downloader.report_error(u'unable to extract video page URL')
3228 video_page = mobj.group(1)
3229 webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3230 self.report_extract_vid_page(video_page)
3233 # Extract video links on video page
3234 """Extract video links of all sizes"""
3235 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3236 mobj = re.findall(pattern, webpage)
3238 self._downloader.report_error(u'unable to extract video links')
3240 # Sort in resolution
3241 links = sorted(mobj)
3243 # Choose the lowest of the sort, i.e. highest resolution
3244 video_url = links[-1]
3245 # Only get the url. The resolution part in the tuple has no use anymore
3246 video_url = video_url[-1]
3247 # Treat escaped \u0026 style hex
3249 video_url = video_url.decode("unicode_escape")
3250 except AttributeError: # Python 3
3251 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3257 'uploader': uploader,
3258 'upload_date': upload_date,
3259 'title': video_title,
3260 'ext': video_extension,
3263 class NBAIE(InfoExtractor):
3264 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3267 def _real_extract(self, url):
3268 mobj = re.match(self._VALID_URL, url)
3270 self._downloader.report_error(u'invalid URL: %s' % url)
3273 video_id = mobj.group(1)
3274 if video_id.endswith('/index.html'):
3275 video_id = video_id[:-len('/index.html')]
3277 webpage = self._download_webpage(url, video_id)
3279 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3280 def _findProp(rexp, default=None):
3281 m = re.search(rexp, webpage)
3283 return unescapeHTML(m.group(1))
3287 shortened_video_id = video_id.rpartition('/')[2]
3288 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3290 'id': shortened_video_id,
3294 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3295 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3299 class JustinTVIE(InfoExtractor):
3300 """Information extractor for justin.tv and twitch.tv"""
3301 # TODO: One broadcast may be split into multiple videos. The key
3302 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3303 # starts at 1 and increases. Can we treat all parts as one video?
3305 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3306 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3307 _JUSTIN_PAGE_LIMIT = 100
3308 IE_NAME = u'justin.tv'
3310 def report_download_page(self, channel, offset):
3311 """Report attempt to download a single page of videos."""
3312 self.to_screen(u'%s: Downloading video information from %d to %d' %
3313 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3315 # Return count of items, list of *valid* items
3316 def _parse_page(self, url, video_id):
3317 webpage = self._download_webpage(url, video_id,
3318 u'Downloading video info JSON',
3319 u'unable to download video info JSON')
3321 response = json.loads(webpage)
3322 if type(response) != list:
3323 error_text = response.get('error', 'unknown error')
3324 self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3327 for clip in response:
3328 video_url = clip['video_file_url']
3330 video_extension = os.path.splitext(video_url)[1][1:]
3331 video_date = re.sub('-', '', clip['start_time'][:10])
3332 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3333 video_id = clip['id']
3334 video_title = clip.get('title', video_id)
3338 'title': video_title,
3339 'uploader': clip.get('channel_name', video_uploader_id),
3340 'uploader_id': video_uploader_id,
3341 'upload_date': video_date,
3342 'ext': video_extension,
3344 return (len(response), info)
3346 def _real_extract(self, url):
3347 mobj = re.match(self._VALID_URL, url)
3349 self._downloader.report_error(u'invalid URL: %s' % url)
3352 api = 'http://api.justin.tv'
3353 video_id = mobj.group(mobj.lastindex)
3355 if mobj.lastindex == 1:
3357 api += '/channel/archives/%s.json'
3359 api += '/broadcast/by_archive/%s.json'
3360 api = api % (video_id,)
3362 self.report_extraction(video_id)
3366 limit = self._JUSTIN_PAGE_LIMIT
3369 self.report_download_page(video_id, offset)
3370 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3371 page_count, page_info = self._parse_page(page_url, video_id)
3372 info.extend(page_info)
3373 if not paged or page_count != limit:
3378 class FunnyOrDieIE(InfoExtractor):
3379 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3381 def _real_extract(self, url):
3382 mobj = re.match(self._VALID_URL, url)
3384 self._downloader.report_error(u'invalid URL: %s' % url)
3387 video_id = mobj.group('id')
3388 webpage = self._download_webpage(url, video_id)
3390 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3392 self._downloader.report_error(u'unable to find video information')
3393 video_url = unescapeHTML(m.group('url'))
3395 m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3397 m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3399 self._downloader.report_error(u'Cannot find video title')
3400 title = clean_html(m.group('title'))
3402 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3404 desc = unescapeHTML(m.group('desc'))
3413 'description': desc,
3417 class SteamIE(InfoExtractor):
3418 _VALID_URL = r"""http://store\.steampowered\.com/
3420 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3422 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3426 def suitable(cls, url):
3427 """Receives a URL and returns True if suitable for this IE."""
3428 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3430 def _real_extract(self, url):
3431 m = re.match(self._VALID_URL, url, re.VERBOSE)
3432 gameID = m.group('gameID')
3433 videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3434 self.report_age_confirmation()
3435 webpage = self._download_webpage(videourl, gameID)
3436 game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3438 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3439 mweb = re.finditer(urlRE, webpage)
3440 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3441 titles = re.finditer(namesRE, webpage)
3442 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3443 thumbs = re.finditer(thumbsRE, webpage)
3445 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3446 video_id = vid.group('videoID')
3447 title = vtitle.group('videoName')
3448 video_url = vid.group('videoURL')
3449 video_thumb = thumb.group('thumbnail')
3451 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3456 'title': unescapeHTML(title),
3457 'thumbnail': video_thumb
3460 return [self.playlist_result(videos, gameID, game_title)]
3462 class UstreamIE(InfoExtractor):
3463 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3464 IE_NAME = u'ustream'
3466 def _real_extract(self, url):
3467 m = re.match(self._VALID_URL, url)
3468 video_id = m.group('videoID')
3469 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3470 webpage = self._download_webpage(url, video_id)
3471 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3472 title = m.group('title')
3473 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3474 uploader = m.group('uploader')
3480 'uploader': uploader
3484 class WorldStarHipHopIE(InfoExtractor):
3485 _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3486 IE_NAME = u'WorldStarHipHop'
3488 def _real_extract(self, url):
3489 _src_url = r"""(http://(hw-videos|hw-post1).*(?:mp4|flv))"""
3491 m = re.match(self._VALID_URL, url)
3492 video_id = m.group('id')
3494 webpage_src = self._download_webpage(url, video_id)
3496 mobj = re.search(_src_url, webpage_src)
3498 if mobj is not None:
3499 video_url = mobj.group()
3500 if 'mp4' in video_url:
3505 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3508 _title = r"""<title>(.*)</title>"""
3510 mobj = re.search(_title, webpage_src)
3512 if mobj is not None:
3513 title = mobj.group(1)
3515 title = 'World Start Hip Hop - %s' % time.ctime()
3517 _thumbnail = r"""rel="image_src" href="(.*)" />"""
3518 mobj = re.search(_thumbnail, webpage_src)
3520 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3521 if mobj is not None:
3522 thumbnail = mobj.group(1)
3524 _title = r"""candytitles.*>(.*)</span>"""
3525 mobj = re.search(_title, webpage_src)
3526 if mobj is not None:
3527 title = mobj.group(1)
3534 'thumbnail' : thumbnail,
3539 class RBMARadioIE(InfoExtractor):
3540 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3542 def _real_extract(self, url):
3543 m = re.match(self._VALID_URL, url)
3544 video_id = m.group('videoID')
3546 webpage = self._download_webpage(url, video_id)
3547 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3549 raise ExtractorError(u'Cannot find metadata')
3550 json_data = m.group(1)
3553 data = json.loads(json_data)
3554 except ValueError as e:
3555 raise ExtractorError(u'Invalid JSON: ' + str(e))
3557 video_url = data['akamai_url'] + '&cbr=256'
3558 url_parts = compat_urllib_parse_urlparse(video_url)
3559 video_ext = url_parts.path.rpartition('.')[2]
3564 'title': data['title'],
3565 'description': data.get('teaser_text'),
3566 'location': data.get('country_of_origin'),
3567 'uploader': data.get('host', {}).get('name'),
3568 'uploader_id': data.get('host', {}).get('slug'),
3569 'thumbnail': data.get('image', {}).get('large_url_2x'),
3570 'duration': data.get('duration'),
3575 class YouPornIE(InfoExtractor):
3576 """Information extractor for youporn.com."""
3577 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3579 def _print_formats(self, formats):
3580 """Print all available formats"""
3581 print(u'Available formats:')
3582 print(u'ext\t\tformat')
3583 print(u'---------------------------------')
3584 for format in formats:
3585 print(u'%s\t\t%s' % (format['ext'], format['format']))
3587 def _specific(self, req_format, formats):
3589 if(x["format"]==req_format):
3593 def _real_extract(self, url):
3594 mobj = re.match(self._VALID_URL, url)
3596 self._downloader.report_error(u'invalid URL: %s' % url)
3599 video_id = mobj.group('videoid')
3601 req = compat_urllib_request.Request(url)
3602 req.add_header('Cookie', 'age_verified=1')
3603 webpage = self._download_webpage(req, video_id)
3605 # Get the video title
3606 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3608 raise ExtractorError(u'Unable to extract video title')
3609 video_title = result.group('title').strip()
3611 # Get the video date
3612 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3614 self._downloader.report_warning(u'unable to extract video date')
3617 upload_date = unified_strdate(result.group('date').strip())
3619 # Get the video uploader
3620 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3622 self._downloader.report_warning(u'unable to extract uploader')
3623 video_uploader = None
3625 video_uploader = result.group('uploader').strip()
3626 video_uploader = clean_html( video_uploader )
3628 # Get all of the formats available
3629 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3630 result = re.search(DOWNLOAD_LIST_RE, webpage)
3632 raise ExtractorError(u'Unable to extract download list')
3633 download_list_html = result.group('download_list').strip()
3635 # Get all of the links from the page
3636 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3637 links = re.findall(LINK_RE, download_list_html)
3638 if(len(links) == 0):
3639 raise ExtractorError(u'ERROR: no known formats available for video')
3641 self.to_screen(u'Links found: %d' % len(links))
3646 # A link looks like this:
3647 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3648 # A path looks like this:
3649 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3650 video_url = unescapeHTML( link )
3651 path = compat_urllib_parse_urlparse( video_url ).path
3652 extension = os.path.splitext( path )[1][1:]
3653 format = path.split('/')[4].split('_')[:2]
3656 format = "-".join( format )
3657 title = u'%s-%s-%s' % (video_title, size, bitrate)
3662 'uploader': video_uploader,
3663 'upload_date': upload_date,
3668 'description': None,
3672 if self._downloader.params.get('listformats', None):
3673 self._print_formats(formats)
3676 req_format = self._downloader.params.get('format', None)
3677 self.to_screen(u'Format: %s' % req_format)
3679 if req_format is None or req_format == 'best':
3681 elif req_format == 'worst':
3682 return [formats[-1]]
3683 elif req_format in ('-1', 'all'):
3686 format = self._specific( req_format, formats )
3688 self._downloader.report_error(u'requested format not available')
3694 class PornotubeIE(InfoExtractor):
3695 """Information extractor for pornotube.com."""
3696 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3698 def _real_extract(self, url):
3699 mobj = re.match(self._VALID_URL, url)
3701 self._downloader.report_error(u'invalid URL: %s' % url)
3704 video_id = mobj.group('videoid')
3705 video_title = mobj.group('title')
3707 # Get webpage content
3708 webpage = self._download_webpage(url, video_id)
3711 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3712 result = re.search(VIDEO_URL_RE, webpage)
3714 self._downloader.report_error(u'unable to extract video url')
3716 video_url = compat_urllib_parse.unquote(result.group('url'))
3718 #Get the uploaded date
3719 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3720 result = re.search(VIDEO_UPLOADED_RE, webpage)
3722 self._downloader.report_error(u'unable to extract video title')
3724 upload_date = unified_strdate(result.group('date'))
3726 info = {'id': video_id,
3729 'upload_date': upload_date,
3730 'title': video_title,
3736 class YouJizzIE(InfoExtractor):
3737 """Information extractor for youjizz.com."""
3738 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3740 def _real_extract(self, url):
3741 mobj = re.match(self._VALID_URL, url)
3743 self._downloader.report_error(u'invalid URL: %s' % url)
3746 video_id = mobj.group('videoid')
3748 # Get webpage content
3749 webpage = self._download_webpage(url, video_id)
3751 # Get the video title
3752 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3754 raise ExtractorError(u'ERROR: unable to extract video title')
3755 video_title = result.group('title').strip()
3757 # Get the embed page
3758 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3760 raise ExtractorError(u'ERROR: unable to extract embed page')
3762 embed_page_url = result.group(0).strip()
3763 video_id = result.group('videoid')
3765 webpage = self._download_webpage(embed_page_url, video_id)
3768 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3770 raise ExtractorError(u'ERROR: unable to extract video url')
3771 video_url = result.group('source')
3773 info = {'id': video_id,
3775 'title': video_title,
3778 'player_url': embed_page_url}
3782 class EightTracksIE(InfoExtractor):
3784 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3786 def _real_extract(self, url):
3787 mobj = re.match(self._VALID_URL, url)
3789 raise ExtractorError(u'Invalid URL: %s' % url)
3790 playlist_id = mobj.group('id')
3792 webpage = self._download_webpage(url, playlist_id)
3794 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3796 raise ExtractorError(u'Cannot find trax information')
3797 json_like = m.group(1)
3798 data = json.loads(json_like)
3800 session = str(random.randint(0, 1000000000))
3802 track_count = data['tracks_count']
3803 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3804 next_url = first_url
3806 for i in itertools.count():
3807 api_json = self._download_webpage(next_url, playlist_id,
3808 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3809 errnote=u'Failed to download song information')
3810 api_data = json.loads(api_json)
3811 track_data = api_data[u'set']['track']
3813 'id': track_data['id'],
3814 'url': track_data['track_file_stream_url'],
3815 'title': track_data['performer'] + u' - ' + track_data['name'],
3816 'raw_title': track_data['name'],
3817 'uploader_id': data['user']['login'],
3821 if api_data['set']['at_last_track']:
3823 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3826 class KeekIE(InfoExtractor):
3827 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3830 def _real_extract(self, url):
3831 m = re.match(self._VALID_URL, url)
3832 video_id = m.group('videoID')
3833 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3834 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3835 webpage = self._download_webpage(url, video_id)
3836 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3837 title = unescapeHTML(m.group('title'))
3838 m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
3839 uploader = clean_html(m.group('uploader'))
3845 'thumbnail': thumbnail,
3846 'uploader': uploader
3850 class TEDIE(InfoExtractor):
3851 _VALID_URL=r'''http://www\.ted\.com/
3853 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3855 ((?P<type_talk>talks)) # We have a simple talk
3857 /(?P<name>\w+) # Here goes the name and then ".html"
3861 def suitable(cls, url):
3862 """Receives a URL and returns True if suitable for this IE."""
3863 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3865 def _real_extract(self, url):
3866 m=re.match(self._VALID_URL, url, re.VERBOSE)
3867 if m.group('type_talk'):
3868 return [self._talk_info(url)]
3870 playlist_id=m.group('playlist_id')
3871 name=m.group('name')
3872 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3873 return [self._playlist_videos_info(url,name,playlist_id)]
3875 def _talk_video_link(self,mediaSlug):
3876 '''Returns the video link for that mediaSlug'''
3877 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3879 def _playlist_videos_info(self,url,name,playlist_id=0):
3880 '''Returns the videos of the playlist'''
3882 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3883 ([.\s]*?)data-playlist_item_id="(\d+)"
3884 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3886 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3887 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3888 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3889 m_names=re.finditer(video_name_RE,webpage)
3891 playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3892 m_playlist = re.search(playlist_RE, webpage)
3893 playlist_title = m_playlist.group('playlist_title')
3895 playlist_entries = []
3896 for m_video, m_name in zip(m_videos,m_names):
3897 video_id=m_video.group('video_id')
3898 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3899 playlist_entries.append(self.url_result(talk_url, 'TED'))
3900 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3902 def _talk_info(self, url, video_id=0):
3903 """Return the video for the talk in the url"""
3904 m=re.match(self._VALID_URL, url,re.VERBOSE)
3905 videoName=m.group('name')
3906 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3907 # If the url includes the language we get the title translated
3908 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3909 title=re.search(title_RE, webpage).group('title')
3910 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3911 "id":(?P<videoID>[\d]+).*?
3912 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3913 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3914 thumb_match=re.search(thumb_RE,webpage)
3915 info_match=re.search(info_RE,webpage,re.VERBOSE)
3916 video_id=info_match.group('videoID')
3917 mediaSlug=info_match.group('mediaSlug')
3918 video_url=self._talk_video_link(mediaSlug)
3924 'thumbnail': thumb_match.group('thumbnail')
3928 class MySpassIE(InfoExtractor):
3929 _VALID_URL = r'http://www.myspass.de/.*'
3931 def _real_extract(self, url):
3932 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3934 # video id is the last path element of the URL
3935 # usually there is a trailing slash, so also try the second but last
3936 url_path = compat_urllib_parse_urlparse(url).path
3937 url_parent_path, video_id = os.path.split(url_path)
3939 _, video_id = os.path.split(url_parent_path)
3942 metadata_url = META_DATA_URL_TEMPLATE % video_id
3943 metadata_text = self._download_webpage(metadata_url, video_id)
3944 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3946 # extract values from metadata
3947 url_flv_el = metadata.find('url_flv')
3948 if url_flv_el is None:
3949 self._downloader.report_error(u'unable to extract download url')
3951 video_url = url_flv_el.text
3952 extension = os.path.splitext(video_url)[1][1:]
3953 title_el = metadata.find('title')
3954 if title_el is None:
3955 self._downloader.report_error(u'unable to extract title')
3957 title = title_el.text
3958 format_id_el = metadata.find('format_id')
3959 if format_id_el is None:
3962 format = format_id_el.text
3963 description_el = metadata.find('description')
3964 if description_el is not None:
3965 description = description_el.text
3968 imagePreview_el = metadata.find('imagePreview')
3969 if imagePreview_el is not None:
3970 thumbnail = imagePreview_el.text
3979 'thumbnail': thumbnail,
3980 'description': description
3984 class SpiegelIE(InfoExtractor):
3985 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3987 def _real_extract(self, url):
3988 m = re.match(self._VALID_URL, url)
3989 video_id = m.group('videoID')
3991 webpage = self._download_webpage(url, video_id)
3992 m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
3994 raise ExtractorError(u'Cannot find title')
3995 video_title = unescapeHTML(m.group(1))
3997 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3998 xml_code = self._download_webpage(xml_url, video_id,
3999 note=u'Downloading XML', errnote=u'Failed to download XML')
4001 idoc = xml.etree.ElementTree.fromstring(xml_code)
4002 last_type = idoc[-1]
4003 filename = last_type.findall('./filename')[0].text
4004 duration = float(last_type.findall('./duration')[0].text)
4006 video_url = 'http://video2.spiegel.de/flash/' + filename
4007 video_ext = filename.rpartition('.')[2]
4012 'title': video_title,
4013 'duration': duration,
4017 class LiveLeakIE(InfoExtractor):
4019 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4020 IE_NAME = u'liveleak'
4022 def _real_extract(self, url):
4023 mobj = re.match(self._VALID_URL, url)
4025 self._downloader.report_error(u'invalid URL: %s' % url)
4028 video_id = mobj.group('video_id')
4030 webpage = self._download_webpage(url, video_id)
4032 m = re.search(r'file: "(.*?)",', webpage)
4034 self._downloader.report_error(u'unable to find video url')
4036 video_url = m.group(1)
4038 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4040 self._downloader.report_error(u'Cannot find video title')
4041 title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4043 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4045 desc = unescapeHTML(m.group('desc'))
4049 m = re.search(r'By:.*?(\w+)</a>', webpage)
4051 uploader = clean_html(m.group(1))
4060 'description': desc,
4061 'uploader': uploader
4066 class ARDIE(InfoExtractor):
4067 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4068 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4069 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4071 def _real_extract(self, url):
4072 # determine video id from url
4073 m = re.match(self._VALID_URL, url)
4075 numid = re.search(r'documentId=([0-9]+)', url)
4077 video_id = numid.group(1)
4079 video_id = m.group('video_id')
4081 # determine title and media streams from webpage
4082 html = self._download_webpage(url, video_id)
4083 title = re.search(self._TITLE, html).group('title')
4084 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4086 assert '"fsk"' in html
4087 self._downloader.report_error(u'this video is only available after 8:00 pm')
4090 # choose default media type and highest quality for now
4091 stream = max([s for s in streams if int(s["media_type"]) == 0],
4092 key=lambda s: int(s["quality"]))
4094 # there's two possibilities: RTMP stream or HTTP download
4095 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4096 if stream['rtmp_url']:
4097 self.to_screen(u'RTMP download detected')
4098 assert stream['video_url'].startswith('mp4:')
4099 info["url"] = stream["rtmp_url"]
4100 info["play_path"] = stream['video_url']
4102 assert stream["video_url"].endswith('.mp4')
4103 info["url"] = stream["video_url"]
4106 class TumblrIE(InfoExtractor):
4107 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4109 def _real_extract(self, url):
4110 m_url = re.match(self._VALID_URL, url)
4111 video_id = m_url.group('id')
4112 blog = m_url.group('blog_name')
4114 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4115 webpage = self._download_webpage(url, video_id)
4117 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4118 video = re.search(re_video, webpage)
4120 self.to_screen("No video founded")
4122 video_url = video.group('video_url')
4123 ext = video.group('ext')
4125 re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22' # We pick the first poster
4126 thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
4128 # The only place where you can get a title, it's not complete,
4129 # but searching in other places doesn't work for all videos
4130 re_title = r'<title>(?P<title>.*?)</title>'
4131 title = unescapeHTML(re.search(re_title, webpage, re.DOTALL).group('title'))
4133 return [{'id': video_id,
4140 class BandcampIE(InfoExtractor):
4141 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4143 def _real_extract(self, url):
4144 mobj = re.match(self._VALID_URL, url)
4145 title = mobj.group('title')
4146 webpage = self._download_webpage(url, title)
4147 # We get the link to the free download page
4148 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4149 if m_download is None:
4150 self._downloader.report_error('No free songs founded')
4152 download_link = m_download.group(1)
4153 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4154 webpage, re.MULTILINE|re.DOTALL).group('id')
4156 download_webpage = self._download_webpage(download_link, id,
4157 'Downloading free downloads page')
4158 # We get the dictionary of the track from some javascrip code
4159 info = re.search(r'items: (.*?),$',
4160 download_webpage, re.MULTILINE).group(1)
4161 info = json.loads(info)[0]
4162 # We pick mp3-320 for now, until format selection can be easily implemented.
4163 mp3_info = info[u'downloads'][u'mp3-320']
4164 # If we try to use this url it says the link has expired
4165 initial_url = mp3_info[u'url']
4166 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4167 m_url = re.match(re_url, initial_url)
4168 #We build the url we will use to get the final track url
4169 # This url is build in Bandcamp in the script download_bunde_*.js
4170 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4171 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4172 # If we could correctly generate the .rand field the url would be
4173 #in the "download_url" key
4174 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4176 track_info = {'id':id,
4177 'title' : info[u'title'],
4180 'thumbnail' : info[u'thumb_url'],
4181 'uploader' : info[u'artist']
4187 def gen_extractors():
4188 """ Return a list of an instance of every supported extractor.
4189 The order does matter; the first extractor matched is the one handling the URL.
4192 YoutubePlaylistIE(),
4217 StanfordOpenClassroomIE(),
4227 WorldStarHipHopIE(),
4245 def get_info_extractor(ie_name):
4246 """Returns the info extractor class with the given ie_name"""
4247 return globals()[ie_name+'IE']