2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
23 class InfoExtractor(object):
24 """Information Extractor class.
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
29 others. The information is stored in a dictionary which is then
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
34 The dictionaries must include the following fields:
38 title: Video title, unescaped.
39 ext: Video filename extension.
41 The following fields are optional:
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
46 uploader: Full name of the video uploader.
47 upload_date: Video upload date (YYYYMMDD).
48 uploader_id: Nickname or id of the video uploader.
49 location: Physical location of the video.
50 player_url: SWF Player URL (used for rtmpdump).
51 subtitles: The .srt file contents.
52 urlhandle: [internal] The urlHandle to be used to download the file,
53 like returned by urllib.request.urlopen
55 The fields should all be Unicode strings.
57 Subclasses of this one should re-define the _real_initialize() and
58 _real_extract() methods and define a _VALID_URL regexp.
59 Probably, they should also be added to the list of extractors.
61 _real_extract() must return a *list* of information dictionaries as
64 Finally, the _WORKING attribute should be set to False for broken IEs
65 in order to warn the users and skip the tests.
72 def __init__(self, downloader=None):
73 """Constructor. Receives an optional downloader."""
75 self.set_downloader(downloader)
78 def suitable(cls, url):
79 """Receives a URL and returns True if suitable for this IE."""
80 return re.match(cls._VALID_URL, url) is not None
84 """Getter method for _WORKING."""
88 """Initializes an instance (authentication, etc)."""
90 self._real_initialize()
93 def extract(self, url):
94 """Extracts URL information and returns it in list of dicts."""
96 return self._real_extract(url)
98 def set_downloader(self, downloader):
99 """Sets the downloader for this IE."""
100 self._downloader = downloader
102 def _real_initialize(self):
103 """Real initialization process. Redefine in subclasses."""
106 def _real_extract(self, url):
107 """Real extraction process. Redefine in subclasses."""
112 return type(self).__name__[:-2]
114 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115 """ Returns the response handle """
117 note = u'Downloading video webpage'
118 self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
120 return compat_urllib_request.urlopen(url_or_request)
121 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
123 errnote = u'Unable to download webpage'
124 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
126 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
127 """ Returns the data of the page as a string """
128 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
129 webpage_bytes = urlh.read()
130 return webpage_bytes.decode('utf-8', 'replace')
132 #Methods for following #608
133 #They set the correct value of the '_type' key
134 def video_result(self, video_info):
135 """Returns a video"""
136 video_info['_type'] = 'video'
138 def url_result(self, url, ie=None):
139 """Returns a url that points to a page that should be processed"""
140 #TODO: ie should be the class used for getting the info
141 video_info = {'_type': 'url',
144 def playlist_result(self, entries):
145 """Returns a playlist"""
146 video_info = {'_type': 'playlist',
151 class YoutubeIE(InfoExtractor):
152 """Information extractor for youtube.com."""
156 (?:https?://)? # http(s):// (optional)
157 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
158 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
159 (?:.*?\#/)? # handle anchor (#/) redirect urls
160 (?: # the various things that can precede the ID:
161 (?:(?:v|embed|e)/) # v/ or embed/ or e/
162 |(?: # or the v= param in all its forms
163 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
164 (?:\?|\#!?) # the params delimiter ? or # or #!
165 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
168 )? # optional -> youtube.com/xxxx is OK
169 )? # all until now is optional -> you can pass the naked ID
170 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
171 (?(1).+)? # if we found the ID, everything can follow
173 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
174 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
175 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
176 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
177 _NETRC_MACHINE = 'youtube'
178 # Listed in order of quality
179 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
180 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
181 _video_extensions = {
187 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
193 _video_dimensions = {
212 def suitable(cls, url):
213 """Receives a URL and returns True if suitable for this IE."""
214 if YoutubePlaylistIE.suitable(url): return False
215 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
217 def report_lang(self):
218 """Report attempt to set language."""
219 self._downloader.to_screen(u'[youtube] Setting language')
221 def report_login(self):
222 """Report attempt to log in."""
223 self._downloader.to_screen(u'[youtube] Logging in')
225 def report_age_confirmation(self):
226 """Report attempt to confirm age."""
227 self._downloader.to_screen(u'[youtube] Confirming age')
229 def report_video_webpage_download(self, video_id):
230 """Report attempt to download video webpage."""
231 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
233 def report_video_info_webpage_download(self, video_id):
234 """Report attempt to download video info webpage."""
235 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
237 def report_video_subtitles_download(self, video_id):
238 """Report attempt to download video info webpage."""
239 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
241 def report_information_extraction(self, video_id):
242 """Report attempt to extract video information."""
243 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
245 def report_unavailable_format(self, video_id, format):
246 """Report extracted video URL."""
247 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
249 def report_rtmp_download(self):
250 """Indicate the download will use the RTMP protocol."""
251 self._downloader.to_screen(u'[youtube] RTMP download detected')
253 def _closed_captions_xml_to_srt(self, xml_string):
255 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
256 # TODO parse xml instead of regex
257 for n, (start, dur_tag, dur, caption) in enumerate(texts):
258 if not dur: dur = '4'
260 end = start + float(dur)
261 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
262 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
263 caption = unescapeHTML(caption)
264 caption = unescapeHTML(caption) # double cycle, intentional
265 srt += str(n+1) + '\n'
266 srt += start + ' --> ' + end + '\n'
267 srt += caption + '\n\n'
270 def _extract_subtitles(self, video_id):
271 self.report_video_subtitles_download(video_id)
272 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
274 srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
275 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
276 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
277 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
278 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
279 if not srt_lang_list:
280 return (u'WARNING: video has no closed captions', None)
281 if self._downloader.params.get('subtitleslang', False):
282 srt_lang = self._downloader.params.get('subtitleslang')
283 elif 'en' in srt_lang_list:
286 srt_lang = list(srt_lang_list.keys())[0]
287 if not srt_lang in srt_lang_list:
288 return (u'WARNING: no closed captions found in the specified language', None)
289 params = compat_urllib_parse.urlencode({
291 'name': srt_lang_list[srt_lang].encode('utf-8'),
294 url = 'http://www.youtube.com/api/timedtext?' + params
296 srt_xml = compat_urllib_request.urlopen(url).read().decode('utf-8')
297 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
298 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
300 return (u'WARNING: Did not fetch video subtitles', None)
301 return (None, self._closed_captions_xml_to_srt(srt_xml))
303 def _print_formats(self, formats):
304 print('Available formats:')
306 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
308 def _real_initialize(self):
309 if self._downloader is None:
314 downloader_params = self._downloader.params
316 # Attempt to use provided username and password or .netrc data
317 if downloader_params.get('username', None) is not None:
318 username = downloader_params['username']
319 password = downloader_params['password']
320 elif downloader_params.get('usenetrc', False):
322 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
327 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
328 except (IOError, netrc.NetrcParseError) as err:
329 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
333 request = compat_urllib_request.Request(self._LANG_URL)
336 compat_urllib_request.urlopen(request).read()
337 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
338 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
341 # No authentication to be performed
345 request = compat_urllib_request.Request(self._LOGIN_URL)
347 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
348 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
349 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
354 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
356 galx = match.group(1)
358 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
364 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
368 u'PersistentCookie': u'yes',
370 u'bgresponse': u'js_disabled',
371 u'checkConnection': u'',
372 u'checkedDomains': u'youtube',
378 u'signIn': u'Sign in',
380 u'service': u'youtube',
384 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
386 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
387 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
388 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
391 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
392 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
393 self._downloader.report_warning(u'unable to log in: bad username or password')
395 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
396 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
402 'action_confirm': 'Confirm',
404 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
406 self.report_age_confirmation()
407 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
408 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
409 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
412 def _extract_id(self, url):
413 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
415 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
417 video_id = mobj.group(2)
420 def _real_extract(self, url):
421 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
422 mobj = re.search(self._NEXT_URL_RE, url)
424 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
425 video_id = self._extract_id(url)
428 self.report_video_webpage_download(video_id)
429 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
430 request = compat_urllib_request.Request(url)
432 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
433 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
434 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
437 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
439 # Attempt to extract SWF player URL
440 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
442 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
447 self.report_video_info_webpage_download(video_id)
448 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
449 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
450 % (video_id, el_type))
451 request = compat_urllib_request.Request(video_info_url)
453 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
454 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
455 video_info = compat_parse_qs(video_info_webpage)
456 if 'token' in video_info:
458 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
459 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
461 if 'token' not in video_info:
462 if 'reason' in video_info:
463 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
465 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
468 # Check for "rental" videos
469 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
470 self._downloader.trouble(u'ERROR: "rental" videos not supported')
473 # Start extracting information
474 self.report_information_extraction(video_id)
477 if 'author' not in video_info:
478 self._downloader.trouble(u'ERROR: unable to extract uploader name')
480 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
483 video_uploader_id = None
484 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
486 video_uploader_id = mobj.group(1)
488 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
491 if 'title' not in video_info:
492 self._downloader.trouble(u'ERROR: unable to extract video title')
494 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
497 if 'thumbnail_url' not in video_info:
498 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
500 else: # don't panic if we can't find it
501 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
505 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
507 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
508 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
509 for expression in format_expressions:
511 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
516 video_description = get_element_by_id("eow-description", video_webpage)
517 if video_description:
518 video_description = clean_html(video_description)
520 video_description = ''
523 video_subtitles = None
524 if self._downloader.params.get('writesubtitles', False):
525 (srt_error, video_subtitles) = self._extract_subtitles(video_id)
527 self._downloader.trouble(srt_error)
529 if 'length_seconds' not in video_info:
530 self._downloader.trouble(u'WARNING: unable to extract video duration')
533 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
536 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
538 # Decide which formats to download
539 req_format = self._downloader.params.get('format', None)
541 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
542 self.report_rtmp_download()
543 video_url_list = [(None, video_info['conn'][0])]
544 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
545 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
546 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
547 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
548 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
550 format_limit = self._downloader.params.get('format_limit', None)
551 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
552 if format_limit is not None and format_limit in available_formats:
553 format_list = available_formats[available_formats.index(format_limit):]
555 format_list = available_formats
556 existing_formats = [x for x in format_list if x in url_map]
557 if len(existing_formats) == 0:
558 self._downloader.trouble(u'ERROR: no known formats available for video')
560 if self._downloader.params.get('listformats', None):
561 self._print_formats(existing_formats)
563 if req_format is None or req_format == 'best':
564 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
565 elif req_format == 'worst':
566 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
567 elif req_format in ('-1', 'all'):
568 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
570 # Specific formats. We pick the first in a slash-delimeted sequence.
571 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
572 req_formats = req_format.split('/')
573 video_url_list = None
574 for rf in req_formats:
576 video_url_list = [(rf, url_map[rf])]
578 if video_url_list is None:
579 self._downloader.trouble(u'ERROR: requested format not available')
582 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
586 for format_param, video_real_url in video_url_list:
588 video_extension = self._video_extensions.get(format_param, 'flv')
590 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
591 self._video_dimensions.get(format_param, '???'))
595 'url': video_real_url,
596 'uploader': video_uploader,
597 'uploader_id': video_uploader_id,
598 'upload_date': upload_date,
599 'title': video_title,
600 'ext': video_extension,
601 'format': video_format,
602 'thumbnail': video_thumbnail,
603 'description': video_description,
604 'player_url': player_url,
605 'subtitles': video_subtitles,
606 'duration': video_duration
611 class MetacafeIE(InfoExtractor):
612 """Information Extractor for metacafe.com."""
614 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
615 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
616 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
617 IE_NAME = u'metacafe'
619 def __init__(self, downloader=None):
620 InfoExtractor.__init__(self, downloader)
622 def report_disclaimer(self):
623 """Report disclaimer retrieval."""
624 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
626 def report_age_confirmation(self):
627 """Report attempt to confirm age."""
628 self._downloader.to_screen(u'[metacafe] Confirming age')
630 def report_download_webpage(self, video_id):
631 """Report webpage download."""
632 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
634 def report_extraction(self, video_id):
635 """Report information extraction."""
636 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
638 def _real_initialize(self):
639 # Retrieve disclaimer
640 request = compat_urllib_request.Request(self._DISCLAIMER)
642 self.report_disclaimer()
643 disclaimer = compat_urllib_request.urlopen(request).read()
644 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
645 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
651 'submit': "Continue - I'm over 18",
653 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
655 self.report_age_confirmation()
656 disclaimer = compat_urllib_request.urlopen(request).read()
657 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
658 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
661 def _real_extract(self, url):
662 # Extract id and simplified title from URL
663 mobj = re.match(self._VALID_URL, url)
665 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
668 video_id = mobj.group(1)
670 # Check if video comes from YouTube
671 mobj2 = re.match(r'^yt-(.*)$', video_id)
672 if mobj2 is not None:
673 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
676 # Retrieve video webpage to extract further information
677 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
679 self.report_download_webpage(video_id)
680 webpage = compat_urllib_request.urlopen(request).read()
681 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
682 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
685 # Extract URL, uploader and title from webpage
686 self.report_extraction(video_id)
687 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
689 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
690 video_extension = mediaURL[-3:]
692 # Extract gdaKey if available
693 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
697 gdaKey = mobj.group(1)
698 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
700 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
702 self._downloader.trouble(u'ERROR: unable to extract media URL')
704 vardict = compat_parse_qs(mobj.group(1))
705 if 'mediaData' not in vardict:
706 self._downloader.trouble(u'ERROR: unable to extract media URL')
708 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
710 self._downloader.trouble(u'ERROR: unable to extract media URL')
712 mediaURL = mobj.group(1).replace('\\/', '/')
713 video_extension = mediaURL[-3:]
714 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
716 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
718 self._downloader.trouble(u'ERROR: unable to extract title')
720 video_title = mobj.group(1).decode('utf-8')
722 mobj = re.search(r'submitter=(.*?);', webpage)
724 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
726 video_uploader = mobj.group(1)
729 'id': video_id.decode('utf-8'),
730 'url': video_url.decode('utf-8'),
731 'uploader': video_uploader.decode('utf-8'),
733 'title': video_title,
734 'ext': video_extension.decode('utf-8'),
738 class DailymotionIE(InfoExtractor):
739 """Information Extractor for Dailymotion"""
741 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
742 IE_NAME = u'dailymotion'
745 def __init__(self, downloader=None):
746 InfoExtractor.__init__(self, downloader)
748 def report_extraction(self, video_id):
749 """Report information extraction."""
750 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
752 def _real_extract(self, url):
753 # Extract id and simplified title from URL
754 mobj = re.match(self._VALID_URL, url)
756 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
759 video_id = mobj.group(1).split('_')[0].split('?')[0]
761 video_extension = 'mp4'
763 # Retrieve video webpage to extract further information
764 request = compat_urllib_request.Request(url)
765 request.add_header('Cookie', 'family_filter=off')
766 webpage = self._download_webpage(request, video_id)
768 # Extract URL, uploader and title from webpage
769 self.report_extraction(video_id)
770 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
772 self._downloader.trouble(u'ERROR: unable to extract media URL')
774 flashvars = compat_urllib_parse.unquote(mobj.group(1))
776 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
779 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
782 self._downloader.trouble(u'ERROR: unable to extract video URL')
785 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
787 self._downloader.trouble(u'ERROR: unable to extract video URL')
790 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
792 # TODO: support choosing qualities
794 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
796 self._downloader.trouble(u'ERROR: unable to extract title')
798 video_title = unescapeHTML(mobj.group('title'))
800 video_uploader = None
801 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
803 # lookin for official user
804 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
805 if mobj_official is None:
806 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
808 video_uploader = mobj_official.group(1)
810 video_uploader = mobj.group(1)
812 video_upload_date = None
813 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
815 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
820 'uploader': video_uploader,
821 'upload_date': video_upload_date,
822 'title': video_title,
823 'ext': video_extension,
827 class PhotobucketIE(InfoExtractor):
828 """Information extractor for photobucket.com."""
830 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
831 IE_NAME = u'photobucket'
833 def __init__(self, downloader=None):
834 InfoExtractor.__init__(self, downloader)
836 def report_download_webpage(self, video_id):
837 """Report webpage download."""
838 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
840 def report_extraction(self, video_id):
841 """Report information extraction."""
842 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
844 def _real_extract(self, url):
845 # Extract id from URL
846 mobj = re.match(self._VALID_URL, url)
848 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
851 video_id = mobj.group(1)
853 video_extension = 'flv'
855 # Retrieve video webpage to extract further information
856 request = compat_urllib_request.Request(url)
858 self.report_download_webpage(video_id)
859 webpage = compat_urllib_request.urlopen(request).read()
860 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
861 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
864 # Extract URL, uploader, and title from webpage
865 self.report_extraction(video_id)
866 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
868 self._downloader.trouble(u'ERROR: unable to extract media URL')
870 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
874 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
876 self._downloader.trouble(u'ERROR: unable to extract title')
878 video_title = mobj.group(1).decode('utf-8')
880 video_uploader = mobj.group(2).decode('utf-8')
883 'id': video_id.decode('utf-8'),
884 'url': video_url.decode('utf-8'),
885 'uploader': video_uploader,
887 'title': video_title,
888 'ext': video_extension.decode('utf-8'),
892 class YahooIE(InfoExtractor):
893 """Information extractor for video.yahoo.com."""
896 # _VALID_URL matches all Yahoo! Video URLs
897 # _VPAGE_URL matches only the extractable '/watch/' URLs
898 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
899 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
900 IE_NAME = u'video.yahoo'
902 def __init__(self, downloader=None):
903 InfoExtractor.__init__(self, downloader)
905 def report_download_webpage(self, video_id):
906 """Report webpage download."""
907 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
909 def report_extraction(self, video_id):
910 """Report information extraction."""
911 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
913 def _real_extract(self, url, new_video=True):
914 # Extract ID from URL
915 mobj = re.match(self._VALID_URL, url)
917 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
920 video_id = mobj.group(2)
921 video_extension = 'flv'
923 # Rewrite valid but non-extractable URLs as
924 # extractable English language /watch/ URLs
925 if re.match(self._VPAGE_URL, url) is None:
926 request = compat_urllib_request.Request(url)
928 webpage = compat_urllib_request.urlopen(request).read()
929 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
930 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
933 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
935 self._downloader.trouble(u'ERROR: Unable to extract id field')
937 yahoo_id = mobj.group(1)
939 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
941 self._downloader.trouble(u'ERROR: Unable to extract vid field')
943 yahoo_vid = mobj.group(1)
945 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
946 return self._real_extract(url, new_video=False)
948 # Retrieve video webpage to extract further information
949 request = compat_urllib_request.Request(url)
951 self.report_download_webpage(video_id)
952 webpage = compat_urllib_request.urlopen(request).read()
953 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
954 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
957 # Extract uploader and title from webpage
958 self.report_extraction(video_id)
959 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
961 self._downloader.trouble(u'ERROR: unable to extract video title')
963 video_title = mobj.group(1).decode('utf-8')
965 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
967 self._downloader.trouble(u'ERROR: unable to extract video uploader')
969 video_uploader = mobj.group(1).decode('utf-8')
971 # Extract video thumbnail
972 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
974 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
976 video_thumbnail = mobj.group(1).decode('utf-8')
978 # Extract video description
979 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
981 self._downloader.trouble(u'ERROR: unable to extract video description')
983 video_description = mobj.group(1).decode('utf-8')
984 if not video_description:
985 video_description = 'No description available.'
987 # Extract video height and width
988 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
990 self._downloader.trouble(u'ERROR: unable to extract video height')
992 yv_video_height = mobj.group(1)
994 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
996 self._downloader.trouble(u'ERROR: unable to extract video width')
998 yv_video_width = mobj.group(1)
1000 # Retrieve video playlist to extract media URL
1001 # I'm not completely sure what all these options are, but we
1002 # seem to need most of them, otherwise the server sends a 401.
1003 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1004 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1005 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1006 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1007 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1009 self.report_download_webpage(video_id)
1010 webpage = compat_urllib_request.urlopen(request).read()
1011 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1012 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1015 # Extract media URL from playlist XML
1016 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1018 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1020 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1021 video_url = unescapeHTML(video_url)
1024 'id': video_id.decode('utf-8'),
1026 'uploader': video_uploader,
1027 'upload_date': None,
1028 'title': video_title,
1029 'ext': video_extension.decode('utf-8'),
1030 'thumbnail': video_thumbnail.decode('utf-8'),
1031 'description': video_description,
1035 class VimeoIE(InfoExtractor):
1036 """Information extractor for vimeo.com."""
1038 # _VALID_URL matches Vimeo URLs
1039 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1042 def __init__(self, downloader=None):
1043 InfoExtractor.__init__(self, downloader)
1045 def report_download_webpage(self, video_id):
1046 """Report webpage download."""
1047 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1049 def report_extraction(self, video_id):
1050 """Report information extraction."""
1051 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1053 def _real_extract(self, url, new_video=True):
1054 # Extract ID from URL
1055 mobj = re.match(self._VALID_URL, url)
1057 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1060 video_id = mobj.group('id')
1061 if not mobj.group('proto'):
1062 url = 'https://' + url
1063 if mobj.group('direct_link'):
1064 url = 'https://vimeo.com/' + video_id
1066 # Retrieve video webpage to extract further information
1067 request = compat_urllib_request.Request(url, None, std_headers)
1069 self.report_download_webpage(video_id)
1070 webpage_bytes = compat_urllib_request.urlopen(request).read()
1071 webpage = webpage_bytes.decode('utf-8')
1072 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1073 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1076 # Now we begin extracting as much information as we can from what we
1077 # retrieved. First we extract the information common to all extractors,
1078 # and latter we extract those that are Vimeo specific.
1079 self.report_extraction(video_id)
1081 # Extract the config JSON
1083 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1084 config = json.loads(config)
1086 self._downloader.trouble(u'ERROR: unable to extract info section')
1090 video_title = config["video"]["title"]
1092 # Extract uploader and uploader_id
1093 video_uploader = config["video"]["owner"]["name"]
1094 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1096 # Extract video thumbnail
1097 video_thumbnail = config["video"]["thumbnail"]
1099 # Extract video description
1100 video_description = get_element_by_attribute("itemprop", "description", webpage)
1101 if video_description: video_description = clean_html(video_description)
1102 else: video_description = ''
1104 # Extract upload date
1105 video_upload_date = None
1106 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1107 if mobj is not None:
1108 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1110 # Vimeo specific: extract request signature and timestamp
1111 sig = config['request']['signature']
1112 timestamp = config['request']['timestamp']
1114 # Vimeo specific: extract video codec and quality information
1115 # First consider quality, then codecs, then take everything
1116 # TODO bind to format param
1117 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1118 files = { 'hd': [], 'sd': [], 'other': []}
1119 for codec_name, codec_extension in codecs:
1120 if codec_name in config["video"]["files"]:
1121 if 'hd' in config["video"]["files"][codec_name]:
1122 files['hd'].append((codec_name, codec_extension, 'hd'))
1123 elif 'sd' in config["video"]["files"][codec_name]:
1124 files['sd'].append((codec_name, codec_extension, 'sd'))
1126 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1128 for quality in ('hd', 'sd', 'other'):
1129 if len(files[quality]) > 0:
1130 video_quality = files[quality][0][2]
1131 video_codec = files[quality][0][0]
1132 video_extension = files[quality][0][1]
1133 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1136 self._downloader.trouble(u'ERROR: no known codec found')
1139 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1140 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1145 'uploader': video_uploader,
1146 'uploader_id': video_uploader_id,
1147 'upload_date': video_upload_date,
1148 'title': video_title,
1149 'ext': video_extension,
1150 'thumbnail': video_thumbnail,
1151 'description': video_description,
1155 class ArteTvIE(InfoExtractor):
1156 """arte.tv information extractor."""
1158 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1159 _LIVE_URL = r'index-[0-9]+\.html$'
1161 IE_NAME = u'arte.tv'
1163 def __init__(self, downloader=None):
1164 InfoExtractor.__init__(self, downloader)
1166 def report_download_webpage(self, video_id):
1167 """Report webpage download."""
1168 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1170 def report_extraction(self, video_id):
1171 """Report information extraction."""
1172 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1174 def fetch_webpage(self, url):
1175 request = compat_urllib_request.Request(url)
1177 self.report_download_webpage(url)
1178 webpage = compat_urllib_request.urlopen(request).read()
1179 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1180 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1182 except ValueError as err:
1183 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1187 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1188 page = self.fetch_webpage(url)
1189 mobj = re.search(regex, page, regexFlags)
1193 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1196 for (i, key, err) in matchTuples:
1197 if mobj.group(i) is None:
1198 self._downloader.trouble(err)
1201 info[key] = mobj.group(i)
1205 def extractLiveStream(self, url):
1206 video_lang = url.split('/')[-4]
1207 info = self.grep_webpage(
1209 r'src="(.*?/videothek_js.*?\.js)',
1212 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1215 http_host = url.split('/')[2]
1216 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1217 info = self.grep_webpage(
1219 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1220 '(http://.*?\.swf).*?' +
1224 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1225 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1226 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1229 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1231 def extractPlus7Stream(self, url):
1232 video_lang = url.split('/')[-3]
1233 info = self.grep_webpage(
1235 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1238 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1241 next_url = compat_urllib_parse.unquote(info.get('url'))
1242 info = self.grep_webpage(
1244 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1247 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1250 next_url = compat_urllib_parse.unquote(info.get('url'))
1252 info = self.grep_webpage(
1254 r'<video id="(.*?)".*?>.*?' +
1255 '<name>(.*?)</name>.*?' +
1256 '<dateVideo>(.*?)</dateVideo>.*?' +
1257 '<url quality="hd">(.*?)</url>',
1260 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1261 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1262 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1263 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1268 'id': info.get('id'),
1269 'url': compat_urllib_parse.unquote(info.get('url')),
1270 'uploader': u'arte.tv',
1271 'upload_date': info.get('date'),
1272 'title': info.get('title').decode('utf-8'),
1278 def _real_extract(self, url):
1279 video_id = url.split('/')[-1]
1280 self.report_extraction(video_id)
1282 if re.search(self._LIVE_URL, video_id) is not None:
1283 self.extractLiveStream(url)
1286 info = self.extractPlus7Stream(url)
1291 class GenericIE(InfoExtractor):
1292 """Generic last-resort information extractor."""
1295 IE_NAME = u'generic'
1297 def __init__(self, downloader=None):
1298 InfoExtractor.__init__(self, downloader)
1300 def report_download_webpage(self, video_id):
1301 """Report webpage download."""
1302 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1303 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1305 def report_extraction(self, video_id):
1306 """Report information extraction."""
1307 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1309 def report_following_redirect(self, new_url):
1310 """Report information extraction."""
1311 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1313 def _test_redirect(self, url):
1314 """Check if it is a redirect, like url shorteners, in case return the new url."""
1315 class HeadRequest(compat_urllib_request.Request):
1316 def get_method(self):
1319 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1321 Subclass the HTTPRedirectHandler to make it use our
1322 HeadRequest also on the redirected URL
1324 def redirect_request(self, req, fp, code, msg, headers, newurl):
1325 if code in (301, 302, 303, 307):
1326 newurl = newurl.replace(' ', '%20')
1327 newheaders = dict((k,v) for k,v in req.headers.items()
1328 if k.lower() not in ("content-length", "content-type"))
1329 return HeadRequest(newurl,
1331 origin_req_host=req.get_origin_req_host(),
1334 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1336 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1338 Fallback to GET if HEAD is not allowed (405 HTTP error)
1340 def http_error_405(self, req, fp, code, msg, headers):
1344 newheaders = dict((k,v) for k,v in req.headers.items()
1345 if k.lower() not in ("content-length", "content-type"))
1346 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1348 origin_req_host=req.get_origin_req_host(),
1352 opener = compat_urllib_request.OpenerDirector()
1353 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1354 HTTPMethodFallback, HEADRedirectHandler,
1355 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1356 opener.add_handler(handler())
1358 response = opener.open(HeadRequest(url))
1359 new_url = response.geturl()
1364 self.report_following_redirect(new_url)
1367 def _real_extract(self, url):
1368 new_url = self._test_redirect(url)
1369 if new_url: return [self.url_result(new_url)]
1371 video_id = url.split('/')[-1]
1372 request = compat_urllib_request.Request(url)
1374 self.report_download_webpage(video_id)
1375 webpage = compat_urllib_request.urlopen(request).read()
1376 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1377 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1379 except ValueError as err:
1380 # since this is the last-resort InfoExtractor, if
1381 # this error is thrown, it'll be thrown here
1382 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1385 self.report_extraction(video_id)
1386 # Start with something easy: JW Player in SWFObject
1387 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1389 # Broaden the search a little bit
1390 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1392 # Broaden the search a little bit: JWPlayer JS loader
1393 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1395 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1398 # It's possible that one of the regexes
1399 # matched, but returned an empty group:
1400 if mobj.group(1) is None:
1401 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1404 video_url = compat_urllib_parse.unquote(mobj.group(1))
1405 video_id = os.path.basename(video_url)
1407 # here's a fun little line of code for you:
1408 video_extension = os.path.splitext(video_id)[1][1:]
1409 video_id = os.path.splitext(video_id)[0]
1411 # it's tempting to parse this further, but you would
1412 # have to take into account all the variations like
1413 # Video Title - Site Name
1414 # Site Name | Video Title
1415 # Video Title - Tagline | Site Name
1416 # and so on and so forth; it's just not practical
1417 mobj = re.search(r'<title>(.*)</title>', webpage)
1419 self._downloader.trouble(u'ERROR: unable to extract title')
1421 video_title = mobj.group(1)
1423 # video uploader is domain name
1424 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1426 self._downloader.trouble(u'ERROR: unable to extract title')
1428 video_uploader = mobj.group(1)
1433 'uploader': video_uploader,
1434 'upload_date': None,
1435 'title': video_title,
1436 'ext': video_extension,
1440 class YoutubeSearchIE(InfoExtractor):
1441 """Information Extractor for YouTube search queries."""
1442 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1443 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1444 _max_youtube_results = 1000
1445 IE_NAME = u'youtube:search'
1447 def __init__(self, downloader=None):
1448 InfoExtractor.__init__(self, downloader)
1450 def report_download_page(self, query, pagenum):
1451 """Report attempt to download search page with given number."""
1452 query = query.decode(preferredencoding())
1453 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1455 def _real_extract(self, query):
1456 mobj = re.match(self._VALID_URL, query)
1458 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1461 prefix, query = query.split(':')
1463 query = query.encode('utf-8')
1465 self._download_n_results(query, 1)
1467 elif prefix == 'all':
1468 self._download_n_results(query, self._max_youtube_results)
1474 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1476 elif n > self._max_youtube_results:
1477 self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1478 n = self._max_youtube_results
1479 self._download_n_results(query, n)
1481 except ValueError: # parsing prefix as integer fails
1482 self._download_n_results(query, 1)
1485 def _download_n_results(self, query, n):
1486 """Downloads a specified number of results for a query"""
1492 while (50 * pagenum) < limit:
1493 self.report_download_page(query, pagenum+1)
1494 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1495 request = compat_urllib_request.Request(result_url)
1497 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1498 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1499 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1501 api_response = json.loads(data)['data']
1503 if not 'items' in api_response:
1504 self._downloader.trouble(u'[youtube] No video results')
1507 new_ids = list(video['id'] for video in api_response['items'])
1508 video_ids += new_ids
1510 limit = min(n, api_response['totalItems'])
1513 if len(video_ids) > n:
1514 video_ids = video_ids[:n]
1515 for id in video_ids:
1516 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1520 class GoogleSearchIE(InfoExtractor):
1521 """Information Extractor for Google Video search queries."""
1522 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1523 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1524 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1525 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1526 _max_google_results = 1000
1527 IE_NAME = u'video.google:search'
1529 def __init__(self, downloader=None):
1530 InfoExtractor.__init__(self, downloader)
1532 def report_download_page(self, query, pagenum):
1533 """Report attempt to download playlist page with given number."""
1534 query = query.decode(preferredencoding())
1535 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1537 def _real_extract(self, query):
1538 mobj = re.match(self._VALID_URL, query)
1540 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1543 prefix, query = query.split(':')
1545 query = query.encode('utf-8')
1547 self._download_n_results(query, 1)
1549 elif prefix == 'all':
1550 self._download_n_results(query, self._max_google_results)
1556 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1558 elif n > self._max_google_results:
1559 self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1560 n = self._max_google_results
1561 self._download_n_results(query, n)
1563 except ValueError: # parsing prefix as integer fails
1564 self._download_n_results(query, 1)
1567 def _download_n_results(self, query, n):
1568 """Downloads a specified number of results for a query"""
1574 self.report_download_page(query, pagenum)
1575 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1576 request = compat_urllib_request.Request(result_url)
1578 page = compat_urllib_request.urlopen(request).read()
1579 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1580 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1583 # Extract video identifiers
1584 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1585 video_id = mobj.group(1)
1586 if video_id not in video_ids:
1587 video_ids.append(video_id)
1588 if len(video_ids) == n:
1589 # Specified n videos reached
1590 for id in video_ids:
1591 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1594 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1595 for id in video_ids:
1596 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1599 pagenum = pagenum + 1
1602 class YahooSearchIE(InfoExtractor):
1603 """Information Extractor for Yahoo! Video search queries."""
1606 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1607 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1608 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1609 _MORE_PAGES_INDICATOR = r'\s*Next'
1610 _max_yahoo_results = 1000
1611 IE_NAME = u'video.yahoo:search'
1613 def __init__(self, downloader=None):
1614 InfoExtractor.__init__(self, downloader)
1616 def report_download_page(self, query, pagenum):
1617 """Report attempt to download playlist page with given number."""
1618 query = query.decode(preferredencoding())
1619 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1621 def _real_extract(self, query):
1622 mobj = re.match(self._VALID_URL, query)
1624 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1627 prefix, query = query.split(':')
1629 query = query.encode('utf-8')
1631 self._download_n_results(query, 1)
1633 elif prefix == 'all':
1634 self._download_n_results(query, self._max_yahoo_results)
1640 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1642 elif n > self._max_yahoo_results:
1643 self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1644 n = self._max_yahoo_results
1645 self._download_n_results(query, n)
1647 except ValueError: # parsing prefix as integer fails
1648 self._download_n_results(query, 1)
1651 def _download_n_results(self, query, n):
1652 """Downloads a specified number of results for a query"""
1655 already_seen = set()
1659 self.report_download_page(query, pagenum)
1660 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1661 request = compat_urllib_request.Request(result_url)
1663 page = compat_urllib_request.urlopen(request).read()
1664 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1665 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1668 # Extract video identifiers
1669 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1670 video_id = mobj.group(1)
1671 if video_id not in already_seen:
1672 video_ids.append(video_id)
1673 already_seen.add(video_id)
1674 if len(video_ids) == n:
1675 # Specified n videos reached
1676 for id in video_ids:
1677 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1680 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1681 for id in video_ids:
1682 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1685 pagenum = pagenum + 1
1688 class YoutubePlaylistIE(InfoExtractor):
1689 """Information Extractor for YouTube playlists."""
1691 _VALID_URL = r"""(?:
1696 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1697 \? (?:.*?&)*? (?:p|a|list)=
1702 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1705 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1707 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1709 IE_NAME = u'youtube:playlist'
1711 def __init__(self, downloader=None):
1712 InfoExtractor.__init__(self, downloader)
1715 def suitable(cls, url):
1716 """Receives a URL and returns True if suitable for this IE."""
1717 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1719 def report_download_page(self, playlist_id, pagenum):
1720 """Report attempt to download playlist page with given number."""
1721 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1723 def _real_extract(self, url):
1724 # Extract playlist id
1725 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1727 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1730 # Download playlist videos from API
1731 playlist_id = mobj.group(1) or mobj.group(2)
1736 self.report_download_page(playlist_id, page_num)
1738 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1740 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1741 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1742 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1746 response = json.loads(page)
1747 except ValueError as err:
1748 self._downloader.trouble(u'ERROR: Invalid JSON in API response: ' + compat_str(err))
1751 if not 'feed' in response or not 'entry' in response['feed']:
1752 self._downloader.trouble(u'ERROR: Got a malformed response from YouTube API')
1754 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1755 for entry in response['feed']['entry']
1756 if 'content' in entry ]
1758 if len(response['feed']['entry']) < self._MAX_RESULTS:
1762 videos = [v[1] for v in sorted(videos)]
1765 playliststart = self._downloader.params.get('playliststart', 1) - 1
1766 playlistend = self._downloader.params.get('playlistend', -1)
1767 if playlistend == -1:
1768 videos = videos[playliststart:]
1770 videos = videos[playliststart:playlistend]
1772 if len(videos) == total:
1773 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1775 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(videos)))
1777 url_results = [self.url_result(url) for url in videos]
1778 return [self.playlist_result(url_results)]
1781 class YoutubeChannelIE(InfoExtractor):
1782 """Information Extractor for YouTube channels."""
1784 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1785 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1786 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1787 IE_NAME = u'youtube:channel'
1789 def report_download_page(self, channel_id, pagenum):
1790 """Report attempt to download channel page with given number."""
1791 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1793 def _real_extract(self, url):
1794 # Extract channel id
1795 mobj = re.match(self._VALID_URL, url)
1797 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1800 # Download channel pages
1801 channel_id = mobj.group(1)
1806 self.report_download_page(channel_id, pagenum)
1807 url = self._TEMPLATE_URL % (channel_id, pagenum)
1808 request = compat_urllib_request.Request(url)
1810 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1811 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1812 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1815 # Extract video identifiers
1817 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1818 if mobj.group(1) not in ids_in_page:
1819 ids_in_page.append(mobj.group(1))
1820 video_ids.extend(ids_in_page)
1822 if self._MORE_PAGES_INDICATOR not in page:
1824 pagenum = pagenum + 1
1826 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1828 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1829 url_entries = [self.url_result(url) for url in urls]
1830 return [self.playlist_result(url_entries)]
1833 class YoutubeUserIE(InfoExtractor):
1834 """Information Extractor for YouTube users."""
1836 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1837 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1838 _GDATA_PAGE_SIZE = 50
1839 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1840 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1841 IE_NAME = u'youtube:user'
1843 def __init__(self, downloader=None):
1844 InfoExtractor.__init__(self, downloader)
1846 def report_download_page(self, username, start_index):
1847 """Report attempt to download user page."""
1848 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1849 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1851 def _real_extract(self, url):
1853 mobj = re.match(self._VALID_URL, url)
1855 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1858 username = mobj.group(1)
1860 # Download video ids using YouTube Data API. Result size per
1861 # query is limited (currently to 50 videos) so we need to query
1862 # page by page until there are no video ids - it means we got
1869 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1870 self.report_download_page(username, start_index)
1872 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1875 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1876 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1877 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1880 # Extract video identifiers
1883 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1884 if mobj.group(1) not in ids_in_page:
1885 ids_in_page.append(mobj.group(1))
1887 video_ids.extend(ids_in_page)
1889 # A little optimization - if current page is not
1890 # "full", ie. does not contain PAGE_SIZE video ids then
1891 # we can assume that this page is the last one - there
1892 # are no more ids on further pages - no need to query
1895 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1900 all_ids_count = len(video_ids)
1901 playliststart = self._downloader.params.get('playliststart', 1) - 1
1902 playlistend = self._downloader.params.get('playlistend', -1)
1904 if playlistend == -1:
1905 video_ids = video_ids[playliststart:]
1907 video_ids = video_ids[playliststart:playlistend]
1909 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1910 (username, all_ids_count, len(video_ids)))
1912 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1913 url_results = [self.url_result(url) for url in urls]
1914 return [self.playlist_result(url_results)]
1917 class BlipTVUserIE(InfoExtractor):
1918 """Information Extractor for blip.tv users."""
1920 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1922 IE_NAME = u'blip.tv:user'
1924 def __init__(self, downloader=None):
1925 InfoExtractor.__init__(self, downloader)
1927 def report_download_page(self, username, pagenum):
1928 """Report attempt to download user page."""
1929 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1930 (self.IE_NAME, username, pagenum))
1932 def _real_extract(self, url):
1934 mobj = re.match(self._VALID_URL, url)
1936 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1939 username = mobj.group(1)
1941 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1943 request = compat_urllib_request.Request(url)
1946 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1947 mobj = re.search(r'data-users-id="([^"]+)"', page)
1948 page_base = page_base % mobj.group(1)
1949 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1950 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1954 # Download video ids using BlipTV Ajax calls. Result size per
1955 # query is limited (currently to 12 videos) so we need to query
1956 # page by page until there are no video ids - it means we got
1963 self.report_download_page(username, pagenum)
1964 url = page_base + "&page=" + str(pagenum)
1965 request = compat_urllib_request.Request( url )
1967 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1968 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1969 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1972 # Extract video identifiers
1975 for mobj in re.finditer(r'href="/([^"]+)"', page):
1976 if mobj.group(1) not in ids_in_page:
1977 ids_in_page.append(unescapeHTML(mobj.group(1)))
1979 video_ids.extend(ids_in_page)
1981 # A little optimization - if current page is not
1982 # "full", ie. does not contain PAGE_SIZE video ids then
1983 # we can assume that this page is the last one - there
1984 # are no more ids on further pages - no need to query
1987 if len(ids_in_page) < self._PAGE_SIZE:
1992 all_ids_count = len(video_ids)
1993 playliststart = self._downloader.params.get('playliststart', 1) - 1
1994 playlistend = self._downloader.params.get('playlistend', -1)
1996 if playlistend == -1:
1997 video_ids = video_ids[playliststart:]
1999 video_ids = video_ids[playliststart:playlistend]
2001 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
2002 (self.IE_NAME, username, all_ids_count, len(video_ids)))
2004 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
2005 url_entries = [self.url_result(url) for url in urls]
2006 return [self.playlist_result(url_entries)]
2009 class DepositFilesIE(InfoExtractor):
2010 """Information extractor for depositfiles.com"""
2012 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2014 def report_download_webpage(self, file_id):
2015 """Report webpage download."""
2016 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2018 def report_extraction(self, file_id):
2019 """Report information extraction."""
2020 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2022 def _real_extract(self, url):
2023 file_id = url.split('/')[-1]
2024 # Rebuild url in english locale
2025 url = 'http://depositfiles.com/en/files/' + file_id
2027 # Retrieve file webpage with 'Free download' button pressed
2028 free_download_indication = { 'gateway_result' : '1' }
2029 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2031 self.report_download_webpage(file_id)
2032 webpage = compat_urllib_request.urlopen(request).read()
2033 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2034 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
2037 # Search for the real file URL
2038 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2039 if (mobj is None) or (mobj.group(1) is None):
2040 # Try to figure out reason of the error.
2041 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2042 if (mobj is not None) and (mobj.group(1) is not None):
2043 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2044 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2046 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2049 file_url = mobj.group(1)
2050 file_extension = os.path.splitext(file_url)[1][1:]
2052 # Search for file title
2053 mobj = re.search(r'<b title="(.*?)">', webpage)
2055 self._downloader.trouble(u'ERROR: unable to extract title')
2057 file_title = mobj.group(1).decode('utf-8')
2060 'id': file_id.decode('utf-8'),
2061 'url': file_url.decode('utf-8'),
2063 'upload_date': None,
2064 'title': file_title,
2065 'ext': file_extension.decode('utf-8'),
2069 class FacebookIE(InfoExtractor):
2070 """Information Extractor for Facebook"""
2072 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2073 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2074 _NETRC_MACHINE = 'facebook'
2075 IE_NAME = u'facebook'
2077 def report_login(self):
2078 """Report attempt to log in."""
2079 self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2081 def _real_initialize(self):
2082 if self._downloader is None:
2087 downloader_params = self._downloader.params
2089 # Attempt to use provided username and password or .netrc data
2090 if downloader_params.get('username', None) is not None:
2091 useremail = downloader_params['username']
2092 password = downloader_params['password']
2093 elif downloader_params.get('usenetrc', False):
2095 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2096 if info is not None:
2100 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2101 except (IOError, netrc.NetrcParseError) as err:
2102 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2105 if useremail is None:
2114 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2117 login_results = compat_urllib_request.urlopen(request).read()
2118 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2119 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2121 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2122 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2125 def _real_extract(self, url):
2126 mobj = re.match(self._VALID_URL, url)
2128 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2130 video_id = mobj.group('ID')
2132 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2133 webpage = self._download_webpage(url, video_id)
2135 BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2136 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2137 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2139 raise ExtractorError(u'Cannot parse data')
2140 data = dict(json.loads(m.group(1)))
2141 params_raw = compat_urllib_parse.unquote(data['params'])
2142 params = json.loads(params_raw)
2143 video_url = params['hd_src']
2145 video_url = params['sd_src']
2147 raise ExtractorError(u'Cannot find video URL')
2148 video_duration = int(params['video_duration'])
2150 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2152 raise ExtractorError(u'Cannot find title in webpage')
2153 video_title = unescapeHTML(m.group(1))
2157 'title': video_title,
2160 'duration': video_duration,
2161 'thumbnail': params['thumbnail_src'],
2166 class BlipTVIE(InfoExtractor):
2167 """Information extractor for blip.tv"""
2169 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2170 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2171 IE_NAME = u'blip.tv'
2173 def report_extraction(self, file_id):
2174 """Report information extraction."""
2175 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2177 def report_direct_download(self, title):
2178 """Report information extraction."""
2179 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2181 def _real_extract(self, url):
2182 mobj = re.match(self._VALID_URL, url)
2184 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2187 urlp = compat_urllib_parse_urlparse(url)
2188 if urlp.path.startswith('/play/'):
2189 request = compat_urllib_request.Request(url)
2190 response = compat_urllib_request.urlopen(request)
2191 redirecturl = response.geturl()
2192 rurlp = compat_urllib_parse_urlparse(redirecturl)
2193 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2194 url = 'http://blip.tv/a/a-' + file_id
2195 return self._real_extract(url)
2202 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2203 request = compat_urllib_request.Request(json_url)
2204 request.add_header('User-Agent', 'iTunes/10.6.1')
2205 self.report_extraction(mobj.group(1))
2208 urlh = compat_urllib_request.urlopen(request)
2209 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2210 basename = url.split('/')[-1]
2211 title,ext = os.path.splitext(basename)
2212 title = title.decode('UTF-8')
2213 ext = ext.replace('.', '')
2214 self.report_direct_download(title)
2219 'upload_date': None,
2224 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2225 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2226 if info is None: # Regular URL
2228 json_code_bytes = urlh.read()
2229 json_code = json_code_bytes.decode('utf-8')
2230 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2231 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2235 json_data = json.loads(json_code)
2236 if 'Post' in json_data:
2237 data = json_data['Post']
2241 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2242 video_url = data['media']['url']
2243 umobj = re.match(self._URL_EXT, video_url)
2245 raise ValueError('Can not determine filename extension')
2246 ext = umobj.group(1)
2249 'id': data['item_id'],
2251 'uploader': data['display_name'],
2252 'upload_date': upload_date,
2253 'title': data['title'],
2255 'format': data['media']['mimeType'],
2256 'thumbnail': data['thumbnailUrl'],
2257 'description': data['description'],
2258 'player_url': data['embedUrl'],
2259 'user_agent': 'iTunes/10.6.1',
2261 except (ValueError,KeyError) as err:
2262 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2268 class MyVideoIE(InfoExtractor):
2269 """Information Extractor for myvideo.de."""
2271 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2272 IE_NAME = u'myvideo'
2274 def __init__(self, downloader=None):
2275 InfoExtractor.__init__(self, downloader)
2277 def report_extraction(self, video_id):
2278 """Report information extraction."""
2279 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2281 def _real_extract(self,url):
2282 mobj = re.match(self._VALID_URL, url)
2284 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2287 video_id = mobj.group(1)
2290 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2291 webpage = self._download_webpage(webpage_url, video_id)
2293 self.report_extraction(video_id)
2294 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\' />',
2297 self._downloader.trouble(u'ERROR: unable to extract media URL')
2299 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2301 mobj = re.search('<title>([^<]+)</title>', webpage)
2303 self._downloader.trouble(u'ERROR: unable to extract title')
2306 video_title = mobj.group(1)
2312 'upload_date': None,
2313 'title': video_title,
2317 class ComedyCentralIE(InfoExtractor):
2318 """Information extractor for The Daily Show and Colbert Report """
2320 # urls can be abbreviations like :thedailyshow or :colbert
2321 # urls for episodes like:
2322 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2323 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2324 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2325 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2326 |(https?://)?(www\.)?
2327 (?P<showname>thedailyshow|colbertnation)\.com/
2328 (full-episodes/(?P<episode>.*)|
2330 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2331 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2334 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2336 _video_extensions = {
2344 _video_dimensions = {
2354 def suitable(cls, url):
2355 """Receives a URL and returns True if suitable for this IE."""
2356 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2358 def report_extraction(self, episode_id):
2359 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2361 def report_config_download(self, episode_id, media_id):
2362 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2364 def report_index_download(self, episode_id):
2365 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2367 def _print_formats(self, formats):
2368 print('Available formats:')
2370 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2373 def _real_extract(self, url):
2374 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2376 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2379 if mobj.group('shortname'):
2380 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2381 url = u'http://www.thedailyshow.com/full-episodes/'
2383 url = u'http://www.colbertnation.com/full-episodes/'
2384 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2385 assert mobj is not None
2387 if mobj.group('clip'):
2388 if mobj.group('showname') == 'thedailyshow':
2389 epTitle = mobj.group('tdstitle')
2391 epTitle = mobj.group('cntitle')
2394 dlNewest = not mobj.group('episode')
2396 epTitle = mobj.group('showname')
2398 epTitle = mobj.group('episode')
2400 req = compat_urllib_request.Request(url)
2401 self.report_extraction(epTitle)
2403 htmlHandle = compat_urllib_request.urlopen(req)
2404 html = htmlHandle.read()
2405 webpage = html.decode('utf-8')
2406 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2407 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2410 url = htmlHandle.geturl()
2411 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2413 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2415 if mobj.group('episode') == '':
2416 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2418 epTitle = mobj.group('episode')
2420 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2422 if len(mMovieParams) == 0:
2423 # The Colbert Report embeds the information in a without
2424 # a URL prefix; so extract the alternate reference
2425 # and then add the URL prefix manually.
2427 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2428 if len(altMovieParams) == 0:
2429 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2432 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2434 uri = mMovieParams[0][1]
2435 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2436 self.report_index_download(epTitle)
2438 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2439 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2440 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2445 idoc = xml.etree.ElementTree.fromstring(indexXml)
2446 itemEls = idoc.findall('.//item')
2447 for partNum,itemEl in enumerate(itemEls):
2448 mediaId = itemEl.findall('./guid')[0].text
2449 shortMediaId = mediaId.split(':')[-1]
2450 showId = mediaId.split(':')[-2].replace('.com', '')
2451 officialTitle = itemEl.findall('./title')[0].text
2452 officialDate = itemEl.findall('./pubDate')[0].text
2454 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2455 compat_urllib_parse.urlencode({'uri': mediaId}))
2456 configReq = compat_urllib_request.Request(configUrl)
2457 self.report_config_download(epTitle, shortMediaId)
2459 configXml = compat_urllib_request.urlopen(configReq).read()
2460 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2461 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2464 cdoc = xml.etree.ElementTree.fromstring(configXml)
2466 for rendition in cdoc.findall('.//rendition'):
2467 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2471 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2474 if self._downloader.params.get('listformats', None):
2475 self._print_formats([i[0] for i in turls])
2478 # For now, just pick the highest bitrate
2479 format,rtmp_video_url = turls[-1]
2481 # Get the format arg from the arg stream
2482 req_format = self._downloader.params.get('format', None)
2484 # Select format if we can find one
2487 format, rtmp_video_url = f, v
2490 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2492 raise ExtractorError(u'Cannot transform RTMP url')
2493 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2494 video_url = base + m.group('finalid')
2496 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2501 'upload_date': officialDate,
2506 'description': officialTitle,
2508 results.append(info)
2513 class EscapistIE(InfoExtractor):
2514 """Information extractor for The Escapist """
2516 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2517 IE_NAME = u'escapist'
2519 def report_extraction(self, showName):
2520 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2522 def report_config_download(self, showName):
2523 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2525 def _real_extract(self, url):
2526 mobj = re.match(self._VALID_URL, url)
2528 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2530 showName = mobj.group('showname')
2531 videoId = mobj.group('episode')
2533 self.report_extraction(showName)
2535 webPage = compat_urllib_request.urlopen(url)
2536 webPageBytes = webPage.read()
2537 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2538 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2539 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2540 self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2543 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2544 description = unescapeHTML(descMatch.group(1))
2545 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2546 imgUrl = unescapeHTML(imgMatch.group(1))
2547 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2548 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2549 configUrlMatch = re.search('config=(.*)$', playerUrl)
2550 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2552 self.report_config_download(showName)
2554 configJSON = compat_urllib_request.urlopen(configUrl)
2555 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2556 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2557 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2558 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2561 # Technically, it's JavaScript, not JSON
2562 configJSON = configJSON.replace("'", '"')
2565 config = json.loads(configJSON)
2566 except (ValueError,) as err:
2567 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2570 playlist = config['playlist']
2571 videoUrl = playlist[1]['url']
2576 'uploader': showName,
2577 'upload_date': None,
2580 'thumbnail': imgUrl,
2581 'description': description,
2582 'player_url': playerUrl,
2587 class CollegeHumorIE(InfoExtractor):
2588 """Information extractor for collegehumor.com"""
2591 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2592 IE_NAME = u'collegehumor'
2594 def report_manifest(self, video_id):
2595 """Report information extraction."""
2596 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2598 def report_extraction(self, video_id):
2599 """Report information extraction."""
2600 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2602 def _real_extract(self, url):
2603 mobj = re.match(self._VALID_URL, url)
2605 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2607 video_id = mobj.group('videoid')
2612 'upload_date': None,
2615 self.report_extraction(video_id)
2616 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2618 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2619 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2620 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2623 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2625 videoNode = mdoc.findall('./video')[0]
2626 info['description'] = videoNode.findall('./description')[0].text
2627 info['title'] = videoNode.findall('./caption')[0].text
2628 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2629 manifest_url = videoNode.findall('./file')[0].text
2631 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2634 manifest_url += '?hdcore=2.10.3'
2635 self.report_manifest(video_id)
2637 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2638 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2639 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2642 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2644 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2645 node_id = media_node.attrib['url']
2646 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2647 except IndexError as err:
2648 self._downloader.trouble(u'\nERROR: Invalid manifest file')
2651 url_pr = compat_urllib_parse_urlparse(manifest_url)
2652 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2659 class XVideosIE(InfoExtractor):
2660 """Information extractor for xvideos.com"""
2662 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2663 IE_NAME = u'xvideos'
2665 def report_extraction(self, video_id):
2666 """Report information extraction."""
2667 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2669 def _real_extract(self, url):
2670 mobj = re.match(self._VALID_URL, url)
2672 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2674 video_id = mobj.group(1)
2676 webpage = self._download_webpage(url, video_id)
2678 self.report_extraction(video_id)
2682 mobj = re.search(r'flv_url=(.+?)&', webpage)
2684 self._downloader.trouble(u'ERROR: unable to extract video url')
2686 video_url = compat_urllib_parse.unquote(mobj.group(1))
2690 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2692 self._downloader.trouble(u'ERROR: unable to extract video title')
2694 video_title = mobj.group(1)
2697 # Extract video thumbnail
2698 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2700 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2702 video_thumbnail = mobj.group(0)
2708 'upload_date': None,
2709 'title': video_title,
2711 'thumbnail': video_thumbnail,
2712 'description': None,
2718 class SoundcloudIE(InfoExtractor):
2719 """Information extractor for soundcloud.com
2720 To access the media, the uid of the song and a stream token
2721 must be extracted from the page source and the script must make
2722 a request to media.soundcloud.com/crossdomain.xml. Then
2723 the media can be grabbed by requesting from an url composed
2724 of the stream token and uid
2727 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2728 IE_NAME = u'soundcloud'
2730 def __init__(self, downloader=None):
2731 InfoExtractor.__init__(self, downloader)
2733 def report_resolve(self, video_id):
2734 """Report information extraction."""
2735 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2737 def report_extraction(self, video_id):
2738 """Report information extraction."""
2739 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2741 def _real_extract(self, url):
2742 mobj = re.match(self._VALID_URL, url)
2744 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2747 # extract uploader (which is in the url)
2748 uploader = mobj.group(1)
2749 # extract simple title (uploader + slug of song title)
2750 slug_title = mobj.group(2)
2751 simple_title = uploader + u'-' + slug_title
2753 self.report_resolve('%s/%s' % (uploader, slug_title))
2755 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2756 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2757 request = compat_urllib_request.Request(resolv_url)
2759 info_json_bytes = compat_urllib_request.urlopen(request).read()
2760 info_json = info_json_bytes.decode('utf-8')
2761 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2762 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2765 info = json.loads(info_json)
2766 video_id = info['id']
2767 self.report_extraction('%s/%s' % (uploader, slug_title))
2769 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2770 request = compat_urllib_request.Request(streams_url)
2772 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2773 stream_json = stream_json_bytes.decode('utf-8')
2774 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2775 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2778 streams = json.loads(stream_json)
2779 mediaURL = streams['http_mp3_128_url']
2784 'uploader': info['user']['username'],
2785 'upload_date': info['created_at'],
2786 'title': info['title'],
2788 'description': info['description'],
2792 class InfoQIE(InfoExtractor):
2793 """Information extractor for infoq.com"""
2794 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2796 def report_extraction(self, video_id):
2797 """Report information extraction."""
2798 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2800 def _real_extract(self, url):
2801 mobj = re.match(self._VALID_URL, url)
2803 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2806 webpage = self._download_webpage(url, video_id=url)
2807 self.report_extraction(url)
2810 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2812 self._downloader.trouble(u'ERROR: unable to extract video url')
2814 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2815 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2818 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2820 self._downloader.trouble(u'ERROR: unable to extract video title')
2822 video_title = mobj.group(1)
2824 # Extract description
2825 video_description = u'No description available.'
2826 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2827 if mobj is not None:
2828 video_description = mobj.group(1)
2830 video_filename = video_url.split('/')[-1]
2831 video_id, extension = video_filename.split('.')
2837 'upload_date': None,
2838 'title': video_title,
2839 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2841 'description': video_description,
2846 class MixcloudIE(InfoExtractor):
2847 """Information extractor for www.mixcloud.com"""
2849 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2850 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2851 IE_NAME = u'mixcloud'
2853 def __init__(self, downloader=None):
2854 InfoExtractor.__init__(self, downloader)
2856 def report_download_json(self, file_id):
2857 """Report JSON download."""
2858 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2860 def report_extraction(self, file_id):
2861 """Report information extraction."""
2862 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2864 def get_urls(self, jsonData, fmt, bitrate='best'):
2865 """Get urls from 'audio_formats' section in json"""
2868 bitrate_list = jsonData[fmt]
2869 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2870 bitrate = max(bitrate_list) # select highest
2872 url_list = jsonData[fmt][bitrate]
2873 except TypeError: # we have no bitrate info.
2874 url_list = jsonData[fmt]
2877 def check_urls(self, url_list):
2878 """Returns 1st active url from list"""
2879 for url in url_list:
2881 compat_urllib_request.urlopen(url)
2883 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2888 def _print_formats(self, formats):
2889 print('Available formats:')
2890 for fmt in formats.keys():
2891 for b in formats[fmt]:
2893 ext = formats[fmt][b][0]
2894 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2895 except TypeError: # we have no bitrate info
2896 ext = formats[fmt][0]
2897 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2900 def _real_extract(self, url):
2901 mobj = re.match(self._VALID_URL, url)
2903 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2905 # extract uploader & filename from url
2906 uploader = mobj.group(1).decode('utf-8')
2907 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2909 # construct API request
2910 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2911 # retrieve .json file with links to files
2912 request = compat_urllib_request.Request(file_url)
2914 self.report_download_json(file_url)
2915 jsonData = compat_urllib_request.urlopen(request).read()
2916 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2917 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2921 json_data = json.loads(jsonData)
2922 player_url = json_data['player_swf_url']
2923 formats = dict(json_data['audio_formats'])
2925 req_format = self._downloader.params.get('format', None)
2928 if self._downloader.params.get('listformats', None):
2929 self._print_formats(formats)
2932 if req_format is None or req_format == 'best':
2933 for format_param in formats.keys():
2934 url_list = self.get_urls(formats, format_param)
2936 file_url = self.check_urls(url_list)
2937 if file_url is not None:
2940 if req_format not in formats:
2941 self._downloader.trouble(u'ERROR: format is not available')
2944 url_list = self.get_urls(formats, req_format)
2945 file_url = self.check_urls(url_list)
2946 format_param = req_format
2949 'id': file_id.decode('utf-8'),
2950 'url': file_url.decode('utf-8'),
2951 'uploader': uploader.decode('utf-8'),
2952 'upload_date': None,
2953 'title': json_data['name'],
2954 'ext': file_url.split('.')[-1].decode('utf-8'),
2955 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2956 'thumbnail': json_data['thumbnail_url'],
2957 'description': json_data['description'],
2958 'player_url': player_url.decode('utf-8'),
2961 class StanfordOpenClassroomIE(InfoExtractor):
2962 """Information extractor for Stanford's Open ClassRoom"""
2964 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2965 IE_NAME = u'stanfordoc'
2967 def report_download_webpage(self, objid):
2968 """Report information extraction."""
2969 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2971 def report_extraction(self, video_id):
2972 """Report information extraction."""
2973 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2975 def _real_extract(self, url):
2976 mobj = re.match(self._VALID_URL, url)
2978 raise ExtractorError(u'Invalid URL: %s' % url)
2980 if mobj.group('course') and mobj.group('video'): # A specific video
2981 course = mobj.group('course')
2982 video = mobj.group('video')
2984 'id': course + '_' + video,
2986 'upload_date': None,
2989 self.report_extraction(info['id'])
2990 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2991 xmlUrl = baseUrl + video + '.xml'
2993 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2994 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2995 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2997 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2999 info['title'] = mdoc.findall('./title')[0].text
3000 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3002 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3004 info['ext'] = info['url'].rpartition('.')[2]
3006 elif mobj.group('course'): # A course page
3007 course = mobj.group('course')
3012 'upload_date': None,
3015 coursepage = self._download_webpage(url, info['id'],
3016 note='Downloading course info page',
3017 errnote='Unable to download course info page')
3019 m = re.search('<h1>([^<]+)</h1>', coursepage)
3021 info['title'] = unescapeHTML(m.group(1))
3023 info['title'] = info['id']
3025 m = re.search('<description>([^<]+)</description>', coursepage)
3027 info['description'] = unescapeHTML(m.group(1))
3029 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3032 'type': 'reference',
3033 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3037 for entry in info['list']:
3038 assert entry['type'] == 'reference'
3039 results += self.extract(entry['url'])
3043 'id': 'Stanford OpenClassroom',
3046 'upload_date': None,
3049 self.report_download_webpage(info['id'])
3050 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3052 rootpage = compat_urllib_request.urlopen(rootURL).read()
3053 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3054 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3057 info['title'] = info['id']
3059 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3062 'type': 'reference',
3063 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3068 for entry in info['list']:
3069 assert entry['type'] == 'reference'
3070 results += self.extract(entry['url'])
3073 class MTVIE(InfoExtractor):
3074 """Information extractor for MTV.com"""
3076 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3079 def report_extraction(self, video_id):
3080 """Report information extraction."""
3081 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3083 def _real_extract(self, url):
3084 mobj = re.match(self._VALID_URL, url)
3086 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3088 if not mobj.group('proto'):
3089 url = 'http://' + url
3090 video_id = mobj.group('videoid')
3092 webpage = self._download_webpage(url, video_id)
3094 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3096 self._downloader.trouble(u'ERROR: unable to extract song name')
3098 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3099 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3101 self._downloader.trouble(u'ERROR: unable to extract performer')
3103 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3104 video_title = performer + ' - ' + song_name
3106 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3108 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3110 mtvn_uri = mobj.group(1)
3112 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3114 self._downloader.trouble(u'ERROR: unable to extract content id')
3116 content_id = mobj.group(1)
3118 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3119 self.report_extraction(video_id)
3120 request = compat_urllib_request.Request(videogen_url)
3122 metadataXml = compat_urllib_request.urlopen(request).read()
3123 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3124 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3127 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3128 renditions = mdoc.findall('.//rendition')
3130 # For now, always pick the highest quality.
3131 rendition = renditions[-1]
3134 _,_,ext = rendition.attrib['type'].partition('/')
3135 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3136 video_url = rendition.find('./src').text
3138 self._downloader.trouble('Invalid rendition field.')
3144 'uploader': performer,
3145 'upload_date': None,
3146 'title': video_title,
3154 class YoukuIE(InfoExtractor):
3155 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3157 def report_download_webpage(self, file_id):
3158 """Report webpage download."""
3159 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3161 def report_extraction(self, file_id):
3162 """Report information extraction."""
3163 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3166 nowTime = int(time.time() * 1000)
3167 random1 = random.randint(1000,1998)
3168 random2 = random.randint(1000,9999)
3170 return "%d%d%d" %(nowTime,random1,random2)
3172 def _get_file_ID_mix_string(self, seed):
3174 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3176 for i in range(len(source)):
3177 seed = (seed * 211 + 30031 ) % 65536
3178 index = math.floor(seed / 65536 * len(source) )
3179 mixed.append(source[int(index)])
3180 source.remove(source[int(index)])
3181 #return ''.join(mixed)
3184 def _get_file_id(self, fileId, seed):
3185 mixed = self._get_file_ID_mix_string(seed)
3186 ids = fileId.split('*')
3190 realId.append(mixed[int(ch)])
3191 return ''.join(realId)
3193 def _real_extract(self, url):
3194 mobj = re.match(self._VALID_URL, url)
3196 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3198 video_id = mobj.group('ID')
3200 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3202 request = compat_urllib_request.Request(info_url, None, std_headers)
3204 self.report_download_webpage(video_id)
3205 jsondata = compat_urllib_request.urlopen(request).read()
3206 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3207 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3210 self.report_extraction(video_id)
3212 jsonstr = jsondata.decode('utf-8')
3213 config = json.loads(jsonstr)
3215 video_title = config['data'][0]['title']
3216 seed = config['data'][0]['seed']
3218 format = self._downloader.params.get('format', None)
3219 supported_format = list(config['data'][0]['streamfileids'].keys())
3221 if format is None or format == 'best':
3222 if 'hd2' in supported_format:
3227 elif format == 'worst':
3235 fileid = config['data'][0]['streamfileids'][format]
3236 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3237 except (UnicodeDecodeError, ValueError, KeyError):
3238 self._downloader.trouble(u'ERROR: unable to extract info section')
3242 sid = self._gen_sid()
3243 fileid = self._get_file_id(fileid, seed)
3245 #column 8,9 of fileid represent the segment number
3246 #fileid[7:9] should be changed
3247 for index, key in enumerate(keys):
3249 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3250 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3253 'id': '%s_part%02d' % (video_id, index),
3254 'url': download_url,
3256 'upload_date': None,
3257 'title': video_title,
3260 files_info.append(info)
3265 class XNXXIE(InfoExtractor):
3266 """Information extractor for xnxx.com"""
3268 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3270 VIDEO_URL_RE = r'flv_url=(.*?)&'
3271 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3272 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3274 def report_webpage(self, video_id):
3275 """Report information extraction"""
3276 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3278 def report_extraction(self, video_id):
3279 """Report information extraction"""
3280 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3282 def _real_extract(self, url):
3283 mobj = re.match(self._VALID_URL, url)
3285 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3287 video_id = mobj.group(1)
3289 self.report_webpage(video_id)
3291 # Get webpage content
3293 webpage_bytes = compat_urllib_request.urlopen(url).read()
3294 webpage = webpage_bytes.decode('utf-8')
3295 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3296 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3299 result = re.search(self.VIDEO_URL_RE, webpage)
3301 self._downloader.trouble(u'ERROR: unable to extract video url')
3303 video_url = compat_urllib_parse.unquote(result.group(1))
3305 result = re.search(self.VIDEO_TITLE_RE, webpage)
3307 self._downloader.trouble(u'ERROR: unable to extract video title')
3309 video_title = result.group(1)
3311 result = re.search(self.VIDEO_THUMB_RE, webpage)
3313 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3315 video_thumbnail = result.group(1)
3321 'upload_date': None,
3322 'title': video_title,
3324 'thumbnail': video_thumbnail,
3325 'description': None,
3329 class GooglePlusIE(InfoExtractor):
3330 """Information extractor for plus.google.com."""
3332 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3333 IE_NAME = u'plus.google'
3335 def __init__(self, downloader=None):
3336 InfoExtractor.__init__(self, downloader)
3338 def report_extract_entry(self, url):
3339 """Report downloading extry"""
3340 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3342 def report_date(self, upload_date):
3343 """Report downloading extry"""
3344 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3346 def report_uploader(self, uploader):
3347 """Report downloading extry"""
3348 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3350 def report_title(self, video_title):
3351 """Report downloading extry"""
3352 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3354 def report_extract_vid_page(self, video_page):
3355 """Report information extraction."""
3356 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3358 def _real_extract(self, url):
3359 # Extract id from URL
3360 mobj = re.match(self._VALID_URL, url)
3362 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3365 post_url = mobj.group(0)
3366 video_id = mobj.group(1)
3368 video_extension = 'flv'
3370 # Step 1, Retrieve post webpage to extract further information
3371 self.report_extract_entry(post_url)
3372 request = compat_urllib_request.Request(post_url)
3374 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3375 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3376 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3379 # Extract update date
3381 pattern = 'title="Timestamp">(.*?)</a>'
3382 mobj = re.search(pattern, webpage)
3384 upload_date = mobj.group(1)
3385 # Convert timestring to a format suitable for filename
3386 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3387 upload_date = upload_date.strftime('%Y%m%d')
3388 self.report_date(upload_date)
3392 pattern = r'rel\="author".*?>(.*?)</a>'
3393 mobj = re.search(pattern, webpage)
3395 uploader = mobj.group(1)
3396 self.report_uploader(uploader)
3399 # Get the first line for title
3401 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3402 mobj = re.search(pattern, webpage)
3404 video_title = mobj.group(1)
3405 self.report_title(video_title)
3407 # Step 2, Stimulate clicking the image box to launch video
3408 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3409 mobj = re.search(pattern, webpage)
3411 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3413 video_page = mobj.group(1)
3414 request = compat_urllib_request.Request(video_page)
3416 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3417 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3418 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3420 self.report_extract_vid_page(video_page)
3423 # Extract video links on video page
3424 """Extract video links of all sizes"""
3425 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3426 mobj = re.findall(pattern, webpage)
3428 self._downloader.trouble(u'ERROR: unable to extract video links')
3430 # Sort in resolution
3431 links = sorted(mobj)
3433 # Choose the lowest of the sort, i.e. highest resolution
3434 video_url = links[-1]
3435 # Only get the url. The resolution part in the tuple has no use anymore
3436 video_url = video_url[-1]
3437 # Treat escaped \u0026 style hex
3439 video_url = video_url.decode("unicode_escape")
3440 except AttributeError: # Python 3
3441 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3447 'uploader': uploader,
3448 'upload_date': upload_date,
3449 'title': video_title,
3450 'ext': video_extension,
3453 class NBAIE(InfoExtractor):
3454 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3457 def _real_extract(self, url):
3458 mobj = re.match(self._VALID_URL, url)
3460 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3463 video_id = mobj.group(1)
3464 if video_id.endswith('/index.html'):
3465 video_id = video_id[:-len('/index.html')]
3467 webpage = self._download_webpage(url, video_id)
3469 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3470 def _findProp(rexp, default=None):
3471 m = re.search(rexp, webpage)
3473 return unescapeHTML(m.group(1))
3477 shortened_video_id = video_id.rpartition('/')[2]
3478 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3480 'id': shortened_video_id,
3484 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3485 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3489 class JustinTVIE(InfoExtractor):
3490 """Information extractor for justin.tv and twitch.tv"""
3491 # TODO: One broadcast may be split into multiple videos. The key
3492 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3493 # starts at 1 and increases. Can we treat all parts as one video?
3495 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3496 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3497 _JUSTIN_PAGE_LIMIT = 100
3498 IE_NAME = u'justin.tv'
3500 def report_extraction(self, file_id):
3501 """Report information extraction."""
3502 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3504 def report_download_page(self, channel, offset):
3505 """Report attempt to download a single page of videos."""
3506 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3507 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3509 # Return count of items, list of *valid* items
3510 def _parse_page(self, url):
3512 urlh = compat_urllib_request.urlopen(url)
3513 webpage_bytes = urlh.read()
3514 webpage = webpage_bytes.decode('utf-8', 'ignore')
3515 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3516 self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3519 response = json.loads(webpage)
3520 if type(response) != list:
3521 error_text = response.get('error', 'unknown error')
3522 self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3525 for clip in response:
3526 video_url = clip['video_file_url']
3528 video_extension = os.path.splitext(video_url)[1][1:]
3529 video_date = re.sub('-', '', clip['start_time'][:10])
3530 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3531 video_id = clip['id']
3532 video_title = clip.get('title', video_id)
3536 'title': video_title,
3537 'uploader': clip.get('channel_name', video_uploader_id),
3538 'uploader_id': video_uploader_id,
3539 'upload_date': video_date,
3540 'ext': video_extension,
3542 return (len(response), info)
3544 def _real_extract(self, url):
3545 mobj = re.match(self._VALID_URL, url)
3547 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3550 api = 'http://api.justin.tv'
3551 video_id = mobj.group(mobj.lastindex)
3553 if mobj.lastindex == 1:
3555 api += '/channel/archives/%s.json'
3557 api += '/broadcast/by_archive/%s.json'
3558 api = api % (video_id,)
3560 self.report_extraction(video_id)
3564 limit = self._JUSTIN_PAGE_LIMIT
3567 self.report_download_page(video_id, offset)
3568 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3569 page_count, page_info = self._parse_page(page_url)
3570 info.extend(page_info)
3571 if not paged or page_count != limit:
3576 class FunnyOrDieIE(InfoExtractor):
3577 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3579 def _real_extract(self, url):
3580 mobj = re.match(self._VALID_URL, url)
3582 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3585 video_id = mobj.group('id')
3586 webpage = self._download_webpage(url, video_id)
3588 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3590 self._downloader.trouble(u'ERROR: unable to find video information')
3591 video_url = unescapeHTML(m.group('url'))
3593 m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3595 self._downloader.trouble(u'Cannot find video title')
3596 title = unescapeHTML(m.group('title'))
3598 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3600 desc = unescapeHTML(m.group('desc'))
3609 'description': desc,
3613 class SteamIE(InfoExtractor):
3614 _VALID_URL = r"""http://store.steampowered.com/
3615 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3617 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3621 def suitable(cls, url):
3622 """Receives a URL and returns True if suitable for this IE."""
3623 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3625 def _real_extract(self, url):
3626 m = re.match(self._VALID_URL, url, re.VERBOSE)
3627 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3628 gameID = m.group('gameID')
3629 videourl = 'http://store.steampowered.com/video/%s/' % gameID
3630 webpage = self._download_webpage(videourl, gameID)
3631 mweb = re.finditer(urlRE, webpage)
3632 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3633 titles = re.finditer(namesRE, webpage)
3634 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3635 thumbs = re.finditer(thumbsRE, webpage)
3637 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3638 video_id = vid.group('videoID')
3639 title = vtitle.group('videoName')
3640 video_url = vid.group('videoURL')
3641 video_thumb = thumb.group('thumbnail')
3643 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3648 'title': unescapeHTML(title),
3649 'thumbnail': video_thumb
3654 class UstreamIE(InfoExtractor):
3655 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3656 IE_NAME = u'ustream'
3658 def _real_extract(self, url):
3659 m = re.match(self._VALID_URL, url)
3660 video_id = m.group('videoID')
3661 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3662 webpage = self._download_webpage(url, video_id)
3663 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3664 title = m.group('title')
3665 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3666 uploader = m.group('uploader')
3672 'uploader': uploader
3676 class RBMARadioIE(InfoExtractor):
3677 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3679 def _real_extract(self, url):
3680 m = re.match(self._VALID_URL, url)
3681 video_id = m.group('videoID')
3683 webpage = self._download_webpage(url, video_id)
3684 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3686 raise ExtractorError(u'Cannot find metadata')
3687 json_data = m.group(1)
3690 data = json.loads(json_data)
3691 except ValueError as e:
3692 raise ExtractorError(u'Invalid JSON: ' + str(e))
3694 video_url = data['akamai_url'] + '&cbr=256'
3695 url_parts = compat_urllib_parse_urlparse(video_url)
3696 video_ext = url_parts.path.rpartition('.')[2]
3701 'title': data['title'],
3702 'description': data.get('teaser_text'),
3703 'location': data.get('country_of_origin'),
3704 'uploader': data.get('host', {}).get('name'),
3705 'uploader_id': data.get('host', {}).get('slug'),
3706 'thumbnail': data.get('image', {}).get('large_url_2x'),
3707 'duration': data.get('duration'),
3712 class YouPornIE(InfoExtractor):
3713 """Information extractor for youporn.com."""
3714 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3716 def _print_formats(self, formats):
3717 """Print all available formats"""
3718 print(u'Available formats:')
3719 print(u'ext\t\tformat')
3720 print(u'---------------------------------')
3721 for format in formats:
3722 print(u'%s\t\t%s' % (format['ext'], format['format']))
3724 def _specific(self, req_format, formats):
3726 if(x["format"]==req_format):
3730 def _real_extract(self, url):
3731 mobj = re.match(self._VALID_URL, url)
3733 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3736 video_id = mobj.group('videoid')
3738 req = compat_urllib_request.Request(url)
3739 req.add_header('Cookie', 'age_verified=1')
3740 webpage = self._download_webpage(req, video_id)
3742 # Get the video title
3743 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3745 raise ExtractorError(u'Unable to extract video title')
3746 video_title = result.group('title').strip()
3748 # Get the video date
3749 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3751 self._downloader.report_warning(u'unable to extract video date')
3754 upload_date = result.group('date').strip()
3756 # Get the video uploader
3757 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3759 self._downloader.report_warning(u'unable to extract uploader')
3760 video_uploader = None
3762 video_uploader = result.group('uploader').strip()
3763 video_uploader = clean_html( video_uploader )
3765 # Get all of the formats available
3766 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3767 result = re.search(DOWNLOAD_LIST_RE, webpage)
3769 raise ExtractorError(u'Unable to extract download list')
3770 download_list_html = result.group('download_list').strip()
3772 # Get all of the links from the page
3773 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3774 links = re.findall(LINK_RE, download_list_html)
3775 if(len(links) == 0):
3776 raise ExtractorError(u'ERROR: no known formats available for video')
3778 self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3783 # A link looks like this:
3784 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3785 # A path looks like this:
3786 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3787 video_url = unescapeHTML( link )
3788 path = compat_urllib_parse_urlparse( video_url ).path
3789 extension = os.path.splitext( path )[1][1:]
3790 format = path.split('/')[4].split('_')[:2]
3793 format = "-".join( format )
3794 title = u'%s-%s-%s' % (video_title, size, bitrate)
3799 'uploader': video_uploader,
3800 'upload_date': upload_date,
3805 'description': None,
3809 if self._downloader.params.get('listformats', None):
3810 self._print_formats(formats)
3813 req_format = self._downloader.params.get('format', None)
3814 self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3816 if req_format is None or req_format == 'best':
3818 elif req_format == 'worst':
3819 return [formats[-1]]
3820 elif req_format in ('-1', 'all'):
3823 format = self._specific( req_format, formats )
3825 self._downloader.trouble(u'ERROR: requested format not available')
3831 class PornotubeIE(InfoExtractor):
3832 """Information extractor for pornotube.com."""
3833 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3835 def _real_extract(self, url):
3836 mobj = re.match(self._VALID_URL, url)
3838 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3841 video_id = mobj.group('videoid')
3842 video_title = mobj.group('title')
3844 # Get webpage content
3845 webpage = self._download_webpage(url, video_id)
3848 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3849 result = re.search(VIDEO_URL_RE, webpage)
3851 self._downloader.trouble(u'ERROR: unable to extract video url')
3853 video_url = compat_urllib_parse.unquote(result.group('url'))
3855 #Get the uploaded date
3856 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3857 result = re.search(VIDEO_UPLOADED_RE, webpage)
3859 self._downloader.trouble(u'ERROR: unable to extract video title')
3861 upload_date = result.group('date')
3863 info = {'id': video_id,
3866 'upload_date': upload_date,
3867 'title': video_title,
3873 class YouJizzIE(InfoExtractor):
3874 """Information extractor for youjizz.com."""
3875 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3877 def _real_extract(self, url):
3878 mobj = re.match(self._VALID_URL, url)
3880 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3883 video_id = mobj.group('videoid')
3885 # Get webpage content
3886 webpage = self._download_webpage(url, video_id)
3888 # Get the video title
3889 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3891 raise ExtractorError(u'ERROR: unable to extract video title')
3892 video_title = result.group('title').strip()
3894 # Get the embed page
3895 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3897 raise ExtractorError(u'ERROR: unable to extract embed page')
3899 embed_page_url = result.group(0).strip()
3900 video_id = result.group('videoid')
3902 webpage = self._download_webpage(embed_page_url, video_id)
3905 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3907 raise ExtractorError(u'ERROR: unable to extract video url')
3908 video_url = result.group('source')
3910 info = {'id': video_id,
3912 'title': video_title,
3915 'player_url': embed_page_url}
3919 class EightTracksIE(InfoExtractor):
3921 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3923 def _real_extract(self, url):
3924 mobj = re.match(self._VALID_URL, url)
3926 raise ExtractorError(u'Invalid URL: %s' % url)
3927 playlist_id = mobj.group('id')
3929 webpage = self._download_webpage(url, playlist_id)
3931 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3933 raise ExtractorError(u'Cannot find trax information')
3934 json_like = m.group(1)
3935 data = json.loads(json_like)
3937 session = str(random.randint(0, 1000000000))
3939 track_count = data['tracks_count']
3940 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3941 next_url = first_url
3943 for i in itertools.count():
3944 api_json = self._download_webpage(next_url, playlist_id,
3945 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3946 errnote=u'Failed to download song information')
3947 api_data = json.loads(api_json)
3948 track_data = api_data[u'set']['track']
3950 'id': track_data['id'],
3951 'url': track_data['track_file_stream_url'],
3952 'title': track_data['performer'] + u' - ' + track_data['name'],
3953 'raw_title': track_data['name'],
3954 'uploader_id': data['user']['login'],
3958 if api_data['set']['at_last_track']:
3960 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3963 class KeekIE(InfoExtractor):
3964 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3967 def _real_extract(self, url):
3968 m = re.match(self._VALID_URL, url)
3969 video_id = m.group('videoID')
3970 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3971 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3972 webpage = self._download_webpage(url, video_id)
3973 m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
3974 title = unescapeHTML(m.group('title'))
3975 m = re.search(r'<div class="bio-names-and-report">[\s\n]+<h4>(?P<uploader>\w+)</h4>', webpage)
3976 uploader = unescapeHTML(m.group('uploader'))
3982 'thumbnail': thumbnail,
3983 'uploader': uploader
3987 class TEDIE(InfoExtractor):
3988 _VALID_URL=r'''http://www.ted.com/
3990 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3992 ((?P<type_talk>talks)) # We have a simple talk
3994 /(?P<name>\w+) # Here goes the name and then ".html"
3998 def suitable(cls, url):
3999 """Receives a URL and returns True if suitable for this IE."""
4000 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4002 def _real_extract(self, url):
4003 m=re.match(self._VALID_URL, url, re.VERBOSE)
4004 if m.group('type_talk'):
4005 return [self._talk_info(url)]
4007 playlist_id=m.group('playlist_id')
4008 name=m.group('name')
4009 self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4010 return self._playlist_videos_info(url,name,playlist_id)
4012 def _talk_video_link(self,mediaSlug):
4013 '''Returns the video link for that mediaSlug'''
4014 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4016 def _playlist_videos_info(self,url,name,playlist_id=0):
4017 '''Returns the videos of the playlist'''
4019 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4020 ([.\s]*?)data-playlist_item_id="(\d+)"
4021 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4023 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4024 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4025 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4026 m_names=re.finditer(video_name_RE,webpage)
4028 for m_video, m_name in zip(m_videos,m_names):
4029 video_id=m_video.group('video_id')
4030 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4031 info.append(self._talk_info(talk_url,video_id))
4034 def _talk_info(self, url, video_id=0):
4035 """Return the video for the talk in the url"""
4036 m=re.match(self._VALID_URL, url,re.VERBOSE)
4037 videoName=m.group('name')
4038 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4039 # If the url includes the language we get the title translated
4040 title_RE=r'<h1><span id="altHeadline" >(?P<title>.*)</span></h1>'
4041 title=re.search(title_RE, webpage).group('title')
4042 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4043 "id":(?P<videoID>[\d]+).*?
4044 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4045 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4046 thumb_match=re.search(thumb_RE,webpage)
4047 info_match=re.search(info_RE,webpage,re.VERBOSE)
4048 video_id=info_match.group('videoID')
4049 mediaSlug=info_match.group('mediaSlug')
4050 video_url=self._talk_video_link(mediaSlug)
4056 'thumbnail': thumb_match.group('thumbnail')
4060 class MySpassIE(InfoExtractor):
4061 _VALID_URL = r'http://www.myspass.de/.*'
4063 def _real_extract(self, url):
4064 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4066 # video id is the last path element of the URL
4067 # usually there is a trailing slash, so also try the second but last
4068 url_path = compat_urllib_parse_urlparse(url).path
4069 url_parent_path, video_id = os.path.split(url_path)
4071 _, video_id = os.path.split(url_parent_path)
4074 metadata_url = META_DATA_URL_TEMPLATE % video_id
4075 metadata_text = self._download_webpage(metadata_url, video_id)
4076 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4078 # extract values from metadata
4079 url_flv_el = metadata.find('url_flv')
4080 if url_flv_el is None:
4081 self._downloader.trouble(u'ERROR: unable to extract download url')
4083 video_url = url_flv_el.text
4084 extension = os.path.splitext(video_url)[1][1:]
4085 title_el = metadata.find('title')
4086 if title_el is None:
4087 self._downloader.trouble(u'ERROR: unable to extract title')
4089 title = title_el.text
4090 format_id_el = metadata.find('format_id')
4091 if format_id_el is None:
4094 format = format_id_el.text
4095 description_el = metadata.find('description')
4096 if description_el is not None:
4097 description = description_el.text
4100 imagePreview_el = metadata.find('imagePreview')
4101 if imagePreview_el is not None:
4102 thumbnail = imagePreview_el.text
4111 'thumbnail': thumbnail,
4112 'description': description
4116 def gen_extractors():
4117 """ Return a list of an instance of every supported extractor.
4118 The order does matter; the first extractor matched is the one handling the URL.
4121 YoutubePlaylistIE(),
4145 StanfordOpenClassroomIE(),