2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
23 class InfoExtractor(object):
24 """Information Extractor class.
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
29 others. The information is stored in a dictionary which is then
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
34 The dictionaries must include the following fields:
38 title: Video title, unescaped.
39 ext: Video filename extension.
41 The following fields are optional:
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
46 uploader: Full name of the video uploader.
47 upload_date: Video upload date (YYYYMMDD).
48 uploader_id: Nickname or id of the video uploader.
49 location: Physical location of the video.
50 player_url: SWF Player URL (used for rtmpdump).
51 subtitles: The subtitle file contents.
52 urlhandle: [internal] The urlHandle to be used to download the file,
53 like returned by urllib.request.urlopen
55 The fields should all be Unicode strings.
57 Subclasses of this one should re-define the _real_initialize() and
58 _real_extract() methods and define a _VALID_URL regexp.
59 Probably, they should also be added to the list of extractors.
61 _real_extract() must return a *list* of information dictionaries as
64 Finally, the _WORKING attribute should be set to False for broken IEs
65 in order to warn the users and skip the tests.
72 def __init__(self, downloader=None):
73 """Constructor. Receives an optional downloader."""
75 self.set_downloader(downloader)
78 def suitable(cls, url):
79 """Receives a URL and returns True if suitable for this IE."""
80 return re.match(cls._VALID_URL, url) is not None
84 """Getter method for _WORKING."""
88 """Initializes an instance (authentication, etc)."""
90 self._real_initialize()
93 def extract(self, url):
94 """Extracts URL information and returns it in list of dicts."""
96 return self._real_extract(url)
98 def set_downloader(self, downloader):
99 """Sets the downloader for this IE."""
100 self._downloader = downloader
102 def _real_initialize(self):
103 """Real initialization process. Redefine in subclasses."""
106 def _real_extract(self, url):
107 """Real extraction process. Redefine in subclasses."""
112 return type(self).__name__[:-2]
114 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115 """ Returns the response handle """
117 note = u'Downloading video webpage'
118 if note is not False:
119 self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
121 return compat_urllib_request.urlopen(url_or_request)
122 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
124 errnote = u'Unable to download webpage'
125 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
127 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
128 """ Returns the data of the page as a string """
129 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
130 content_type = urlh.headers.get('Content-Type', '')
131 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
133 encoding = m.group(1)
136 webpage_bytes = urlh.read()
137 if self._downloader.params.get('dump_intermediate_pages', False):
139 url = url_or_request.get_full_url()
140 except AttributeError:
142 self._downloader.to_screen(u'Dumping request to ' + url)
143 dump = base64.b64encode(webpage_bytes).decode('ascii')
144 self._downloader.to_screen(dump)
145 return webpage_bytes.decode(encoding, 'replace')
148 class YoutubeIE(InfoExtractor):
149 """Information extractor for youtube.com."""
153 (?:https?://)? # http(s):// (optional)
154 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
155 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
156 (?:.*?\#/)? # handle anchor (#/) redirect urls
157 (?: # the various things that can precede the ID:
158 (?:(?:v|embed|e)/) # v/ or embed/ or e/
159 |(?: # or the v= param in all its forms
160 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
161 (?:\?|\#!?) # the params delimiter ? or # or #!
162 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
165 )? # optional -> youtube.com/xxxx is OK
166 )? # all until now is optional -> you can pass the naked ID
167 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
168 (?(1).+)? # if we found the ID, everything can follow
170 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
171 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
172 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
173 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
174 _NETRC_MACHINE = 'youtube'
175 # Listed in order of quality
176 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
177 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
178 _video_extensions = {
184 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
190 _video_dimensions = {
209 def suitable(cls, url):
210 """Receives a URL and returns True if suitable for this IE."""
211 if YoutubePlaylistIE.suitable(url): return False
212 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
214 def report_lang(self):
215 """Report attempt to set language."""
216 self._downloader.to_screen(u'[youtube] Setting language')
218 def report_login(self):
219 """Report attempt to log in."""
220 self._downloader.to_screen(u'[youtube] Logging in')
222 def report_age_confirmation(self):
223 """Report attempt to confirm age."""
224 self._downloader.to_screen(u'[youtube] Confirming age')
226 def report_video_webpage_download(self, video_id):
227 """Report attempt to download video webpage."""
228 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
230 def report_video_info_webpage_download(self, video_id):
231 """Report attempt to download video info webpage."""
232 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
234 def report_video_subtitles_download(self, video_id):
235 """Report attempt to download video info webpage."""
236 self._downloader.to_screen(u'[youtube] %s: Checking available subtitles' % video_id)
238 def report_video_subtitles_request(self, video_id, sub_lang, format):
239 """Report attempt to download video info webpage."""
240 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
242 def report_video_subtitles_available(self, video_id, sub_lang_list):
243 """Report available subtitles."""
244 sub_lang = ",".join(list(sub_lang_list.keys()))
245 self._downloader.to_screen(u'[youtube] %s: Available subtitles for video: %s' % (video_id, sub_lang))
247 def report_information_extraction(self, video_id):
248 """Report attempt to extract video information."""
249 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
251 def report_unavailable_format(self, video_id, format):
252 """Report extracted video URL."""
253 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
255 def report_rtmp_download(self):
256 """Indicate the download will use the RTMP protocol."""
257 self._downloader.to_screen(u'[youtube] RTMP download detected')
259 def _get_available_subtitles(self, video_id):
260 self.report_video_subtitles_download(video_id)
261 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
263 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
264 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
265 return (u'unable to download video subtitles: %s' % compat_str(err), None)
266 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
267 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
268 if not sub_lang_list:
269 return (u'video doesn\'t have subtitles', None)
272 def _list_available_subtitles(self, video_id):
273 sub_lang_list = self._get_available_subtitles(video_id)
274 self.report_video_subtitles_available(video_id, sub_lang_list)
276 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
279 (error_message, sub_lang, sub)
281 self.report_video_subtitles_request(video_id, sub_lang, format)
282 params = compat_urllib_parse.urlencode({
288 url = 'http://www.youtube.com/api/timedtext?' + params
290 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
291 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
292 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
294 return (u'Did not fetch video subtitles', None, None)
295 return (None, sub_lang, sub)
297 def _extract_subtitle(self, video_id):
299 Return a list with a tuple:
300 [(error_message, sub_lang, sub)]
302 sub_lang_list = self._get_available_subtitles(video_id)
303 sub_format = self._downloader.params.get('subtitlesformat')
304 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
305 return [(sub_lang_list[0], None, None)]
306 if self._downloader.params.get('subtitleslang', False):
307 sub_lang = self._downloader.params.get('subtitleslang')
308 elif 'en' in sub_lang_list:
311 sub_lang = list(sub_lang_list.keys())[0]
312 if not sub_lang in sub_lang_list:
313 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
315 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
318 def _extract_all_subtitles(self, video_id):
319 sub_lang_list = self._get_available_subtitles(video_id)
320 sub_format = self._downloader.params.get('subtitlesformat')
321 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
322 return [(sub_lang_list[0], None, None)]
324 for sub_lang in sub_lang_list:
325 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
326 subtitles.append(subtitle)
329 def _print_formats(self, formats):
330 print('Available formats:')
332 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
334 def _real_initialize(self):
335 if self._downloader is None:
340 downloader_params = self._downloader.params
342 # Attempt to use provided username and password or .netrc data
343 if downloader_params.get('username', None) is not None:
344 username = downloader_params['username']
345 password = downloader_params['password']
346 elif downloader_params.get('usenetrc', False):
348 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
353 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
354 except (IOError, netrc.NetrcParseError) as err:
355 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
359 request = compat_urllib_request.Request(self._LANG_URL)
362 compat_urllib_request.urlopen(request).read()
363 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
364 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
367 # No authentication to be performed
371 request = compat_urllib_request.Request(self._LOGIN_URL)
373 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
374 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
375 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
380 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
382 galx = match.group(1)
384 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
390 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
394 u'PersistentCookie': u'yes',
396 u'bgresponse': u'js_disabled',
397 u'checkConnection': u'',
398 u'checkedDomains': u'youtube',
404 u'signIn': u'Sign in',
406 u'service': u'youtube',
410 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
412 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
413 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
414 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
417 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
418 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
419 self._downloader.report_warning(u'unable to log in: bad username or password')
421 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
422 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
428 'action_confirm': 'Confirm',
430 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
432 self.report_age_confirmation()
433 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
434 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
435 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
438 def _extract_id(self, url):
439 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
441 self._downloader.report_error(u'invalid URL: %s' % url)
443 video_id = mobj.group(2)
446 def _real_extract(self, url):
447 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
448 mobj = re.search(self._NEXT_URL_RE, url)
450 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
451 video_id = self._extract_id(url)
454 self.report_video_webpage_download(video_id)
455 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
456 request = compat_urllib_request.Request(url)
458 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
459 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
460 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
463 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
465 # Attempt to extract SWF player URL
466 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
468 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
473 self.report_video_info_webpage_download(video_id)
474 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
475 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
476 % (video_id, el_type))
477 video_info_webpage = self._download_webpage(video_info_url, video_id,
479 errnote='unable to download video info webpage')
480 video_info = compat_parse_qs(video_info_webpage)
481 if 'token' in video_info:
483 if 'token' not in video_info:
484 if 'reason' in video_info:
485 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
487 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
490 # Check for "rental" videos
491 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
492 self._downloader.report_error(u'"rental" videos not supported')
495 # Start extracting information
496 self.report_information_extraction(video_id)
499 if 'author' not in video_info:
500 self._downloader.report_error(u'unable to extract uploader name')
502 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
505 video_uploader_id = None
506 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
508 video_uploader_id = mobj.group(1)
510 self._downloader.report_warning(u'unable to extract uploader nickname')
513 if 'title' not in video_info:
514 self._downloader.report_error(u'unable to extract video title')
516 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
519 if 'thumbnail_url' not in video_info:
520 self._downloader.report_warning(u'unable to extract video thumbnail')
522 else: # don't panic if we can't find it
523 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
527 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
529 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
530 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
531 for expression in format_expressions:
533 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
538 video_description = get_element_by_id("eow-description", video_webpage)
539 if video_description:
540 video_description = clean_html(video_description)
542 video_description = ''
545 video_subtitles = None
547 if self._downloader.params.get('writesubtitles', False):
548 video_subtitles = self._extract_subtitle(video_id)
550 (sub_error, sub_lang, sub) = video_subtitles[0]
552 self._downloader.report_error(sub_error)
554 if self._downloader.params.get('allsubtitles', False):
555 video_subtitles = self._extract_all_subtitles(video_id)
556 for video_subtitle in video_subtitles:
557 (sub_error, sub_lang, sub) = video_subtitle
559 self._downloader.report_error(sub_error)
561 if self._downloader.params.get('listsubtitles', False):
562 sub_lang_list = self._list_available_subtitles(video_id)
565 if 'length_seconds' not in video_info:
566 self._downloader.report_warning(u'unable to extract video duration')
569 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
572 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
574 # Decide which formats to download
575 req_format = self._downloader.params.get('format', None)
577 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
578 self.report_rtmp_download()
579 video_url_list = [(None, video_info['conn'][0])]
580 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
581 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
582 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
583 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
584 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
586 format_limit = self._downloader.params.get('format_limit', None)
587 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
588 if format_limit is not None and format_limit in available_formats:
589 format_list = available_formats[available_formats.index(format_limit):]
591 format_list = available_formats
592 existing_formats = [x for x in format_list if x in url_map]
593 if len(existing_formats) == 0:
594 self._downloader.report_error(u'no known formats available for video')
596 if self._downloader.params.get('listformats', None):
597 self._print_formats(existing_formats)
599 if req_format is None or req_format == 'best':
600 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
601 elif req_format == 'worst':
602 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
603 elif req_format in ('-1', 'all'):
604 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
606 # Specific formats. We pick the first in a slash-delimeted sequence.
607 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
608 req_formats = req_format.split('/')
609 video_url_list = None
610 for rf in req_formats:
612 video_url_list = [(rf, url_map[rf])]
614 if video_url_list is None:
615 self._downloader.report_error(u'requested format not available')
618 self._downloader.report_error(u'no conn or url_encoded_fmt_stream_map information found in video info')
622 for format_param, video_real_url in video_url_list:
624 video_extension = self._video_extensions.get(format_param, 'flv')
626 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
627 self._video_dimensions.get(format_param, '???'))
631 'url': video_real_url,
632 'uploader': video_uploader,
633 'uploader_id': video_uploader_id,
634 'upload_date': upload_date,
635 'title': video_title,
636 'ext': video_extension,
637 'format': video_format,
638 'thumbnail': video_thumbnail,
639 'description': video_description,
640 'player_url': player_url,
641 'subtitles': video_subtitles,
642 'duration': video_duration
647 class MetacafeIE(InfoExtractor):
648 """Information Extractor for metacafe.com."""
650 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
651 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
652 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
653 IE_NAME = u'metacafe'
655 def __init__(self, downloader=None):
656 InfoExtractor.__init__(self, downloader)
658 def report_disclaimer(self):
659 """Report disclaimer retrieval."""
660 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
662 def report_age_confirmation(self):
663 """Report attempt to confirm age."""
664 self._downloader.to_screen(u'[metacafe] Confirming age')
666 def report_download_webpage(self, video_id):
667 """Report webpage download."""
668 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
670 def report_extraction(self, video_id):
671 """Report information extraction."""
672 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
674 def _real_initialize(self):
675 # Retrieve disclaimer
676 request = compat_urllib_request.Request(self._DISCLAIMER)
678 self.report_disclaimer()
679 disclaimer = compat_urllib_request.urlopen(request).read()
680 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
681 self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
687 'submit': "Continue - I'm over 18",
689 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
691 self.report_age_confirmation()
692 disclaimer = compat_urllib_request.urlopen(request).read()
693 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
694 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
697 def _real_extract(self, url):
698 # Extract id and simplified title from URL
699 mobj = re.match(self._VALID_URL, url)
701 self._downloader.report_error(u'invalid URL: %s' % url)
704 video_id = mobj.group(1)
706 # Check if video comes from YouTube
707 mobj2 = re.match(r'^yt-(.*)$', video_id)
708 if mobj2 is not None:
709 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
712 # Retrieve video webpage to extract further information
713 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
715 self.report_download_webpage(video_id)
716 webpage = compat_urllib_request.urlopen(request).read()
717 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
718 self._downloader.report_error(u'unable retrieve video webpage: %s' % compat_str(err))
721 # Extract URL, uploader and title from webpage
722 self.report_extraction(video_id)
723 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
725 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
726 video_extension = mediaURL[-3:]
728 # Extract gdaKey if available
729 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
733 gdaKey = mobj.group(1)
734 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
736 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
738 self._downloader.report_error(u'unable to extract media URL')
740 vardict = compat_parse_qs(mobj.group(1))
741 if 'mediaData' not in vardict:
742 self._downloader.report_error(u'unable to extract media URL')
744 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
746 self._downloader.report_error(u'unable to extract media URL')
748 mediaURL = mobj.group(1).replace('\\/', '/')
749 video_extension = mediaURL[-3:]
750 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
752 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
754 self._downloader.report_error(u'unable to extract title')
756 video_title = mobj.group(1).decode('utf-8')
758 mobj = re.search(r'submitter=(.*?);', webpage)
760 self._downloader.report_error(u'unable to extract uploader nickname')
762 video_uploader = mobj.group(1)
765 'id': video_id.decode('utf-8'),
766 'url': video_url.decode('utf-8'),
767 'uploader': video_uploader.decode('utf-8'),
769 'title': video_title,
770 'ext': video_extension.decode('utf-8'),
774 class DailymotionIE(InfoExtractor):
775 """Information Extractor for Dailymotion"""
777 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
778 IE_NAME = u'dailymotion'
781 def __init__(self, downloader=None):
782 InfoExtractor.__init__(self, downloader)
784 def report_extraction(self, video_id):
785 """Report information extraction."""
786 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
788 def _real_extract(self, url):
789 # Extract id and simplified title from URL
790 mobj = re.match(self._VALID_URL, url)
792 self._downloader.report_error(u'invalid URL: %s' % url)
795 video_id = mobj.group(1).split('_')[0].split('?')[0]
797 video_extension = 'mp4'
799 # Retrieve video webpage to extract further information
800 request = compat_urllib_request.Request(url)
801 request.add_header('Cookie', 'family_filter=off')
802 webpage = self._download_webpage(request, video_id)
804 # Extract URL, uploader and title from webpage
805 self.report_extraction(video_id)
806 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
808 self._downloader.report_error(u'unable to extract media URL')
810 flashvars = compat_urllib_parse.unquote(mobj.group(1))
812 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
815 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
818 self._downloader.report_error(u'unable to extract video URL')
821 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
823 self._downloader.report_error(u'unable to extract video URL')
826 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
828 # TODO: support choosing qualities
830 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
832 self._downloader.report_error(u'unable to extract title')
834 video_title = unescapeHTML(mobj.group('title'))
836 video_uploader = None
837 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
839 # lookin for official user
840 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
841 if mobj_official is None:
842 self._downloader.report_warning(u'unable to extract uploader nickname')
844 video_uploader = mobj_official.group(1)
846 video_uploader = mobj.group(1)
848 video_upload_date = None
849 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
851 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
856 'uploader': video_uploader,
857 'upload_date': video_upload_date,
858 'title': video_title,
859 'ext': video_extension,
863 class PhotobucketIE(InfoExtractor):
864 """Information extractor for photobucket.com."""
866 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
867 IE_NAME = u'photobucket'
869 def __init__(self, downloader=None):
870 InfoExtractor.__init__(self, downloader)
872 def report_download_webpage(self, video_id):
873 """Report webpage download."""
874 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
876 def report_extraction(self, video_id):
877 """Report information extraction."""
878 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
880 def _real_extract(self, url):
881 # Extract id from URL
882 mobj = re.match(self._VALID_URL, url)
884 self._downloader.report_error(u'Invalid URL: %s' % url)
887 video_id = mobj.group(1)
889 video_extension = 'flv'
891 # Retrieve video webpage to extract further information
892 request = compat_urllib_request.Request(url)
894 self.report_download_webpage(video_id)
895 webpage = compat_urllib_request.urlopen(request).read()
896 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
897 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
900 # Extract URL, uploader, and title from webpage
901 self.report_extraction(video_id)
902 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
904 self._downloader.report_error(u'unable to extract media URL')
906 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
910 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
912 self._downloader.report_error(u'unable to extract title')
914 video_title = mobj.group(1).decode('utf-8')
916 video_uploader = mobj.group(2).decode('utf-8')
919 'id': video_id.decode('utf-8'),
920 'url': video_url.decode('utf-8'),
921 'uploader': video_uploader,
923 'title': video_title,
924 'ext': video_extension.decode('utf-8'),
928 class YahooIE(InfoExtractor):
929 """Information extractor for video.yahoo.com."""
932 # _VALID_URL matches all Yahoo! Video URLs
933 # _VPAGE_URL matches only the extractable '/watch/' URLs
934 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
935 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
936 IE_NAME = u'video.yahoo'
938 def __init__(self, downloader=None):
939 InfoExtractor.__init__(self, downloader)
941 def report_download_webpage(self, video_id):
942 """Report webpage download."""
943 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
945 def report_extraction(self, video_id):
946 """Report information extraction."""
947 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
949 def _real_extract(self, url, new_video=True):
950 # Extract ID from URL
951 mobj = re.match(self._VALID_URL, url)
953 self._downloader.report_error(u'Invalid URL: %s' % url)
956 video_id = mobj.group(2)
957 video_extension = 'flv'
959 # Rewrite valid but non-extractable URLs as
960 # extractable English language /watch/ URLs
961 if re.match(self._VPAGE_URL, url) is None:
962 request = compat_urllib_request.Request(url)
964 webpage = compat_urllib_request.urlopen(request).read()
965 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
966 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
969 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
971 self._downloader.report_error(u'Unable to extract id field')
973 yahoo_id = mobj.group(1)
975 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
977 self._downloader.report_error(u'Unable to extract vid field')
979 yahoo_vid = mobj.group(1)
981 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
982 return self._real_extract(url, new_video=False)
984 # Retrieve video webpage to extract further information
985 request = compat_urllib_request.Request(url)
987 self.report_download_webpage(video_id)
988 webpage = compat_urllib_request.urlopen(request).read()
989 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
990 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
993 # Extract uploader and title from webpage
994 self.report_extraction(video_id)
995 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
997 self._downloader.report_error(u'unable to extract video title')
999 video_title = mobj.group(1).decode('utf-8')
1001 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1003 self._downloader.report_error(u'unable to extract video uploader')
1005 video_uploader = mobj.group(1).decode('utf-8')
1007 # Extract video thumbnail
1008 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1010 self._downloader.report_error(u'unable to extract video thumbnail')
1012 video_thumbnail = mobj.group(1).decode('utf-8')
1014 # Extract video description
1015 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1017 self._downloader.report_error(u'unable to extract video description')
1019 video_description = mobj.group(1).decode('utf-8')
1020 if not video_description:
1021 video_description = 'No description available.'
1023 # Extract video height and width
1024 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1026 self._downloader.report_error(u'unable to extract video height')
1028 yv_video_height = mobj.group(1)
1030 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1032 self._downloader.report_error(u'unable to extract video width')
1034 yv_video_width = mobj.group(1)
1036 # Retrieve video playlist to extract media URL
1037 # I'm not completely sure what all these options are, but we
1038 # seem to need most of them, otherwise the server sends a 401.
1039 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1040 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1041 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1042 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1043 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1045 self.report_download_webpage(video_id)
1046 webpage = compat_urllib_request.urlopen(request).read()
1047 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1048 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1051 # Extract media URL from playlist XML
1052 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1054 self._downloader.report_error(u'Unable to extract media URL')
1056 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1057 video_url = unescapeHTML(video_url)
1060 'id': video_id.decode('utf-8'),
1062 'uploader': video_uploader,
1063 'upload_date': None,
1064 'title': video_title,
1065 'ext': video_extension.decode('utf-8'),
1066 'thumbnail': video_thumbnail.decode('utf-8'),
1067 'description': video_description,
1071 class VimeoIE(InfoExtractor):
1072 """Information extractor for vimeo.com."""
1074 # _VALID_URL matches Vimeo URLs
1075 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1078 def __init__(self, downloader=None):
1079 InfoExtractor.__init__(self, downloader)
1081 def report_download_webpage(self, video_id):
1082 """Report webpage download."""
1083 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1085 def report_extraction(self, video_id):
1086 """Report information extraction."""
1087 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1089 def _real_extract(self, url, new_video=True):
1090 # Extract ID from URL
1091 mobj = re.match(self._VALID_URL, url)
1093 self._downloader.report_error(u'Invalid URL: %s' % url)
1096 video_id = mobj.group('id')
1097 if not mobj.group('proto'):
1098 url = 'https://' + url
1099 if mobj.group('direct_link'):
1100 url = 'https://vimeo.com/' + video_id
1102 # Retrieve video webpage to extract further information
1103 request = compat_urllib_request.Request(url, None, std_headers)
1105 self.report_download_webpage(video_id)
1106 webpage_bytes = compat_urllib_request.urlopen(request).read()
1107 webpage = webpage_bytes.decode('utf-8')
1108 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1109 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1112 # Now we begin extracting as much information as we can from what we
1113 # retrieved. First we extract the information common to all extractors,
1114 # and latter we extract those that are Vimeo specific.
1115 self.report_extraction(video_id)
1117 # Extract the config JSON
1119 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1120 config = json.loads(config)
1122 self._downloader.report_error(u'unable to extract info section')
1126 video_title = config["video"]["title"]
1128 # Extract uploader and uploader_id
1129 video_uploader = config["video"]["owner"]["name"]
1130 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1132 # Extract video thumbnail
1133 video_thumbnail = config["video"]["thumbnail"]
1135 # Extract video description
1136 video_description = get_element_by_attribute("itemprop", "description", webpage)
1137 if video_description: video_description = clean_html(video_description)
1138 else: video_description = u''
1140 # Extract upload date
1141 video_upload_date = None
1142 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1143 if mobj is not None:
1144 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1146 # Vimeo specific: extract request signature and timestamp
1147 sig = config['request']['signature']
1148 timestamp = config['request']['timestamp']
1150 # Vimeo specific: extract video codec and quality information
1151 # First consider quality, then codecs, then take everything
1152 # TODO bind to format param
1153 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1154 files = { 'hd': [], 'sd': [], 'other': []}
1155 for codec_name, codec_extension in codecs:
1156 if codec_name in config["video"]["files"]:
1157 if 'hd' in config["video"]["files"][codec_name]:
1158 files['hd'].append((codec_name, codec_extension, 'hd'))
1159 elif 'sd' in config["video"]["files"][codec_name]:
1160 files['sd'].append((codec_name, codec_extension, 'sd'))
1162 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1164 for quality in ('hd', 'sd', 'other'):
1165 if len(files[quality]) > 0:
1166 video_quality = files[quality][0][2]
1167 video_codec = files[quality][0][0]
1168 video_extension = files[quality][0][1]
1169 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1172 self._downloader.report_error(u'no known codec found')
1175 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1176 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1181 'uploader': video_uploader,
1182 'uploader_id': video_uploader_id,
1183 'upload_date': video_upload_date,
1184 'title': video_title,
1185 'ext': video_extension,
1186 'thumbnail': video_thumbnail,
1187 'description': video_description,
1191 class ArteTvIE(InfoExtractor):
1192 """arte.tv information extractor."""
1194 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1195 _LIVE_URL = r'index-[0-9]+\.html$'
1197 IE_NAME = u'arte.tv'
1199 def __init__(self, downloader=None):
1200 InfoExtractor.__init__(self, downloader)
1202 def report_download_webpage(self, video_id):
1203 """Report webpage download."""
1204 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1206 def report_extraction(self, video_id):
1207 """Report information extraction."""
1208 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1210 def fetch_webpage(self, url):
1211 request = compat_urllib_request.Request(url)
1213 self.report_download_webpage(url)
1214 webpage = compat_urllib_request.urlopen(request).read()
1215 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1216 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1218 except ValueError as err:
1219 self._downloader.report_error(u'Invalid URL: %s' % url)
1223 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1224 page = self.fetch_webpage(url)
1225 mobj = re.search(regex, page, regexFlags)
1229 self._downloader.report_error(u'Invalid URL: %s' % url)
1232 for (i, key, err) in matchTuples:
1233 if mobj.group(i) is None:
1234 self._downloader.trouble(err)
1237 info[key] = mobj.group(i)
1241 def extractLiveStream(self, url):
1242 video_lang = url.split('/')[-4]
1243 info = self.grep_webpage(
1245 r'src="(.*?/videothek_js.*?\.js)',
1248 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1251 http_host = url.split('/')[2]
1252 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1253 info = self.grep_webpage(
1255 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1256 '(http://.*?\.swf).*?' +
1260 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1261 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1262 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1265 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1267 def extractPlus7Stream(self, url):
1268 video_lang = url.split('/')[-3]
1269 info = self.grep_webpage(
1271 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1274 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1277 next_url = compat_urllib_parse.unquote(info.get('url'))
1278 info = self.grep_webpage(
1280 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1283 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1286 next_url = compat_urllib_parse.unquote(info.get('url'))
1288 info = self.grep_webpage(
1290 r'<video id="(.*?)".*?>.*?' +
1291 '<name>(.*?)</name>.*?' +
1292 '<dateVideo>(.*?)</dateVideo>.*?' +
1293 '<url quality="hd">(.*?)</url>',
1296 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1297 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1298 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1299 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1304 'id': info.get('id'),
1305 'url': compat_urllib_parse.unquote(info.get('url')),
1306 'uploader': u'arte.tv',
1307 'upload_date': info.get('date'),
1308 'title': info.get('title').decode('utf-8'),
1314 def _real_extract(self, url):
1315 video_id = url.split('/')[-1]
1316 self.report_extraction(video_id)
1318 if re.search(self._LIVE_URL, video_id) is not None:
1319 self.extractLiveStream(url)
1322 info = self.extractPlus7Stream(url)
1327 class GenericIE(InfoExtractor):
1328 """Generic last-resort information extractor."""
1331 IE_NAME = u'generic'
1333 def __init__(self, downloader=None):
1334 InfoExtractor.__init__(self, downloader)
1336 def report_download_webpage(self, video_id):
1337 """Report webpage download."""
1338 if not self._downloader.params.get('test', False):
1339 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1340 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1342 def report_extraction(self, video_id):
1343 """Report information extraction."""
1344 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1346 def report_following_redirect(self, new_url):
1347 """Report information extraction."""
1348 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1350 def _test_redirect(self, url):
1351 """Check if it is a redirect, like url shorteners, in case restart chain."""
1352 class HeadRequest(compat_urllib_request.Request):
1353 def get_method(self):
1356 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1358 Subclass the HTTPRedirectHandler to make it use our
1359 HeadRequest also on the redirected URL
1361 def redirect_request(self, req, fp, code, msg, headers, newurl):
1362 if code in (301, 302, 303, 307):
1363 newurl = newurl.replace(' ', '%20')
1364 newheaders = dict((k,v) for k,v in req.headers.items()
1365 if k.lower() not in ("content-length", "content-type"))
1366 return HeadRequest(newurl,
1368 origin_req_host=req.get_origin_req_host(),
1371 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1373 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1375 Fallback to GET if HEAD is not allowed (405 HTTP error)
1377 def http_error_405(self, req, fp, code, msg, headers):
1381 newheaders = dict((k,v) for k,v in req.headers.items()
1382 if k.lower() not in ("content-length", "content-type"))
1383 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1385 origin_req_host=req.get_origin_req_host(),
1389 opener = compat_urllib_request.OpenerDirector()
1390 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1391 HTTPMethodFallback, HEADRedirectHandler,
1392 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1393 opener.add_handler(handler())
1395 response = opener.open(HeadRequest(url))
1396 new_url = response.geturl()
1401 self.report_following_redirect(new_url)
1402 self._downloader.download([new_url])
1405 def _real_extract(self, url):
1406 if self._test_redirect(url): return
1408 video_id = url.split('/')[-1]
1410 webpage = self._download_webpage(url, video_id)
1411 except ValueError as err:
1412 # since this is the last-resort InfoExtractor, if
1413 # this error is thrown, it'll be thrown here
1414 self._downloader.report_error(u'Invalid URL: %s' % url)
1417 self.report_extraction(video_id)
1418 # Start with something easy: JW Player in SWFObject
1419 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1421 # Broaden the search a little bit
1422 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1424 # Broaden the search a little bit: JWPlayer JS loader
1425 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1427 self._downloader.report_error(u'Invalid URL: %s' % url)
1430 # It's possible that one of the regexes
1431 # matched, but returned an empty group:
1432 if mobj.group(1) is None:
1433 self._downloader.report_error(u'Invalid URL: %s' % url)
1436 video_url = compat_urllib_parse.unquote(mobj.group(1))
1437 video_id = os.path.basename(video_url)
1439 # here's a fun little line of code for you:
1440 video_extension = os.path.splitext(video_id)[1][1:]
1441 video_id = os.path.splitext(video_id)[0]
1443 # it's tempting to parse this further, but you would
1444 # have to take into account all the variations like
1445 # Video Title - Site Name
1446 # Site Name | Video Title
1447 # Video Title - Tagline | Site Name
1448 # and so on and so forth; it's just not practical
1449 mobj = re.search(r'<title>(.*)</title>', webpage)
1451 self._downloader.report_error(u'unable to extract title')
1453 video_title = mobj.group(1)
1455 # video uploader is domain name
1456 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1458 self._downloader.report_error(u'unable to extract title')
1460 video_uploader = mobj.group(1)
1465 'uploader': video_uploader,
1466 'upload_date': None,
1467 'title': video_title,
1468 'ext': video_extension,
1472 class YoutubeSearchIE(InfoExtractor):
1473 """Information Extractor for YouTube search queries."""
1474 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1475 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1476 _max_youtube_results = 1000
1477 IE_NAME = u'youtube:search'
1479 def __init__(self, downloader=None):
1480 InfoExtractor.__init__(self, downloader)
1482 def report_download_page(self, query, pagenum):
1483 """Report attempt to download search page with given number."""
1484 query = query.decode(preferredencoding())
1485 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1487 def _real_extract(self, query):
1488 mobj = re.match(self._VALID_URL, query)
1490 self._downloader.report_error(u'invalid search query "%s"' % query)
1493 prefix, query = query.split(':')
1495 query = query.encode('utf-8')
1497 self._download_n_results(query, 1)
1499 elif prefix == 'all':
1500 self._download_n_results(query, self._max_youtube_results)
1506 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1508 elif n > self._max_youtube_results:
1509 self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1510 n = self._max_youtube_results
1511 self._download_n_results(query, n)
1513 except ValueError: # parsing prefix as integer fails
1514 self._download_n_results(query, 1)
1517 def _download_n_results(self, query, n):
1518 """Downloads a specified number of results for a query"""
1524 while (50 * pagenum) < limit:
1525 self.report_download_page(query, pagenum+1)
1526 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1527 request = compat_urllib_request.Request(result_url)
1529 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1530 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1531 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1533 api_response = json.loads(data)['data']
1535 if not 'items' in api_response:
1536 self._downloader.trouble(u'[youtube] No video results')
1539 new_ids = list(video['id'] for video in api_response['items'])
1540 video_ids += new_ids
1542 limit = min(n, api_response['totalItems'])
1545 if len(video_ids) > n:
1546 video_ids = video_ids[:n]
1547 for id in video_ids:
1548 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1552 class GoogleSearchIE(InfoExtractor):
1553 """Information Extractor for Google Video search queries."""
1554 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1555 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1556 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1557 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1558 _max_google_results = 1000
1559 IE_NAME = u'video.google:search'
1561 def __init__(self, downloader=None):
1562 InfoExtractor.__init__(self, downloader)
1564 def report_download_page(self, query, pagenum):
1565 """Report attempt to download playlist page with given number."""
1566 query = query.decode(preferredencoding())
1567 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1569 def _real_extract(self, query):
1570 mobj = re.match(self._VALID_URL, query)
1572 self._downloader.report_error(u'invalid search query "%s"' % query)
1575 prefix, query = query.split(':')
1577 query = query.encode('utf-8')
1579 self._download_n_results(query, 1)
1581 elif prefix == 'all':
1582 self._download_n_results(query, self._max_google_results)
1588 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1590 elif n > self._max_google_results:
1591 self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1592 n = self._max_google_results
1593 self._download_n_results(query, n)
1595 except ValueError: # parsing prefix as integer fails
1596 self._download_n_results(query, 1)
1599 def _download_n_results(self, query, n):
1600 """Downloads a specified number of results for a query"""
1606 self.report_download_page(query, pagenum)
1607 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1608 request = compat_urllib_request.Request(result_url)
1610 page = compat_urllib_request.urlopen(request).read()
1611 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1612 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1615 # Extract video identifiers
1616 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1617 video_id = mobj.group(1)
1618 if video_id not in video_ids:
1619 video_ids.append(video_id)
1620 if len(video_ids) == n:
1621 # Specified n videos reached
1622 for id in video_ids:
1623 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1626 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1627 for id in video_ids:
1628 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1631 pagenum = pagenum + 1
1634 class YahooSearchIE(InfoExtractor):
1635 """Information Extractor for Yahoo! Video search queries."""
1638 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1639 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1640 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1641 _MORE_PAGES_INDICATOR = r'\s*Next'
1642 _max_yahoo_results = 1000
1643 IE_NAME = u'video.yahoo:search'
1645 def __init__(self, downloader=None):
1646 InfoExtractor.__init__(self, downloader)
1648 def report_download_page(self, query, pagenum):
1649 """Report attempt to download playlist page with given number."""
1650 query = query.decode(preferredencoding())
1651 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1653 def _real_extract(self, query):
1654 mobj = re.match(self._VALID_URL, query)
1656 self._downloader.report_error(u'invalid search query "%s"' % query)
1659 prefix, query = query.split(':')
1661 query = query.encode('utf-8')
1663 self._download_n_results(query, 1)
1665 elif prefix == 'all':
1666 self._download_n_results(query, self._max_yahoo_results)
1672 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1674 elif n > self._max_yahoo_results:
1675 self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1676 n = self._max_yahoo_results
1677 self._download_n_results(query, n)
1679 except ValueError: # parsing prefix as integer fails
1680 self._download_n_results(query, 1)
1683 def _download_n_results(self, query, n):
1684 """Downloads a specified number of results for a query"""
1687 already_seen = set()
1691 self.report_download_page(query, pagenum)
1692 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1693 request = compat_urllib_request.Request(result_url)
1695 page = compat_urllib_request.urlopen(request).read()
1696 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1697 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1700 # Extract video identifiers
1701 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1702 video_id = mobj.group(1)
1703 if video_id not in already_seen:
1704 video_ids.append(video_id)
1705 already_seen.add(video_id)
1706 if len(video_ids) == n:
1707 # Specified n videos reached
1708 for id in video_ids:
1709 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1712 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1713 for id in video_ids:
1714 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1717 pagenum = pagenum + 1
1720 class YoutubePlaylistIE(InfoExtractor):
1721 """Information Extractor for YouTube playlists."""
1723 _VALID_URL = r"""(?:
1728 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1729 \? (?:.*?&)*? (?:p|a|list)=
1732 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1735 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1737 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1739 IE_NAME = u'youtube:playlist'
1741 def __init__(self, downloader=None):
1742 InfoExtractor.__init__(self, downloader)
1745 def suitable(cls, url):
1746 """Receives a URL and returns True if suitable for this IE."""
1747 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1749 def report_download_page(self, playlist_id, pagenum):
1750 """Report attempt to download playlist page with given number."""
1751 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1753 def _real_extract(self, url):
1754 # Extract playlist id
1755 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1757 self._downloader.report_error(u'invalid url: %s' % url)
1760 # Download playlist videos from API
1761 playlist_id = mobj.group(1) or mobj.group(2)
1766 self.report_download_page(playlist_id, page_num)
1768 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1770 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1771 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1772 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1776 response = json.loads(page)
1777 except ValueError as err:
1778 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1781 if not 'feed' in response or not 'entry' in response['feed']:
1782 self._downloader.report_error(u'Got a malformed response from YouTube API')
1784 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1785 for entry in response['feed']['entry']
1786 if 'content' in entry ]
1788 if len(response['feed']['entry']) < self._MAX_RESULTS:
1792 videos = [v[1] for v in sorted(videos)]
1795 playliststart = self._downloader.params.get('playliststart', 1) - 1
1796 playlistend = self._downloader.params.get('playlistend', -1)
1797 if playlistend == -1:
1798 videos = videos[playliststart:]
1800 videos = videos[playliststart:playlistend]
1802 if len(videos) == total:
1803 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1805 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(videos)))
1807 for video in videos:
1808 self._downloader.download([video])
1812 class YoutubeChannelIE(InfoExtractor):
1813 """Information Extractor for YouTube channels."""
1815 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1816 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1817 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1818 IE_NAME = u'youtube:channel'
1820 def report_download_page(self, channel_id, pagenum):
1821 """Report attempt to download channel page with given number."""
1822 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1824 def _real_extract(self, url):
1825 # Extract channel id
1826 mobj = re.match(self._VALID_URL, url)
1828 self._downloader.report_error(u'invalid url: %s' % url)
1831 # Download channel pages
1832 channel_id = mobj.group(1)
1837 self.report_download_page(channel_id, pagenum)
1838 url = self._TEMPLATE_URL % (channel_id, pagenum)
1839 request = compat_urllib_request.Request(url)
1841 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1842 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1843 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1846 # Extract video identifiers
1848 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1849 if mobj.group(1) not in ids_in_page:
1850 ids_in_page.append(mobj.group(1))
1851 video_ids.extend(ids_in_page)
1853 if self._MORE_PAGES_INDICATOR not in page:
1855 pagenum = pagenum + 1
1857 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1859 for id in video_ids:
1860 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1864 class YoutubeUserIE(InfoExtractor):
1865 """Information Extractor for YouTube users."""
1867 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1868 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1869 _GDATA_PAGE_SIZE = 50
1870 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1871 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1872 IE_NAME = u'youtube:user'
1874 def __init__(self, downloader=None):
1875 InfoExtractor.__init__(self, downloader)
1877 def report_download_page(self, username, start_index):
1878 """Report attempt to download user page."""
1879 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1880 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1882 def _real_extract(self, url):
1884 mobj = re.match(self._VALID_URL, url)
1886 self._downloader.report_error(u'invalid url: %s' % url)
1889 username = mobj.group(1)
1891 # Download video ids using YouTube Data API. Result size per
1892 # query is limited (currently to 50 videos) so we need to query
1893 # page by page until there are no video ids - it means we got
1900 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1901 self.report_download_page(username, start_index)
1903 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1906 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1907 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1908 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1911 # Extract video identifiers
1914 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1915 if mobj.group(1) not in ids_in_page:
1916 ids_in_page.append(mobj.group(1))
1918 video_ids.extend(ids_in_page)
1920 # A little optimization - if current page is not
1921 # "full", ie. does not contain PAGE_SIZE video ids then
1922 # we can assume that this page is the last one - there
1923 # are no more ids on further pages - no need to query
1926 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1931 all_ids_count = len(video_ids)
1932 playliststart = self._downloader.params.get('playliststart', 1) - 1
1933 playlistend = self._downloader.params.get('playlistend', -1)
1935 if playlistend == -1:
1936 video_ids = video_ids[playliststart:]
1938 video_ids = video_ids[playliststart:playlistend]
1940 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1941 (username, all_ids_count, len(video_ids)))
1943 for video_id in video_ids:
1944 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1947 class BlipTVUserIE(InfoExtractor):
1948 """Information Extractor for blip.tv users."""
1950 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1952 IE_NAME = u'blip.tv:user'
1954 def __init__(self, downloader=None):
1955 InfoExtractor.__init__(self, downloader)
1957 def report_download_page(self, username, pagenum):
1958 """Report attempt to download user page."""
1959 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1960 (self.IE_NAME, username, pagenum))
1962 def _real_extract(self, url):
1964 mobj = re.match(self._VALID_URL, url)
1966 self._downloader.report_error(u'invalid url: %s' % url)
1969 username = mobj.group(1)
1971 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1973 request = compat_urllib_request.Request(url)
1976 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1977 mobj = re.search(r'data-users-id="([^"]+)"', page)
1978 page_base = page_base % mobj.group(1)
1979 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1980 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1984 # Download video ids using BlipTV Ajax calls. Result size per
1985 # query is limited (currently to 12 videos) so we need to query
1986 # page by page until there are no video ids - it means we got
1993 self.report_download_page(username, pagenum)
1994 url = page_base + "&page=" + str(pagenum)
1995 request = compat_urllib_request.Request( url )
1997 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1998 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1999 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
2002 # Extract video identifiers
2005 for mobj in re.finditer(r'href="/([^"]+)"', page):
2006 if mobj.group(1) not in ids_in_page:
2007 ids_in_page.append(unescapeHTML(mobj.group(1)))
2009 video_ids.extend(ids_in_page)
2011 # A little optimization - if current page is not
2012 # "full", ie. does not contain PAGE_SIZE video ids then
2013 # we can assume that this page is the last one - there
2014 # are no more ids on further pages - no need to query
2017 if len(ids_in_page) < self._PAGE_SIZE:
2022 all_ids_count = len(video_ids)
2023 playliststart = self._downloader.params.get('playliststart', 1) - 1
2024 playlistend = self._downloader.params.get('playlistend', -1)
2026 if playlistend == -1:
2027 video_ids = video_ids[playliststart:]
2029 video_ids = video_ids[playliststart:playlistend]
2031 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
2032 (self.IE_NAME, username, all_ids_count, len(video_ids)))
2034 for video_id in video_ids:
2035 self._downloader.download([u'http://blip.tv/'+video_id])
2038 class DepositFilesIE(InfoExtractor):
2039 """Information extractor for depositfiles.com"""
2041 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2043 def report_download_webpage(self, file_id):
2044 """Report webpage download."""
2045 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2047 def report_extraction(self, file_id):
2048 """Report information extraction."""
2049 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2051 def _real_extract(self, url):
2052 file_id = url.split('/')[-1]
2053 # Rebuild url in english locale
2054 url = 'http://depositfiles.com/en/files/' + file_id
2056 # Retrieve file webpage with 'Free download' button pressed
2057 free_download_indication = { 'gateway_result' : '1' }
2058 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2060 self.report_download_webpage(file_id)
2061 webpage = compat_urllib_request.urlopen(request).read()
2062 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2063 self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
2066 # Search for the real file URL
2067 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2068 if (mobj is None) or (mobj.group(1) is None):
2069 # Try to figure out reason of the error.
2070 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2071 if (mobj is not None) and (mobj.group(1) is not None):
2072 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2073 self._downloader.report_error(u'%s' % restriction_message)
2075 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
2078 file_url = mobj.group(1)
2079 file_extension = os.path.splitext(file_url)[1][1:]
2081 # Search for file title
2082 mobj = re.search(r'<b title="(.*?)">', webpage)
2084 self._downloader.report_error(u'unable to extract title')
2086 file_title = mobj.group(1).decode('utf-8')
2089 'id': file_id.decode('utf-8'),
2090 'url': file_url.decode('utf-8'),
2092 'upload_date': None,
2093 'title': file_title,
2094 'ext': file_extension.decode('utf-8'),
2098 class FacebookIE(InfoExtractor):
2099 """Information Extractor for Facebook"""
2101 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2102 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2103 _NETRC_MACHINE = 'facebook'
2104 IE_NAME = u'facebook'
2106 def report_login(self):
2107 """Report attempt to log in."""
2108 self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2110 def _real_initialize(self):
2111 if self._downloader is None:
2116 downloader_params = self._downloader.params
2118 # Attempt to use provided username and password or .netrc data
2119 if downloader_params.get('username', None) is not None:
2120 useremail = downloader_params['username']
2121 password = downloader_params['password']
2122 elif downloader_params.get('usenetrc', False):
2124 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2125 if info is not None:
2129 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2130 except (IOError, netrc.NetrcParseError) as err:
2131 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2134 if useremail is None:
2143 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2146 login_results = compat_urllib_request.urlopen(request).read()
2147 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2148 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2150 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2151 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2154 def _real_extract(self, url):
2155 mobj = re.match(self._VALID_URL, url)
2157 self._downloader.report_error(u'invalid URL: %s' % url)
2159 video_id = mobj.group('ID')
2161 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2162 webpage = self._download_webpage(url, video_id)
2164 BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2165 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2166 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2168 raise ExtractorError(u'Cannot parse data')
2169 data = dict(json.loads(m.group(1)))
2170 params_raw = compat_urllib_parse.unquote(data['params'])
2171 params = json.loads(params_raw)
2172 video_url = params['hd_src']
2174 video_url = params['sd_src']
2176 raise ExtractorError(u'Cannot find video URL')
2177 video_duration = int(params['video_duration'])
2179 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2181 raise ExtractorError(u'Cannot find title in webpage')
2182 video_title = unescapeHTML(m.group(1))
2186 'title': video_title,
2189 'duration': video_duration,
2190 'thumbnail': params['thumbnail_src'],
2195 class BlipTVIE(InfoExtractor):
2196 """Information extractor for blip.tv"""
2198 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2199 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2200 IE_NAME = u'blip.tv'
2202 def report_extraction(self, file_id):
2203 """Report information extraction."""
2204 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2206 def report_direct_download(self, title):
2207 """Report information extraction."""
2208 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2210 def _real_extract(self, url):
2211 mobj = re.match(self._VALID_URL, url)
2213 self._downloader.report_error(u'invalid URL: %s' % url)
2216 urlp = compat_urllib_parse_urlparse(url)
2217 if urlp.path.startswith('/play/'):
2218 request = compat_urllib_request.Request(url)
2219 response = compat_urllib_request.urlopen(request)
2220 redirecturl = response.geturl()
2221 rurlp = compat_urllib_parse_urlparse(redirecturl)
2222 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2223 url = 'http://blip.tv/a/a-' + file_id
2224 return self._real_extract(url)
2231 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2232 request = compat_urllib_request.Request(json_url)
2233 request.add_header('User-Agent', 'iTunes/10.6.1')
2234 self.report_extraction(mobj.group(1))
2237 urlh = compat_urllib_request.urlopen(request)
2238 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2239 basename = url.split('/')[-1]
2240 title,ext = os.path.splitext(basename)
2241 title = title.decode('UTF-8')
2242 ext = ext.replace('.', '')
2243 self.report_direct_download(title)
2248 'upload_date': None,
2253 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2254 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2255 if info is None: # Regular URL
2257 json_code_bytes = urlh.read()
2258 json_code = json_code_bytes.decode('utf-8')
2259 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2260 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2264 json_data = json.loads(json_code)
2265 if 'Post' in json_data:
2266 data = json_data['Post']
2270 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2271 video_url = data['media']['url']
2272 umobj = re.match(self._URL_EXT, video_url)
2274 raise ValueError('Can not determine filename extension')
2275 ext = umobj.group(1)
2278 'id': data['item_id'],
2280 'uploader': data['display_name'],
2281 'upload_date': upload_date,
2282 'title': data['title'],
2284 'format': data['media']['mimeType'],
2285 'thumbnail': data['thumbnailUrl'],
2286 'description': data['description'],
2287 'player_url': data['embedUrl'],
2288 'user_agent': 'iTunes/10.6.1',
2290 except (ValueError,KeyError) as err:
2291 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2297 class MyVideoIE(InfoExtractor):
2298 """Information Extractor for myvideo.de."""
2300 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2301 IE_NAME = u'myvideo'
2303 def __init__(self, downloader=None):
2304 InfoExtractor.__init__(self, downloader)
2306 def report_extraction(self, video_id):
2307 """Report information extraction."""
2308 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2310 def _real_extract(self,url):
2311 mobj = re.match(self._VALID_URL, url)
2313 self._download.report_error(u'invalid URL: %s' % url)
2316 video_id = mobj.group(1)
2319 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2320 webpage = self._download_webpage(webpage_url, video_id)
2322 self.report_extraction(video_id)
2323 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2326 self._downloader.report_error(u'unable to extract media URL')
2328 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2330 mobj = re.search('<title>([^<]+)</title>', webpage)
2332 self._downloader.report_error(u'unable to extract title')
2335 video_title = mobj.group(1)
2341 'upload_date': None,
2342 'title': video_title,
2346 class ComedyCentralIE(InfoExtractor):
2347 """Information extractor for The Daily Show and Colbert Report """
2349 # urls can be abbreviations like :thedailyshow or :colbert
2350 # urls for episodes like:
2351 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2352 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2353 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2354 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2355 |(https?://)?(www\.)?
2356 (?P<showname>thedailyshow|colbertnation)\.com/
2357 (full-episodes/(?P<episode>.*)|
2359 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2360 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2363 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2365 _video_extensions = {
2373 _video_dimensions = {
2383 def suitable(cls, url):
2384 """Receives a URL and returns True if suitable for this IE."""
2385 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2387 def report_extraction(self, episode_id):
2388 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2390 def report_config_download(self, episode_id, media_id):
2391 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2393 def report_index_download(self, episode_id):
2394 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2396 def _print_formats(self, formats):
2397 print('Available formats:')
2399 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2402 def _real_extract(self, url):
2403 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2405 self._downloader.report_error(u'invalid URL: %s' % url)
2408 if mobj.group('shortname'):
2409 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2410 url = u'http://www.thedailyshow.com/full-episodes/'
2412 url = u'http://www.colbertnation.com/full-episodes/'
2413 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2414 assert mobj is not None
2416 if mobj.group('clip'):
2417 if mobj.group('showname') == 'thedailyshow':
2418 epTitle = mobj.group('tdstitle')
2420 epTitle = mobj.group('cntitle')
2423 dlNewest = not mobj.group('episode')
2425 epTitle = mobj.group('showname')
2427 epTitle = mobj.group('episode')
2429 req = compat_urllib_request.Request(url)
2430 self.report_extraction(epTitle)
2432 htmlHandle = compat_urllib_request.urlopen(req)
2433 html = htmlHandle.read()
2434 webpage = html.decode('utf-8')
2435 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2436 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2439 url = htmlHandle.geturl()
2440 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2442 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2444 if mobj.group('episode') == '':
2445 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2447 epTitle = mobj.group('episode')
2449 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2451 if len(mMovieParams) == 0:
2452 # The Colbert Report embeds the information in a without
2453 # a URL prefix; so extract the alternate reference
2454 # and then add the URL prefix manually.
2456 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2457 if len(altMovieParams) == 0:
2458 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2461 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2463 uri = mMovieParams[0][1]
2464 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2465 self.report_index_download(epTitle)
2467 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2468 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2469 self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2474 idoc = xml.etree.ElementTree.fromstring(indexXml)
2475 itemEls = idoc.findall('.//item')
2476 for partNum,itemEl in enumerate(itemEls):
2477 mediaId = itemEl.findall('./guid')[0].text
2478 shortMediaId = mediaId.split(':')[-1]
2479 showId = mediaId.split(':')[-2].replace('.com', '')
2480 officialTitle = itemEl.findall('./title')[0].text
2481 officialDate = itemEl.findall('./pubDate')[0].text
2483 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2484 compat_urllib_parse.urlencode({'uri': mediaId}))
2485 configReq = compat_urllib_request.Request(configUrl)
2486 self.report_config_download(epTitle, shortMediaId)
2488 configXml = compat_urllib_request.urlopen(configReq).read()
2489 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2490 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2493 cdoc = xml.etree.ElementTree.fromstring(configXml)
2495 for rendition in cdoc.findall('.//rendition'):
2496 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2500 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2503 if self._downloader.params.get('listformats', None):
2504 self._print_formats([i[0] for i in turls])
2507 # For now, just pick the highest bitrate
2508 format,rtmp_video_url = turls[-1]
2510 # Get the format arg from the arg stream
2511 req_format = self._downloader.params.get('format', None)
2513 # Select format if we can find one
2516 format, rtmp_video_url = f, v
2519 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2521 raise ExtractorError(u'Cannot transform RTMP url')
2522 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2523 video_url = base + m.group('finalid')
2525 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2530 'upload_date': officialDate,
2535 'description': officialTitle,
2537 results.append(info)
2542 class EscapistIE(InfoExtractor):
2543 """Information extractor for The Escapist """
2545 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2546 IE_NAME = u'escapist'
2548 def report_extraction(self, showName):
2549 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2551 def report_config_download(self, showName):
2552 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2554 def _real_extract(self, url):
2555 mobj = re.match(self._VALID_URL, url)
2557 self._downloader.report_error(u'invalid URL: %s' % url)
2559 showName = mobj.group('showname')
2560 videoId = mobj.group('episode')
2562 self.report_extraction(showName)
2564 webPage = compat_urllib_request.urlopen(url)
2565 webPageBytes = webPage.read()
2566 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2567 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2568 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2569 self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2572 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2573 description = unescapeHTML(descMatch.group(1))
2574 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2575 imgUrl = unescapeHTML(imgMatch.group(1))
2576 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2577 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2578 configUrlMatch = re.search('config=(.*)$', playerUrl)
2579 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2581 self.report_config_download(showName)
2583 configJSON = compat_urllib_request.urlopen(configUrl)
2584 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2585 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2586 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2587 self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2590 # Technically, it's JavaScript, not JSON
2591 configJSON = configJSON.replace("'", '"')
2594 config = json.loads(configJSON)
2595 except (ValueError,) as err:
2596 self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2599 playlist = config['playlist']
2600 videoUrl = playlist[1]['url']
2605 'uploader': showName,
2606 'upload_date': None,
2609 'thumbnail': imgUrl,
2610 'description': description,
2611 'player_url': playerUrl,
2616 class CollegeHumorIE(InfoExtractor):
2617 """Information extractor for collegehumor.com"""
2620 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2621 IE_NAME = u'collegehumor'
2623 def report_manifest(self, video_id):
2624 """Report information extraction."""
2625 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2627 def report_extraction(self, video_id):
2628 """Report information extraction."""
2629 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2631 def _real_extract(self, url):
2632 mobj = re.match(self._VALID_URL, url)
2634 self._downloader.report_error(u'invalid URL: %s' % url)
2636 video_id = mobj.group('videoid')
2641 'upload_date': None,
2644 self.report_extraction(video_id)
2645 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2647 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2648 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2649 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2652 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2654 videoNode = mdoc.findall('./video')[0]
2655 info['description'] = videoNode.findall('./description')[0].text
2656 info['title'] = videoNode.findall('./caption')[0].text
2657 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2658 manifest_url = videoNode.findall('./file')[0].text
2660 self._downloader.report_error(u'Invalid metadata XML file')
2663 manifest_url += '?hdcore=2.10.3'
2664 self.report_manifest(video_id)
2666 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2667 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2668 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2671 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2673 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2674 node_id = media_node.attrib['url']
2675 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2676 except IndexError as err:
2677 self._downloader.report_error(u'Invalid manifest file')
2680 url_pr = compat_urllib_parse_urlparse(manifest_url)
2681 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2688 class XVideosIE(InfoExtractor):
2689 """Information extractor for xvideos.com"""
2691 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2692 IE_NAME = u'xvideos'
2694 def report_extraction(self, video_id):
2695 """Report information extraction."""
2696 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2698 def _real_extract(self, url):
2699 mobj = re.match(self._VALID_URL, url)
2701 self._downloader.report_error(u'invalid URL: %s' % url)
2703 video_id = mobj.group(1)
2705 webpage = self._download_webpage(url, video_id)
2707 self.report_extraction(video_id)
2711 mobj = re.search(r'flv_url=(.+?)&', webpage)
2713 self._downloader.report_error(u'unable to extract video url')
2715 video_url = compat_urllib_parse.unquote(mobj.group(1))
2719 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2721 self._downloader.report_error(u'unable to extract video title')
2723 video_title = mobj.group(1)
2726 # Extract video thumbnail
2727 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2729 self._downloader.report_error(u'unable to extract video thumbnail')
2731 video_thumbnail = mobj.group(0)
2737 'upload_date': None,
2738 'title': video_title,
2740 'thumbnail': video_thumbnail,
2741 'description': None,
2747 class SoundcloudIE(InfoExtractor):
2748 """Information extractor for soundcloud.com
2749 To access the media, the uid of the song and a stream token
2750 must be extracted from the page source and the script must make
2751 a request to media.soundcloud.com/crossdomain.xml. Then
2752 the media can be grabbed by requesting from an url composed
2753 of the stream token and uid
2756 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2757 IE_NAME = u'soundcloud'
2759 def __init__(self, downloader=None):
2760 InfoExtractor.__init__(self, downloader)
2762 def report_resolve(self, video_id):
2763 """Report information extraction."""
2764 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2766 def report_extraction(self, video_id):
2767 """Report information extraction."""
2768 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2770 def _real_extract(self, url):
2771 mobj = re.match(self._VALID_URL, url)
2773 self._downloader.report_error(u'invalid URL: %s' % url)
2776 # extract uploader (which is in the url)
2777 uploader = mobj.group(1)
2778 # extract simple title (uploader + slug of song title)
2779 slug_title = mobj.group(2)
2780 simple_title = uploader + u'-' + slug_title
2782 self.report_resolve('%s/%s' % (uploader, slug_title))
2784 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2785 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2786 request = compat_urllib_request.Request(resolv_url)
2788 info_json_bytes = compat_urllib_request.urlopen(request).read()
2789 info_json = info_json_bytes.decode('utf-8')
2790 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2791 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2794 info = json.loads(info_json)
2795 video_id = info['id']
2796 self.report_extraction('%s/%s' % (uploader, slug_title))
2798 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2799 request = compat_urllib_request.Request(streams_url)
2801 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2802 stream_json = stream_json_bytes.decode('utf-8')
2803 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2804 self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2807 streams = json.loads(stream_json)
2808 mediaURL = streams['http_mp3_128_url']
2813 'uploader': info['user']['username'],
2814 'upload_date': info['created_at'],
2815 'title': info['title'],
2817 'description': info['description'],
2820 class SoundcloudSetIE(InfoExtractor):
2821 """Information extractor for soundcloud.com sets
2822 To access the media, the uid of the song and a stream token
2823 must be extracted from the page source and the script must make
2824 a request to media.soundcloud.com/crossdomain.xml. Then
2825 the media can be grabbed by requesting from an url composed
2826 of the stream token and uid
2829 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2830 IE_NAME = u'soundcloud'
2832 def __init__(self, downloader=None):
2833 InfoExtractor.__init__(self, downloader)
2835 def report_resolve(self, video_id):
2836 """Report information extraction."""
2837 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2839 def report_extraction(self, video_id):
2840 """Report information extraction."""
2841 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2843 def _real_extract(self, url):
2844 mobj = re.match(self._VALID_URL, url)
2846 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2849 # extract uploader (which is in the url)
2850 uploader = mobj.group(1)
2851 # extract simple title (uploader + slug of song title)
2852 slug_title = mobj.group(2)
2853 simple_title = uploader + u'-' + slug_title
2855 self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2857 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2858 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2859 request = compat_urllib_request.Request(resolv_url)
2861 info_json_bytes = compat_urllib_request.urlopen(request).read()
2862 info_json = info_json_bytes.decode('utf-8')
2863 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2864 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2868 info = json.loads(info_json)
2869 if 'errors' in info:
2870 for err in info['errors']:
2871 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err['error_message']))
2874 for track in info['tracks']:
2875 video_id = track['id']
2876 self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2878 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2879 request = compat_urllib_request.Request(streams_url)
2881 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2882 stream_json = stream_json_bytes.decode('utf-8')
2883 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2884 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2887 streams = json.loads(stream_json)
2888 mediaURL = streams['http_mp3_128_url']
2893 'uploader': track['user']['username'],
2894 'upload_date': track['created_at'],
2895 'title': track['title'],
2897 'description': track['description'],
2902 class InfoQIE(InfoExtractor):
2903 """Information extractor for infoq.com"""
2904 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2906 def report_extraction(self, video_id):
2907 """Report information extraction."""
2908 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2910 def _real_extract(self, url):
2911 mobj = re.match(self._VALID_URL, url)
2913 self._downloader.report_error(u'invalid URL: %s' % url)
2916 webpage = self._download_webpage(url, video_id=url)
2917 self.report_extraction(url)
2920 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2922 self._downloader.report_error(u'unable to extract video url')
2924 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2925 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2928 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2930 self._downloader.report_error(u'unable to extract video title')
2932 video_title = mobj.group(1)
2934 # Extract description
2935 video_description = u'No description available.'
2936 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2937 if mobj is not None:
2938 video_description = mobj.group(1)
2940 video_filename = video_url.split('/')[-1]
2941 video_id, extension = video_filename.split('.')
2947 'upload_date': None,
2948 'title': video_title,
2949 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2951 'description': video_description,
2956 class MixcloudIE(InfoExtractor):
2957 """Information extractor for www.mixcloud.com"""
2959 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2960 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2961 IE_NAME = u'mixcloud'
2963 def __init__(self, downloader=None):
2964 InfoExtractor.__init__(self, downloader)
2966 def report_download_json(self, file_id):
2967 """Report JSON download."""
2968 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2970 def report_extraction(self, file_id):
2971 """Report information extraction."""
2972 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2974 def get_urls(self, jsonData, fmt, bitrate='best'):
2975 """Get urls from 'audio_formats' section in json"""
2978 bitrate_list = jsonData[fmt]
2979 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2980 bitrate = max(bitrate_list) # select highest
2982 url_list = jsonData[fmt][bitrate]
2983 except TypeError: # we have no bitrate info.
2984 url_list = jsonData[fmt]
2987 def check_urls(self, url_list):
2988 """Returns 1st active url from list"""
2989 for url in url_list:
2991 compat_urllib_request.urlopen(url)
2993 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2998 def _print_formats(self, formats):
2999 print('Available formats:')
3000 for fmt in formats.keys():
3001 for b in formats[fmt]:
3003 ext = formats[fmt][b][0]
3004 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
3005 except TypeError: # we have no bitrate info
3006 ext = formats[fmt][0]
3007 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
3010 def _real_extract(self, url):
3011 mobj = re.match(self._VALID_URL, url)
3013 self._downloader.report_error(u'invalid URL: %s' % url)
3015 # extract uploader & filename from url
3016 uploader = mobj.group(1).decode('utf-8')
3017 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3019 # construct API request
3020 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3021 # retrieve .json file with links to files
3022 request = compat_urllib_request.Request(file_url)
3024 self.report_download_json(file_url)
3025 jsonData = compat_urllib_request.urlopen(request).read()
3026 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3027 self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
3031 json_data = json.loads(jsonData)
3032 player_url = json_data['player_swf_url']
3033 formats = dict(json_data['audio_formats'])
3035 req_format = self._downloader.params.get('format', None)
3038 if self._downloader.params.get('listformats', None):
3039 self._print_formats(formats)
3042 if req_format is None or req_format == 'best':
3043 for format_param in formats.keys():
3044 url_list = self.get_urls(formats, format_param)
3046 file_url = self.check_urls(url_list)
3047 if file_url is not None:
3050 if req_format not in formats:
3051 self._downloader.report_error(u'format is not available')
3054 url_list = self.get_urls(formats, req_format)
3055 file_url = self.check_urls(url_list)
3056 format_param = req_format
3059 'id': file_id.decode('utf-8'),
3060 'url': file_url.decode('utf-8'),
3061 'uploader': uploader.decode('utf-8'),
3062 'upload_date': None,
3063 'title': json_data['name'],
3064 'ext': file_url.split('.')[-1].decode('utf-8'),
3065 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3066 'thumbnail': json_data['thumbnail_url'],
3067 'description': json_data['description'],
3068 'player_url': player_url.decode('utf-8'),
3071 class StanfordOpenClassroomIE(InfoExtractor):
3072 """Information extractor for Stanford's Open ClassRoom"""
3074 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3075 IE_NAME = u'stanfordoc'
3077 def report_download_webpage(self, objid):
3078 """Report information extraction."""
3079 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3081 def report_extraction(self, video_id):
3082 """Report information extraction."""
3083 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3085 def _real_extract(self, url):
3086 mobj = re.match(self._VALID_URL, url)
3088 raise ExtractorError(u'Invalid URL: %s' % url)
3090 if mobj.group('course') and mobj.group('video'): # A specific video
3091 course = mobj.group('course')
3092 video = mobj.group('video')
3094 'id': course + '_' + video,
3096 'upload_date': None,
3099 self.report_extraction(info['id'])
3100 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3101 xmlUrl = baseUrl + video + '.xml'
3103 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3104 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3105 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
3107 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3109 info['title'] = mdoc.findall('./title')[0].text
3110 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3112 self._downloader.report_error(u'Invalid metadata XML file')
3114 info['ext'] = info['url'].rpartition('.')[2]
3116 elif mobj.group('course'): # A course page
3117 course = mobj.group('course')
3122 'upload_date': None,
3125 coursepage = self._download_webpage(url, info['id'],
3126 note='Downloading course info page',
3127 errnote='Unable to download course info page')
3129 m = re.search('<h1>([^<]+)</h1>', coursepage)
3131 info['title'] = unescapeHTML(m.group(1))
3133 info['title'] = info['id']
3135 m = re.search('<description>([^<]+)</description>', coursepage)
3137 info['description'] = unescapeHTML(m.group(1))
3139 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3142 'type': 'reference',
3143 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3147 for entry in info['list']:
3148 assert entry['type'] == 'reference'
3149 results += self.extract(entry['url'])
3153 'id': 'Stanford OpenClassroom',
3156 'upload_date': None,
3159 self.report_download_webpage(info['id'])
3160 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3162 rootpage = compat_urllib_request.urlopen(rootURL).read()
3163 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3164 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3167 info['title'] = info['id']
3169 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3172 'type': 'reference',
3173 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3178 for entry in info['list']:
3179 assert entry['type'] == 'reference'
3180 results += self.extract(entry['url'])
3183 class MTVIE(InfoExtractor):
3184 """Information extractor for MTV.com"""
3186 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3189 def report_extraction(self, video_id):
3190 """Report information extraction."""
3191 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3193 def _real_extract(self, url):
3194 mobj = re.match(self._VALID_URL, url)
3196 self._downloader.report_error(u'invalid URL: %s' % url)
3198 if not mobj.group('proto'):
3199 url = 'http://' + url
3200 video_id = mobj.group('videoid')
3202 webpage = self._download_webpage(url, video_id)
3204 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3206 self._downloader.report_error(u'unable to extract song name')
3208 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3209 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3211 self._downloader.report_error(u'unable to extract performer')
3213 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3214 video_title = performer + ' - ' + song_name
3216 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3218 self._downloader.report_error(u'unable to mtvn_uri')
3220 mtvn_uri = mobj.group(1)
3222 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3224 self._downloader.report_error(u'unable to extract content id')
3226 content_id = mobj.group(1)
3228 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3229 self.report_extraction(video_id)
3230 request = compat_urllib_request.Request(videogen_url)
3232 metadataXml = compat_urllib_request.urlopen(request).read()
3233 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3234 self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3237 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3238 renditions = mdoc.findall('.//rendition')
3240 # For now, always pick the highest quality.
3241 rendition = renditions[-1]
3244 _,_,ext = rendition.attrib['type'].partition('/')
3245 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3246 video_url = rendition.find('./src').text
3248 self._downloader.trouble('Invalid rendition field.')
3254 'uploader': performer,
3255 'upload_date': None,
3256 'title': video_title,
3264 class YoukuIE(InfoExtractor):
3265 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3267 def report_download_webpage(self, file_id):
3268 """Report webpage download."""
3269 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3271 def report_extraction(self, file_id):
3272 """Report information extraction."""
3273 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3276 nowTime = int(time.time() * 1000)
3277 random1 = random.randint(1000,1998)
3278 random2 = random.randint(1000,9999)
3280 return "%d%d%d" %(nowTime,random1,random2)
3282 def _get_file_ID_mix_string(self, seed):
3284 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3286 for i in range(len(source)):
3287 seed = (seed * 211 + 30031 ) % 65536
3288 index = math.floor(seed / 65536 * len(source) )
3289 mixed.append(source[int(index)])
3290 source.remove(source[int(index)])
3291 #return ''.join(mixed)
3294 def _get_file_id(self, fileId, seed):
3295 mixed = self._get_file_ID_mix_string(seed)
3296 ids = fileId.split('*')
3300 realId.append(mixed[int(ch)])
3301 return ''.join(realId)
3303 def _real_extract(self, url):
3304 mobj = re.match(self._VALID_URL, url)
3306 self._downloader.report_error(u'invalid URL: %s' % url)
3308 video_id = mobj.group('ID')
3310 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3312 request = compat_urllib_request.Request(info_url, None, std_headers)
3314 self.report_download_webpage(video_id)
3315 jsondata = compat_urllib_request.urlopen(request).read()
3316 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3317 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3320 self.report_extraction(video_id)
3322 jsonstr = jsondata.decode('utf-8')
3323 config = json.loads(jsonstr)
3325 video_title = config['data'][0]['title']
3326 seed = config['data'][0]['seed']
3328 format = self._downloader.params.get('format', None)
3329 supported_format = list(config['data'][0]['streamfileids'].keys())
3331 if format is None or format == 'best':
3332 if 'hd2' in supported_format:
3337 elif format == 'worst':
3345 fileid = config['data'][0]['streamfileids'][format]
3346 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3347 except (UnicodeDecodeError, ValueError, KeyError):
3348 self._downloader.report_error(u'unable to extract info section')
3352 sid = self._gen_sid()
3353 fileid = self._get_file_id(fileid, seed)
3355 #column 8,9 of fileid represent the segment number
3356 #fileid[7:9] should be changed
3357 for index, key in enumerate(keys):
3359 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3360 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3363 'id': '%s_part%02d' % (video_id, index),
3364 'url': download_url,
3366 'upload_date': None,
3367 'title': video_title,
3370 files_info.append(info)
3375 class XNXXIE(InfoExtractor):
3376 """Information extractor for xnxx.com"""
3378 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3380 VIDEO_URL_RE = r'flv_url=(.*?)&'
3381 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3382 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3384 def report_webpage(self, video_id):
3385 """Report information extraction"""
3386 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3388 def report_extraction(self, video_id):
3389 """Report information extraction"""
3390 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3392 def _real_extract(self, url):
3393 mobj = re.match(self._VALID_URL, url)
3395 self._downloader.report_error(u'invalid URL: %s' % url)
3397 video_id = mobj.group(1)
3399 self.report_webpage(video_id)
3401 # Get webpage content
3403 webpage_bytes = compat_urllib_request.urlopen(url).read()
3404 webpage = webpage_bytes.decode('utf-8')
3405 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3406 self._downloader.report_error(u'unable to download video webpage: %s' % err)
3409 result = re.search(self.VIDEO_URL_RE, webpage)
3411 self._downloader.report_error(u'unable to extract video url')
3413 video_url = compat_urllib_parse.unquote(result.group(1))
3415 result = re.search(self.VIDEO_TITLE_RE, webpage)
3417 self._downloader.report_error(u'unable to extract video title')
3419 video_title = result.group(1)
3421 result = re.search(self.VIDEO_THUMB_RE, webpage)
3423 self._downloader.report_error(u'unable to extract video thumbnail')
3425 video_thumbnail = result.group(1)
3431 'upload_date': None,
3432 'title': video_title,
3434 'thumbnail': video_thumbnail,
3435 'description': None,
3439 class GooglePlusIE(InfoExtractor):
3440 """Information extractor for plus.google.com."""
3442 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3443 IE_NAME = u'plus.google'
3445 def __init__(self, downloader=None):
3446 InfoExtractor.__init__(self, downloader)
3448 def report_extract_entry(self, url):
3449 """Report downloading extry"""
3450 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3452 def report_date(self, upload_date):
3453 """Report downloading extry"""
3454 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3456 def report_uploader(self, uploader):
3457 """Report downloading extry"""
3458 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3460 def report_title(self, video_title):
3461 """Report downloading extry"""
3462 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3464 def report_extract_vid_page(self, video_page):
3465 """Report information extraction."""
3466 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3468 def _real_extract(self, url):
3469 # Extract id from URL
3470 mobj = re.match(self._VALID_URL, url)
3472 self._downloader.report_error(u'Invalid URL: %s' % url)
3475 post_url = mobj.group(0)
3476 video_id = mobj.group(1)
3478 video_extension = 'flv'
3480 # Step 1, Retrieve post webpage to extract further information
3481 self.report_extract_entry(post_url)
3482 request = compat_urllib_request.Request(post_url)
3484 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3485 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3486 self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3489 # Extract update date
3491 pattern = 'title="Timestamp">(.*?)</a>'
3492 mobj = re.search(pattern, webpage)
3494 upload_date = mobj.group(1)
3495 # Convert timestring to a format suitable for filename
3496 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3497 upload_date = upload_date.strftime('%Y%m%d')
3498 self.report_date(upload_date)
3502 pattern = r'rel\="author".*?>(.*?)</a>'
3503 mobj = re.search(pattern, webpage)
3505 uploader = mobj.group(1)
3506 self.report_uploader(uploader)
3509 # Get the first line for title
3511 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3512 mobj = re.search(pattern, webpage)
3514 video_title = mobj.group(1)
3515 self.report_title(video_title)
3517 # Step 2, Stimulate clicking the image box to launch video
3518 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3519 mobj = re.search(pattern, webpage)
3521 self._downloader.report_error(u'unable to extract video page URL')
3523 video_page = mobj.group(1)
3524 request = compat_urllib_request.Request(video_page)
3526 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3527 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3528 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3530 self.report_extract_vid_page(video_page)
3533 # Extract video links on video page
3534 """Extract video links of all sizes"""
3535 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3536 mobj = re.findall(pattern, webpage)
3538 self._downloader.report_error(u'unable to extract video links')
3540 # Sort in resolution
3541 links = sorted(mobj)
3543 # Choose the lowest of the sort, i.e. highest resolution
3544 video_url = links[-1]
3545 # Only get the url. The resolution part in the tuple has no use anymore
3546 video_url = video_url[-1]
3547 # Treat escaped \u0026 style hex
3549 video_url = video_url.decode("unicode_escape")
3550 except AttributeError: # Python 3
3551 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3557 'uploader': uploader,
3558 'upload_date': upload_date,
3559 'title': video_title,
3560 'ext': video_extension,
3563 class NBAIE(InfoExtractor):
3564 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3567 def _real_extract(self, url):
3568 mobj = re.match(self._VALID_URL, url)
3570 self._downloader.report_error(u'invalid URL: %s' % url)
3573 video_id = mobj.group(1)
3574 if video_id.endswith('/index.html'):
3575 video_id = video_id[:-len('/index.html')]
3577 webpage = self._download_webpage(url, video_id)
3579 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3580 def _findProp(rexp, default=None):
3581 m = re.search(rexp, webpage)
3583 return unescapeHTML(m.group(1))
3587 shortened_video_id = video_id.rpartition('/')[2]
3588 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3590 'id': shortened_video_id,
3594 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3595 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3599 class JustinTVIE(InfoExtractor):
3600 """Information extractor for justin.tv and twitch.tv"""
3601 # TODO: One broadcast may be split into multiple videos. The key
3602 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3603 # starts at 1 and increases. Can we treat all parts as one video?
3605 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3606 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3607 _JUSTIN_PAGE_LIMIT = 100
3608 IE_NAME = u'justin.tv'
3610 def report_extraction(self, file_id):
3611 """Report information extraction."""
3612 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3614 def report_download_page(self, channel, offset):
3615 """Report attempt to download a single page of videos."""
3616 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3617 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3619 # Return count of items, list of *valid* items
3620 def _parse_page(self, url):
3622 urlh = compat_urllib_request.urlopen(url)
3623 webpage_bytes = urlh.read()
3624 webpage = webpage_bytes.decode('utf-8', 'ignore')
3625 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3626 self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3629 response = json.loads(webpage)
3630 if type(response) != list:
3631 error_text = response.get('error', 'unknown error')
3632 self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3635 for clip in response:
3636 video_url = clip['video_file_url']
3638 video_extension = os.path.splitext(video_url)[1][1:]
3639 video_date = re.sub('-', '', clip['start_time'][:10])
3640 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3641 video_id = clip['id']
3642 video_title = clip.get('title', video_id)
3646 'title': video_title,
3647 'uploader': clip.get('channel_name', video_uploader_id),
3648 'uploader_id': video_uploader_id,
3649 'upload_date': video_date,
3650 'ext': video_extension,
3652 return (len(response), info)
3654 def _real_extract(self, url):
3655 mobj = re.match(self._VALID_URL, url)
3657 self._downloader.report_error(u'invalid URL: %s' % url)
3660 api = 'http://api.justin.tv'
3661 video_id = mobj.group(mobj.lastindex)
3663 if mobj.lastindex == 1:
3665 api += '/channel/archives/%s.json'
3667 api += '/broadcast/by_archive/%s.json'
3668 api = api % (video_id,)
3670 self.report_extraction(video_id)
3674 limit = self._JUSTIN_PAGE_LIMIT
3677 self.report_download_page(video_id, offset)
3678 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3679 page_count, page_info = self._parse_page(page_url)
3680 info.extend(page_info)
3681 if not paged or page_count != limit:
3686 class FunnyOrDieIE(InfoExtractor):
3687 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3689 def _real_extract(self, url):
3690 mobj = re.match(self._VALID_URL, url)
3692 self._downloader.report_error(u'invalid URL: %s' % url)
3695 video_id = mobj.group('id')
3696 webpage = self._download_webpage(url, video_id)
3698 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3700 self._downloader.report_error(u'unable to find video information')
3701 video_url = unescapeHTML(m.group('url'))
3703 m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3705 self._downloader.trouble(u'Cannot find video title')
3706 title = clean_html(m.group('title'))
3708 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3710 desc = unescapeHTML(m.group('desc'))
3719 'description': desc,
3723 class SteamIE(InfoExtractor):
3724 _VALID_URL = r"""http://store.steampowered.com/
3725 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3727 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3731 def suitable(cls, url):
3732 """Receives a URL and returns True if suitable for this IE."""
3733 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3735 def _real_extract(self, url):
3736 m = re.match(self._VALID_URL, url, re.VERBOSE)
3737 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3738 gameID = m.group('gameID')
3739 videourl = 'http://store.steampowered.com/video/%s/' % gameID
3740 webpage = self._download_webpage(videourl, gameID)
3741 mweb = re.finditer(urlRE, webpage)
3742 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3743 titles = re.finditer(namesRE, webpage)
3744 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3745 thumbs = re.finditer(thumbsRE, webpage)
3747 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3748 video_id = vid.group('videoID')
3749 title = vtitle.group('videoName')
3750 video_url = vid.group('videoURL')
3751 video_thumb = thumb.group('thumbnail')
3753 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3758 'title': unescapeHTML(title),
3759 'thumbnail': video_thumb
3764 class UstreamIE(InfoExtractor):
3765 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3766 IE_NAME = u'ustream'
3768 def _real_extract(self, url):
3769 m = re.match(self._VALID_URL, url)
3770 video_id = m.group('videoID')
3771 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3772 webpage = self._download_webpage(url, video_id)
3773 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3774 title = m.group('title')
3775 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3776 uploader = m.group('uploader')
3782 'uploader': uploader
3786 class WorldStarHipHopIE(InfoExtractor):
3787 _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3788 IE_NAME = u'WorldStarHipHop'
3790 def _real_extract(self, url):
3791 _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3793 webpage_src = compat_urllib_request.urlopen(url).read()
3794 webpage_src = webpage_src.decode('utf-8')
3796 mobj = re.search(_src_url, webpage_src)
3798 m = re.match(self._VALID_URL, url)
3799 video_id = m.group('id')
3801 if mobj is not None:
3802 video_url = mobj.group()
3803 if 'mp4' in video_url:
3808 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3811 _title = r"""<title>(.*)</title>"""
3813 mobj = re.search(_title, webpage_src)
3815 if mobj is not None:
3816 title = mobj.group(1)
3818 title = 'World Start Hip Hop - %s' % time.ctime()
3820 _thumbnail = r"""rel="image_src" href="(.*)" />"""
3821 mobj = re.search(_thumbnail, webpage_src)
3823 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3824 if mobj is not None:
3825 thumbnail = mobj.group(1)
3827 _title = r"""candytitles.*>(.*)</span>"""
3828 mobj = re.search(_title, webpage_src)
3829 if mobj is not None:
3830 title = mobj.group(1)
3837 'thumbnail' : thumbnail,
3842 class RBMARadioIE(InfoExtractor):
3843 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3845 def _real_extract(self, url):
3846 m = re.match(self._VALID_URL, url)
3847 video_id = m.group('videoID')
3849 webpage = self._download_webpage(url, video_id)
3850 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3852 raise ExtractorError(u'Cannot find metadata')
3853 json_data = m.group(1)
3856 data = json.loads(json_data)
3857 except ValueError as e:
3858 raise ExtractorError(u'Invalid JSON: ' + str(e))
3860 video_url = data['akamai_url'] + '&cbr=256'
3861 url_parts = compat_urllib_parse_urlparse(video_url)
3862 video_ext = url_parts.path.rpartition('.')[2]
3867 'title': data['title'],
3868 'description': data.get('teaser_text'),
3869 'location': data.get('country_of_origin'),
3870 'uploader': data.get('host', {}).get('name'),
3871 'uploader_id': data.get('host', {}).get('slug'),
3872 'thumbnail': data.get('image', {}).get('large_url_2x'),
3873 'duration': data.get('duration'),
3878 class YouPornIE(InfoExtractor):
3879 """Information extractor for youporn.com."""
3880 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3882 def _print_formats(self, formats):
3883 """Print all available formats"""
3884 print(u'Available formats:')
3885 print(u'ext\t\tformat')
3886 print(u'---------------------------------')
3887 for format in formats:
3888 print(u'%s\t\t%s' % (format['ext'], format['format']))
3890 def _specific(self, req_format, formats):
3892 if(x["format"]==req_format):
3896 def _real_extract(self, url):
3897 mobj = re.match(self._VALID_URL, url)
3899 self._downloader.report_error(u'invalid URL: %s' % url)
3902 video_id = mobj.group('videoid')
3904 req = compat_urllib_request.Request(url)
3905 req.add_header('Cookie', 'age_verified=1')
3906 webpage = self._download_webpage(req, video_id)
3908 # Get the video title
3909 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3911 raise ExtractorError(u'Unable to extract video title')
3912 video_title = result.group('title').strip()
3914 # Get the video date
3915 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3917 self._downloader.report_warning(u'unable to extract video date')
3920 upload_date = result.group('date').strip()
3922 # Get the video uploader
3923 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3925 self._downloader.report_warning(u'unable to extract uploader')
3926 video_uploader = None
3928 video_uploader = result.group('uploader').strip()
3929 video_uploader = clean_html( video_uploader )
3931 # Get all of the formats available
3932 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3933 result = re.search(DOWNLOAD_LIST_RE, webpage)
3935 raise ExtractorError(u'Unable to extract download list')
3936 download_list_html = result.group('download_list').strip()
3938 # Get all of the links from the page
3939 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3940 links = re.findall(LINK_RE, download_list_html)
3941 if(len(links) == 0):
3942 raise ExtractorError(u'ERROR: no known formats available for video')
3944 self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3949 # A link looks like this:
3950 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3951 # A path looks like this:
3952 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3953 video_url = unescapeHTML( link )
3954 path = compat_urllib_parse_urlparse( video_url ).path
3955 extension = os.path.splitext( path )[1][1:]
3956 format = path.split('/')[4].split('_')[:2]
3959 format = "-".join( format )
3960 title = u'%s-%s-%s' % (video_title, size, bitrate)
3965 'uploader': video_uploader,
3966 'upload_date': upload_date,
3971 'description': None,
3975 if self._downloader.params.get('listformats', None):
3976 self._print_formats(formats)
3979 req_format = self._downloader.params.get('format', None)
3980 self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3982 if req_format is None or req_format == 'best':
3984 elif req_format == 'worst':
3985 return [formats[-1]]
3986 elif req_format in ('-1', 'all'):
3989 format = self._specific( req_format, formats )
3991 self._downloader.report_error(u'requested format not available')
3997 class PornotubeIE(InfoExtractor):
3998 """Information extractor for pornotube.com."""
3999 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
4001 def _real_extract(self, url):
4002 mobj = re.match(self._VALID_URL, url)
4004 self._downloader.report_error(u'invalid URL: %s' % url)
4007 video_id = mobj.group('videoid')
4008 video_title = mobj.group('title')
4010 # Get webpage content
4011 webpage = self._download_webpage(url, video_id)
4014 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
4015 result = re.search(VIDEO_URL_RE, webpage)
4017 self._downloader.report_error(u'unable to extract video url')
4019 video_url = compat_urllib_parse.unquote(result.group('url'))
4021 #Get the uploaded date
4022 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
4023 result = re.search(VIDEO_UPLOADED_RE, webpage)
4025 self._downloader.report_error(u'unable to extract video title')
4027 upload_date = result.group('date')
4029 info = {'id': video_id,
4032 'upload_date': upload_date,
4033 'title': video_title,
4039 class YouJizzIE(InfoExtractor):
4040 """Information extractor for youjizz.com."""
4041 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
4043 def _real_extract(self, url):
4044 mobj = re.match(self._VALID_URL, url)
4046 self._downloader.report_error(u'invalid URL: %s' % url)
4049 video_id = mobj.group('videoid')
4051 # Get webpage content
4052 webpage = self._download_webpage(url, video_id)
4054 # Get the video title
4055 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
4057 raise ExtractorError(u'ERROR: unable to extract video title')
4058 video_title = result.group('title').strip()
4060 # Get the embed page
4061 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
4063 raise ExtractorError(u'ERROR: unable to extract embed page')
4065 embed_page_url = result.group(0).strip()
4066 video_id = result.group('videoid')
4068 webpage = self._download_webpage(embed_page_url, video_id)
4071 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
4073 raise ExtractorError(u'ERROR: unable to extract video url')
4074 video_url = result.group('source')
4076 info = {'id': video_id,
4078 'title': video_title,
4081 'player_url': embed_page_url}
4085 class EightTracksIE(InfoExtractor):
4087 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
4089 def _real_extract(self, url):
4090 mobj = re.match(self._VALID_URL, url)
4092 raise ExtractorError(u'Invalid URL: %s' % url)
4093 playlist_id = mobj.group('id')
4095 webpage = self._download_webpage(url, playlist_id)
4097 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
4099 raise ExtractorError(u'Cannot find trax information')
4100 json_like = m.group(1)
4101 data = json.loads(json_like)
4103 session = str(random.randint(0, 1000000000))
4105 track_count = data['tracks_count']
4106 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
4107 next_url = first_url
4109 for i in itertools.count():
4110 api_json = self._download_webpage(next_url, playlist_id,
4111 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
4112 errnote=u'Failed to download song information')
4113 api_data = json.loads(api_json)
4114 track_data = api_data[u'set']['track']
4116 'id': track_data['id'],
4117 'url': track_data['track_file_stream_url'],
4118 'title': track_data['performer'] + u' - ' + track_data['name'],
4119 'raw_title': track_data['name'],
4120 'uploader_id': data['user']['login'],
4124 if api_data['set']['at_last_track']:
4126 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
4129 class KeekIE(InfoExtractor):
4130 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
4133 def _real_extract(self, url):
4134 m = re.match(self._VALID_URL, url)
4135 video_id = m.group('videoID')
4136 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
4137 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
4138 webpage = self._download_webpage(url, video_id)
4139 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4140 title = unescapeHTML(m.group('title'))
4141 m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
4142 uploader = clean_html(m.group('uploader'))
4148 'thumbnail': thumbnail,
4149 'uploader': uploader
4153 class TEDIE(InfoExtractor):
4154 _VALID_URL=r'''http://www.ted.com/
4156 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4158 ((?P<type_talk>talks)) # We have a simple talk
4160 /(?P<name>\w+) # Here goes the name and then ".html"
4164 def suitable(cls, url):
4165 """Receives a URL and returns True if suitable for this IE."""
4166 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4168 def _real_extract(self, url):
4169 m=re.match(self._VALID_URL, url, re.VERBOSE)
4170 if m.group('type_talk'):
4171 return [self._talk_info(url)]
4173 playlist_id=m.group('playlist_id')
4174 name=m.group('name')
4175 self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4176 return self._playlist_videos_info(url,name,playlist_id)
4178 def _talk_video_link(self,mediaSlug):
4179 '''Returns the video link for that mediaSlug'''
4180 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4182 def _playlist_videos_info(self,url,name,playlist_id=0):
4183 '''Returns the videos of the playlist'''
4185 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4186 ([.\s]*?)data-playlist_item_id="(\d+)"
4187 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4189 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4190 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4191 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4192 m_names=re.finditer(video_name_RE,webpage)
4194 for m_video, m_name in zip(m_videos,m_names):
4195 video_id=m_video.group('video_id')
4196 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4197 info.append(self._talk_info(talk_url,video_id))
4200 def _talk_info(self, url, video_id=0):
4201 """Return the video for the talk in the url"""
4202 m=re.match(self._VALID_URL, url,re.VERBOSE)
4203 videoName=m.group('name')
4204 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4205 # If the url includes the language we get the title translated
4206 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4207 title=re.search(title_RE, webpage).group('title')
4208 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4209 "id":(?P<videoID>[\d]+).*?
4210 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4211 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4212 thumb_match=re.search(thumb_RE,webpage)
4213 info_match=re.search(info_RE,webpage,re.VERBOSE)
4214 video_id=info_match.group('videoID')
4215 mediaSlug=info_match.group('mediaSlug')
4216 video_url=self._talk_video_link(mediaSlug)
4222 'thumbnail': thumb_match.group('thumbnail')
4226 class MySpassIE(InfoExtractor):
4227 _VALID_URL = r'http://www.myspass.de/.*'
4229 def _real_extract(self, url):
4230 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4232 # video id is the last path element of the URL
4233 # usually there is a trailing slash, so also try the second but last
4234 url_path = compat_urllib_parse_urlparse(url).path
4235 url_parent_path, video_id = os.path.split(url_path)
4237 _, video_id = os.path.split(url_parent_path)
4240 metadata_url = META_DATA_URL_TEMPLATE % video_id
4241 metadata_text = self._download_webpage(metadata_url, video_id)
4242 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4244 # extract values from metadata
4245 url_flv_el = metadata.find('url_flv')
4246 if url_flv_el is None:
4247 self._downloader.report_error(u'unable to extract download url')
4249 video_url = url_flv_el.text
4250 extension = os.path.splitext(video_url)[1][1:]
4251 title_el = metadata.find('title')
4252 if title_el is None:
4253 self._downloader.report_error(u'unable to extract title')
4255 title = title_el.text
4256 format_id_el = metadata.find('format_id')
4257 if format_id_el is None:
4260 format = format_id_el.text
4261 description_el = metadata.find('description')
4262 if description_el is not None:
4263 description = description_el.text
4266 imagePreview_el = metadata.find('imagePreview')
4267 if imagePreview_el is not None:
4268 thumbnail = imagePreview_el.text
4277 'thumbnail': thumbnail,
4278 'description': description
4282 class SpiegelIE(InfoExtractor):
4283 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4285 def _real_extract(self, url):
4286 m = re.match(self._VALID_URL, url)
4287 video_id = m.group('videoID')
4289 webpage = self._download_webpage(url, video_id)
4290 m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4292 raise ExtractorError(u'Cannot find title')
4293 video_title = unescapeHTML(m.group(1))
4295 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4296 xml_code = self._download_webpage(xml_url, video_id,
4297 note=u'Downloading XML', errnote=u'Failed to download XML')
4299 idoc = xml.etree.ElementTree.fromstring(xml_code)
4300 last_type = idoc[-1]
4301 filename = last_type.findall('./filename')[0].text
4302 duration = float(last_type.findall('./duration')[0].text)
4304 video_url = 'http://video2.spiegel.de/flash/' + filename
4305 video_ext = filename.rpartition('.')[2]
4310 'title': video_title,
4311 'duration': duration,
4315 class LiveLeakIE(InfoExtractor):
4317 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4318 IE_NAME = u'liveleak'
4320 def _real_extract(self, url):
4321 mobj = re.match(self._VALID_URL, url)
4323 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4326 video_id = mobj.group('video_id')
4328 webpage = self._download_webpage(url, video_id)
4330 m = re.search(r'file: "(.*?)",', webpage)
4332 self._downloader.report_error(u'unable to find video url')
4334 video_url = m.group(1)
4336 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4338 self._downloader.trouble(u'Cannot find video title')
4339 title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4341 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4343 desc = unescapeHTML(m.group('desc'))
4347 m = re.search(r'By:.*?(\w+)</a>', webpage)
4349 uploader = clean_html(m.group(1))
4358 'description': desc,
4359 'uploader': uploader
4364 class ARDIE(InfoExtractor):
4365 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4366 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4367 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4369 def _real_extract(self, url):
4370 # determine video id from url
4371 m = re.match(self._VALID_URL, url)
4373 numid = re.search(r'documentId=([0-9]+)', url)
4375 video_id = numid.group(1)
4377 video_id = m.group('video_id')
4379 # determine title and media streams from webpage
4380 html = self._download_webpage(url, video_id)
4381 title = re.search(self._TITLE, html).group('title')
4382 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4384 assert '"fsk"' in html
4385 self._downloader.report_error(u'this video is only available after 8:00 pm')
4388 # choose default media type and highest quality for now
4389 stream = max([s for s in streams if int(s["media_type"]) == 0],
4390 key=lambda s: int(s["quality"]))
4392 # there's two possibilities: RTMP stream or HTTP download
4393 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4394 if stream['rtmp_url']:
4395 self._downloader.to_screen(u'[%s] RTMP download detected' % self.IE_NAME)
4396 assert stream['video_url'].startswith('mp4:')
4397 info["url"] = stream["rtmp_url"]
4398 info["play_path"] = stream['video_url']
4400 assert stream["video_url"].endswith('.mp4')
4401 info["url"] = stream["video_url"]
4405 def gen_extractors():
4406 """ Return a list of an instance of every supported extractor.
4407 The order does matter; the first extractor matched is the one handling the URL.
4410 YoutubePlaylistIE(),
4435 StanfordOpenClassroomIE(),
4445 WorldStarHipHopIE(),