2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
23 class InfoExtractor(object):
24 """Information Extractor class.
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
29 others. The information is stored in a dictionary which is then
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
34 The dictionaries must include the following fields:
38 title: Video title, unescaped.
39 ext: Video filename extension.
41 The following fields are optional:
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
46 uploader: Full name of the video uploader.
47 upload_date: Video upload date (YYYYMMDD).
48 uploader_id: Nickname or id of the video uploader.
49 location: Physical location of the video.
50 player_url: SWF Player URL (used for rtmpdump).
51 subtitles: The subtitle file contents.
52 urlhandle: [internal] The urlHandle to be used to download the file,
53 like returned by urllib.request.urlopen
55 The fields should all be Unicode strings.
57 Subclasses of this one should re-define the _real_initialize() and
58 _real_extract() methods and define a _VALID_URL regexp.
59 Probably, they should also be added to the list of extractors.
61 _real_extract() must return a *list* of information dictionaries as
64 Finally, the _WORKING attribute should be set to False for broken IEs
65 in order to warn the users and skip the tests.
72 def __init__(self, downloader=None):
73 """Constructor. Receives an optional downloader."""
75 self.set_downloader(downloader)
78 def suitable(cls, url):
79 """Receives a URL and returns True if suitable for this IE."""
80 return re.match(cls._VALID_URL, url) is not None
84 """Getter method for _WORKING."""
88 """Initializes an instance (authentication, etc)."""
90 self._real_initialize()
93 def extract(self, url):
94 """Extracts URL information and returns it in list of dicts."""
96 return self._real_extract(url)
98 def set_downloader(self, downloader):
99 """Sets the downloader for this IE."""
100 self._downloader = downloader
102 def _real_initialize(self):
103 """Real initialization process. Redefine in subclasses."""
106 def _real_extract(self, url):
107 """Real extraction process. Redefine in subclasses."""
112 return type(self).__name__[:-2]
114 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115 """ Returns the response handle """
117 note = u'Downloading video webpage'
118 if note is not False:
119 self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
121 return compat_urllib_request.urlopen(url_or_request)
122 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
124 errnote = u'Unable to download webpage'
125 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
127 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
128 """ Returns the data of the page as a string """
129 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
130 content_type = urlh.headers.get('Content-Type', '')
131 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
133 encoding = m.group(1)
136 webpage_bytes = urlh.read()
137 if self._downloader.params.get('dump_intermediate_pages', False):
139 url = url_or_request.get_full_url()
140 except AttributeError:
142 self._downloader.to_screen(u'Dumping request to ' + url)
143 dump = base64.b64encode(webpage_bytes).decode('ascii')
144 self._downloader.to_screen(dump)
145 return webpage_bytes.decode(encoding, 'replace')
148 class YoutubeIE(InfoExtractor):
149 """Information extractor for youtube.com."""
153 (?:https?://)? # http(s):// (optional)
154 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
155 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
156 (?:.*?\#/)? # handle anchor (#/) redirect urls
157 (?: # the various things that can precede the ID:
158 (?:(?:v|embed|e)/) # v/ or embed/ or e/
159 |(?: # or the v= param in all its forms
160 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
161 (?:\?|\#!?) # the params delimiter ? or # or #!
162 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
165 )? # optional -> youtube.com/xxxx is OK
166 )? # all until now is optional -> you can pass the naked ID
167 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
168 (?(1).+)? # if we found the ID, everything can follow
170 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
171 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
172 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
173 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
174 _NETRC_MACHINE = 'youtube'
175 # Listed in order of quality
176 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
177 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
178 _video_extensions = {
184 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
190 _video_dimensions = {
209 def suitable(cls, url):
210 """Receives a URL and returns True if suitable for this IE."""
211 if YoutubePlaylistIE.suitable(url): return False
212 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
214 def report_lang(self):
215 """Report attempt to set language."""
216 self._downloader.to_screen(u'[youtube] Setting language')
218 def report_login(self):
219 """Report attempt to log in."""
220 self._downloader.to_screen(u'[youtube] Logging in')
222 def report_age_confirmation(self):
223 """Report attempt to confirm age."""
224 self._downloader.to_screen(u'[youtube] Confirming age')
226 def report_video_webpage_download(self, video_id):
227 """Report attempt to download video webpage."""
228 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
230 def report_video_info_webpage_download(self, video_id):
231 """Report attempt to download video info webpage."""
232 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
234 def report_video_subtitles_download(self, video_id):
235 """Report attempt to download video info webpage."""
236 self._downloader.to_screen(u'[youtube] %s: Checking available subtitles' % video_id)
238 def report_video_subtitles_request(self, video_id, sub_lang, format):
239 """Report attempt to download video info webpage."""
240 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
242 def report_video_subtitles_available(self, video_id, sub_lang_list):
243 """Report available subtitles."""
244 sub_lang = ",".join(list(sub_lang_list.keys()))
245 self._downloader.to_screen(u'[youtube] %s: Available subtitles for video: %s' % (video_id, sub_lang))
247 def report_information_extraction(self, video_id):
248 """Report attempt to extract video information."""
249 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
251 def report_unavailable_format(self, video_id, format):
252 """Report extracted video URL."""
253 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
255 def report_rtmp_download(self):
256 """Indicate the download will use the RTMP protocol."""
257 self._downloader.to_screen(u'[youtube] RTMP download detected')
259 def _get_available_subtitles(self, video_id):
260 self.report_video_subtitles_download(video_id)
261 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
263 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
264 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
265 return (u'unable to download video subtitles: %s' % compat_str(err), None)
266 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
267 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
268 if not sub_lang_list:
269 return (u'video doesn\'t have subtitles', None)
272 def _list_available_subtitles(self, video_id):
273 sub_lang_list = self._get_available_subtitles(video_id)
274 self.report_video_subtitles_available(video_id, sub_lang_list)
276 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
279 (error_message, sub_lang, sub)
281 self.report_video_subtitles_request(video_id, sub_lang, format)
282 params = compat_urllib_parse.urlencode({
288 url = 'http://www.youtube.com/api/timedtext?' + params
290 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
291 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
292 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
294 return (u'Did not fetch video subtitles', None, None)
295 return (None, sub_lang, sub)
297 def _extract_subtitle(self, video_id):
299 Return a list with a tuple:
300 [(error_message, sub_lang, sub)]
302 sub_lang_list = self._get_available_subtitles(video_id)
303 sub_format = self._downloader.params.get('subtitlesformat')
304 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
305 return [(sub_lang_list[0], None, None)]
306 if self._downloader.params.get('subtitleslang', False):
307 sub_lang = self._downloader.params.get('subtitleslang')
308 elif 'en' in sub_lang_list:
311 sub_lang = list(sub_lang_list.keys())[0]
312 if not sub_lang in sub_lang_list:
313 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
315 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
318 def _extract_all_subtitles(self, video_id):
319 sub_lang_list = self._get_available_subtitles(video_id)
320 sub_format = self._downloader.params.get('subtitlesformat')
321 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
322 return [(sub_lang_list[0], None, None)]
324 for sub_lang in sub_lang_list:
325 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
326 subtitles.append(subtitle)
329 def _print_formats(self, formats):
330 print('Available formats:')
332 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
334 def _real_initialize(self):
335 if self._downloader is None:
340 downloader_params = self._downloader.params
342 # Attempt to use provided username and password or .netrc data
343 if downloader_params.get('username', None) is not None:
344 username = downloader_params['username']
345 password = downloader_params['password']
346 elif downloader_params.get('usenetrc', False):
348 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
353 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
354 except (IOError, netrc.NetrcParseError) as err:
355 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
359 request = compat_urllib_request.Request(self._LANG_URL)
362 compat_urllib_request.urlopen(request).read()
363 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
364 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
367 # No authentication to be performed
371 request = compat_urllib_request.Request(self._LOGIN_URL)
373 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
374 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
375 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
380 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
382 galx = match.group(1)
384 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
390 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
394 u'PersistentCookie': u'yes',
396 u'bgresponse': u'js_disabled',
397 u'checkConnection': u'',
398 u'checkedDomains': u'youtube',
404 u'signIn': u'Sign in',
406 u'service': u'youtube',
410 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
412 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
413 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
414 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
417 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
418 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
419 self._downloader.report_warning(u'unable to log in: bad username or password')
421 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
422 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
428 'action_confirm': 'Confirm',
430 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
432 self.report_age_confirmation()
433 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
434 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
435 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
438 def _extract_id(self, url):
439 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
441 self._downloader.report_error(u'invalid URL: %s' % url)
443 video_id = mobj.group(2)
446 def _real_extract(self, url):
447 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
448 mobj = re.search(self._NEXT_URL_RE, url)
450 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
451 video_id = self._extract_id(url)
454 self.report_video_webpage_download(video_id)
455 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
456 request = compat_urllib_request.Request(url)
458 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
459 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
460 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
463 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
465 # Attempt to extract SWF player URL
466 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
468 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
473 self.report_video_info_webpage_download(video_id)
474 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
475 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
476 % (video_id, el_type))
477 video_info_webpage = self._download_webpage(video_info_url, video_id,
479 errnote='unable to download video info webpage')
480 video_info = compat_parse_qs(video_info_webpage)
481 if 'token' in video_info:
483 if 'token' not in video_info:
484 if 'reason' in video_info:
485 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
487 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
490 # Check for "rental" videos
491 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
492 self._downloader.report_error(u'"rental" videos not supported')
495 # Start extracting information
496 self.report_information_extraction(video_id)
499 if 'author' not in video_info:
500 self._downloader.report_error(u'unable to extract uploader name')
502 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
505 video_uploader_id = None
506 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
508 video_uploader_id = mobj.group(1)
510 self._downloader.report_warning(u'unable to extract uploader nickname')
513 if 'title' not in video_info:
514 self._downloader.report_error(u'unable to extract video title')
516 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
519 if 'thumbnail_url' not in video_info:
520 self._downloader.report_warning(u'unable to extract video thumbnail')
522 else: # don't panic if we can't find it
523 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
527 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
529 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
530 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
531 for expression in format_expressions:
533 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
538 video_description = get_element_by_id("eow-description", video_webpage)
539 if video_description:
540 video_description = clean_html(video_description)
542 video_description = ''
545 video_subtitles = None
547 if self._downloader.params.get('writesubtitles', False):
548 video_subtitles = self._extract_subtitle(video_id)
550 (sub_error, sub_lang, sub) = video_subtitles[0]
552 self._downloader.report_error(sub_error)
554 if self._downloader.params.get('allsubtitles', False):
555 video_subtitles = self._extract_all_subtitles(video_id)
556 for video_subtitle in video_subtitles:
557 (sub_error, sub_lang, sub) = video_subtitle
559 self._downloader.report_error(sub_error)
561 if self._downloader.params.get('listsubtitles', False):
562 sub_lang_list = self._list_available_subtitles(video_id)
565 if 'length_seconds' not in video_info:
566 self._downloader.report_warning(u'unable to extract video duration')
569 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
572 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
574 # Decide which formats to download
575 req_format = self._downloader.params.get('format', None)
577 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
578 self.report_rtmp_download()
579 video_url_list = [(None, video_info['conn'][0])]
580 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
581 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
582 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
583 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
584 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
586 format_limit = self._downloader.params.get('format_limit', None)
587 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
588 if format_limit is not None and format_limit in available_formats:
589 format_list = available_formats[available_formats.index(format_limit):]
591 format_list = available_formats
592 existing_formats = [x for x in format_list if x in url_map]
593 if len(existing_formats) == 0:
594 self._downloader.report_error(u'no known formats available for video')
596 if self._downloader.params.get('listformats', None):
597 self._print_formats(existing_formats)
599 if req_format is None or req_format == 'best':
600 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
601 elif req_format == 'worst':
602 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
603 elif req_format in ('-1', 'all'):
604 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
606 # Specific formats. We pick the first in a slash-delimeted sequence.
607 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
608 req_formats = req_format.split('/')
609 video_url_list = None
610 for rf in req_formats:
612 video_url_list = [(rf, url_map[rf])]
614 if video_url_list is None:
615 self._downloader.report_error(u'requested format not available')
618 self._downloader.report_error(u'no conn or url_encoded_fmt_stream_map information found in video info')
622 for format_param, video_real_url in video_url_list:
624 video_extension = self._video_extensions.get(format_param, 'flv')
626 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
627 self._video_dimensions.get(format_param, '???'))
631 'url': video_real_url,
632 'uploader': video_uploader,
633 'uploader_id': video_uploader_id,
634 'upload_date': upload_date,
635 'title': video_title,
636 'ext': video_extension,
637 'format': video_format,
638 'thumbnail': video_thumbnail,
639 'description': video_description,
640 'player_url': player_url,
641 'subtitles': video_subtitles,
642 'duration': video_duration
647 class MetacafeIE(InfoExtractor):
648 """Information Extractor for metacafe.com."""
650 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
651 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
652 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
653 IE_NAME = u'metacafe'
655 def __init__(self, downloader=None):
656 InfoExtractor.__init__(self, downloader)
658 def report_disclaimer(self):
659 """Report disclaimer retrieval."""
660 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
662 def report_age_confirmation(self):
663 """Report attempt to confirm age."""
664 self._downloader.to_screen(u'[metacafe] Confirming age')
666 def report_download_webpage(self, video_id):
667 """Report webpage download."""
668 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
670 def report_extraction(self, video_id):
671 """Report information extraction."""
672 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
674 def _real_initialize(self):
675 # Retrieve disclaimer
676 request = compat_urllib_request.Request(self._DISCLAIMER)
678 self.report_disclaimer()
679 disclaimer = compat_urllib_request.urlopen(request).read()
680 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
681 self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
687 'submit': "Continue - I'm over 18",
689 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
691 self.report_age_confirmation()
692 disclaimer = compat_urllib_request.urlopen(request).read()
693 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
694 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
697 def _real_extract(self, url):
698 # Extract id and simplified title from URL
699 mobj = re.match(self._VALID_URL, url)
701 self._downloader.report_error(u'invalid URL: %s' % url)
704 video_id = mobj.group(1)
706 # Check if video comes from YouTube
707 mobj2 = re.match(r'^yt-(.*)$', video_id)
708 if mobj2 is not None:
709 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
712 # Retrieve video webpage to extract further information
713 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
715 self.report_download_webpage(video_id)
716 webpage = compat_urllib_request.urlopen(request).read()
717 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
718 self._downloader.report_error(u'unable retrieve video webpage: %s' % compat_str(err))
721 # Extract URL, uploader and title from webpage
722 self.report_extraction(video_id)
723 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
725 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
726 video_extension = mediaURL[-3:]
728 # Extract gdaKey if available
729 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
733 gdaKey = mobj.group(1)
734 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
736 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
738 self._downloader.report_error(u'unable to extract media URL')
740 vardict = compat_parse_qs(mobj.group(1))
741 if 'mediaData' not in vardict:
742 self._downloader.report_error(u'unable to extract media URL')
744 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
746 self._downloader.report_error(u'unable to extract media URL')
748 mediaURL = mobj.group(1).replace('\\/', '/')
749 video_extension = mediaURL[-3:]
750 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
752 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
754 self._downloader.report_error(u'unable to extract title')
756 video_title = mobj.group(1).decode('utf-8')
758 mobj = re.search(r'submitter=(.*?);', webpage)
760 self._downloader.report_error(u'unable to extract uploader nickname')
762 video_uploader = mobj.group(1)
765 'id': video_id.decode('utf-8'),
766 'url': video_url.decode('utf-8'),
767 'uploader': video_uploader.decode('utf-8'),
769 'title': video_title,
770 'ext': video_extension.decode('utf-8'),
774 class DailymotionIE(InfoExtractor):
775 """Information Extractor for Dailymotion"""
777 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
778 IE_NAME = u'dailymotion'
781 def __init__(self, downloader=None):
782 InfoExtractor.__init__(self, downloader)
784 def report_extraction(self, video_id):
785 """Report information extraction."""
786 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
788 def _real_extract(self, url):
789 # Extract id and simplified title from URL
790 mobj = re.match(self._VALID_URL, url)
792 self._downloader.report_error(u'invalid URL: %s' % url)
795 video_id = mobj.group(1).split('_')[0].split('?')[0]
797 video_extension = 'mp4'
799 # Retrieve video webpage to extract further information
800 request = compat_urllib_request.Request(url)
801 request.add_header('Cookie', 'family_filter=off')
802 webpage = self._download_webpage(request, video_id)
804 # Extract URL, uploader and title from webpage
805 self.report_extraction(video_id)
806 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
808 self._downloader.report_error(u'unable to extract media URL')
810 flashvars = compat_urllib_parse.unquote(mobj.group(1))
812 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
815 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
818 self._downloader.report_error(u'unable to extract video URL')
821 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
823 self._downloader.report_error(u'unable to extract video URL')
826 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
828 # TODO: support choosing qualities
830 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
832 self._downloader.report_error(u'unable to extract title')
834 video_title = unescapeHTML(mobj.group('title'))
836 video_uploader = None
837 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
839 # lookin for official user
840 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
841 if mobj_official is None:
842 self._downloader.report_warning(u'unable to extract uploader nickname')
844 video_uploader = mobj_official.group(1)
846 video_uploader = mobj.group(1)
848 video_upload_date = None
849 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
851 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
856 'uploader': video_uploader,
857 'upload_date': video_upload_date,
858 'title': video_title,
859 'ext': video_extension,
863 class PhotobucketIE(InfoExtractor):
864 """Information extractor for photobucket.com."""
866 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
867 IE_NAME = u'photobucket'
869 def __init__(self, downloader=None):
870 InfoExtractor.__init__(self, downloader)
872 def report_download_webpage(self, video_id):
873 """Report webpage download."""
874 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
876 def report_extraction(self, video_id):
877 """Report information extraction."""
878 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
880 def _real_extract(self, url):
881 # Extract id from URL
882 mobj = re.match(self._VALID_URL, url)
884 self._downloader.report_error(u'Invalid URL: %s' % url)
887 video_id = mobj.group(1)
889 video_extension = 'flv'
891 # Retrieve video webpage to extract further information
892 request = compat_urllib_request.Request(url)
894 self.report_download_webpage(video_id)
895 webpage = compat_urllib_request.urlopen(request).read()
896 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
897 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
900 # Extract URL, uploader, and title from webpage
901 self.report_extraction(video_id)
902 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
904 self._downloader.report_error(u'unable to extract media URL')
906 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
910 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
912 self._downloader.report_error(u'unable to extract title')
914 video_title = mobj.group(1).decode('utf-8')
916 video_uploader = mobj.group(2).decode('utf-8')
919 'id': video_id.decode('utf-8'),
920 'url': video_url.decode('utf-8'),
921 'uploader': video_uploader,
923 'title': video_title,
924 'ext': video_extension.decode('utf-8'),
928 class YahooIE(InfoExtractor):
929 """Information extractor for video.yahoo.com."""
932 # _VALID_URL matches all Yahoo! Video URLs
933 # _VPAGE_URL matches only the extractable '/watch/' URLs
934 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
935 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
936 IE_NAME = u'video.yahoo'
938 def __init__(self, downloader=None):
939 InfoExtractor.__init__(self, downloader)
941 def report_download_webpage(self, video_id):
942 """Report webpage download."""
943 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
945 def report_extraction(self, video_id):
946 """Report information extraction."""
947 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
949 def _real_extract(self, url, new_video=True):
950 # Extract ID from URL
951 mobj = re.match(self._VALID_URL, url)
953 self._downloader.report_error(u'Invalid URL: %s' % url)
956 video_id = mobj.group(2)
957 video_extension = 'flv'
959 # Rewrite valid but non-extractable URLs as
960 # extractable English language /watch/ URLs
961 if re.match(self._VPAGE_URL, url) is None:
962 request = compat_urllib_request.Request(url)
964 webpage = compat_urllib_request.urlopen(request).read()
965 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
966 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
969 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
971 self._downloader.report_error(u'Unable to extract id field')
973 yahoo_id = mobj.group(1)
975 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
977 self._downloader.report_error(u'Unable to extract vid field')
979 yahoo_vid = mobj.group(1)
981 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
982 return self._real_extract(url, new_video=False)
984 # Retrieve video webpage to extract further information
985 request = compat_urllib_request.Request(url)
987 self.report_download_webpage(video_id)
988 webpage = compat_urllib_request.urlopen(request).read()
989 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
990 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
993 # Extract uploader and title from webpage
994 self.report_extraction(video_id)
995 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
997 self._downloader.report_error(u'unable to extract video title')
999 video_title = mobj.group(1).decode('utf-8')
1001 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1003 self._downloader.report_error(u'unable to extract video uploader')
1005 video_uploader = mobj.group(1).decode('utf-8')
1007 # Extract video thumbnail
1008 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1010 self._downloader.report_error(u'unable to extract video thumbnail')
1012 video_thumbnail = mobj.group(1).decode('utf-8')
1014 # Extract video description
1015 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1017 self._downloader.report_error(u'unable to extract video description')
1019 video_description = mobj.group(1).decode('utf-8')
1020 if not video_description:
1021 video_description = 'No description available.'
1023 # Extract video height and width
1024 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1026 self._downloader.report_error(u'unable to extract video height')
1028 yv_video_height = mobj.group(1)
1030 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1032 self._downloader.report_error(u'unable to extract video width')
1034 yv_video_width = mobj.group(1)
1036 # Retrieve video playlist to extract media URL
1037 # I'm not completely sure what all these options are, but we
1038 # seem to need most of them, otherwise the server sends a 401.
1039 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1040 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1041 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1042 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1043 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1045 self.report_download_webpage(video_id)
1046 webpage = compat_urllib_request.urlopen(request).read()
1047 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1048 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1051 # Extract media URL from playlist XML
1052 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1054 self._downloader.report_error(u'Unable to extract media URL')
1056 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1057 video_url = unescapeHTML(video_url)
1060 'id': video_id.decode('utf-8'),
1062 'uploader': video_uploader,
1063 'upload_date': None,
1064 'title': video_title,
1065 'ext': video_extension.decode('utf-8'),
1066 'thumbnail': video_thumbnail.decode('utf-8'),
1067 'description': video_description,
1071 class VimeoIE(InfoExtractor):
1072 """Information extractor for vimeo.com."""
1074 # _VALID_URL matches Vimeo URLs
1075 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1078 def __init__(self, downloader=None):
1079 InfoExtractor.__init__(self, downloader)
1081 def report_download_webpage(self, video_id):
1082 """Report webpage download."""
1083 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1085 def report_extraction(self, video_id):
1086 """Report information extraction."""
1087 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1089 def _real_extract(self, url, new_video=True):
1090 # Extract ID from URL
1091 mobj = re.match(self._VALID_URL, url)
1093 self._downloader.report_error(u'Invalid URL: %s' % url)
1096 video_id = mobj.group('id')
1097 if not mobj.group('proto'):
1098 url = 'https://' + url
1099 if mobj.group('direct_link'):
1100 url = 'https://vimeo.com/' + video_id
1102 # Retrieve video webpage to extract further information
1103 request = compat_urllib_request.Request(url, None, std_headers)
1105 self.report_download_webpage(video_id)
1106 webpage_bytes = compat_urllib_request.urlopen(request).read()
1107 webpage = webpage_bytes.decode('utf-8')
1108 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1109 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1112 # Now we begin extracting as much information as we can from what we
1113 # retrieved. First we extract the information common to all extractors,
1114 # and latter we extract those that are Vimeo specific.
1115 self.report_extraction(video_id)
1117 # Extract the config JSON
1119 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1120 config = json.loads(config)
1122 self._downloader.report_error(u'unable to extract info section')
1126 video_title = config["video"]["title"]
1128 # Extract uploader and uploader_id
1129 video_uploader = config["video"]["owner"]["name"]
1130 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1132 # Extract video thumbnail
1133 video_thumbnail = config["video"]["thumbnail"]
1135 # Extract video description
1136 video_description = get_element_by_attribute("itemprop", "description", webpage)
1137 if video_description: video_description = clean_html(video_description)
1138 else: video_description = u''
1140 # Extract upload date
1141 video_upload_date = None
1142 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1143 if mobj is not None:
1144 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1146 # Vimeo specific: extract request signature and timestamp
1147 sig = config['request']['signature']
1148 timestamp = config['request']['timestamp']
1150 # Vimeo specific: extract video codec and quality information
1151 # First consider quality, then codecs, then take everything
1152 # TODO bind to format param
1153 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1154 files = { 'hd': [], 'sd': [], 'other': []}
1155 for codec_name, codec_extension in codecs:
1156 if codec_name in config["video"]["files"]:
1157 if 'hd' in config["video"]["files"][codec_name]:
1158 files['hd'].append((codec_name, codec_extension, 'hd'))
1159 elif 'sd' in config["video"]["files"][codec_name]:
1160 files['sd'].append((codec_name, codec_extension, 'sd'))
1162 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1164 for quality in ('hd', 'sd', 'other'):
1165 if len(files[quality]) > 0:
1166 video_quality = files[quality][0][2]
1167 video_codec = files[quality][0][0]
1168 video_extension = files[quality][0][1]
1169 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1172 self._downloader.report_error(u'no known codec found')
1175 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1176 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1181 'uploader': video_uploader,
1182 'uploader_id': video_uploader_id,
1183 'upload_date': video_upload_date,
1184 'title': video_title,
1185 'ext': video_extension,
1186 'thumbnail': video_thumbnail,
1187 'description': video_description,
1191 class ArteTvIE(InfoExtractor):
1192 """arte.tv information extractor."""
1194 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1195 _LIVE_URL = r'index-[0-9]+\.html$'
1197 IE_NAME = u'arte.tv'
1199 def __init__(self, downloader=None):
1200 InfoExtractor.__init__(self, downloader)
1202 def report_download_webpage(self, video_id):
1203 """Report webpage download."""
1204 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1206 def report_extraction(self, video_id):
1207 """Report information extraction."""
1208 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1210 def fetch_webpage(self, url):
1211 request = compat_urllib_request.Request(url)
1213 self.report_download_webpage(url)
1214 webpage = compat_urllib_request.urlopen(request).read()
1215 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1216 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1218 except ValueError as err:
1219 self._downloader.report_error(u'Invalid URL: %s' % url)
1223 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1224 page = self.fetch_webpage(url)
1225 mobj = re.search(regex, page, regexFlags)
1229 self._downloader.report_error(u'Invalid URL: %s' % url)
1232 for (i, key, err) in matchTuples:
1233 if mobj.group(i) is None:
1234 self._downloader.trouble(err)
1237 info[key] = mobj.group(i)
1241 def extractLiveStream(self, url):
1242 video_lang = url.split('/')[-4]
1243 info = self.grep_webpage(
1245 r'src="(.*?/videothek_js.*?\.js)',
1248 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1251 http_host = url.split('/')[2]
1252 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1253 info = self.grep_webpage(
1255 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1256 '(http://.*?\.swf).*?' +
1260 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1261 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1262 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1265 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1267 def extractPlus7Stream(self, url):
1268 video_lang = url.split('/')[-3]
1269 info = self.grep_webpage(
1271 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1274 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1277 next_url = compat_urllib_parse.unquote(info.get('url'))
1278 info = self.grep_webpage(
1280 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1283 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1286 next_url = compat_urllib_parse.unquote(info.get('url'))
1288 info = self.grep_webpage(
1290 r'<video id="(.*?)".*?>.*?' +
1291 '<name>(.*?)</name>.*?' +
1292 '<dateVideo>(.*?)</dateVideo>.*?' +
1293 '<url quality="hd">(.*?)</url>',
1296 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1297 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1298 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1299 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1304 'id': info.get('id'),
1305 'url': compat_urllib_parse.unquote(info.get('url')),
1306 'uploader': u'arte.tv',
1307 'upload_date': info.get('date'),
1308 'title': info.get('title').decode('utf-8'),
1314 def _real_extract(self, url):
1315 video_id = url.split('/')[-1]
1316 self.report_extraction(video_id)
1318 if re.search(self._LIVE_URL, video_id) is not None:
1319 self.extractLiveStream(url)
1322 info = self.extractPlus7Stream(url)
1327 class GenericIE(InfoExtractor):
1328 """Generic last-resort information extractor."""
1331 IE_NAME = u'generic'
1333 def __init__(self, downloader=None):
1334 InfoExtractor.__init__(self, downloader)
1336 def report_download_webpage(self, video_id):
1337 """Report webpage download."""
1338 if not self._downloader.params.get('test', False):
1339 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1340 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1342 def report_extraction(self, video_id):
1343 """Report information extraction."""
1344 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1346 def report_following_redirect(self, new_url):
1347 """Report information extraction."""
1348 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1350 def _test_redirect(self, url):
1351 """Check if it is a redirect, like url shorteners, in case restart chain."""
1352 class HeadRequest(compat_urllib_request.Request):
1353 def get_method(self):
1356 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1358 Subclass the HTTPRedirectHandler to make it use our
1359 HeadRequest also on the redirected URL
1361 def redirect_request(self, req, fp, code, msg, headers, newurl):
1362 if code in (301, 302, 303, 307):
1363 newurl = newurl.replace(' ', '%20')
1364 newheaders = dict((k,v) for k,v in req.headers.items()
1365 if k.lower() not in ("content-length", "content-type"))
1366 return HeadRequest(newurl,
1368 origin_req_host=req.get_origin_req_host(),
1371 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1373 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1375 Fallback to GET if HEAD is not allowed (405 HTTP error)
1377 def http_error_405(self, req, fp, code, msg, headers):
1381 newheaders = dict((k,v) for k,v in req.headers.items()
1382 if k.lower() not in ("content-length", "content-type"))
1383 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1385 origin_req_host=req.get_origin_req_host(),
1389 opener = compat_urllib_request.OpenerDirector()
1390 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1391 HTTPMethodFallback, HEADRedirectHandler,
1392 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1393 opener.add_handler(handler())
1395 response = opener.open(HeadRequest(url))
1396 new_url = response.geturl()
1401 self.report_following_redirect(new_url)
1402 self._downloader.download([new_url])
1405 def _real_extract(self, url):
1406 if self._test_redirect(url): return
1408 video_id = url.split('/')[-1]
1410 webpage = self._download_webpage(url, video_id)
1411 except ValueError as err:
1412 # since this is the last-resort InfoExtractor, if
1413 # this error is thrown, it'll be thrown here
1414 self._downloader.report_error(u'Invalid URL: %s' % url)
1417 self.report_extraction(video_id)
1418 # Start with something easy: JW Player in SWFObject
1419 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1421 # Broaden the search a little bit
1422 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1424 # Broaden the search a little bit: JWPlayer JS loader
1425 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1427 self._downloader.report_error(u'Invalid URL: %s' % url)
1430 # It's possible that one of the regexes
1431 # matched, but returned an empty group:
1432 if mobj.group(1) is None:
1433 self._downloader.report_error(u'Invalid URL: %s' % url)
1436 video_url = compat_urllib_parse.unquote(mobj.group(1))
1437 video_id = os.path.basename(video_url)
1439 # here's a fun little line of code for you:
1440 video_extension = os.path.splitext(video_id)[1][1:]
1441 video_id = os.path.splitext(video_id)[0]
1443 # it's tempting to parse this further, but you would
1444 # have to take into account all the variations like
1445 # Video Title - Site Name
1446 # Site Name | Video Title
1447 # Video Title - Tagline | Site Name
1448 # and so on and so forth; it's just not practical
1449 mobj = re.search(r'<title>(.*)</title>', webpage)
1451 self._downloader.report_error(u'unable to extract title')
1453 video_title = mobj.group(1)
1455 # video uploader is domain name
1456 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1458 self._downloader.report_error(u'unable to extract title')
1460 video_uploader = mobj.group(1)
1465 'uploader': video_uploader,
1466 'upload_date': None,
1467 'title': video_title,
1468 'ext': video_extension,
1472 class YoutubeSearchIE(InfoExtractor):
1473 """Information Extractor for YouTube search queries."""
1474 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1475 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1476 _max_youtube_results = 1000
1477 IE_NAME = u'youtube:search'
1479 def __init__(self, downloader=None):
1480 InfoExtractor.__init__(self, downloader)
1482 def report_download_page(self, query, pagenum):
1483 """Report attempt to download search page with given number."""
1484 query = query.decode(preferredencoding())
1485 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1487 def _real_extract(self, query):
1488 mobj = re.match(self._VALID_URL, query)
1490 self._downloader.report_error(u'invalid search query "%s"' % query)
1493 prefix, query = query.split(':')
1495 query = query.encode('utf-8')
1497 self._download_n_results(query, 1)
1499 elif prefix == 'all':
1500 self._download_n_results(query, self._max_youtube_results)
1506 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1508 elif n > self._max_youtube_results:
1509 self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1510 n = self._max_youtube_results
1511 self._download_n_results(query, n)
1513 except ValueError: # parsing prefix as integer fails
1514 self._download_n_results(query, 1)
1517 def _download_n_results(self, query, n):
1518 """Downloads a specified number of results for a query"""
1524 while (50 * pagenum) < limit:
1525 self.report_download_page(query, pagenum+1)
1526 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1527 request = compat_urllib_request.Request(result_url)
1529 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1530 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1531 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1533 api_response = json.loads(data)['data']
1535 if not 'items' in api_response:
1536 self._downloader.trouble(u'[youtube] No video results')
1539 new_ids = list(video['id'] for video in api_response['items'])
1540 video_ids += new_ids
1542 limit = min(n, api_response['totalItems'])
1545 if len(video_ids) > n:
1546 video_ids = video_ids[:n]
1547 for id in video_ids:
1548 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1552 class GoogleSearchIE(InfoExtractor):
1553 """Information Extractor for Google Video search queries."""
1554 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1555 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1556 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1557 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1558 _max_google_results = 1000
1559 IE_NAME = u'video.google:search'
1561 def __init__(self, downloader=None):
1562 InfoExtractor.__init__(self, downloader)
1564 def report_download_page(self, query, pagenum):
1565 """Report attempt to download playlist page with given number."""
1566 query = query.decode(preferredencoding())
1567 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1569 def _real_extract(self, query):
1570 mobj = re.match(self._VALID_URL, query)
1572 self._downloader.report_error(u'invalid search query "%s"' % query)
1575 prefix, query = query.split(':')
1577 query = query.encode('utf-8')
1579 self._download_n_results(query, 1)
1581 elif prefix == 'all':
1582 self._download_n_results(query, self._max_google_results)
1588 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1590 elif n > self._max_google_results:
1591 self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1592 n = self._max_google_results
1593 self._download_n_results(query, n)
1595 except ValueError: # parsing prefix as integer fails
1596 self._download_n_results(query, 1)
1599 def _download_n_results(self, query, n):
1600 """Downloads a specified number of results for a query"""
1606 self.report_download_page(query, pagenum)
1607 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1608 request = compat_urllib_request.Request(result_url)
1610 page = compat_urllib_request.urlopen(request).read()
1611 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1612 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1615 # Extract video identifiers
1616 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1617 video_id = mobj.group(1)
1618 if video_id not in video_ids:
1619 video_ids.append(video_id)
1620 if len(video_ids) == n:
1621 # Specified n videos reached
1622 for id in video_ids:
1623 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1626 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1627 for id in video_ids:
1628 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1631 pagenum = pagenum + 1
1634 class YahooSearchIE(InfoExtractor):
1635 """Information Extractor for Yahoo! Video search queries."""
1638 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1639 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1640 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1641 _MORE_PAGES_INDICATOR = r'\s*Next'
1642 _max_yahoo_results = 1000
1643 IE_NAME = u'video.yahoo:search'
1645 def __init__(self, downloader=None):
1646 InfoExtractor.__init__(self, downloader)
1648 def report_download_page(self, query, pagenum):
1649 """Report attempt to download playlist page with given number."""
1650 query = query.decode(preferredencoding())
1651 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1653 def _real_extract(self, query):
1654 mobj = re.match(self._VALID_URL, query)
1656 self._downloader.report_error(u'invalid search query "%s"' % query)
1659 prefix, query = query.split(':')
1661 query = query.encode('utf-8')
1663 self._download_n_results(query, 1)
1665 elif prefix == 'all':
1666 self._download_n_results(query, self._max_yahoo_results)
1672 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1674 elif n > self._max_yahoo_results:
1675 self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1676 n = self._max_yahoo_results
1677 self._download_n_results(query, n)
1679 except ValueError: # parsing prefix as integer fails
1680 self._download_n_results(query, 1)
1683 def _download_n_results(self, query, n):
1684 """Downloads a specified number of results for a query"""
1687 already_seen = set()
1691 self.report_download_page(query, pagenum)
1692 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1693 request = compat_urllib_request.Request(result_url)
1695 page = compat_urllib_request.urlopen(request).read()
1696 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1697 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1700 # Extract video identifiers
1701 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1702 video_id = mobj.group(1)
1703 if video_id not in already_seen:
1704 video_ids.append(video_id)
1705 already_seen.add(video_id)
1706 if len(video_ids) == n:
1707 # Specified n videos reached
1708 for id in video_ids:
1709 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1712 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1713 for id in video_ids:
1714 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1717 pagenum = pagenum + 1
1720 class YoutubePlaylistIE(InfoExtractor):
1721 """Information Extractor for YouTube playlists."""
1723 _VALID_URL = r"""(?:
1728 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1729 \? (?:.*?&)*? (?:p|a|list)=
1732 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1735 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1737 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1739 IE_NAME = u'youtube:playlist'
1741 def __init__(self, downloader=None):
1742 InfoExtractor.__init__(self, downloader)
1745 def suitable(cls, url):
1746 """Receives a URL and returns True if suitable for this IE."""
1747 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1749 def report_download_page(self, playlist_id, pagenum):
1750 """Report attempt to download playlist page with given number."""
1751 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1753 def _real_extract(self, url):
1754 # Extract playlist id
1755 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1757 self._downloader.report_error(u'invalid url: %s' % url)
1760 # Download playlist videos from API
1761 playlist_id = mobj.group(1) or mobj.group(2)
1766 self.report_download_page(playlist_id, page_num)
1768 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1770 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1771 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1772 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1776 response = json.loads(page)
1777 except ValueError as err:
1778 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1781 if 'feed' not in response:
1782 self._downloader.report_error(u'Got a malformed response from YouTube API')
1784 if 'entry' not in response['feed']:
1785 # Number of videos is a multiple of self._MAX_RESULTS
1788 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1789 for entry in response['feed']['entry']
1790 if 'content' in entry ]
1792 if len(response['feed']['entry']) < self._MAX_RESULTS:
1796 videos = [v[1] for v in sorted(videos)]
1799 playliststart = self._downloader.params.get('playliststart', 1) - 1
1800 playlistend = self._downloader.params.get('playlistend', -1)
1801 if playlistend == -1:
1802 videos = videos[playliststart:]
1804 videos = videos[playliststart:playlistend]
1806 if len(videos) == total:
1807 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1809 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(videos)))
1811 for video in videos:
1812 self._downloader.download([video])
1816 class YoutubeChannelIE(InfoExtractor):
1817 """Information Extractor for YouTube channels."""
1819 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1820 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1821 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1822 IE_NAME = u'youtube:channel'
1824 def report_download_page(self, channel_id, pagenum):
1825 """Report attempt to download channel page with given number."""
1826 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1828 def _real_extract(self, url):
1829 # Extract channel id
1830 mobj = re.match(self._VALID_URL, url)
1832 self._downloader.report_error(u'invalid url: %s' % url)
1835 # Download channel pages
1836 channel_id = mobj.group(1)
1841 self.report_download_page(channel_id, pagenum)
1842 url = self._TEMPLATE_URL % (channel_id, pagenum)
1843 request = compat_urllib_request.Request(url)
1845 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1846 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1847 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1850 # Extract video identifiers
1852 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1853 if mobj.group(1) not in ids_in_page:
1854 ids_in_page.append(mobj.group(1))
1855 video_ids.extend(ids_in_page)
1857 if self._MORE_PAGES_INDICATOR not in page:
1859 pagenum = pagenum + 1
1861 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1863 for id in video_ids:
1864 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1868 class YoutubeUserIE(InfoExtractor):
1869 """Information Extractor for YouTube users."""
1871 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1872 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1873 _GDATA_PAGE_SIZE = 50
1874 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1875 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1876 IE_NAME = u'youtube:user'
1878 def __init__(self, downloader=None):
1879 InfoExtractor.__init__(self, downloader)
1881 def report_download_page(self, username, start_index):
1882 """Report attempt to download user page."""
1883 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1884 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1886 def _real_extract(self, url):
1888 mobj = re.match(self._VALID_URL, url)
1890 self._downloader.report_error(u'invalid url: %s' % url)
1893 username = mobj.group(1)
1895 # Download video ids using YouTube Data API. Result size per
1896 # query is limited (currently to 50 videos) so we need to query
1897 # page by page until there are no video ids - it means we got
1904 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1905 self.report_download_page(username, start_index)
1907 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1910 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1911 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1912 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1915 # Extract video identifiers
1918 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1919 if mobj.group(1) not in ids_in_page:
1920 ids_in_page.append(mobj.group(1))
1922 video_ids.extend(ids_in_page)
1924 # A little optimization - if current page is not
1925 # "full", ie. does not contain PAGE_SIZE video ids then
1926 # we can assume that this page is the last one - there
1927 # are no more ids on further pages - no need to query
1930 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1935 all_ids_count = len(video_ids)
1936 playliststart = self._downloader.params.get('playliststart', 1) - 1
1937 playlistend = self._downloader.params.get('playlistend', -1)
1939 if playlistend == -1:
1940 video_ids = video_ids[playliststart:]
1942 video_ids = video_ids[playliststart:playlistend]
1944 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1945 (username, all_ids_count, len(video_ids)))
1947 for video_id in video_ids:
1948 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1951 class BlipTVUserIE(InfoExtractor):
1952 """Information Extractor for blip.tv users."""
1954 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1956 IE_NAME = u'blip.tv:user'
1958 def __init__(self, downloader=None):
1959 InfoExtractor.__init__(self, downloader)
1961 def report_download_page(self, username, pagenum):
1962 """Report attempt to download user page."""
1963 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1964 (self.IE_NAME, username, pagenum))
1966 def _real_extract(self, url):
1968 mobj = re.match(self._VALID_URL, url)
1970 self._downloader.report_error(u'invalid url: %s' % url)
1973 username = mobj.group(1)
1975 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1977 request = compat_urllib_request.Request(url)
1980 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1981 mobj = re.search(r'data-users-id="([^"]+)"', page)
1982 page_base = page_base % mobj.group(1)
1983 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1984 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1988 # Download video ids using BlipTV Ajax calls. Result size per
1989 # query is limited (currently to 12 videos) so we need to query
1990 # page by page until there are no video ids - it means we got
1997 self.report_download_page(username, pagenum)
1998 url = page_base + "&page=" + str(pagenum)
1999 request = compat_urllib_request.Request( url )
2001 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
2002 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2003 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
2006 # Extract video identifiers
2009 for mobj in re.finditer(r'href="/([^"]+)"', page):
2010 if mobj.group(1) not in ids_in_page:
2011 ids_in_page.append(unescapeHTML(mobj.group(1)))
2013 video_ids.extend(ids_in_page)
2015 # A little optimization - if current page is not
2016 # "full", ie. does not contain PAGE_SIZE video ids then
2017 # we can assume that this page is the last one - there
2018 # are no more ids on further pages - no need to query
2021 if len(ids_in_page) < self._PAGE_SIZE:
2026 all_ids_count = len(video_ids)
2027 playliststart = self._downloader.params.get('playliststart', 1) - 1
2028 playlistend = self._downloader.params.get('playlistend', -1)
2030 if playlistend == -1:
2031 video_ids = video_ids[playliststart:]
2033 video_ids = video_ids[playliststart:playlistend]
2035 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
2036 (self.IE_NAME, username, all_ids_count, len(video_ids)))
2038 for video_id in video_ids:
2039 self._downloader.download([u'http://blip.tv/'+video_id])
2042 class DepositFilesIE(InfoExtractor):
2043 """Information extractor for depositfiles.com"""
2045 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2047 def report_download_webpage(self, file_id):
2048 """Report webpage download."""
2049 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2051 def report_extraction(self, file_id):
2052 """Report information extraction."""
2053 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2055 def _real_extract(self, url):
2056 file_id = url.split('/')[-1]
2057 # Rebuild url in english locale
2058 url = 'http://depositfiles.com/en/files/' + file_id
2060 # Retrieve file webpage with 'Free download' button pressed
2061 free_download_indication = { 'gateway_result' : '1' }
2062 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2064 self.report_download_webpage(file_id)
2065 webpage = compat_urllib_request.urlopen(request).read()
2066 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2067 self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
2070 # Search for the real file URL
2071 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2072 if (mobj is None) or (mobj.group(1) is None):
2073 # Try to figure out reason of the error.
2074 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2075 if (mobj is not None) and (mobj.group(1) is not None):
2076 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2077 self._downloader.report_error(u'%s' % restriction_message)
2079 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
2082 file_url = mobj.group(1)
2083 file_extension = os.path.splitext(file_url)[1][1:]
2085 # Search for file title
2086 mobj = re.search(r'<b title="(.*?)">', webpage)
2088 self._downloader.report_error(u'unable to extract title')
2090 file_title = mobj.group(1).decode('utf-8')
2093 'id': file_id.decode('utf-8'),
2094 'url': file_url.decode('utf-8'),
2096 'upload_date': None,
2097 'title': file_title,
2098 'ext': file_extension.decode('utf-8'),
2102 class FacebookIE(InfoExtractor):
2103 """Information Extractor for Facebook"""
2105 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2106 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2107 _NETRC_MACHINE = 'facebook'
2108 IE_NAME = u'facebook'
2110 def report_login(self):
2111 """Report attempt to log in."""
2112 self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2114 def _real_initialize(self):
2115 if self._downloader is None:
2120 downloader_params = self._downloader.params
2122 # Attempt to use provided username and password or .netrc data
2123 if downloader_params.get('username', None) is not None:
2124 useremail = downloader_params['username']
2125 password = downloader_params['password']
2126 elif downloader_params.get('usenetrc', False):
2128 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2129 if info is not None:
2133 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2134 except (IOError, netrc.NetrcParseError) as err:
2135 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2138 if useremail is None:
2147 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2150 login_results = compat_urllib_request.urlopen(request).read()
2151 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2152 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2154 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2155 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2158 def _real_extract(self, url):
2159 mobj = re.match(self._VALID_URL, url)
2161 self._downloader.report_error(u'invalid URL: %s' % url)
2163 video_id = mobj.group('ID')
2165 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2166 webpage = self._download_webpage(url, video_id)
2168 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
2169 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2170 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2172 raise ExtractorError(u'Cannot parse data')
2173 data = dict(json.loads(m.group(1)))
2174 params_raw = compat_urllib_parse.unquote(data['params'])
2175 params = json.loads(params_raw)
2176 video_data = params['video_data'][0]
2177 video_url = video_data.get('hd_src')
2179 video_url = video_data['sd_src']
2181 raise ExtractorError(u'Cannot find video URL')
2182 video_duration = int(video_data['video_duration'])
2183 thumbnail = video_data['thumbnail_src']
2185 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2187 raise ExtractorError(u'Cannot find title in webpage')
2188 video_title = unescapeHTML(m.group(1))
2192 'title': video_title,
2195 'duration': video_duration,
2196 'thumbnail': thumbnail,
2201 class BlipTVIE(InfoExtractor):
2202 """Information extractor for blip.tv"""
2204 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2205 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2206 IE_NAME = u'blip.tv'
2208 def report_extraction(self, file_id):
2209 """Report information extraction."""
2210 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2212 def report_direct_download(self, title):
2213 """Report information extraction."""
2214 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2216 def _real_extract(self, url):
2217 mobj = re.match(self._VALID_URL, url)
2219 self._downloader.report_error(u'invalid URL: %s' % url)
2222 urlp = compat_urllib_parse_urlparse(url)
2223 if urlp.path.startswith('/play/'):
2224 request = compat_urllib_request.Request(url)
2225 response = compat_urllib_request.urlopen(request)
2226 redirecturl = response.geturl()
2227 rurlp = compat_urllib_parse_urlparse(redirecturl)
2228 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2229 url = 'http://blip.tv/a/a-' + file_id
2230 return self._real_extract(url)
2237 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2238 request = compat_urllib_request.Request(json_url)
2239 request.add_header('User-Agent', 'iTunes/10.6.1')
2240 self.report_extraction(mobj.group(1))
2243 urlh = compat_urllib_request.urlopen(request)
2244 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2245 basename = url.split('/')[-1]
2246 title,ext = os.path.splitext(basename)
2247 title = title.decode('UTF-8')
2248 ext = ext.replace('.', '')
2249 self.report_direct_download(title)
2254 'upload_date': None,
2259 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2260 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2261 if info is None: # Regular URL
2263 json_code_bytes = urlh.read()
2264 json_code = json_code_bytes.decode('utf-8')
2265 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2266 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2270 json_data = json.loads(json_code)
2271 if 'Post' in json_data:
2272 data = json_data['Post']
2276 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2277 video_url = data['media']['url']
2278 umobj = re.match(self._URL_EXT, video_url)
2280 raise ValueError('Can not determine filename extension')
2281 ext = umobj.group(1)
2284 'id': data['item_id'],
2286 'uploader': data['display_name'],
2287 'upload_date': upload_date,
2288 'title': data['title'],
2290 'format': data['media']['mimeType'],
2291 'thumbnail': data['thumbnailUrl'],
2292 'description': data['description'],
2293 'player_url': data['embedUrl'],
2294 'user_agent': 'iTunes/10.6.1',
2296 except (ValueError,KeyError) as err:
2297 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2303 class MyVideoIE(InfoExtractor):
2304 """Information Extractor for myvideo.de."""
2306 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2307 IE_NAME = u'myvideo'
2309 def __init__(self, downloader=None):
2310 InfoExtractor.__init__(self, downloader)
2312 def report_extraction(self, video_id):
2313 """Report information extraction."""
2314 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2316 def _real_extract(self,url):
2317 mobj = re.match(self._VALID_URL, url)
2319 self._download.report_error(u'invalid URL: %s' % url)
2322 video_id = mobj.group(1)
2325 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2326 webpage = self._download_webpage(webpage_url, video_id)
2328 self.report_extraction(video_id)
2329 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2332 self._downloader.report_error(u'unable to extract media URL')
2334 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2336 mobj = re.search('<title>([^<]+)</title>', webpage)
2338 self._downloader.report_error(u'unable to extract title')
2341 video_title = mobj.group(1)
2347 'upload_date': None,
2348 'title': video_title,
2352 class ComedyCentralIE(InfoExtractor):
2353 """Information extractor for The Daily Show and Colbert Report """
2355 # urls can be abbreviations like :thedailyshow or :colbert
2356 # urls for episodes like:
2357 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2358 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2359 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2360 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2361 |(https?://)?(www\.)?
2362 (?P<showname>thedailyshow|colbertnation)\.com/
2363 (full-episodes/(?P<episode>.*)|
2365 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2366 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2369 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2371 _video_extensions = {
2379 _video_dimensions = {
2389 def suitable(cls, url):
2390 """Receives a URL and returns True if suitable for this IE."""
2391 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2393 def report_extraction(self, episode_id):
2394 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2396 def report_config_download(self, episode_id, media_id):
2397 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2399 def report_index_download(self, episode_id):
2400 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2402 def _print_formats(self, formats):
2403 print('Available formats:')
2405 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2408 def _real_extract(self, url):
2409 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2411 self._downloader.report_error(u'invalid URL: %s' % url)
2414 if mobj.group('shortname'):
2415 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2416 url = u'http://www.thedailyshow.com/full-episodes/'
2418 url = u'http://www.colbertnation.com/full-episodes/'
2419 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2420 assert mobj is not None
2422 if mobj.group('clip'):
2423 if mobj.group('showname') == 'thedailyshow':
2424 epTitle = mobj.group('tdstitle')
2426 epTitle = mobj.group('cntitle')
2429 dlNewest = not mobj.group('episode')
2431 epTitle = mobj.group('showname')
2433 epTitle = mobj.group('episode')
2435 req = compat_urllib_request.Request(url)
2436 self.report_extraction(epTitle)
2438 htmlHandle = compat_urllib_request.urlopen(req)
2439 html = htmlHandle.read()
2440 webpage = html.decode('utf-8')
2441 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2442 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2445 url = htmlHandle.geturl()
2446 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2448 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2450 if mobj.group('episode') == '':
2451 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2453 epTitle = mobj.group('episode')
2455 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2457 if len(mMovieParams) == 0:
2458 # The Colbert Report embeds the information in a without
2459 # a URL prefix; so extract the alternate reference
2460 # and then add the URL prefix manually.
2462 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2463 if len(altMovieParams) == 0:
2464 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2467 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2469 uri = mMovieParams[0][1]
2470 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2471 self.report_index_download(epTitle)
2473 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2474 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2475 self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2480 idoc = xml.etree.ElementTree.fromstring(indexXml)
2481 itemEls = idoc.findall('.//item')
2482 for partNum,itemEl in enumerate(itemEls):
2483 mediaId = itemEl.findall('./guid')[0].text
2484 shortMediaId = mediaId.split(':')[-1]
2485 showId = mediaId.split(':')[-2].replace('.com', '')
2486 officialTitle = itemEl.findall('./title')[0].text
2487 officialDate = itemEl.findall('./pubDate')[0].text
2489 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2490 compat_urllib_parse.urlencode({'uri': mediaId}))
2491 configReq = compat_urllib_request.Request(configUrl)
2492 self.report_config_download(epTitle, shortMediaId)
2494 configXml = compat_urllib_request.urlopen(configReq).read()
2495 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2496 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2499 cdoc = xml.etree.ElementTree.fromstring(configXml)
2501 for rendition in cdoc.findall('.//rendition'):
2502 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2506 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2509 if self._downloader.params.get('listformats', None):
2510 self._print_formats([i[0] for i in turls])
2513 # For now, just pick the highest bitrate
2514 format,rtmp_video_url = turls[-1]
2516 # Get the format arg from the arg stream
2517 req_format = self._downloader.params.get('format', None)
2519 # Select format if we can find one
2522 format, rtmp_video_url = f, v
2525 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2527 raise ExtractorError(u'Cannot transform RTMP url')
2528 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2529 video_url = base + m.group('finalid')
2531 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2536 'upload_date': officialDate,
2541 'description': officialTitle,
2543 results.append(info)
2548 class EscapistIE(InfoExtractor):
2549 """Information extractor for The Escapist """
2551 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2552 IE_NAME = u'escapist'
2554 def report_extraction(self, showName):
2555 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2557 def report_config_download(self, showName):
2558 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2560 def _real_extract(self, url):
2561 mobj = re.match(self._VALID_URL, url)
2563 self._downloader.report_error(u'invalid URL: %s' % url)
2565 showName = mobj.group('showname')
2566 videoId = mobj.group('episode')
2568 self.report_extraction(showName)
2570 webPage = compat_urllib_request.urlopen(url)
2571 webPageBytes = webPage.read()
2572 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2573 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2574 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2575 self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2578 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2579 description = unescapeHTML(descMatch.group(1))
2580 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2581 imgUrl = unescapeHTML(imgMatch.group(1))
2582 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2583 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2584 configUrlMatch = re.search('config=(.*)$', playerUrl)
2585 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2587 self.report_config_download(showName)
2589 configJSON = compat_urllib_request.urlopen(configUrl)
2590 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2591 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2592 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2593 self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2596 # Technically, it's JavaScript, not JSON
2597 configJSON = configJSON.replace("'", '"')
2600 config = json.loads(configJSON)
2601 except (ValueError,) as err:
2602 self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2605 playlist = config['playlist']
2606 videoUrl = playlist[1]['url']
2611 'uploader': showName,
2612 'upload_date': None,
2615 'thumbnail': imgUrl,
2616 'description': description,
2617 'player_url': playerUrl,
2622 class CollegeHumorIE(InfoExtractor):
2623 """Information extractor for collegehumor.com"""
2626 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2627 IE_NAME = u'collegehumor'
2629 def report_manifest(self, video_id):
2630 """Report information extraction."""
2631 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2633 def report_extraction(self, video_id):
2634 """Report information extraction."""
2635 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2637 def _real_extract(self, url):
2638 mobj = re.match(self._VALID_URL, url)
2640 self._downloader.report_error(u'invalid URL: %s' % url)
2642 video_id = mobj.group('videoid')
2647 'upload_date': None,
2650 self.report_extraction(video_id)
2651 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2653 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2654 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2655 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2658 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2660 videoNode = mdoc.findall('./video')[0]
2661 info['description'] = videoNode.findall('./description')[0].text
2662 info['title'] = videoNode.findall('./caption')[0].text
2663 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2664 manifest_url = videoNode.findall('./file')[0].text
2666 self._downloader.report_error(u'Invalid metadata XML file')
2669 manifest_url += '?hdcore=2.10.3'
2670 self.report_manifest(video_id)
2672 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2673 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2674 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2677 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2679 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2680 node_id = media_node.attrib['url']
2681 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2682 except IndexError as err:
2683 self._downloader.report_error(u'Invalid manifest file')
2686 url_pr = compat_urllib_parse_urlparse(manifest_url)
2687 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2694 class XVideosIE(InfoExtractor):
2695 """Information extractor for xvideos.com"""
2697 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2698 IE_NAME = u'xvideos'
2700 def report_extraction(self, video_id):
2701 """Report information extraction."""
2702 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2704 def _real_extract(self, url):
2705 mobj = re.match(self._VALID_URL, url)
2707 self._downloader.report_error(u'invalid URL: %s' % url)
2709 video_id = mobj.group(1)
2711 webpage = self._download_webpage(url, video_id)
2713 self.report_extraction(video_id)
2717 mobj = re.search(r'flv_url=(.+?)&', webpage)
2719 self._downloader.report_error(u'unable to extract video url')
2721 video_url = compat_urllib_parse.unquote(mobj.group(1))
2725 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2727 self._downloader.report_error(u'unable to extract video title')
2729 video_title = mobj.group(1)
2732 # Extract video thumbnail
2733 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2735 self._downloader.report_error(u'unable to extract video thumbnail')
2737 video_thumbnail = mobj.group(0)
2743 'upload_date': None,
2744 'title': video_title,
2746 'thumbnail': video_thumbnail,
2747 'description': None,
2753 class SoundcloudIE(InfoExtractor):
2754 """Information extractor for soundcloud.com
2755 To access the media, the uid of the song and a stream token
2756 must be extracted from the page source and the script must make
2757 a request to media.soundcloud.com/crossdomain.xml. Then
2758 the media can be grabbed by requesting from an url composed
2759 of the stream token and uid
2762 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2763 IE_NAME = u'soundcloud'
2765 def __init__(self, downloader=None):
2766 InfoExtractor.__init__(self, downloader)
2768 def report_resolve(self, video_id):
2769 """Report information extraction."""
2770 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2772 def report_extraction(self, video_id):
2773 """Report information extraction."""
2774 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2776 def _real_extract(self, url):
2777 mobj = re.match(self._VALID_URL, url)
2779 self._downloader.report_error(u'invalid URL: %s' % url)
2782 # extract uploader (which is in the url)
2783 uploader = mobj.group(1)
2784 # extract simple title (uploader + slug of song title)
2785 slug_title = mobj.group(2)
2786 simple_title = uploader + u'-' + slug_title
2788 self.report_resolve('%s/%s' % (uploader, slug_title))
2790 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2791 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2792 request = compat_urllib_request.Request(resolv_url)
2794 info_json_bytes = compat_urllib_request.urlopen(request).read()
2795 info_json = info_json_bytes.decode('utf-8')
2796 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2797 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2800 info = json.loads(info_json)
2801 video_id = info['id']
2802 self.report_extraction('%s/%s' % (uploader, slug_title))
2804 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2805 request = compat_urllib_request.Request(streams_url)
2807 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2808 stream_json = stream_json_bytes.decode('utf-8')
2809 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2810 self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2813 streams = json.loads(stream_json)
2814 mediaURL = streams['http_mp3_128_url']
2819 'uploader': info['user']['username'],
2820 'upload_date': info['created_at'],
2821 'title': info['title'],
2823 'description': info['description'],
2826 class SoundcloudSetIE(InfoExtractor):
2827 """Information extractor for soundcloud.com sets
2828 To access the media, the uid of the song and a stream token
2829 must be extracted from the page source and the script must make
2830 a request to media.soundcloud.com/crossdomain.xml. Then
2831 the media can be grabbed by requesting from an url composed
2832 of the stream token and uid
2835 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2836 IE_NAME = u'soundcloud'
2838 def __init__(self, downloader=None):
2839 InfoExtractor.__init__(self, downloader)
2841 def report_resolve(self, video_id):
2842 """Report information extraction."""
2843 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2845 def report_extraction(self, video_id):
2846 """Report information extraction."""
2847 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2849 def _real_extract(self, url):
2850 mobj = re.match(self._VALID_URL, url)
2852 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2855 # extract uploader (which is in the url)
2856 uploader = mobj.group(1)
2857 # extract simple title (uploader + slug of song title)
2858 slug_title = mobj.group(2)
2859 simple_title = uploader + u'-' + slug_title
2861 self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2863 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2864 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2865 request = compat_urllib_request.Request(resolv_url)
2867 info_json_bytes = compat_urllib_request.urlopen(request).read()
2868 info_json = info_json_bytes.decode('utf-8')
2869 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2870 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2874 info = json.loads(info_json)
2875 if 'errors' in info:
2876 for err in info['errors']:
2877 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err['error_message']))
2880 for track in info['tracks']:
2881 video_id = track['id']
2882 self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2884 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2885 request = compat_urllib_request.Request(streams_url)
2887 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2888 stream_json = stream_json_bytes.decode('utf-8')
2889 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2890 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2893 streams = json.loads(stream_json)
2894 mediaURL = streams['http_mp3_128_url']
2899 'uploader': track['user']['username'],
2900 'upload_date': track['created_at'],
2901 'title': track['title'],
2903 'description': track['description'],
2908 class InfoQIE(InfoExtractor):
2909 """Information extractor for infoq.com"""
2910 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2912 def report_extraction(self, video_id):
2913 """Report information extraction."""
2914 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2916 def _real_extract(self, url):
2917 mobj = re.match(self._VALID_URL, url)
2919 self._downloader.report_error(u'invalid URL: %s' % url)
2922 webpage = self._download_webpage(url, video_id=url)
2923 self.report_extraction(url)
2926 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2928 self._downloader.report_error(u'unable to extract video url')
2930 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2931 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2934 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2936 self._downloader.report_error(u'unable to extract video title')
2938 video_title = mobj.group(1)
2940 # Extract description
2941 video_description = u'No description available.'
2942 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2943 if mobj is not None:
2944 video_description = mobj.group(1)
2946 video_filename = video_url.split('/')[-1]
2947 video_id, extension = video_filename.split('.')
2953 'upload_date': None,
2954 'title': video_title,
2955 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2957 'description': video_description,
2962 class MixcloudIE(InfoExtractor):
2963 """Information extractor for www.mixcloud.com"""
2965 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2966 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2967 IE_NAME = u'mixcloud'
2969 def __init__(self, downloader=None):
2970 InfoExtractor.__init__(self, downloader)
2972 def report_download_json(self, file_id):
2973 """Report JSON download."""
2974 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2976 def report_extraction(self, file_id):
2977 """Report information extraction."""
2978 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2980 def get_urls(self, jsonData, fmt, bitrate='best'):
2981 """Get urls from 'audio_formats' section in json"""
2984 bitrate_list = jsonData[fmt]
2985 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2986 bitrate = max(bitrate_list) # select highest
2988 url_list = jsonData[fmt][bitrate]
2989 except TypeError: # we have no bitrate info.
2990 url_list = jsonData[fmt]
2993 def check_urls(self, url_list):
2994 """Returns 1st active url from list"""
2995 for url in url_list:
2997 compat_urllib_request.urlopen(url)
2999 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3004 def _print_formats(self, formats):
3005 print('Available formats:')
3006 for fmt in formats.keys():
3007 for b in formats[fmt]:
3009 ext = formats[fmt][b][0]
3010 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
3011 except TypeError: # we have no bitrate info
3012 ext = formats[fmt][0]
3013 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
3016 def _real_extract(self, url):
3017 mobj = re.match(self._VALID_URL, url)
3019 self._downloader.report_error(u'invalid URL: %s' % url)
3021 # extract uploader & filename from url
3022 uploader = mobj.group(1).decode('utf-8')
3023 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3025 # construct API request
3026 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3027 # retrieve .json file with links to files
3028 request = compat_urllib_request.Request(file_url)
3030 self.report_download_json(file_url)
3031 jsonData = compat_urllib_request.urlopen(request).read()
3032 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3033 self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
3037 json_data = json.loads(jsonData)
3038 player_url = json_data['player_swf_url']
3039 formats = dict(json_data['audio_formats'])
3041 req_format = self._downloader.params.get('format', None)
3044 if self._downloader.params.get('listformats', None):
3045 self._print_formats(formats)
3048 if req_format is None or req_format == 'best':
3049 for format_param in formats.keys():
3050 url_list = self.get_urls(formats, format_param)
3052 file_url = self.check_urls(url_list)
3053 if file_url is not None:
3056 if req_format not in formats:
3057 self._downloader.report_error(u'format is not available')
3060 url_list = self.get_urls(formats, req_format)
3061 file_url = self.check_urls(url_list)
3062 format_param = req_format
3065 'id': file_id.decode('utf-8'),
3066 'url': file_url.decode('utf-8'),
3067 'uploader': uploader.decode('utf-8'),
3068 'upload_date': None,
3069 'title': json_data['name'],
3070 'ext': file_url.split('.')[-1].decode('utf-8'),
3071 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3072 'thumbnail': json_data['thumbnail_url'],
3073 'description': json_data['description'],
3074 'player_url': player_url.decode('utf-8'),
3077 class StanfordOpenClassroomIE(InfoExtractor):
3078 """Information extractor for Stanford's Open ClassRoom"""
3080 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3081 IE_NAME = u'stanfordoc'
3083 def report_download_webpage(self, objid):
3084 """Report information extraction."""
3085 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3087 def report_extraction(self, video_id):
3088 """Report information extraction."""
3089 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3091 def _real_extract(self, url):
3092 mobj = re.match(self._VALID_URL, url)
3094 raise ExtractorError(u'Invalid URL: %s' % url)
3096 if mobj.group('course') and mobj.group('video'): # A specific video
3097 course = mobj.group('course')
3098 video = mobj.group('video')
3100 'id': course + '_' + video,
3102 'upload_date': None,
3105 self.report_extraction(info['id'])
3106 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3107 xmlUrl = baseUrl + video + '.xml'
3109 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3110 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3111 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
3113 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3115 info['title'] = mdoc.findall('./title')[0].text
3116 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3118 self._downloader.report_error(u'Invalid metadata XML file')
3120 info['ext'] = info['url'].rpartition('.')[2]
3122 elif mobj.group('course'): # A course page
3123 course = mobj.group('course')
3128 'upload_date': None,
3131 coursepage = self._download_webpage(url, info['id'],
3132 note='Downloading course info page',
3133 errnote='Unable to download course info page')
3135 m = re.search('<h1>([^<]+)</h1>', coursepage)
3137 info['title'] = unescapeHTML(m.group(1))
3139 info['title'] = info['id']
3141 m = re.search('<description>([^<]+)</description>', coursepage)
3143 info['description'] = unescapeHTML(m.group(1))
3145 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3148 'type': 'reference',
3149 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3153 for entry in info['list']:
3154 assert entry['type'] == 'reference'
3155 results += self.extract(entry['url'])
3159 'id': 'Stanford OpenClassroom',
3162 'upload_date': None,
3165 self.report_download_webpage(info['id'])
3166 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3168 rootpage = compat_urllib_request.urlopen(rootURL).read()
3169 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3170 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3173 info['title'] = info['id']
3175 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3178 'type': 'reference',
3179 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3184 for entry in info['list']:
3185 assert entry['type'] == 'reference'
3186 results += self.extract(entry['url'])
3189 class MTVIE(InfoExtractor):
3190 """Information extractor for MTV.com"""
3192 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3195 def report_extraction(self, video_id):
3196 """Report information extraction."""
3197 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3199 def _real_extract(self, url):
3200 mobj = re.match(self._VALID_URL, url)
3202 self._downloader.report_error(u'invalid URL: %s' % url)
3204 if not mobj.group('proto'):
3205 url = 'http://' + url
3206 video_id = mobj.group('videoid')
3208 webpage = self._download_webpage(url, video_id)
3210 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3212 self._downloader.report_error(u'unable to extract song name')
3214 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3215 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3217 self._downloader.report_error(u'unable to extract performer')
3219 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3220 video_title = performer + ' - ' + song_name
3222 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3224 self._downloader.report_error(u'unable to mtvn_uri')
3226 mtvn_uri = mobj.group(1)
3228 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3230 self._downloader.report_error(u'unable to extract content id')
3232 content_id = mobj.group(1)
3234 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3235 self.report_extraction(video_id)
3236 request = compat_urllib_request.Request(videogen_url)
3238 metadataXml = compat_urllib_request.urlopen(request).read()
3239 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3240 self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3243 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3244 renditions = mdoc.findall('.//rendition')
3246 # For now, always pick the highest quality.
3247 rendition = renditions[-1]
3250 _,_,ext = rendition.attrib['type'].partition('/')
3251 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3252 video_url = rendition.find('./src').text
3254 self._downloader.trouble('Invalid rendition field.')
3260 'uploader': performer,
3261 'upload_date': None,
3262 'title': video_title,
3270 class YoukuIE(InfoExtractor):
3271 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3273 def report_download_webpage(self, file_id):
3274 """Report webpage download."""
3275 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3277 def report_extraction(self, file_id):
3278 """Report information extraction."""
3279 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3282 nowTime = int(time.time() * 1000)
3283 random1 = random.randint(1000,1998)
3284 random2 = random.randint(1000,9999)
3286 return "%d%d%d" %(nowTime,random1,random2)
3288 def _get_file_ID_mix_string(self, seed):
3290 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3292 for i in range(len(source)):
3293 seed = (seed * 211 + 30031 ) % 65536
3294 index = math.floor(seed / 65536 * len(source) )
3295 mixed.append(source[int(index)])
3296 source.remove(source[int(index)])
3297 #return ''.join(mixed)
3300 def _get_file_id(self, fileId, seed):
3301 mixed = self._get_file_ID_mix_string(seed)
3302 ids = fileId.split('*')
3306 realId.append(mixed[int(ch)])
3307 return ''.join(realId)
3309 def _real_extract(self, url):
3310 mobj = re.match(self._VALID_URL, url)
3312 self._downloader.report_error(u'invalid URL: %s' % url)
3314 video_id = mobj.group('ID')
3316 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3318 request = compat_urllib_request.Request(info_url, None, std_headers)
3320 self.report_download_webpage(video_id)
3321 jsondata = compat_urllib_request.urlopen(request).read()
3322 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3323 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3326 self.report_extraction(video_id)
3328 jsonstr = jsondata.decode('utf-8')
3329 config = json.loads(jsonstr)
3331 video_title = config['data'][0]['title']
3332 seed = config['data'][0]['seed']
3334 format = self._downloader.params.get('format', None)
3335 supported_format = list(config['data'][0]['streamfileids'].keys())
3337 if format is None or format == 'best':
3338 if 'hd2' in supported_format:
3343 elif format == 'worst':
3351 fileid = config['data'][0]['streamfileids'][format]
3352 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3353 except (UnicodeDecodeError, ValueError, KeyError):
3354 self._downloader.report_error(u'unable to extract info section')
3358 sid = self._gen_sid()
3359 fileid = self._get_file_id(fileid, seed)
3361 #column 8,9 of fileid represent the segment number
3362 #fileid[7:9] should be changed
3363 for index, key in enumerate(keys):
3365 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3366 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3369 'id': '%s_part%02d' % (video_id, index),
3370 'url': download_url,
3372 'upload_date': None,
3373 'title': video_title,
3376 files_info.append(info)
3381 class XNXXIE(InfoExtractor):
3382 """Information extractor for xnxx.com"""
3384 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3386 VIDEO_URL_RE = r'flv_url=(.*?)&'
3387 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3388 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3390 def report_webpage(self, video_id):
3391 """Report information extraction"""
3392 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3394 def report_extraction(self, video_id):
3395 """Report information extraction"""
3396 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3398 def _real_extract(self, url):
3399 mobj = re.match(self._VALID_URL, url)
3401 self._downloader.report_error(u'invalid URL: %s' % url)
3403 video_id = mobj.group(1)
3405 self.report_webpage(video_id)
3407 # Get webpage content
3409 webpage_bytes = compat_urllib_request.urlopen(url).read()
3410 webpage = webpage_bytes.decode('utf-8')
3411 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3412 self._downloader.report_error(u'unable to download video webpage: %s' % err)
3415 result = re.search(self.VIDEO_URL_RE, webpage)
3417 self._downloader.report_error(u'unable to extract video url')
3419 video_url = compat_urllib_parse.unquote(result.group(1))
3421 result = re.search(self.VIDEO_TITLE_RE, webpage)
3423 self._downloader.report_error(u'unable to extract video title')
3425 video_title = result.group(1)
3427 result = re.search(self.VIDEO_THUMB_RE, webpage)
3429 self._downloader.report_error(u'unable to extract video thumbnail')
3431 video_thumbnail = result.group(1)
3437 'upload_date': None,
3438 'title': video_title,
3440 'thumbnail': video_thumbnail,
3441 'description': None,
3445 class GooglePlusIE(InfoExtractor):
3446 """Information extractor for plus.google.com."""
3448 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3449 IE_NAME = u'plus.google'
3451 def __init__(self, downloader=None):
3452 InfoExtractor.__init__(self, downloader)
3454 def report_extract_entry(self, url):
3455 """Report downloading extry"""
3456 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3458 def report_date(self, upload_date):
3459 """Report downloading extry"""
3460 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3462 def report_uploader(self, uploader):
3463 """Report downloading extry"""
3464 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3466 def report_title(self, video_title):
3467 """Report downloading extry"""
3468 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3470 def report_extract_vid_page(self, video_page):
3471 """Report information extraction."""
3472 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3474 def _real_extract(self, url):
3475 # Extract id from URL
3476 mobj = re.match(self._VALID_URL, url)
3478 self._downloader.report_error(u'Invalid URL: %s' % url)
3481 post_url = mobj.group(0)
3482 video_id = mobj.group(1)
3484 video_extension = 'flv'
3486 # Step 1, Retrieve post webpage to extract further information
3487 self.report_extract_entry(post_url)
3488 request = compat_urllib_request.Request(post_url)
3490 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3491 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3492 self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3495 # Extract update date
3497 pattern = 'title="Timestamp">(.*?)</a>'
3498 mobj = re.search(pattern, webpage)
3500 upload_date = mobj.group(1)
3501 # Convert timestring to a format suitable for filename
3502 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3503 upload_date = upload_date.strftime('%Y%m%d')
3504 self.report_date(upload_date)
3508 pattern = r'rel\="author".*?>(.*?)</a>'
3509 mobj = re.search(pattern, webpage)
3511 uploader = mobj.group(1)
3512 self.report_uploader(uploader)
3515 # Get the first line for title
3517 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3518 mobj = re.search(pattern, webpage)
3520 video_title = mobj.group(1)
3521 self.report_title(video_title)
3523 # Step 2, Stimulate clicking the image box to launch video
3524 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3525 mobj = re.search(pattern, webpage)
3527 self._downloader.report_error(u'unable to extract video page URL')
3529 video_page = mobj.group(1)
3530 request = compat_urllib_request.Request(video_page)
3532 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3533 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3534 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3536 self.report_extract_vid_page(video_page)
3539 # Extract video links on video page
3540 """Extract video links of all sizes"""
3541 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3542 mobj = re.findall(pattern, webpage)
3544 self._downloader.report_error(u'unable to extract video links')
3546 # Sort in resolution
3547 links = sorted(mobj)
3549 # Choose the lowest of the sort, i.e. highest resolution
3550 video_url = links[-1]
3551 # Only get the url. The resolution part in the tuple has no use anymore
3552 video_url = video_url[-1]
3553 # Treat escaped \u0026 style hex
3555 video_url = video_url.decode("unicode_escape")
3556 except AttributeError: # Python 3
3557 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3563 'uploader': uploader,
3564 'upload_date': upload_date,
3565 'title': video_title,
3566 'ext': video_extension,
3569 class NBAIE(InfoExtractor):
3570 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3573 def _real_extract(self, url):
3574 mobj = re.match(self._VALID_URL, url)
3576 self._downloader.report_error(u'invalid URL: %s' % url)
3579 video_id = mobj.group(1)
3580 if video_id.endswith('/index.html'):
3581 video_id = video_id[:-len('/index.html')]
3583 webpage = self._download_webpage(url, video_id)
3585 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3586 def _findProp(rexp, default=None):
3587 m = re.search(rexp, webpage)
3589 return unescapeHTML(m.group(1))
3593 shortened_video_id = video_id.rpartition('/')[2]
3594 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3596 'id': shortened_video_id,
3600 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3601 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3605 class JustinTVIE(InfoExtractor):
3606 """Information extractor for justin.tv and twitch.tv"""
3607 # TODO: One broadcast may be split into multiple videos. The key
3608 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3609 # starts at 1 and increases. Can we treat all parts as one video?
3611 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3612 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3613 _JUSTIN_PAGE_LIMIT = 100
3614 IE_NAME = u'justin.tv'
3616 def report_extraction(self, file_id):
3617 """Report information extraction."""
3618 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3620 def report_download_page(self, channel, offset):
3621 """Report attempt to download a single page of videos."""
3622 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3623 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3625 # Return count of items, list of *valid* items
3626 def _parse_page(self, url):
3628 urlh = compat_urllib_request.urlopen(url)
3629 webpage_bytes = urlh.read()
3630 webpage = webpage_bytes.decode('utf-8', 'ignore')
3631 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3632 self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3635 response = json.loads(webpage)
3636 if type(response) != list:
3637 error_text = response.get('error', 'unknown error')
3638 self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3641 for clip in response:
3642 video_url = clip['video_file_url']
3644 video_extension = os.path.splitext(video_url)[1][1:]
3645 video_date = re.sub('-', '', clip['start_time'][:10])
3646 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3647 video_id = clip['id']
3648 video_title = clip.get('title', video_id)
3652 'title': video_title,
3653 'uploader': clip.get('channel_name', video_uploader_id),
3654 'uploader_id': video_uploader_id,
3655 'upload_date': video_date,
3656 'ext': video_extension,
3658 return (len(response), info)
3660 def _real_extract(self, url):
3661 mobj = re.match(self._VALID_URL, url)
3663 self._downloader.report_error(u'invalid URL: %s' % url)
3666 api = 'http://api.justin.tv'
3667 video_id = mobj.group(mobj.lastindex)
3669 if mobj.lastindex == 1:
3671 api += '/channel/archives/%s.json'
3673 api += '/broadcast/by_archive/%s.json'
3674 api = api % (video_id,)
3676 self.report_extraction(video_id)
3680 limit = self._JUSTIN_PAGE_LIMIT
3683 self.report_download_page(video_id, offset)
3684 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3685 page_count, page_info = self._parse_page(page_url)
3686 info.extend(page_info)
3687 if not paged or page_count != limit:
3692 class FunnyOrDieIE(InfoExtractor):
3693 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3695 def _real_extract(self, url):
3696 mobj = re.match(self._VALID_URL, url)
3698 self._downloader.report_error(u'invalid URL: %s' % url)
3701 video_id = mobj.group('id')
3702 webpage = self._download_webpage(url, video_id)
3704 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3706 self._downloader.report_error(u'unable to find video information')
3707 video_url = unescapeHTML(m.group('url'))
3709 m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3711 m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3713 self._downloader.trouble(u'Cannot find video title')
3714 title = clean_html(m.group('title'))
3716 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3718 desc = unescapeHTML(m.group('desc'))
3727 'description': desc,
3731 class SteamIE(InfoExtractor):
3732 _VALID_URL = r"""http://store.steampowered.com/
3733 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3735 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3739 def suitable(cls, url):
3740 """Receives a URL and returns True if suitable for this IE."""
3741 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3743 def _real_extract(self, url):
3744 m = re.match(self._VALID_URL, url, re.VERBOSE)
3745 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3746 gameID = m.group('gameID')
3747 videourl = 'http://store.steampowered.com/video/%s/' % gameID
3748 webpage = self._download_webpage(videourl, gameID)
3749 mweb = re.finditer(urlRE, webpage)
3750 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3751 titles = re.finditer(namesRE, webpage)
3752 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3753 thumbs = re.finditer(thumbsRE, webpage)
3755 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3756 video_id = vid.group('videoID')
3757 title = vtitle.group('videoName')
3758 video_url = vid.group('videoURL')
3759 video_thumb = thumb.group('thumbnail')
3761 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3766 'title': unescapeHTML(title),
3767 'thumbnail': video_thumb
3772 class UstreamIE(InfoExtractor):
3773 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3774 IE_NAME = u'ustream'
3776 def _real_extract(self, url):
3777 m = re.match(self._VALID_URL, url)
3778 video_id = m.group('videoID')
3779 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3780 webpage = self._download_webpage(url, video_id)
3781 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3782 title = m.group('title')
3783 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3784 uploader = m.group('uploader')
3790 'uploader': uploader
3794 class WorldStarHipHopIE(InfoExtractor):
3795 _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3796 IE_NAME = u'WorldStarHipHop'
3798 def _real_extract(self, url):
3799 _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3801 webpage_src = compat_urllib_request.urlopen(url).read()
3802 webpage_src = webpage_src.decode('utf-8')
3804 mobj = re.search(_src_url, webpage_src)
3806 m = re.match(self._VALID_URL, url)
3807 video_id = m.group('id')
3809 if mobj is not None:
3810 video_url = mobj.group()
3811 if 'mp4' in video_url:
3816 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3819 _title = r"""<title>(.*)</title>"""
3821 mobj = re.search(_title, webpage_src)
3823 if mobj is not None:
3824 title = mobj.group(1)
3826 title = 'World Start Hip Hop - %s' % time.ctime()
3828 _thumbnail = r"""rel="image_src" href="(.*)" />"""
3829 mobj = re.search(_thumbnail, webpage_src)
3831 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3832 if mobj is not None:
3833 thumbnail = mobj.group(1)
3835 _title = r"""candytitles.*>(.*)</span>"""
3836 mobj = re.search(_title, webpage_src)
3837 if mobj is not None:
3838 title = mobj.group(1)
3845 'thumbnail' : thumbnail,
3850 class RBMARadioIE(InfoExtractor):
3851 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3853 def _real_extract(self, url):
3854 m = re.match(self._VALID_URL, url)
3855 video_id = m.group('videoID')
3857 webpage = self._download_webpage(url, video_id)
3858 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3860 raise ExtractorError(u'Cannot find metadata')
3861 json_data = m.group(1)
3864 data = json.loads(json_data)
3865 except ValueError as e:
3866 raise ExtractorError(u'Invalid JSON: ' + str(e))
3868 video_url = data['akamai_url'] + '&cbr=256'
3869 url_parts = compat_urllib_parse_urlparse(video_url)
3870 video_ext = url_parts.path.rpartition('.')[2]
3875 'title': data['title'],
3876 'description': data.get('teaser_text'),
3877 'location': data.get('country_of_origin'),
3878 'uploader': data.get('host', {}).get('name'),
3879 'uploader_id': data.get('host', {}).get('slug'),
3880 'thumbnail': data.get('image', {}).get('large_url_2x'),
3881 'duration': data.get('duration'),
3886 class YouPornIE(InfoExtractor):
3887 """Information extractor for youporn.com."""
3888 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3890 def _print_formats(self, formats):
3891 """Print all available formats"""
3892 print(u'Available formats:')
3893 print(u'ext\t\tformat')
3894 print(u'---------------------------------')
3895 for format in formats:
3896 print(u'%s\t\t%s' % (format['ext'], format['format']))
3898 def _specific(self, req_format, formats):
3900 if(x["format"]==req_format):
3904 def _real_extract(self, url):
3905 mobj = re.match(self._VALID_URL, url)
3907 self._downloader.report_error(u'invalid URL: %s' % url)
3910 video_id = mobj.group('videoid')
3912 req = compat_urllib_request.Request(url)
3913 req.add_header('Cookie', 'age_verified=1')
3914 webpage = self._download_webpage(req, video_id)
3916 # Get the video title
3917 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3919 raise ExtractorError(u'Unable to extract video title')
3920 video_title = result.group('title').strip()
3922 # Get the video date
3923 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3925 self._downloader.report_warning(u'unable to extract video date')
3928 upload_date = result.group('date').strip()
3930 # Get the video uploader
3931 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3933 self._downloader.report_warning(u'unable to extract uploader')
3934 video_uploader = None
3936 video_uploader = result.group('uploader').strip()
3937 video_uploader = clean_html( video_uploader )
3939 # Get all of the formats available
3940 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3941 result = re.search(DOWNLOAD_LIST_RE, webpage)
3943 raise ExtractorError(u'Unable to extract download list')
3944 download_list_html = result.group('download_list').strip()
3946 # Get all of the links from the page
3947 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3948 links = re.findall(LINK_RE, download_list_html)
3949 if(len(links) == 0):
3950 raise ExtractorError(u'ERROR: no known formats available for video')
3952 self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3957 # A link looks like this:
3958 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3959 # A path looks like this:
3960 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3961 video_url = unescapeHTML( link )
3962 path = compat_urllib_parse_urlparse( video_url ).path
3963 extension = os.path.splitext( path )[1][1:]
3964 format = path.split('/')[4].split('_')[:2]
3967 format = "-".join( format )
3968 title = u'%s-%s-%s' % (video_title, size, bitrate)
3973 'uploader': video_uploader,
3974 'upload_date': upload_date,
3979 'description': None,
3983 if self._downloader.params.get('listformats', None):
3984 self._print_formats(formats)
3987 req_format = self._downloader.params.get('format', None)
3988 self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3990 if req_format is None or req_format == 'best':
3992 elif req_format == 'worst':
3993 return [formats[-1]]
3994 elif req_format in ('-1', 'all'):
3997 format = self._specific( req_format, formats )
3999 self._downloader.report_error(u'requested format not available')
4005 class PornotubeIE(InfoExtractor):
4006 """Information extractor for pornotube.com."""
4007 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
4009 def _real_extract(self, url):
4010 mobj = re.match(self._VALID_URL, url)
4012 self._downloader.report_error(u'invalid URL: %s' % url)
4015 video_id = mobj.group('videoid')
4016 video_title = mobj.group('title')
4018 # Get webpage content
4019 webpage = self._download_webpage(url, video_id)
4022 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
4023 result = re.search(VIDEO_URL_RE, webpage)
4025 self._downloader.report_error(u'unable to extract video url')
4027 video_url = compat_urllib_parse.unquote(result.group('url'))
4029 #Get the uploaded date
4030 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
4031 result = re.search(VIDEO_UPLOADED_RE, webpage)
4033 self._downloader.report_error(u'unable to extract video title')
4035 upload_date = result.group('date')
4037 info = {'id': video_id,
4040 'upload_date': upload_date,
4041 'title': video_title,
4047 class YouJizzIE(InfoExtractor):
4048 """Information extractor for youjizz.com."""
4049 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
4051 def _real_extract(self, url):
4052 mobj = re.match(self._VALID_URL, url)
4054 self._downloader.report_error(u'invalid URL: %s' % url)
4057 video_id = mobj.group('videoid')
4059 # Get webpage content
4060 webpage = self._download_webpage(url, video_id)
4062 # Get the video title
4063 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
4065 raise ExtractorError(u'ERROR: unable to extract video title')
4066 video_title = result.group('title').strip()
4068 # Get the embed page
4069 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
4071 raise ExtractorError(u'ERROR: unable to extract embed page')
4073 embed_page_url = result.group(0).strip()
4074 video_id = result.group('videoid')
4076 webpage = self._download_webpage(embed_page_url, video_id)
4079 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
4081 raise ExtractorError(u'ERROR: unable to extract video url')
4082 video_url = result.group('source')
4084 info = {'id': video_id,
4086 'title': video_title,
4089 'player_url': embed_page_url}
4093 class EightTracksIE(InfoExtractor):
4095 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
4097 def _real_extract(self, url):
4098 mobj = re.match(self._VALID_URL, url)
4100 raise ExtractorError(u'Invalid URL: %s' % url)
4101 playlist_id = mobj.group('id')
4103 webpage = self._download_webpage(url, playlist_id)
4105 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
4107 raise ExtractorError(u'Cannot find trax information')
4108 json_like = m.group(1)
4109 data = json.loads(json_like)
4111 session = str(random.randint(0, 1000000000))
4113 track_count = data['tracks_count']
4114 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
4115 next_url = first_url
4117 for i in itertools.count():
4118 api_json = self._download_webpage(next_url, playlist_id,
4119 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
4120 errnote=u'Failed to download song information')
4121 api_data = json.loads(api_json)
4122 track_data = api_data[u'set']['track']
4124 'id': track_data['id'],
4125 'url': track_data['track_file_stream_url'],
4126 'title': track_data['performer'] + u' - ' + track_data['name'],
4127 'raw_title': track_data['name'],
4128 'uploader_id': data['user']['login'],
4132 if api_data['set']['at_last_track']:
4134 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
4137 class KeekIE(InfoExtractor):
4138 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
4141 def _real_extract(self, url):
4142 m = re.match(self._VALID_URL, url)
4143 video_id = m.group('videoID')
4144 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
4145 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
4146 webpage = self._download_webpage(url, video_id)
4147 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4148 title = unescapeHTML(m.group('title'))
4149 m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
4150 uploader = clean_html(m.group('uploader'))
4156 'thumbnail': thumbnail,
4157 'uploader': uploader
4161 class TEDIE(InfoExtractor):
4162 _VALID_URL=r'''http://www.ted.com/
4164 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4166 ((?P<type_talk>talks)) # We have a simple talk
4168 /(?P<name>\w+) # Here goes the name and then ".html"
4172 def suitable(cls, url):
4173 """Receives a URL and returns True if suitable for this IE."""
4174 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4176 def _real_extract(self, url):
4177 m=re.match(self._VALID_URL, url, re.VERBOSE)
4178 if m.group('type_talk'):
4179 return [self._talk_info(url)]
4181 playlist_id=m.group('playlist_id')
4182 name=m.group('name')
4183 self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4184 return self._playlist_videos_info(url,name,playlist_id)
4186 def _talk_video_link(self,mediaSlug):
4187 '''Returns the video link for that mediaSlug'''
4188 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4190 def _playlist_videos_info(self,url,name,playlist_id=0):
4191 '''Returns the videos of the playlist'''
4193 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4194 ([.\s]*?)data-playlist_item_id="(\d+)"
4195 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4197 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4198 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4199 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4200 m_names=re.finditer(video_name_RE,webpage)
4202 for m_video, m_name in zip(m_videos,m_names):
4203 video_id=m_video.group('video_id')
4204 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4205 info.append(self._talk_info(talk_url,video_id))
4208 def _talk_info(self, url, video_id=0):
4209 """Return the video for the talk in the url"""
4210 m=re.match(self._VALID_URL, url,re.VERBOSE)
4211 videoName=m.group('name')
4212 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4213 # If the url includes the language we get the title translated
4214 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4215 title=re.search(title_RE, webpage).group('title')
4216 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4217 "id":(?P<videoID>[\d]+).*?
4218 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4219 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4220 thumb_match=re.search(thumb_RE,webpage)
4221 info_match=re.search(info_RE,webpage,re.VERBOSE)
4222 video_id=info_match.group('videoID')
4223 mediaSlug=info_match.group('mediaSlug')
4224 video_url=self._talk_video_link(mediaSlug)
4230 'thumbnail': thumb_match.group('thumbnail')
4234 class MySpassIE(InfoExtractor):
4235 _VALID_URL = r'http://www.myspass.de/.*'
4237 def _real_extract(self, url):
4238 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4240 # video id is the last path element of the URL
4241 # usually there is a trailing slash, so also try the second but last
4242 url_path = compat_urllib_parse_urlparse(url).path
4243 url_parent_path, video_id = os.path.split(url_path)
4245 _, video_id = os.path.split(url_parent_path)
4248 metadata_url = META_DATA_URL_TEMPLATE % video_id
4249 metadata_text = self._download_webpage(metadata_url, video_id)
4250 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4252 # extract values from metadata
4253 url_flv_el = metadata.find('url_flv')
4254 if url_flv_el is None:
4255 self._downloader.report_error(u'unable to extract download url')
4257 video_url = url_flv_el.text
4258 extension = os.path.splitext(video_url)[1][1:]
4259 title_el = metadata.find('title')
4260 if title_el is None:
4261 self._downloader.report_error(u'unable to extract title')
4263 title = title_el.text
4264 format_id_el = metadata.find('format_id')
4265 if format_id_el is None:
4268 format = format_id_el.text
4269 description_el = metadata.find('description')
4270 if description_el is not None:
4271 description = description_el.text
4274 imagePreview_el = metadata.find('imagePreview')
4275 if imagePreview_el is not None:
4276 thumbnail = imagePreview_el.text
4285 'thumbnail': thumbnail,
4286 'description': description
4290 class SpiegelIE(InfoExtractor):
4291 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4293 def _real_extract(self, url):
4294 m = re.match(self._VALID_URL, url)
4295 video_id = m.group('videoID')
4297 webpage = self._download_webpage(url, video_id)
4298 m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4300 raise ExtractorError(u'Cannot find title')
4301 video_title = unescapeHTML(m.group(1))
4303 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4304 xml_code = self._download_webpage(xml_url, video_id,
4305 note=u'Downloading XML', errnote=u'Failed to download XML')
4307 idoc = xml.etree.ElementTree.fromstring(xml_code)
4308 last_type = idoc[-1]
4309 filename = last_type.findall('./filename')[0].text
4310 duration = float(last_type.findall('./duration')[0].text)
4312 video_url = 'http://video2.spiegel.de/flash/' + filename
4313 video_ext = filename.rpartition('.')[2]
4318 'title': video_title,
4319 'duration': duration,
4323 class LiveLeakIE(InfoExtractor):
4325 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4326 IE_NAME = u'liveleak'
4328 def _real_extract(self, url):
4329 mobj = re.match(self._VALID_URL, url)
4331 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4334 video_id = mobj.group('video_id')
4336 webpage = self._download_webpage(url, video_id)
4338 m = re.search(r'file: "(.*?)",', webpage)
4340 self._downloader.report_error(u'unable to find video url')
4342 video_url = m.group(1)
4344 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4346 self._downloader.trouble(u'Cannot find video title')
4347 title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4349 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4351 desc = unescapeHTML(m.group('desc'))
4355 m = re.search(r'By:.*?(\w+)</a>', webpage)
4357 uploader = clean_html(m.group(1))
4366 'description': desc,
4367 'uploader': uploader
4372 class ARDIE(InfoExtractor):
4373 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4374 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4375 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4377 def _real_extract(self, url):
4378 # determine video id from url
4379 m = re.match(self._VALID_URL, url)
4381 numid = re.search(r'documentId=([0-9]+)', url)
4383 video_id = numid.group(1)
4385 video_id = m.group('video_id')
4387 # determine title and media streams from webpage
4388 html = self._download_webpage(url, video_id)
4389 title = re.search(self._TITLE, html).group('title')
4390 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4392 assert '"fsk"' in html
4393 self._downloader.report_error(u'this video is only available after 8:00 pm')
4396 # choose default media type and highest quality for now
4397 stream = max([s for s in streams if int(s["media_type"]) == 0],
4398 key=lambda s: int(s["quality"]))
4400 # there's two possibilities: RTMP stream or HTTP download
4401 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4402 if stream['rtmp_url']:
4403 self._downloader.to_screen(u'[%s] RTMP download detected' % self.IE_NAME)
4404 assert stream['video_url'].startswith('mp4:')
4405 info["url"] = stream["rtmp_url"]
4406 info["play_path"] = stream['video_url']
4408 assert stream["video_url"].endswith('.mp4')
4409 info["url"] = stream["video_url"]
4413 def gen_extractors():
4414 """ Return a list of an instance of every supported extractor.
4415 The order does matter; the first extractor matched is the one handling the URL.
4418 YoutubePlaylistIE(),
4443 StanfordOpenClassroomIE(),
4453 WorldStarHipHopIE(),