2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
23 class InfoExtractor(object):
24 """Information Extractor class.
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
29 others. The information is stored in a dictionary which is then
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
34 The dictionaries must include the following fields:
38 title: Video title, unescaped.
39 ext: Video filename extension.
41 The following fields are optional:
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
46 uploader: Full name of the video uploader.
47 upload_date: Video upload date (YYYYMMDD).
48 uploader_id: Nickname or id of the video uploader.
49 location: Physical location of the video.
50 player_url: SWF Player URL (used for rtmpdump).
51 subtitles: The subtitle file contents.
52 urlhandle: [internal] The urlHandle to be used to download the file,
53 like returned by urllib.request.urlopen
55 The fields should all be Unicode strings.
57 Subclasses of this one should re-define the _real_initialize() and
58 _real_extract() methods and define a _VALID_URL regexp.
59 Probably, they should also be added to the list of extractors.
61 _real_extract() must return a *list* of information dictionaries as
64 Finally, the _WORKING attribute should be set to False for broken IEs
65 in order to warn the users and skip the tests.
72 def __init__(self, downloader=None):
73 """Constructor. Receives an optional downloader."""
75 self.set_downloader(downloader)
78 def suitable(cls, url):
79 """Receives a URL and returns True if suitable for this IE."""
80 return re.match(cls._VALID_URL, url) is not None
84 """Getter method for _WORKING."""
88 """Initializes an instance (authentication, etc)."""
90 self._real_initialize()
93 def extract(self, url):
94 """Extracts URL information and returns it in list of dicts."""
96 return self._real_extract(url)
98 def set_downloader(self, downloader):
99 """Sets the downloader for this IE."""
100 self._downloader = downloader
102 def _real_initialize(self):
103 """Real initialization process. Redefine in subclasses."""
106 def _real_extract(self, url):
107 """Real extraction process. Redefine in subclasses."""
112 return type(self).__name__[:-2]
114 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115 """ Returns the response handle """
117 self.report_download_webpage(video_id)
118 elif note is not False:
119 self.to_screen(u'%s: %s' % (video_id, note))
121 return compat_urllib_request.urlopen(url_or_request)
122 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
124 errnote = u'Unable to download webpage'
125 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
127 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
128 """ Returns the data of the page as a string """
129 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
130 content_type = urlh.headers.get('Content-Type', '')
131 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
133 encoding = m.group(1)
136 webpage_bytes = urlh.read()
137 if self._downloader.params.get('dump_intermediate_pages', False):
139 url = url_or_request.get_full_url()
140 except AttributeError:
142 self.to_screen(u'Dumping request to ' + url)
143 dump = base64.b64encode(webpage_bytes).decode('ascii')
144 self._downloader.to_screen(dump)
145 return webpage_bytes.decode(encoding, 'replace')
147 def to_screen(self, msg):
148 """Print msg to screen, prefixing it with '[ie_name]'"""
149 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
151 def report_extraction(self, id_or_name):
152 """Report information extraction."""
153 self.to_screen(u'%s: Extracting information' % id_or_name)
155 def report_download_webpage(self, video_id):
156 """Report webpage download."""
157 self.to_screen(u'%s: Downloading webpage' % video_id)
159 def report_age_confirmation(self):
160 """Report attempt to confirm age."""
161 self.to_screen(u'Confirming age')
163 #Methods for following #608
164 #They set the correct value of the '_type' key
165 def video_result(self, video_info):
166 """Returns a video"""
167 video_info['_type'] = 'video'
169 def url_result(self, url, ie=None):
170 """Returns a url that points to a page that should be processed"""
171 #TODO: ie should be the class used for getting the info
172 video_info = {'_type': 'url',
176 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
177 """Returns a playlist"""
178 video_info = {'_type': 'playlist',
181 video_info['id'] = playlist_id
183 video_info['title'] = playlist_title
187 class YoutubeIE(InfoExtractor):
188 """Information extractor for youtube.com."""
192 (?:https?://)? # http(s):// (optional)
193 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
194 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
195 (?:.*?\#/)? # handle anchor (#/) redirect urls
196 (?: # the various things that can precede the ID:
197 (?:(?:v|embed|e)/) # v/ or embed/ or e/
198 |(?: # or the v= param in all its forms
199 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
200 (?:\?|\#!?) # the params delimiter ? or # or #!
201 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
204 )? # optional -> youtube.com/xxxx is OK
205 )? # all until now is optional -> you can pass the naked ID
206 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
207 (?(1).+)? # if we found the ID, everything can follow
209 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
210 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
211 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
212 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
213 _NETRC_MACHINE = 'youtube'
214 # Listed in order of quality
215 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
216 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
217 _video_extensions = {
223 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
229 _video_dimensions = {
248 def suitable(cls, url):
249 """Receives a URL and returns True if suitable for this IE."""
250 if YoutubePlaylistIE.suitable(url): return False
251 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
253 def report_lang(self):
254 """Report attempt to set language."""
255 self.to_screen(u'Setting language')
257 def report_login(self):
258 """Report attempt to log in."""
259 self.to_screen(u'Logging in')
261 def report_video_webpage_download(self, video_id):
262 """Report attempt to download video webpage."""
263 self.to_screen(u'%s: Downloading video webpage' % video_id)
265 def report_video_info_webpage_download(self, video_id):
266 """Report attempt to download video info webpage."""
267 self.to_screen(u'%s: Downloading video info webpage' % video_id)
269 def report_video_subtitles_download(self, video_id):
270 """Report attempt to download video info webpage."""
271 self.to_screen(u'%s: Checking available subtitles' % video_id)
273 def report_video_subtitles_request(self, video_id, sub_lang, format):
274 """Report attempt to download video info webpage."""
275 self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
277 def report_video_subtitles_available(self, video_id, sub_lang_list):
278 """Report available subtitles."""
279 sub_lang = ",".join(list(sub_lang_list.keys()))
280 self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
282 def report_information_extraction(self, video_id):
283 """Report attempt to extract video information."""
284 self.to_screen(u'%s: Extracting video information' % video_id)
286 def report_unavailable_format(self, video_id, format):
287 """Report extracted video URL."""
288 self.to_screen(u'%s: Format %s not available' % (video_id, format))
290 def report_rtmp_download(self):
291 """Indicate the download will use the RTMP protocol."""
292 self.to_screen(u'RTMP download detected')
294 def _get_available_subtitles(self, video_id):
295 self.report_video_subtitles_download(video_id)
296 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
298 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
299 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
300 return (u'unable to download video subtitles: %s' % compat_str(err), None)
301 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
302 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
303 if not sub_lang_list:
304 return (u'video doesn\'t have subtitles', None)
307 def _list_available_subtitles(self, video_id):
308 sub_lang_list = self._get_available_subtitles(video_id)
309 self.report_video_subtitles_available(video_id, sub_lang_list)
311 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
314 (error_message, sub_lang, sub)
316 self.report_video_subtitles_request(video_id, sub_lang, format)
317 params = compat_urllib_parse.urlencode({
323 url = 'http://www.youtube.com/api/timedtext?' + params
325 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
326 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
327 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
329 return (u'Did not fetch video subtitles', None, None)
330 return (None, sub_lang, sub)
332 def _extract_subtitle(self, video_id):
334 Return a list with a tuple:
335 [(error_message, sub_lang, sub)]
337 sub_lang_list = self._get_available_subtitles(video_id)
338 sub_format = self._downloader.params.get('subtitlesformat')
339 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
340 return [(sub_lang_list[0], None, None)]
341 if self._downloader.params.get('subtitleslang', False):
342 sub_lang = self._downloader.params.get('subtitleslang')
343 elif 'en' in sub_lang_list:
346 sub_lang = list(sub_lang_list.keys())[0]
347 if not sub_lang in sub_lang_list:
348 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
350 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
353 def _extract_all_subtitles(self, video_id):
354 sub_lang_list = self._get_available_subtitles(video_id)
355 sub_format = self._downloader.params.get('subtitlesformat')
356 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
357 return [(sub_lang_list[0], None, None)]
359 for sub_lang in sub_lang_list:
360 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
361 subtitles.append(subtitle)
364 def _print_formats(self, formats):
365 print('Available formats:')
367 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
369 def _real_initialize(self):
370 if self._downloader is None:
375 downloader_params = self._downloader.params
377 # Attempt to use provided username and password or .netrc data
378 if downloader_params.get('username', None) is not None:
379 username = downloader_params['username']
380 password = downloader_params['password']
381 elif downloader_params.get('usenetrc', False):
383 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
388 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
389 except (IOError, netrc.NetrcParseError) as err:
390 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
394 request = compat_urllib_request.Request(self._LANG_URL)
397 compat_urllib_request.urlopen(request).read()
398 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
399 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
402 # No authentication to be performed
406 request = compat_urllib_request.Request(self._LOGIN_URL)
408 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
409 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
410 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
415 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
417 galx = match.group(1)
419 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
425 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
429 u'PersistentCookie': u'yes',
431 u'bgresponse': u'js_disabled',
432 u'checkConnection': u'',
433 u'checkedDomains': u'youtube',
439 u'signIn': u'Sign in',
441 u'service': u'youtube',
445 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
447 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
448 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
449 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
452 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
453 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
454 self._downloader.report_warning(u'unable to log in: bad username or password')
456 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
457 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
463 'action_confirm': 'Confirm',
465 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
467 self.report_age_confirmation()
468 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
469 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
470 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
473 def _extract_id(self, url):
474 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
476 self._downloader.report_error(u'invalid URL: %s' % url)
478 video_id = mobj.group(2)
481 def _real_extract(self, url):
482 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
483 mobj = re.search(self._NEXT_URL_RE, url)
485 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
486 video_id = self._extract_id(url)
489 self.report_video_webpage_download(video_id)
490 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
491 request = compat_urllib_request.Request(url)
493 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
494 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
495 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
498 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
500 # Attempt to extract SWF player URL
501 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
503 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
508 self.report_video_info_webpage_download(video_id)
509 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
510 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
511 % (video_id, el_type))
512 video_info_webpage = self._download_webpage(video_info_url, video_id,
514 errnote='unable to download video info webpage')
515 video_info = compat_parse_qs(video_info_webpage)
516 if 'token' in video_info:
518 if 'token' not in video_info:
519 if 'reason' in video_info:
520 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
522 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
525 # Check for "rental" videos
526 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
527 self._downloader.report_error(u'"rental" videos not supported')
530 # Start extracting information
531 self.report_information_extraction(video_id)
534 if 'author' not in video_info:
535 self._downloader.report_error(u'unable to extract uploader name')
537 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
540 video_uploader_id = None
541 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
543 video_uploader_id = mobj.group(1)
545 self._downloader.report_warning(u'unable to extract uploader nickname')
548 if 'title' not in video_info:
549 self._downloader.report_error(u'unable to extract video title')
551 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
554 if 'thumbnail_url' not in video_info:
555 self._downloader.report_warning(u'unable to extract video thumbnail')
557 else: # don't panic if we can't find it
558 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
562 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
564 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
565 upload_date = unified_strdate(upload_date)
568 video_description = get_element_by_id("eow-description", video_webpage)
569 if video_description:
570 video_description = clean_html(video_description)
572 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
574 video_description = unescapeHTML(fd_mobj.group(1))
576 video_description = u''
579 video_subtitles = None
581 if self._downloader.params.get('writesubtitles', False):
582 video_subtitles = self._extract_subtitle(video_id)
584 (sub_error, sub_lang, sub) = video_subtitles[0]
586 self._downloader.report_error(sub_error)
588 if self._downloader.params.get('allsubtitles', False):
589 video_subtitles = self._extract_all_subtitles(video_id)
590 for video_subtitle in video_subtitles:
591 (sub_error, sub_lang, sub) = video_subtitle
593 self._downloader.report_error(sub_error)
595 if self._downloader.params.get('listsubtitles', False):
596 sub_lang_list = self._list_available_subtitles(video_id)
599 if 'length_seconds' not in video_info:
600 self._downloader.report_warning(u'unable to extract video duration')
603 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
606 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
608 # Decide which formats to download
609 req_format = self._downloader.params.get('format', None)
611 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
612 self.report_rtmp_download()
613 video_url_list = [(None, video_info['conn'][0])]
614 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
615 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
616 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
617 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
618 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
620 format_limit = self._downloader.params.get('format_limit', None)
621 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
622 if format_limit is not None and format_limit in available_formats:
623 format_list = available_formats[available_formats.index(format_limit):]
625 format_list = available_formats
626 existing_formats = [x for x in format_list if x in url_map]
627 if len(existing_formats) == 0:
628 raise ExtractorError(u'no known formats available for video')
629 if self._downloader.params.get('listformats', None):
630 self._print_formats(existing_formats)
632 if req_format is None or req_format == 'best':
633 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
634 elif req_format == 'worst':
635 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
636 elif req_format in ('-1', 'all'):
637 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
639 # Specific formats. We pick the first in a slash-delimeted sequence.
640 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
641 req_formats = req_format.split('/')
642 video_url_list = None
643 for rf in req_formats:
645 video_url_list = [(rf, url_map[rf])]
647 if video_url_list is None:
648 raise ExtractorError(u'requested format not available')
650 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
653 for format_param, video_real_url in video_url_list:
655 video_extension = self._video_extensions.get(format_param, 'flv')
657 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
658 self._video_dimensions.get(format_param, '???'))
662 'url': video_real_url,
663 'uploader': video_uploader,
664 'uploader_id': video_uploader_id,
665 'upload_date': upload_date,
666 'title': video_title,
667 'ext': video_extension,
668 'format': video_format,
669 'thumbnail': video_thumbnail,
670 'description': video_description,
671 'player_url': player_url,
672 'subtitles': video_subtitles,
673 'duration': video_duration
678 class MetacafeIE(InfoExtractor):
679 """Information Extractor for metacafe.com."""
681 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
682 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
683 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
684 IE_NAME = u'metacafe'
686 def report_disclaimer(self):
687 """Report disclaimer retrieval."""
688 self.to_screen(u'Retrieving disclaimer')
690 def _real_initialize(self):
691 # Retrieve disclaimer
692 request = compat_urllib_request.Request(self._DISCLAIMER)
694 self.report_disclaimer()
695 disclaimer = compat_urllib_request.urlopen(request).read()
696 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
697 self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
703 'submit': "Continue - I'm over 18",
705 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
707 self.report_age_confirmation()
708 disclaimer = compat_urllib_request.urlopen(request).read()
709 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
710 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
713 def _real_extract(self, url):
714 # Extract id and simplified title from URL
715 mobj = re.match(self._VALID_URL, url)
717 self._downloader.report_error(u'invalid URL: %s' % url)
720 video_id = mobj.group(1)
722 # Check if video comes from YouTube
723 mobj2 = re.match(r'^yt-(.*)$', video_id)
724 if mobj2 is not None:
725 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
727 # Retrieve video webpage to extract further information
728 webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
730 # Extract URL, uploader and title from webpage
731 self.report_extraction(video_id)
732 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
734 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
735 video_extension = mediaURL[-3:]
737 # Extract gdaKey if available
738 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
742 gdaKey = mobj.group(1)
743 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
745 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
747 self._downloader.report_error(u'unable to extract media URL')
749 vardict = compat_parse_qs(mobj.group(1))
750 if 'mediaData' not in vardict:
751 self._downloader.report_error(u'unable to extract media URL')
753 mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
755 self._downloader.report_error(u'unable to extract media URL')
757 mediaURL = mobj.group('mediaURL').replace('\\/', '/')
758 video_extension = mediaURL[-3:]
759 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
761 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
763 self._downloader.report_error(u'unable to extract title')
765 video_title = mobj.group(1).decode('utf-8')
767 mobj = re.search(r'submitter=(.*?);', webpage)
769 self._downloader.report_error(u'unable to extract uploader nickname')
771 video_uploader = mobj.group(1)
774 'id': video_id.decode('utf-8'),
775 'url': video_url.decode('utf-8'),
776 'uploader': video_uploader.decode('utf-8'),
778 'title': video_title,
779 'ext': video_extension.decode('utf-8'),
783 class DailymotionIE(InfoExtractor):
784 """Information Extractor for Dailymotion"""
786 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
787 IE_NAME = u'dailymotion'
789 def _real_extract(self, url):
790 # Extract id and simplified title from URL
791 mobj = re.match(self._VALID_URL, url)
793 self._downloader.report_error(u'invalid URL: %s' % url)
796 video_id = mobj.group(1).split('_')[0].split('?')[0]
798 video_extension = 'mp4'
800 # Retrieve video webpage to extract further information
801 request = compat_urllib_request.Request(url)
802 request.add_header('Cookie', 'family_filter=off')
803 webpage = self._download_webpage(request, video_id)
805 # Extract URL, uploader and title from webpage
806 self.report_extraction(video_id)
807 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
809 self._downloader.report_error(u'unable to extract media URL')
811 flashvars = compat_urllib_parse.unquote(mobj.group(1))
813 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
816 self.to_screen(u'Using %s' % key)
819 self._downloader.report_error(u'unable to extract video URL')
822 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
824 self._downloader.report_error(u'unable to extract video URL')
827 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
829 # TODO: support choosing qualities
831 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
833 self._downloader.report_error(u'unable to extract title')
835 video_title = unescapeHTML(mobj.group('title'))
837 video_uploader = None
838 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
840 # lookin for official user
841 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
842 if mobj_official is None:
843 self._downloader.report_warning(u'unable to extract uploader nickname')
845 video_uploader = mobj_official.group(1)
847 video_uploader = mobj.group(1)
849 video_upload_date = None
850 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
852 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
857 'uploader': video_uploader,
858 'upload_date': video_upload_date,
859 'title': video_title,
860 'ext': video_extension,
864 class PhotobucketIE(InfoExtractor):
865 """Information extractor for photobucket.com."""
867 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
868 IE_NAME = u'photobucket'
870 def _real_extract(self, url):
871 # Extract id from URL
872 mobj = re.match(self._VALID_URL, url)
874 self._downloader.report_error(u'Invalid URL: %s' % url)
877 video_id = mobj.group(1)
879 video_extension = 'flv'
881 # Retrieve video webpage to extract further information
882 request = compat_urllib_request.Request(url)
884 self.report_download_webpage(video_id)
885 webpage = compat_urllib_request.urlopen(request).read()
886 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
887 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
890 # Extract URL, uploader, and title from webpage
891 self.report_extraction(video_id)
892 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
894 self._downloader.report_error(u'unable to extract media URL')
896 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
900 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
902 self._downloader.report_error(u'unable to extract title')
904 video_title = mobj.group(1).decode('utf-8')
906 video_uploader = mobj.group(2).decode('utf-8')
909 'id': video_id.decode('utf-8'),
910 'url': video_url.decode('utf-8'),
911 'uploader': video_uploader,
913 'title': video_title,
914 'ext': video_extension.decode('utf-8'),
918 class YahooIE(InfoExtractor):
919 """Information extractor for video.yahoo.com."""
922 # _VALID_URL matches all Yahoo! Video URLs
923 # _VPAGE_URL matches only the extractable '/watch/' URLs
924 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
925 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
926 IE_NAME = u'video.yahoo'
928 def _real_extract(self, url, new_video=True):
929 # Extract ID from URL
930 mobj = re.match(self._VALID_URL, url)
932 self._downloader.report_error(u'Invalid URL: %s' % url)
935 video_id = mobj.group(2)
936 video_extension = 'flv'
938 # Rewrite valid but non-extractable URLs as
939 # extractable English language /watch/ URLs
940 if re.match(self._VPAGE_URL, url) is None:
941 request = compat_urllib_request.Request(url)
943 webpage = compat_urllib_request.urlopen(request).read()
944 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
945 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
948 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
950 self._downloader.report_error(u'Unable to extract id field')
952 yahoo_id = mobj.group(1)
954 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
956 self._downloader.report_error(u'Unable to extract vid field')
958 yahoo_vid = mobj.group(1)
960 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
961 return self._real_extract(url, new_video=False)
963 # Retrieve video webpage to extract further information
964 request = compat_urllib_request.Request(url)
966 self.report_download_webpage(video_id)
967 webpage = compat_urllib_request.urlopen(request).read()
968 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
969 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
972 # Extract uploader and title from webpage
973 self.report_extraction(video_id)
974 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
976 self._downloader.report_error(u'unable to extract video title')
978 video_title = mobj.group(1).decode('utf-8')
980 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
982 self._downloader.report_error(u'unable to extract video uploader')
984 video_uploader = mobj.group(1).decode('utf-8')
986 # Extract video thumbnail
987 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
989 self._downloader.report_error(u'unable to extract video thumbnail')
991 video_thumbnail = mobj.group(1).decode('utf-8')
993 # Extract video description
994 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
996 self._downloader.report_error(u'unable to extract video description')
998 video_description = mobj.group(1).decode('utf-8')
999 if not video_description:
1000 video_description = 'No description available.'
1002 # Extract video height and width
1003 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1005 self._downloader.report_error(u'unable to extract video height')
1007 yv_video_height = mobj.group(1)
1009 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1011 self._downloader.report_error(u'unable to extract video width')
1013 yv_video_width = mobj.group(1)
1015 # Retrieve video playlist to extract media URL
1016 # I'm not completely sure what all these options are, but we
1017 # seem to need most of them, otherwise the server sends a 401.
1018 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1019 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1020 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1021 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1022 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1024 self.report_download_webpage(video_id)
1025 webpage = compat_urllib_request.urlopen(request).read()
1026 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1027 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1030 # Extract media URL from playlist XML
1031 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1033 self._downloader.report_error(u'Unable to extract media URL')
1035 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1036 video_url = unescapeHTML(video_url)
1039 'id': video_id.decode('utf-8'),
1041 'uploader': video_uploader,
1042 'upload_date': None,
1043 'title': video_title,
1044 'ext': video_extension.decode('utf-8'),
1045 'thumbnail': video_thumbnail.decode('utf-8'),
1046 'description': video_description,
1050 class VimeoIE(InfoExtractor):
1051 """Information extractor for vimeo.com."""
1053 # _VALID_URL matches Vimeo URLs
1054 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1057 def _real_extract(self, url, new_video=True):
1058 # Extract ID from URL
1059 mobj = re.match(self._VALID_URL, url)
1061 self._downloader.report_error(u'Invalid URL: %s' % url)
1064 video_id = mobj.group('id')
1065 if not mobj.group('proto'):
1066 url = 'https://' + url
1067 if mobj.group('direct_link'):
1068 url = 'https://vimeo.com/' + video_id
1070 # Retrieve video webpage to extract further information
1071 request = compat_urllib_request.Request(url, None, std_headers)
1073 self.report_download_webpage(video_id)
1074 webpage_bytes = compat_urllib_request.urlopen(request).read()
1075 webpage = webpage_bytes.decode('utf-8')
1076 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1077 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1080 # Now we begin extracting as much information as we can from what we
1081 # retrieved. First we extract the information common to all extractors,
1082 # and latter we extract those that are Vimeo specific.
1083 self.report_extraction(video_id)
1085 # Extract the config JSON
1087 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1088 config = json.loads(config)
1090 if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1091 self._downloader.report_error(u'The author has restricted the access to this video, try with the "--referer" option')
1093 self._downloader.report_error(u'unable to extract info section')
1097 video_title = config["video"]["title"]
1099 # Extract uploader and uploader_id
1100 video_uploader = config["video"]["owner"]["name"]
1101 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1103 # Extract video thumbnail
1104 video_thumbnail = config["video"]["thumbnail"]
1106 # Extract video description
1107 video_description = get_element_by_attribute("itemprop", "description", webpage)
1108 if video_description: video_description = clean_html(video_description)
1109 else: video_description = u''
1111 # Extract upload date
1112 video_upload_date = None
1113 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1114 if mobj is not None:
1115 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1117 # Vimeo specific: extract request signature and timestamp
1118 sig = config['request']['signature']
1119 timestamp = config['request']['timestamp']
1121 # Vimeo specific: extract video codec and quality information
1122 # First consider quality, then codecs, then take everything
1123 # TODO bind to format param
1124 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1125 files = { 'hd': [], 'sd': [], 'other': []}
1126 for codec_name, codec_extension in codecs:
1127 if codec_name in config["video"]["files"]:
1128 if 'hd' in config["video"]["files"][codec_name]:
1129 files['hd'].append((codec_name, codec_extension, 'hd'))
1130 elif 'sd' in config["video"]["files"][codec_name]:
1131 files['sd'].append((codec_name, codec_extension, 'sd'))
1133 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1135 for quality in ('hd', 'sd', 'other'):
1136 if len(files[quality]) > 0:
1137 video_quality = files[quality][0][2]
1138 video_codec = files[quality][0][0]
1139 video_extension = files[quality][0][1]
1140 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1143 self._downloader.report_error(u'no known codec found')
1146 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1147 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1152 'uploader': video_uploader,
1153 'uploader_id': video_uploader_id,
1154 'upload_date': video_upload_date,
1155 'title': video_title,
1156 'ext': video_extension,
1157 'thumbnail': video_thumbnail,
1158 'description': video_description,
1162 class ArteTvIE(InfoExtractor):
1163 """arte.tv information extractor."""
1165 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1166 _LIVE_URL = r'index-[0-9]+\.html$'
1168 IE_NAME = u'arte.tv'
1170 def fetch_webpage(self, url):
1171 request = compat_urllib_request.Request(url)
1173 self.report_download_webpage(url)
1174 webpage = compat_urllib_request.urlopen(request).read()
1175 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1176 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1178 except ValueError as err:
1179 self._downloader.report_error(u'Invalid URL: %s' % url)
1183 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1184 page = self.fetch_webpage(url)
1185 mobj = re.search(regex, page, regexFlags)
1189 self._downloader.report_error(u'Invalid URL: %s' % url)
1192 for (i, key, err) in matchTuples:
1193 if mobj.group(i) is None:
1194 self._downloader.report_error(err)
1197 info[key] = mobj.group(i)
1201 def extractLiveStream(self, url):
1202 video_lang = url.split('/')[-4]
1203 info = self.grep_webpage(
1205 r'src="(.*?/videothek_js.*?\.js)',
1208 (1, 'url', u'Invalid URL: %s' % url)
1211 http_host = url.split('/')[2]
1212 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1213 info = self.grep_webpage(
1215 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1216 '(http://.*?\.swf).*?' +
1220 (1, 'path', u'could not extract video path: %s' % url),
1221 (2, 'player', u'could not extract video player: %s' % url),
1222 (3, 'url', u'could not extract video url: %s' % url)
1225 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1227 def extractPlus7Stream(self, url):
1228 video_lang = url.split('/')[-3]
1229 info = self.grep_webpage(
1231 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1234 (1, 'url', u'Invalid URL: %s' % url)
1237 next_url = compat_urllib_parse.unquote(info.get('url'))
1238 info = self.grep_webpage(
1240 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1243 (1, 'url', u'Could not find <video> tag: %s' % url)
1246 next_url = compat_urllib_parse.unquote(info.get('url'))
1248 info = self.grep_webpage(
1250 r'<video id="(.*?)".*?>.*?' +
1251 '<name>(.*?)</name>.*?' +
1252 '<dateVideo>(.*?)</dateVideo>.*?' +
1253 '<url quality="hd">(.*?)</url>',
1256 (1, 'id', u'could not extract video id: %s' % url),
1257 (2, 'title', u'could not extract video title: %s' % url),
1258 (3, 'date', u'could not extract video date: %s' % url),
1259 (4, 'url', u'could not extract video url: %s' % url)
1264 'id': info.get('id'),
1265 'url': compat_urllib_parse.unquote(info.get('url')),
1266 'uploader': u'arte.tv',
1267 'upload_date': info.get('date'),
1268 'title': info.get('title').decode('utf-8'),
1274 def _real_extract(self, url):
1275 video_id = url.split('/')[-1]
1276 self.report_extraction(video_id)
1278 if re.search(self._LIVE_URL, video_id) is not None:
1279 self.extractLiveStream(url)
1282 info = self.extractPlus7Stream(url)
1287 class GenericIE(InfoExtractor):
1288 """Generic last-resort information extractor."""
1291 IE_NAME = u'generic'
1293 def report_download_webpage(self, video_id):
1294 """Report webpage download."""
1295 if not self._downloader.params.get('test', False):
1296 self._downloader.report_warning(u'Falling back on generic information extractor.')
1297 super(GenericIE, self).report_download_webpage(video_id)
1299 def report_following_redirect(self, new_url):
1300 """Report information extraction."""
1301 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1303 def _test_redirect(self, url):
1304 """Check if it is a redirect, like url shorteners, in case return the new url."""
1305 class HeadRequest(compat_urllib_request.Request):
1306 def get_method(self):
1309 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1311 Subclass the HTTPRedirectHandler to make it use our
1312 HeadRequest also on the redirected URL
1314 def redirect_request(self, req, fp, code, msg, headers, newurl):
1315 if code in (301, 302, 303, 307):
1316 newurl = newurl.replace(' ', '%20')
1317 newheaders = dict((k,v) for k,v in req.headers.items()
1318 if k.lower() not in ("content-length", "content-type"))
1319 return HeadRequest(newurl,
1321 origin_req_host=req.get_origin_req_host(),
1324 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1326 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1328 Fallback to GET if HEAD is not allowed (405 HTTP error)
1330 def http_error_405(self, req, fp, code, msg, headers):
1334 newheaders = dict((k,v) for k,v in req.headers.items()
1335 if k.lower() not in ("content-length", "content-type"))
1336 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1338 origin_req_host=req.get_origin_req_host(),
1342 opener = compat_urllib_request.OpenerDirector()
1343 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1344 HTTPMethodFallback, HEADRedirectHandler,
1345 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1346 opener.add_handler(handler())
1348 response = opener.open(HeadRequest(url))
1349 new_url = response.geturl()
1354 self.report_following_redirect(new_url)
1357 def _real_extract(self, url):
1358 new_url = self._test_redirect(url)
1359 if new_url: return [self.url_result(new_url)]
1361 video_id = url.split('/')[-1]
1363 webpage = self._download_webpage(url, video_id)
1364 except ValueError as err:
1365 # since this is the last-resort InfoExtractor, if
1366 # this error is thrown, it'll be thrown here
1367 self._downloader.report_error(u'Invalid URL: %s' % url)
1370 self.report_extraction(video_id)
1371 # Start with something easy: JW Player in SWFObject
1372 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1374 # Broaden the search a little bit
1375 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1377 # Broaden the search a little bit: JWPlayer JS loader
1378 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1380 self._downloader.report_error(u'Invalid URL: %s' % url)
1383 # It's possible that one of the regexes
1384 # matched, but returned an empty group:
1385 if mobj.group(1) is None:
1386 self._downloader.report_error(u'Invalid URL: %s' % url)
1389 video_url = compat_urllib_parse.unquote(mobj.group(1))
1390 video_id = os.path.basename(video_url)
1392 # here's a fun little line of code for you:
1393 video_extension = os.path.splitext(video_id)[1][1:]
1394 video_id = os.path.splitext(video_id)[0]
1396 # it's tempting to parse this further, but you would
1397 # have to take into account all the variations like
1398 # Video Title - Site Name
1399 # Site Name | Video Title
1400 # Video Title - Tagline | Site Name
1401 # and so on and so forth; it's just not practical
1402 mobj = re.search(r'<title>(.*)</title>', webpage)
1404 self._downloader.report_error(u'unable to extract title')
1406 video_title = mobj.group(1)
1408 # video uploader is domain name
1409 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1411 self._downloader.report_error(u'unable to extract title')
1413 video_uploader = mobj.group(1)
1418 'uploader': video_uploader,
1419 'upload_date': None,
1420 'title': video_title,
1421 'ext': video_extension,
1425 class YoutubeSearchIE(InfoExtractor):
1426 """Information Extractor for YouTube search queries."""
1427 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1428 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1429 _max_youtube_results = 1000
1430 IE_NAME = u'youtube:search'
1432 def report_download_page(self, query, pagenum):
1433 """Report attempt to download search page with given number."""
1434 query = query.decode(preferredencoding())
1435 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1437 def _real_extract(self, query):
1438 mobj = re.match(self._VALID_URL, query)
1440 self._downloader.report_error(u'invalid search query "%s"' % query)
1443 prefix, query = query.split(':')
1445 query = query.encode('utf-8')
1447 return self._get_n_results(query, 1)
1448 elif prefix == 'all':
1449 self._get_n_results(query, self._max_youtube_results)
1454 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1456 elif n > self._max_youtube_results:
1457 self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1458 n = self._max_youtube_results
1459 return self._get_n_results(query, n)
1460 except ValueError: # parsing prefix as integer fails
1461 return self._get_n_results(query, 1)
1463 def _get_n_results(self, query, n):
1464 """Get a specified number of results for a query"""
1470 while (50 * pagenum) < limit:
1471 self.report_download_page(query, pagenum+1)
1472 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1473 request = compat_urllib_request.Request(result_url)
1475 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1476 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1477 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1479 api_response = json.loads(data)['data']
1481 if not 'items' in api_response:
1482 self._downloader.report_error(u'[youtube] No video results')
1485 new_ids = list(video['id'] for video in api_response['items'])
1486 video_ids += new_ids
1488 limit = min(n, api_response['totalItems'])
1491 if len(video_ids) > n:
1492 video_ids = video_ids[:n]
1493 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1497 class GoogleSearchIE(InfoExtractor):
1498 """Information Extractor for Google Video search queries."""
1499 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1500 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1501 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1502 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1503 _max_google_results = 1000
1504 IE_NAME = u'video.google:search'
1506 def report_download_page(self, query, pagenum):
1507 """Report attempt to download playlist page with given number."""
1508 query = query.decode(preferredencoding())
1509 self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1511 def _real_extract(self, query):
1512 mobj = re.match(self._VALID_URL, query)
1514 self._downloader.report_error(u'invalid search query "%s"' % query)
1517 prefix, query = query.split(':')
1519 query = query.encode('utf-8')
1521 self._download_n_results(query, 1)
1523 elif prefix == 'all':
1524 self._download_n_results(query, self._max_google_results)
1530 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1532 elif n > self._max_google_results:
1533 self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1534 n = self._max_google_results
1535 self._download_n_results(query, n)
1537 except ValueError: # parsing prefix as integer fails
1538 self._download_n_results(query, 1)
1541 def _download_n_results(self, query, n):
1542 """Downloads a specified number of results for a query"""
1548 self.report_download_page(query, pagenum)
1549 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1550 request = compat_urllib_request.Request(result_url)
1552 page = compat_urllib_request.urlopen(request).read()
1553 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1554 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1557 # Extract video identifiers
1558 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1559 video_id = mobj.group(1)
1560 if video_id not in video_ids:
1561 video_ids.append(video_id)
1562 if len(video_ids) == n:
1563 # Specified n videos reached
1564 for id in video_ids:
1565 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1568 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1569 for id in video_ids:
1570 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1573 pagenum = pagenum + 1
1576 class YahooSearchIE(InfoExtractor):
1577 """Information Extractor for Yahoo! Video search queries."""
1580 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1581 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1582 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1583 _MORE_PAGES_INDICATOR = r'\s*Next'
1584 _max_yahoo_results = 1000
1585 IE_NAME = u'video.yahoo:search'
1587 def report_download_page(self, query, pagenum):
1588 """Report attempt to download playlist page with given number."""
1589 query = query.decode(preferredencoding())
1590 self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1592 def _real_extract(self, query):
1593 mobj = re.match(self._VALID_URL, query)
1595 self._downloader.report_error(u'invalid search query "%s"' % query)
1598 prefix, query = query.split(':')
1600 query = query.encode('utf-8')
1602 self._download_n_results(query, 1)
1604 elif prefix == 'all':
1605 self._download_n_results(query, self._max_yahoo_results)
1611 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1613 elif n > self._max_yahoo_results:
1614 self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1615 n = self._max_yahoo_results
1616 self._download_n_results(query, n)
1618 except ValueError: # parsing prefix as integer fails
1619 self._download_n_results(query, 1)
1622 def _download_n_results(self, query, n):
1623 """Downloads a specified number of results for a query"""
1626 already_seen = set()
1630 self.report_download_page(query, pagenum)
1631 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1632 request = compat_urllib_request.Request(result_url)
1634 page = compat_urllib_request.urlopen(request).read()
1635 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1636 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1639 # Extract video identifiers
1640 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1641 video_id = mobj.group(1)
1642 if video_id not in already_seen:
1643 video_ids.append(video_id)
1644 already_seen.add(video_id)
1645 if len(video_ids) == n:
1646 # Specified n videos reached
1647 for id in video_ids:
1648 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1651 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1652 for id in video_ids:
1653 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1656 pagenum = pagenum + 1
1659 class YoutubePlaylistIE(InfoExtractor):
1660 """Information Extractor for YouTube playlists."""
1662 _VALID_URL = r"""(?:
1667 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1668 \? (?:.*?&)*? (?:p|a|list)=
1671 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1674 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1676 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1678 IE_NAME = u'youtube:playlist'
1681 def suitable(cls, url):
1682 """Receives a URL and returns True if suitable for this IE."""
1683 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1685 def report_download_page(self, playlist_id, pagenum):
1686 """Report attempt to download playlist page with given number."""
1687 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1689 def _real_extract(self, url):
1690 # Extract playlist id
1691 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1693 self._downloader.report_error(u'invalid url: %s' % url)
1696 # Download playlist videos from API
1697 playlist_id = mobj.group(1) or mobj.group(2)
1702 self.report_download_page(playlist_id, page_num)
1704 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1706 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1707 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1708 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1712 response = json.loads(page)
1713 except ValueError as err:
1714 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1717 if 'feed' not in response:
1718 self._downloader.report_error(u'Got a malformed response from YouTube API')
1720 playlist_title = response['feed']['title']['$t']
1721 if 'entry' not in response['feed']:
1722 # Number of videos is a multiple of self._MAX_RESULTS
1725 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1726 for entry in response['feed']['entry']
1727 if 'content' in entry ]
1729 if len(response['feed']['entry']) < self._MAX_RESULTS:
1733 videos = [v[1] for v in sorted(videos)]
1735 url_results = [self.url_result(url, 'Youtube') for url in videos]
1736 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1739 class YoutubeChannelIE(InfoExtractor):
1740 """Information Extractor for YouTube channels."""
1742 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1743 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1744 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1745 _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1746 IE_NAME = u'youtube:channel'
1748 def report_download_page(self, channel_id, pagenum):
1749 """Report attempt to download channel page with given number."""
1750 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1752 def extract_videos_from_page(self, page):
1754 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1755 if mobj.group(1) not in ids_in_page:
1756 ids_in_page.append(mobj.group(1))
1759 def _real_extract(self, url):
1760 # Extract channel id
1761 mobj = re.match(self._VALID_URL, url)
1763 self._downloader.report_error(u'invalid url: %s' % url)
1766 # Download channel page
1767 channel_id = mobj.group(1)
1771 self.report_download_page(channel_id, pagenum)
1772 url = self._TEMPLATE_URL % (channel_id, pagenum)
1773 request = compat_urllib_request.Request(url)
1775 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1776 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1777 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1780 # Extract video identifiers
1781 ids_in_page = self.extract_videos_from_page(page)
1782 video_ids.extend(ids_in_page)
1784 # Download any subsequent channel pages using the json-based channel_ajax query
1785 if self._MORE_PAGES_INDICATOR in page:
1787 pagenum = pagenum + 1
1789 self.report_download_page(channel_id, pagenum)
1790 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1791 request = compat_urllib_request.Request(url)
1793 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1794 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1795 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1798 page = json.loads(page)
1800 ids_in_page = self.extract_videos_from_page(page['content_html'])
1801 video_ids.extend(ids_in_page)
1803 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1806 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1808 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1809 url_entries = [self.url_result(url, 'Youtube') for url in urls]
1810 return [self.playlist_result(url_entries, channel_id)]
1813 class YoutubeUserIE(InfoExtractor):
1814 """Information Extractor for YouTube users."""
1816 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1817 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1818 _GDATA_PAGE_SIZE = 50
1819 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1820 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1821 IE_NAME = u'youtube:user'
1823 def report_download_page(self, username, start_index):
1824 """Report attempt to download user page."""
1825 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1826 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1828 def _real_extract(self, url):
1830 mobj = re.match(self._VALID_URL, url)
1832 self._downloader.report_error(u'invalid url: %s' % url)
1835 username = mobj.group(1)
1837 # Download video ids using YouTube Data API. Result size per
1838 # query is limited (currently to 50 videos) so we need to query
1839 # page by page until there are no video ids - it means we got
1846 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1847 self.report_download_page(username, start_index)
1849 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1852 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1853 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1854 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1857 # Extract video identifiers
1860 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1861 if mobj.group(1) not in ids_in_page:
1862 ids_in_page.append(mobj.group(1))
1864 video_ids.extend(ids_in_page)
1866 # A little optimization - if current page is not
1867 # "full", ie. does not contain PAGE_SIZE video ids then
1868 # we can assume that this page is the last one - there
1869 # are no more ids on further pages - no need to query
1872 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1877 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1878 url_results = [self.url_result(url, 'Youtube') for url in urls]
1879 return [self.playlist_result(url_results, playlist_title = username)]
1882 class BlipTVUserIE(InfoExtractor):
1883 """Information Extractor for blip.tv users."""
1885 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1887 IE_NAME = u'blip.tv:user'
1889 def report_download_page(self, username, pagenum):
1890 """Report attempt to download user page."""
1891 self.to_screen(u'user %s: Downloading video ids from page %d' %
1892 (username, pagenum))
1894 def _real_extract(self, url):
1896 mobj = re.match(self._VALID_URL, url)
1898 self._downloader.report_error(u'invalid url: %s' % url)
1901 username = mobj.group(1)
1903 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1905 request = compat_urllib_request.Request(url)
1908 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1909 mobj = re.search(r'data-users-id="([^"]+)"', page)
1910 page_base = page_base % mobj.group(1)
1911 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1912 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1916 # Download video ids using BlipTV Ajax calls. Result size per
1917 # query is limited (currently to 12 videos) so we need to query
1918 # page by page until there are no video ids - it means we got
1925 self.report_download_page(username, pagenum)
1926 url = page_base + "&page=" + str(pagenum)
1927 request = compat_urllib_request.Request( url )
1929 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1930 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1931 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
1934 # Extract video identifiers
1937 for mobj in re.finditer(r'href="/([^"]+)"', page):
1938 if mobj.group(1) not in ids_in_page:
1939 ids_in_page.append(unescapeHTML(mobj.group(1)))
1941 video_ids.extend(ids_in_page)
1943 # A little optimization - if current page is not
1944 # "full", ie. does not contain PAGE_SIZE video ids then
1945 # we can assume that this page is the last one - there
1946 # are no more ids on further pages - no need to query
1949 if len(ids_in_page) < self._PAGE_SIZE:
1954 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1955 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1956 return [self.playlist_result(url_entries, playlist_title = username)]
1959 class DepositFilesIE(InfoExtractor):
1960 """Information extractor for depositfiles.com"""
1962 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1964 def _real_extract(self, url):
1965 file_id = url.split('/')[-1]
1966 # Rebuild url in english locale
1967 url = 'http://depositfiles.com/en/files/' + file_id
1969 # Retrieve file webpage with 'Free download' button pressed
1970 free_download_indication = { 'gateway_result' : '1' }
1971 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1973 self.report_download_webpage(file_id)
1974 webpage = compat_urllib_request.urlopen(request).read()
1975 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1976 self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
1979 # Search for the real file URL
1980 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1981 if (mobj is None) or (mobj.group(1) is None):
1982 # Try to figure out reason of the error.
1983 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1984 if (mobj is not None) and (mobj.group(1) is not None):
1985 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1986 self._downloader.report_error(u'%s' % restriction_message)
1988 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
1991 file_url = mobj.group(1)
1992 file_extension = os.path.splitext(file_url)[1][1:]
1994 # Search for file title
1995 mobj = re.search(r'<b title="(.*?)">', webpage)
1997 self._downloader.report_error(u'unable to extract title')
1999 file_title = mobj.group(1).decode('utf-8')
2002 'id': file_id.decode('utf-8'),
2003 'url': file_url.decode('utf-8'),
2005 'upload_date': None,
2006 'title': file_title,
2007 'ext': file_extension.decode('utf-8'),
2011 class FacebookIE(InfoExtractor):
2012 """Information Extractor for Facebook"""
2014 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2015 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2016 _NETRC_MACHINE = 'facebook'
2017 IE_NAME = u'facebook'
2019 def report_login(self):
2020 """Report attempt to log in."""
2021 self.to_screen(u'Logging in')
2023 def _real_initialize(self):
2024 if self._downloader is None:
2029 downloader_params = self._downloader.params
2031 # Attempt to use provided username and password or .netrc data
2032 if downloader_params.get('username', None) is not None:
2033 useremail = downloader_params['username']
2034 password = downloader_params['password']
2035 elif downloader_params.get('usenetrc', False):
2037 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2038 if info is not None:
2042 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2043 except (IOError, netrc.NetrcParseError) as err:
2044 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2047 if useremail is None:
2056 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2059 login_results = compat_urllib_request.urlopen(request).read()
2060 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2061 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2063 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2064 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2067 def _real_extract(self, url):
2068 mobj = re.match(self._VALID_URL, url)
2070 self._downloader.report_error(u'invalid URL: %s' % url)
2072 video_id = mobj.group('ID')
2074 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2075 webpage = self._download_webpage(url, video_id)
2077 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
2078 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2079 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2081 raise ExtractorError(u'Cannot parse data')
2082 data = dict(json.loads(m.group(1)))
2083 params_raw = compat_urllib_parse.unquote(data['params'])
2084 params = json.loads(params_raw)
2085 video_data = params['video_data'][0]
2086 video_url = video_data.get('hd_src')
2088 video_url = video_data['sd_src']
2090 raise ExtractorError(u'Cannot find video URL')
2091 video_duration = int(video_data['video_duration'])
2092 thumbnail = video_data['thumbnail_src']
2094 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2096 raise ExtractorError(u'Cannot find title in webpage')
2097 video_title = unescapeHTML(m.group(1))
2101 'title': video_title,
2104 'duration': video_duration,
2105 'thumbnail': thumbnail,
2110 class BlipTVIE(InfoExtractor):
2111 """Information extractor for blip.tv"""
2113 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2114 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2115 IE_NAME = u'blip.tv'
2117 def report_direct_download(self, title):
2118 """Report information extraction."""
2119 self.to_screen(u'%s: Direct download detected' % title)
2121 def _real_extract(self, url):
2122 mobj = re.match(self._VALID_URL, url)
2124 self._downloader.report_error(u'invalid URL: %s' % url)
2127 urlp = compat_urllib_parse_urlparse(url)
2128 if urlp.path.startswith('/play/'):
2129 request = compat_urllib_request.Request(url)
2130 response = compat_urllib_request.urlopen(request)
2131 redirecturl = response.geturl()
2132 rurlp = compat_urllib_parse_urlparse(redirecturl)
2133 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2134 url = 'http://blip.tv/a/a-' + file_id
2135 return self._real_extract(url)
2142 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2143 request = compat_urllib_request.Request(json_url)
2144 request.add_header('User-Agent', 'iTunes/10.6.1')
2145 self.report_extraction(mobj.group(1))
2148 urlh = compat_urllib_request.urlopen(request)
2149 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2150 basename = url.split('/')[-1]
2151 title,ext = os.path.splitext(basename)
2152 title = title.decode('UTF-8')
2153 ext = ext.replace('.', '')
2154 self.report_direct_download(title)
2159 'upload_date': None,
2164 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2165 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2166 if info is None: # Regular URL
2168 json_code_bytes = urlh.read()
2169 json_code = json_code_bytes.decode('utf-8')
2170 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2171 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2175 json_data = json.loads(json_code)
2176 if 'Post' in json_data:
2177 data = json_data['Post']
2181 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2182 video_url = data['media']['url']
2183 umobj = re.match(self._URL_EXT, video_url)
2185 raise ValueError('Can not determine filename extension')
2186 ext = umobj.group(1)
2189 'id': data['item_id'],
2191 'uploader': data['display_name'],
2192 'upload_date': upload_date,
2193 'title': data['title'],
2195 'format': data['media']['mimeType'],
2196 'thumbnail': data['thumbnailUrl'],
2197 'description': data['description'],
2198 'player_url': data['embedUrl'],
2199 'user_agent': 'iTunes/10.6.1',
2201 except (ValueError,KeyError) as err:
2202 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2208 class MyVideoIE(InfoExtractor):
2209 """Information Extractor for myvideo.de."""
2211 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2212 IE_NAME = u'myvideo'
2214 def _real_extract(self,url):
2215 mobj = re.match(self._VALID_URL, url)
2217 self._download.report_error(u'invalid URL: %s' % url)
2220 video_id = mobj.group(1)
2223 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2224 webpage = self._download_webpage(webpage_url, video_id)
2226 self.report_extraction(video_id)
2227 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2230 self._downloader.report_error(u'unable to extract media URL')
2232 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2234 mobj = re.search('<title>([^<]+)</title>', webpage)
2236 self._downloader.report_error(u'unable to extract title')
2239 video_title = mobj.group(1)
2245 'upload_date': None,
2246 'title': video_title,
2250 class ComedyCentralIE(InfoExtractor):
2251 """Information extractor for The Daily Show and Colbert Report """
2253 # urls can be abbreviations like :thedailyshow or :colbert
2254 # urls for episodes like:
2255 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2256 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2257 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2258 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2259 |(https?://)?(www\.)?
2260 (?P<showname>thedailyshow|colbertnation)\.com/
2261 (full-episodes/(?P<episode>.*)|
2263 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2264 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2267 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2269 _video_extensions = {
2277 _video_dimensions = {
2287 def suitable(cls, url):
2288 """Receives a URL and returns True if suitable for this IE."""
2289 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2291 def report_config_download(self, episode_id, media_id):
2292 self.to_screen(u'%s: Downloading configuration for %s' % (episode_id, media_id))
2294 def report_index_download(self, episode_id):
2295 self.to_screen(u'%s: Downloading show index' % episode_id)
2297 def _print_formats(self, formats):
2298 print('Available formats:')
2300 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2303 def _real_extract(self, url):
2304 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2306 self._downloader.report_error(u'invalid URL: %s' % url)
2309 if mobj.group('shortname'):
2310 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2311 url = u'http://www.thedailyshow.com/full-episodes/'
2313 url = u'http://www.colbertnation.com/full-episodes/'
2314 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2315 assert mobj is not None
2317 if mobj.group('clip'):
2318 if mobj.group('showname') == 'thedailyshow':
2319 epTitle = mobj.group('tdstitle')
2321 epTitle = mobj.group('cntitle')
2324 dlNewest = not mobj.group('episode')
2326 epTitle = mobj.group('showname')
2328 epTitle = mobj.group('episode')
2330 req = compat_urllib_request.Request(url)
2331 self.report_extraction(epTitle)
2333 htmlHandle = compat_urllib_request.urlopen(req)
2334 html = htmlHandle.read()
2335 webpage = html.decode('utf-8')
2336 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2337 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2340 url = htmlHandle.geturl()
2341 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2343 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2345 if mobj.group('episode') == '':
2346 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2348 epTitle = mobj.group('episode')
2350 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2352 if len(mMovieParams) == 0:
2353 # The Colbert Report embeds the information in a without
2354 # a URL prefix; so extract the alternate reference
2355 # and then add the URL prefix manually.
2357 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2358 if len(altMovieParams) == 0:
2359 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2362 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2364 uri = mMovieParams[0][1]
2365 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2366 self.report_index_download(epTitle)
2368 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2369 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2370 self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2375 idoc = xml.etree.ElementTree.fromstring(indexXml)
2376 itemEls = idoc.findall('.//item')
2377 for partNum,itemEl in enumerate(itemEls):
2378 mediaId = itemEl.findall('./guid')[0].text
2379 shortMediaId = mediaId.split(':')[-1]
2380 showId = mediaId.split(':')[-2].replace('.com', '')
2381 officialTitle = itemEl.findall('./title')[0].text
2382 officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2384 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2385 compat_urllib_parse.urlencode({'uri': mediaId}))
2386 configReq = compat_urllib_request.Request(configUrl)
2387 self.report_config_download(epTitle, shortMediaId)
2389 configXml = compat_urllib_request.urlopen(configReq).read()
2390 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2391 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2394 cdoc = xml.etree.ElementTree.fromstring(configXml)
2396 for rendition in cdoc.findall('.//rendition'):
2397 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2401 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2404 if self._downloader.params.get('listformats', None):
2405 self._print_formats([i[0] for i in turls])
2408 # For now, just pick the highest bitrate
2409 format,rtmp_video_url = turls[-1]
2411 # Get the format arg from the arg stream
2412 req_format = self._downloader.params.get('format', None)
2414 # Select format if we can find one
2417 format, rtmp_video_url = f, v
2420 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2422 raise ExtractorError(u'Cannot transform RTMP url')
2423 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2424 video_url = base + m.group('finalid')
2426 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2431 'upload_date': officialDate,
2436 'description': officialTitle,
2438 results.append(info)
2443 class EscapistIE(InfoExtractor):
2444 """Information extractor for The Escapist """
2446 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2447 IE_NAME = u'escapist'
2449 def report_config_download(self, showName):
2450 self.to_screen(u'%s: Downloading configuration' % showName)
2452 def _real_extract(self, url):
2453 mobj = re.match(self._VALID_URL, url)
2455 self._downloader.report_error(u'invalid URL: %s' % url)
2457 showName = mobj.group('showname')
2458 videoId = mobj.group('episode')
2460 self.report_extraction(showName)
2462 webPage = compat_urllib_request.urlopen(url)
2463 webPageBytes = webPage.read()
2464 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2465 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2466 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2467 self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2470 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2471 description = unescapeHTML(descMatch.group(1))
2472 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2473 imgUrl = unescapeHTML(imgMatch.group(1))
2474 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2475 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2476 configUrlMatch = re.search('config=(.*)$', playerUrl)
2477 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2479 self.report_config_download(showName)
2481 configJSON = compat_urllib_request.urlopen(configUrl)
2482 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2483 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2484 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2485 self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2488 # Technically, it's JavaScript, not JSON
2489 configJSON = configJSON.replace("'", '"')
2492 config = json.loads(configJSON)
2493 except (ValueError,) as err:
2494 self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2497 playlist = config['playlist']
2498 videoUrl = playlist[1]['url']
2503 'uploader': showName,
2504 'upload_date': None,
2507 'thumbnail': imgUrl,
2508 'description': description,
2509 'player_url': playerUrl,
2514 class CollegeHumorIE(InfoExtractor):
2515 """Information extractor for collegehumor.com"""
2518 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2519 IE_NAME = u'collegehumor'
2521 def report_manifest(self, video_id):
2522 """Report information extraction."""
2523 self.to_screen(u'%s: Downloading XML manifest' % video_id)
2525 def _real_extract(self, url):
2526 mobj = re.match(self._VALID_URL, url)
2528 self._downloader.report_error(u'invalid URL: %s' % url)
2530 video_id = mobj.group('videoid')
2535 'upload_date': None,
2538 self.report_extraction(video_id)
2539 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2541 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2542 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2543 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2546 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2548 videoNode = mdoc.findall('./video')[0]
2549 info['description'] = videoNode.findall('./description')[0].text
2550 info['title'] = videoNode.findall('./caption')[0].text
2551 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2552 manifest_url = videoNode.findall('./file')[0].text
2554 self._downloader.report_error(u'Invalid metadata XML file')
2557 manifest_url += '?hdcore=2.10.3'
2558 self.report_manifest(video_id)
2560 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2561 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2562 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2565 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2567 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2568 node_id = media_node.attrib['url']
2569 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2570 except IndexError as err:
2571 self._downloader.report_error(u'Invalid manifest file')
2574 url_pr = compat_urllib_parse_urlparse(manifest_url)
2575 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2582 class XVideosIE(InfoExtractor):
2583 """Information extractor for xvideos.com"""
2585 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2586 IE_NAME = u'xvideos'
2588 def _real_extract(self, url):
2589 mobj = re.match(self._VALID_URL, url)
2591 self._downloader.report_error(u'invalid URL: %s' % url)
2593 video_id = mobj.group(1)
2595 webpage = self._download_webpage(url, video_id)
2597 self.report_extraction(video_id)
2601 mobj = re.search(r'flv_url=(.+?)&', webpage)
2603 self._downloader.report_error(u'unable to extract video url')
2605 video_url = compat_urllib_parse.unquote(mobj.group(1))
2609 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2611 self._downloader.report_error(u'unable to extract video title')
2613 video_title = mobj.group(1)
2616 # Extract video thumbnail
2617 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2619 self._downloader.report_error(u'unable to extract video thumbnail')
2621 video_thumbnail = mobj.group(0)
2627 'upload_date': None,
2628 'title': video_title,
2630 'thumbnail': video_thumbnail,
2631 'description': None,
2637 class SoundcloudIE(InfoExtractor):
2638 """Information extractor for soundcloud.com
2639 To access the media, the uid of the song and a stream token
2640 must be extracted from the page source and the script must make
2641 a request to media.soundcloud.com/crossdomain.xml. Then
2642 the media can be grabbed by requesting from an url composed
2643 of the stream token and uid
2646 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2647 IE_NAME = u'soundcloud'
2649 def report_resolve(self, video_id):
2650 """Report information extraction."""
2651 self.to_screen(u'%s: Resolving id' % video_id)
2653 def _real_extract(self, url):
2654 mobj = re.match(self._VALID_URL, url)
2656 self._downloader.report_error(u'invalid URL: %s' % url)
2659 # extract uploader (which is in the url)
2660 uploader = mobj.group(1)
2661 # extract simple title (uploader + slug of song title)
2662 slug_title = mobj.group(2)
2663 simple_title = uploader + u'-' + slug_title
2665 self.report_resolve('%s/%s' % (uploader, slug_title))
2667 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2668 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2669 request = compat_urllib_request.Request(resolv_url)
2671 info_json_bytes = compat_urllib_request.urlopen(request).read()
2672 info_json = info_json_bytes.decode('utf-8')
2673 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2674 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2677 info = json.loads(info_json)
2678 video_id = info['id']
2679 self.report_extraction('%s/%s' % (uploader, slug_title))
2681 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2682 request = compat_urllib_request.Request(streams_url)
2684 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2685 stream_json = stream_json_bytes.decode('utf-8')
2686 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2687 self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2690 streams = json.loads(stream_json)
2691 mediaURL = streams['http_mp3_128_url']
2692 upload_date = unified_strdate(info['created_at'])
2697 'uploader': info['user']['username'],
2698 'upload_date': upload_date,
2699 'title': info['title'],
2701 'description': info['description'],
2704 class SoundcloudSetIE(InfoExtractor):
2705 """Information extractor for soundcloud.com sets
2706 To access the media, the uid of the song and a stream token
2707 must be extracted from the page source and the script must make
2708 a request to media.soundcloud.com/crossdomain.xml. Then
2709 the media can be grabbed by requesting from an url composed
2710 of the stream token and uid
2713 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2714 IE_NAME = u'soundcloud:set'
2716 def report_resolve(self, video_id):
2717 """Report information extraction."""
2718 self.to_screen(u'%s: Resolving id' % video_id)
2720 def _real_extract(self, url):
2721 mobj = re.match(self._VALID_URL, url)
2723 self._downloader.report_error(u'invalid URL: %s' % url)
2726 # extract uploader (which is in the url)
2727 uploader = mobj.group(1)
2728 # extract simple title (uploader + slug of song title)
2729 slug_title = mobj.group(2)
2730 simple_title = uploader + u'-' + slug_title
2732 self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2734 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2735 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2736 request = compat_urllib_request.Request(resolv_url)
2738 info_json_bytes = compat_urllib_request.urlopen(request).read()
2739 info_json = info_json_bytes.decode('utf-8')
2740 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2741 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2745 info = json.loads(info_json)
2746 if 'errors' in info:
2747 for err in info['errors']:
2748 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2751 for track in info['tracks']:
2752 video_id = track['id']
2753 self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2755 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2756 request = compat_urllib_request.Request(streams_url)
2758 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2759 stream_json = stream_json_bytes.decode('utf-8')
2760 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2761 self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2764 streams = json.loads(stream_json)
2765 mediaURL = streams['http_mp3_128_url']
2770 'uploader': track['user']['username'],
2771 'upload_date': track['created_at'],
2772 'title': track['title'],
2774 'description': track['description'],
2779 class InfoQIE(InfoExtractor):
2780 """Information extractor for infoq.com"""
2781 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2783 def _real_extract(self, url):
2784 mobj = re.match(self._VALID_URL, url)
2786 self._downloader.report_error(u'invalid URL: %s' % url)
2789 webpage = self._download_webpage(url, video_id=url)
2790 self.report_extraction(url)
2793 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2795 self._downloader.report_error(u'unable to extract video url')
2797 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2798 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2801 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2803 self._downloader.report_error(u'unable to extract video title')
2805 video_title = mobj.group(1)
2807 # Extract description
2808 video_description = u'No description available.'
2809 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2810 if mobj is not None:
2811 video_description = mobj.group(1)
2813 video_filename = video_url.split('/')[-1]
2814 video_id, extension = video_filename.split('.')
2820 'upload_date': None,
2821 'title': video_title,
2822 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2824 'description': video_description,
2829 class MixcloudIE(InfoExtractor):
2830 """Information extractor for www.mixcloud.com"""
2832 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2833 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2834 IE_NAME = u'mixcloud'
2836 def report_download_json(self, file_id):
2837 """Report JSON download."""
2838 self.to_screen(u'Downloading json')
2840 def get_urls(self, jsonData, fmt, bitrate='best'):
2841 """Get urls from 'audio_formats' section in json"""
2844 bitrate_list = jsonData[fmt]
2845 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2846 bitrate = max(bitrate_list) # select highest
2848 url_list = jsonData[fmt][bitrate]
2849 except TypeError: # we have no bitrate info.
2850 url_list = jsonData[fmt]
2853 def check_urls(self, url_list):
2854 """Returns 1st active url from list"""
2855 for url in url_list:
2857 compat_urllib_request.urlopen(url)
2859 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2864 def _print_formats(self, formats):
2865 print('Available formats:')
2866 for fmt in formats.keys():
2867 for b in formats[fmt]:
2869 ext = formats[fmt][b][0]
2870 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2871 except TypeError: # we have no bitrate info
2872 ext = formats[fmt][0]
2873 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2876 def _real_extract(self, url):
2877 mobj = re.match(self._VALID_URL, url)
2879 self._downloader.report_error(u'invalid URL: %s' % url)
2881 # extract uploader & filename from url
2882 uploader = mobj.group(1).decode('utf-8')
2883 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2885 # construct API request
2886 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2887 # retrieve .json file with links to files
2888 request = compat_urllib_request.Request(file_url)
2890 self.report_download_json(file_url)
2891 jsonData = compat_urllib_request.urlopen(request).read()
2892 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2893 self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
2897 json_data = json.loads(jsonData)
2898 player_url = json_data['player_swf_url']
2899 formats = dict(json_data['audio_formats'])
2901 req_format = self._downloader.params.get('format', None)
2904 if self._downloader.params.get('listformats', None):
2905 self._print_formats(formats)
2908 if req_format is None or req_format == 'best':
2909 for format_param in formats.keys():
2910 url_list = self.get_urls(formats, format_param)
2912 file_url = self.check_urls(url_list)
2913 if file_url is not None:
2916 if req_format not in formats:
2917 self._downloader.report_error(u'format is not available')
2920 url_list = self.get_urls(formats, req_format)
2921 file_url = self.check_urls(url_list)
2922 format_param = req_format
2925 'id': file_id.decode('utf-8'),
2926 'url': file_url.decode('utf-8'),
2927 'uploader': uploader.decode('utf-8'),
2928 'upload_date': None,
2929 'title': json_data['name'],
2930 'ext': file_url.split('.')[-1].decode('utf-8'),
2931 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2932 'thumbnail': json_data['thumbnail_url'],
2933 'description': json_data['description'],
2934 'player_url': player_url.decode('utf-8'),
2937 class StanfordOpenClassroomIE(InfoExtractor):
2938 """Information extractor for Stanford's Open ClassRoom"""
2940 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2941 IE_NAME = u'stanfordoc'
2943 def _real_extract(self, url):
2944 mobj = re.match(self._VALID_URL, url)
2946 raise ExtractorError(u'Invalid URL: %s' % url)
2948 if mobj.group('course') and mobj.group('video'): # A specific video
2949 course = mobj.group('course')
2950 video = mobj.group('video')
2952 'id': course + '_' + video,
2954 'upload_date': None,
2957 self.report_extraction(info['id'])
2958 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2959 xmlUrl = baseUrl + video + '.xml'
2961 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2962 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2963 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2965 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2967 info['title'] = mdoc.findall('./title')[0].text
2968 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2970 self._downloader.report_error(u'Invalid metadata XML file')
2972 info['ext'] = info['url'].rpartition('.')[2]
2974 elif mobj.group('course'): # A course page
2975 course = mobj.group('course')
2980 'upload_date': None,
2983 coursepage = self._download_webpage(url, info['id'],
2984 note='Downloading course info page',
2985 errnote='Unable to download course info page')
2987 m = re.search('<h1>([^<]+)</h1>', coursepage)
2989 info['title'] = unescapeHTML(m.group(1))
2991 info['title'] = info['id']
2993 m = re.search('<description>([^<]+)</description>', coursepage)
2995 info['description'] = unescapeHTML(m.group(1))
2997 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3000 'type': 'reference',
3001 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3005 for entry in info['list']:
3006 assert entry['type'] == 'reference'
3007 results += self.extract(entry['url'])
3011 'id': 'Stanford OpenClassroom',
3014 'upload_date': None,
3017 self.report_download_webpage(info['id'])
3018 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3020 rootpage = compat_urllib_request.urlopen(rootURL).read()
3021 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3022 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3025 info['title'] = info['id']
3027 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3030 'type': 'reference',
3031 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3036 for entry in info['list']:
3037 assert entry['type'] == 'reference'
3038 results += self.extract(entry['url'])
3041 class MTVIE(InfoExtractor):
3042 """Information extractor for MTV.com"""
3044 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3047 def _real_extract(self, url):
3048 mobj = re.match(self._VALID_URL, url)
3050 self._downloader.report_error(u'invalid URL: %s' % url)
3052 if not mobj.group('proto'):
3053 url = 'http://' + url
3054 video_id = mobj.group('videoid')
3056 webpage = self._download_webpage(url, video_id)
3058 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3060 self._downloader.report_error(u'unable to extract song name')
3062 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3063 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3065 self._downloader.report_error(u'unable to extract performer')
3067 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3068 video_title = performer + ' - ' + song_name
3070 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3072 self._downloader.report_error(u'unable to mtvn_uri')
3074 mtvn_uri = mobj.group(1)
3076 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3078 self._downloader.report_error(u'unable to extract content id')
3080 content_id = mobj.group(1)
3082 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3083 self.report_extraction(video_id)
3084 request = compat_urllib_request.Request(videogen_url)
3086 metadataXml = compat_urllib_request.urlopen(request).read()
3087 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3088 self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3091 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3092 renditions = mdoc.findall('.//rendition')
3094 # For now, always pick the highest quality.
3095 rendition = renditions[-1]
3098 _,_,ext = rendition.attrib['type'].partition('/')
3099 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3100 video_url = rendition.find('./src').text
3102 self._downloader.report_error('Invalid rendition field.')
3108 'uploader': performer,
3109 'upload_date': None,
3110 'title': video_title,
3118 class YoukuIE(InfoExtractor):
3119 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3122 nowTime = int(time.time() * 1000)
3123 random1 = random.randint(1000,1998)
3124 random2 = random.randint(1000,9999)
3126 return "%d%d%d" %(nowTime,random1,random2)
3128 def _get_file_ID_mix_string(self, seed):
3130 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3132 for i in range(len(source)):
3133 seed = (seed * 211 + 30031 ) % 65536
3134 index = math.floor(seed / 65536 * len(source) )
3135 mixed.append(source[int(index)])
3136 source.remove(source[int(index)])
3137 #return ''.join(mixed)
3140 def _get_file_id(self, fileId, seed):
3141 mixed = self._get_file_ID_mix_string(seed)
3142 ids = fileId.split('*')
3146 realId.append(mixed[int(ch)])
3147 return ''.join(realId)
3149 def _real_extract(self, url):
3150 mobj = re.match(self._VALID_URL, url)
3152 self._downloader.report_error(u'invalid URL: %s' % url)
3154 video_id = mobj.group('ID')
3156 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3158 request = compat_urllib_request.Request(info_url, None, std_headers)
3160 self.report_download_webpage(video_id)
3161 jsondata = compat_urllib_request.urlopen(request).read()
3162 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3163 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3166 self.report_extraction(video_id)
3168 jsonstr = jsondata.decode('utf-8')
3169 config = json.loads(jsonstr)
3171 video_title = config['data'][0]['title']
3172 seed = config['data'][0]['seed']
3174 format = self._downloader.params.get('format', None)
3175 supported_format = list(config['data'][0]['streamfileids'].keys())
3177 if format is None or format == 'best':
3178 if 'hd2' in supported_format:
3183 elif format == 'worst':
3191 fileid = config['data'][0]['streamfileids'][format]
3192 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3193 except (UnicodeDecodeError, ValueError, KeyError):
3194 self._downloader.report_error(u'unable to extract info section')
3198 sid = self._gen_sid()
3199 fileid = self._get_file_id(fileid, seed)
3201 #column 8,9 of fileid represent the segment number
3202 #fileid[7:9] should be changed
3203 for index, key in enumerate(keys):
3205 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3206 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3209 'id': '%s_part%02d' % (video_id, index),
3210 'url': download_url,
3212 'upload_date': None,
3213 'title': video_title,
3216 files_info.append(info)
3221 class XNXXIE(InfoExtractor):
3222 """Information extractor for xnxx.com"""
3224 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3226 VIDEO_URL_RE = r'flv_url=(.*?)&'
3227 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3228 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3230 def _real_extract(self, url):
3231 mobj = re.match(self._VALID_URL, url)
3233 self._downloader.report_error(u'invalid URL: %s' % url)
3235 video_id = mobj.group(1)
3237 self.report_download_webpage(video_id)
3239 # Get webpage content
3241 webpage_bytes = compat_urllib_request.urlopen(url).read()
3242 webpage = webpage_bytes.decode('utf-8')
3243 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3244 self._downloader.report_error(u'unable to download video webpage: %s' % err)
3247 result = re.search(self.VIDEO_URL_RE, webpage)
3249 self._downloader.report_error(u'unable to extract video url')
3251 video_url = compat_urllib_parse.unquote(result.group(1))
3253 result = re.search(self.VIDEO_TITLE_RE, webpage)
3255 self._downloader.report_error(u'unable to extract video title')
3257 video_title = result.group(1)
3259 result = re.search(self.VIDEO_THUMB_RE, webpage)
3261 self._downloader.report_error(u'unable to extract video thumbnail')
3263 video_thumbnail = result.group(1)
3269 'upload_date': None,
3270 'title': video_title,
3272 'thumbnail': video_thumbnail,
3273 'description': None,
3277 class GooglePlusIE(InfoExtractor):
3278 """Information extractor for plus.google.com."""
3280 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3281 IE_NAME = u'plus.google'
3283 def report_extract_entry(self, url):
3284 """Report downloading extry"""
3285 self.to_screen(u'Downloading entry: %s' % url)
3287 def report_date(self, upload_date):
3288 """Report downloading extry"""
3289 self.to_screen(u'Entry date: %s' % upload_date)
3291 def report_uploader(self, uploader):
3292 """Report downloading extry"""
3293 self.to_screen(u'Uploader: %s' % uploader)
3295 def report_title(self, video_title):
3296 """Report downloading extry"""
3297 self.to_screen(u'Title: %s' % video_title)
3299 def report_extract_vid_page(self, video_page):
3300 """Report information extraction."""
3301 self.to_screen(u'Extracting video page: %s' % video_page)
3303 def _real_extract(self, url):
3304 # Extract id from URL
3305 mobj = re.match(self._VALID_URL, url)
3307 self._downloader.report_error(u'Invalid URL: %s' % url)
3310 post_url = mobj.group(0)
3311 video_id = mobj.group(1)
3313 video_extension = 'flv'
3315 # Step 1, Retrieve post webpage to extract further information
3316 self.report_extract_entry(post_url)
3317 request = compat_urllib_request.Request(post_url)
3319 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3320 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3321 self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3324 # Extract update date
3326 pattern = 'title="Timestamp">(.*?)</a>'
3327 mobj = re.search(pattern, webpage)
3329 upload_date = mobj.group(1)
3330 # Convert timestring to a format suitable for filename
3331 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3332 upload_date = upload_date.strftime('%Y%m%d')
3333 self.report_date(upload_date)
3337 pattern = r'rel\="author".*?>(.*?)</a>'
3338 mobj = re.search(pattern, webpage)
3340 uploader = mobj.group(1)
3341 self.report_uploader(uploader)
3344 # Get the first line for title
3346 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3347 mobj = re.search(pattern, webpage)
3349 video_title = mobj.group(1)
3350 self.report_title(video_title)
3352 # Step 2, Stimulate clicking the image box to launch video
3353 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3354 mobj = re.search(pattern, webpage)
3356 self._downloader.report_error(u'unable to extract video page URL')
3358 video_page = mobj.group(1)
3359 request = compat_urllib_request.Request(video_page)
3361 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3362 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3363 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3365 self.report_extract_vid_page(video_page)
3368 # Extract video links on video page
3369 """Extract video links of all sizes"""
3370 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3371 mobj = re.findall(pattern, webpage)
3373 self._downloader.report_error(u'unable to extract video links')
3375 # Sort in resolution
3376 links = sorted(mobj)
3378 # Choose the lowest of the sort, i.e. highest resolution
3379 video_url = links[-1]
3380 # Only get the url. The resolution part in the tuple has no use anymore
3381 video_url = video_url[-1]
3382 # Treat escaped \u0026 style hex
3384 video_url = video_url.decode("unicode_escape")
3385 except AttributeError: # Python 3
3386 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3392 'uploader': uploader,
3393 'upload_date': upload_date,
3394 'title': video_title,
3395 'ext': video_extension,
3398 class NBAIE(InfoExtractor):
3399 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3402 def _real_extract(self, url):
3403 mobj = re.match(self._VALID_URL, url)
3405 self._downloader.report_error(u'invalid URL: %s' % url)
3408 video_id = mobj.group(1)
3409 if video_id.endswith('/index.html'):
3410 video_id = video_id[:-len('/index.html')]
3412 webpage = self._download_webpage(url, video_id)
3414 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3415 def _findProp(rexp, default=None):
3416 m = re.search(rexp, webpage)
3418 return unescapeHTML(m.group(1))
3422 shortened_video_id = video_id.rpartition('/')[2]
3423 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3425 'id': shortened_video_id,
3429 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3430 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3434 class JustinTVIE(InfoExtractor):
3435 """Information extractor for justin.tv and twitch.tv"""
3436 # TODO: One broadcast may be split into multiple videos. The key
3437 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3438 # starts at 1 and increases. Can we treat all parts as one video?
3440 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3441 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3442 _JUSTIN_PAGE_LIMIT = 100
3443 IE_NAME = u'justin.tv'
3445 def report_download_page(self, channel, offset):
3446 """Report attempt to download a single page of videos."""
3447 self.to_screen(u'%s: Downloading video information from %d to %d' %
3448 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3450 # Return count of items, list of *valid* items
3451 def _parse_page(self, url):
3453 urlh = compat_urllib_request.urlopen(url)
3454 webpage_bytes = urlh.read()
3455 webpage = webpage_bytes.decode('utf-8', 'ignore')
3456 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3457 self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3460 response = json.loads(webpage)
3461 if type(response) != list:
3462 error_text = response.get('error', 'unknown error')
3463 self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3466 for clip in response:
3467 video_url = clip['video_file_url']
3469 video_extension = os.path.splitext(video_url)[1][1:]
3470 video_date = re.sub('-', '', clip['start_time'][:10])
3471 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3472 video_id = clip['id']
3473 video_title = clip.get('title', video_id)
3477 'title': video_title,
3478 'uploader': clip.get('channel_name', video_uploader_id),
3479 'uploader_id': video_uploader_id,
3480 'upload_date': video_date,
3481 'ext': video_extension,
3483 return (len(response), info)
3485 def _real_extract(self, url):
3486 mobj = re.match(self._VALID_URL, url)
3488 self._downloader.report_error(u'invalid URL: %s' % url)
3491 api = 'http://api.justin.tv'
3492 video_id = mobj.group(mobj.lastindex)
3494 if mobj.lastindex == 1:
3496 api += '/channel/archives/%s.json'
3498 api += '/broadcast/by_archive/%s.json'
3499 api = api % (video_id,)
3501 self.report_extraction(video_id)
3505 limit = self._JUSTIN_PAGE_LIMIT
3508 self.report_download_page(video_id, offset)
3509 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3510 page_count, page_info = self._parse_page(page_url)
3511 info.extend(page_info)
3512 if not paged or page_count != limit:
3517 class FunnyOrDieIE(InfoExtractor):
3518 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3520 def _real_extract(self, url):
3521 mobj = re.match(self._VALID_URL, url)
3523 self._downloader.report_error(u'invalid URL: %s' % url)
3526 video_id = mobj.group('id')
3527 webpage = self._download_webpage(url, video_id)
3529 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3531 self._downloader.report_error(u'unable to find video information')
3532 video_url = unescapeHTML(m.group('url'))
3534 m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3536 m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3538 self._downloader.report_error(u'Cannot find video title')
3539 title = clean_html(m.group('title'))
3541 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3543 desc = unescapeHTML(m.group('desc'))
3552 'description': desc,
3556 class SteamIE(InfoExtractor):
3557 _VALID_URL = r"""http://store.steampowered.com/
3559 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3561 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3565 def suitable(cls, url):
3566 """Receives a URL and returns True if suitable for this IE."""
3567 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3569 def _real_extract(self, url):
3570 m = re.match(self._VALID_URL, url, re.VERBOSE)
3571 gameID = m.group('gameID')
3572 videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3573 self.report_age_confirmation()
3574 webpage = self._download_webpage(videourl, gameID)
3575 game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3577 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3578 mweb = re.finditer(urlRE, webpage)
3579 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3580 titles = re.finditer(namesRE, webpage)
3581 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3582 thumbs = re.finditer(thumbsRE, webpage)
3584 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3585 video_id = vid.group('videoID')
3586 title = vtitle.group('videoName')
3587 video_url = vid.group('videoURL')
3588 video_thumb = thumb.group('thumbnail')
3590 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3595 'title': unescapeHTML(title),
3596 'thumbnail': video_thumb
3599 return [self.playlist_result(videos, gameID, game_title)]
3601 class UstreamIE(InfoExtractor):
3602 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3603 IE_NAME = u'ustream'
3605 def _real_extract(self, url):
3606 m = re.match(self._VALID_URL, url)
3607 video_id = m.group('videoID')
3608 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3609 webpage = self._download_webpage(url, video_id)
3610 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3611 title = m.group('title')
3612 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3613 uploader = m.group('uploader')
3619 'uploader': uploader
3623 class WorldStarHipHopIE(InfoExtractor):
3624 _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3625 IE_NAME = u'WorldStarHipHop'
3627 def _real_extract(self, url):
3628 _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3630 webpage_src = compat_urllib_request.urlopen(url).read()
3631 webpage_src = webpage_src.decode('utf-8')
3633 mobj = re.search(_src_url, webpage_src)
3635 m = re.match(self._VALID_URL, url)
3636 video_id = m.group('id')
3638 if mobj is not None:
3639 video_url = mobj.group()
3640 if 'mp4' in video_url:
3645 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3648 _title = r"""<title>(.*)</title>"""
3650 mobj = re.search(_title, webpage_src)
3652 if mobj is not None:
3653 title = mobj.group(1)
3655 title = 'World Start Hip Hop - %s' % time.ctime()
3657 _thumbnail = r"""rel="image_src" href="(.*)" />"""
3658 mobj = re.search(_thumbnail, webpage_src)
3660 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3661 if mobj is not None:
3662 thumbnail = mobj.group(1)
3664 _title = r"""candytitles.*>(.*)</span>"""
3665 mobj = re.search(_title, webpage_src)
3666 if mobj is not None:
3667 title = mobj.group(1)
3674 'thumbnail' : thumbnail,
3679 class RBMARadioIE(InfoExtractor):
3680 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3682 def _real_extract(self, url):
3683 m = re.match(self._VALID_URL, url)
3684 video_id = m.group('videoID')
3686 webpage = self._download_webpage(url, video_id)
3687 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3689 raise ExtractorError(u'Cannot find metadata')
3690 json_data = m.group(1)
3693 data = json.loads(json_data)
3694 except ValueError as e:
3695 raise ExtractorError(u'Invalid JSON: ' + str(e))
3697 video_url = data['akamai_url'] + '&cbr=256'
3698 url_parts = compat_urllib_parse_urlparse(video_url)
3699 video_ext = url_parts.path.rpartition('.')[2]
3704 'title': data['title'],
3705 'description': data.get('teaser_text'),
3706 'location': data.get('country_of_origin'),
3707 'uploader': data.get('host', {}).get('name'),
3708 'uploader_id': data.get('host', {}).get('slug'),
3709 'thumbnail': data.get('image', {}).get('large_url_2x'),
3710 'duration': data.get('duration'),
3715 class YouPornIE(InfoExtractor):
3716 """Information extractor for youporn.com."""
3717 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3719 def _print_formats(self, formats):
3720 """Print all available formats"""
3721 print(u'Available formats:')
3722 print(u'ext\t\tformat')
3723 print(u'---------------------------------')
3724 for format in formats:
3725 print(u'%s\t\t%s' % (format['ext'], format['format']))
3727 def _specific(self, req_format, formats):
3729 if(x["format"]==req_format):
3733 def _real_extract(self, url):
3734 mobj = re.match(self._VALID_URL, url)
3736 self._downloader.report_error(u'invalid URL: %s' % url)
3739 video_id = mobj.group('videoid')
3741 req = compat_urllib_request.Request(url)
3742 req.add_header('Cookie', 'age_verified=1')
3743 webpage = self._download_webpage(req, video_id)
3745 # Get the video title
3746 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3748 raise ExtractorError(u'Unable to extract video title')
3749 video_title = result.group('title').strip()
3751 # Get the video date
3752 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3754 self._downloader.report_warning(u'unable to extract video date')
3757 upload_date = unified_strdate(result.group('date').strip())
3759 # Get the video uploader
3760 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3762 self._downloader.report_warning(u'unable to extract uploader')
3763 video_uploader = None
3765 video_uploader = result.group('uploader').strip()
3766 video_uploader = clean_html( video_uploader )
3768 # Get all of the formats available
3769 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3770 result = re.search(DOWNLOAD_LIST_RE, webpage)
3772 raise ExtractorError(u'Unable to extract download list')
3773 download_list_html = result.group('download_list').strip()
3775 # Get all of the links from the page
3776 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3777 links = re.findall(LINK_RE, download_list_html)
3778 if(len(links) == 0):
3779 raise ExtractorError(u'ERROR: no known formats available for video')
3781 self.to_screen(u'Links found: %d' % len(links))
3786 # A link looks like this:
3787 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3788 # A path looks like this:
3789 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3790 video_url = unescapeHTML( link )
3791 path = compat_urllib_parse_urlparse( video_url ).path
3792 extension = os.path.splitext( path )[1][1:]
3793 format = path.split('/')[4].split('_')[:2]
3796 format = "-".join( format )
3797 title = u'%s-%s-%s' % (video_title, size, bitrate)
3802 'uploader': video_uploader,
3803 'upload_date': upload_date,
3808 'description': None,
3812 if self._downloader.params.get('listformats', None):
3813 self._print_formats(formats)
3816 req_format = self._downloader.params.get('format', None)
3817 self.to_screen(u'Format: %s' % req_format)
3819 if req_format is None or req_format == 'best':
3821 elif req_format == 'worst':
3822 return [formats[-1]]
3823 elif req_format in ('-1', 'all'):
3826 format = self._specific( req_format, formats )
3828 self._downloader.report_error(u'requested format not available')
3834 class PornotubeIE(InfoExtractor):
3835 """Information extractor for pornotube.com."""
3836 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3838 def _real_extract(self, url):
3839 mobj = re.match(self._VALID_URL, url)
3841 self._downloader.report_error(u'invalid URL: %s' % url)
3844 video_id = mobj.group('videoid')
3845 video_title = mobj.group('title')
3847 # Get webpage content
3848 webpage = self._download_webpage(url, video_id)
3851 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3852 result = re.search(VIDEO_URL_RE, webpage)
3854 self._downloader.report_error(u'unable to extract video url')
3856 video_url = compat_urllib_parse.unquote(result.group('url'))
3858 #Get the uploaded date
3859 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3860 result = re.search(VIDEO_UPLOADED_RE, webpage)
3862 self._downloader.report_error(u'unable to extract video title')
3864 upload_date = unified_strdate(result.group('date'))
3866 info = {'id': video_id,
3869 'upload_date': upload_date,
3870 'title': video_title,
3876 class YouJizzIE(InfoExtractor):
3877 """Information extractor for youjizz.com."""
3878 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3880 def _real_extract(self, url):
3881 mobj = re.match(self._VALID_URL, url)
3883 self._downloader.report_error(u'invalid URL: %s' % url)
3886 video_id = mobj.group('videoid')
3888 # Get webpage content
3889 webpage = self._download_webpage(url, video_id)
3891 # Get the video title
3892 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3894 raise ExtractorError(u'ERROR: unable to extract video title')
3895 video_title = result.group('title').strip()
3897 # Get the embed page
3898 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3900 raise ExtractorError(u'ERROR: unable to extract embed page')
3902 embed_page_url = result.group(0).strip()
3903 video_id = result.group('videoid')
3905 webpage = self._download_webpage(embed_page_url, video_id)
3908 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3910 raise ExtractorError(u'ERROR: unable to extract video url')
3911 video_url = result.group('source')
3913 info = {'id': video_id,
3915 'title': video_title,
3918 'player_url': embed_page_url}
3922 class EightTracksIE(InfoExtractor):
3924 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3926 def _real_extract(self, url):
3927 mobj = re.match(self._VALID_URL, url)
3929 raise ExtractorError(u'Invalid URL: %s' % url)
3930 playlist_id = mobj.group('id')
3932 webpage = self._download_webpage(url, playlist_id)
3934 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3936 raise ExtractorError(u'Cannot find trax information')
3937 json_like = m.group(1)
3938 data = json.loads(json_like)
3940 session = str(random.randint(0, 1000000000))
3942 track_count = data['tracks_count']
3943 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3944 next_url = first_url
3946 for i in itertools.count():
3947 api_json = self._download_webpage(next_url, playlist_id,
3948 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3949 errnote=u'Failed to download song information')
3950 api_data = json.loads(api_json)
3951 track_data = api_data[u'set']['track']
3953 'id': track_data['id'],
3954 'url': track_data['track_file_stream_url'],
3955 'title': track_data['performer'] + u' - ' + track_data['name'],
3956 'raw_title': track_data['name'],
3957 'uploader_id': data['user']['login'],
3961 if api_data['set']['at_last_track']:
3963 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3966 class KeekIE(InfoExtractor):
3967 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3970 def _real_extract(self, url):
3971 m = re.match(self._VALID_URL, url)
3972 video_id = m.group('videoID')
3973 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3974 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3975 webpage = self._download_webpage(url, video_id)
3976 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3977 title = unescapeHTML(m.group('title'))
3978 m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
3979 uploader = clean_html(m.group('uploader'))
3985 'thumbnail': thumbnail,
3986 'uploader': uploader
3990 class TEDIE(InfoExtractor):
3991 _VALID_URL=r'''http://www.ted.com/
3993 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3995 ((?P<type_talk>talks)) # We have a simple talk
3997 /(?P<name>\w+) # Here goes the name and then ".html"
4001 def suitable(cls, url):
4002 """Receives a URL and returns True if suitable for this IE."""
4003 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4005 def _real_extract(self, url):
4006 m=re.match(self._VALID_URL, url, re.VERBOSE)
4007 if m.group('type_talk'):
4008 return [self._talk_info(url)]
4010 playlist_id=m.group('playlist_id')
4011 name=m.group('name')
4012 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
4013 return [self._playlist_videos_info(url,name,playlist_id)]
4015 def _talk_video_link(self,mediaSlug):
4016 '''Returns the video link for that mediaSlug'''
4017 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4019 def _playlist_videos_info(self,url,name,playlist_id=0):
4020 '''Returns the videos of the playlist'''
4022 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4023 ([.\s]*?)data-playlist_item_id="(\d+)"
4024 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4026 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4027 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4028 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4029 m_names=re.finditer(video_name_RE,webpage)
4031 playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
4032 m_playlist = re.search(playlist_RE, webpage)
4033 playlist_title = m_playlist.group('playlist_title')
4035 playlist_entries = []
4036 for m_video, m_name in zip(m_videos,m_names):
4037 video_id=m_video.group('video_id')
4038 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4039 playlist_entries.append(self.url_result(talk_url, 'TED'))
4040 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
4042 def _talk_info(self, url, video_id=0):
4043 """Return the video for the talk in the url"""
4044 m=re.match(self._VALID_URL, url,re.VERBOSE)
4045 videoName=m.group('name')
4046 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4047 # If the url includes the language we get the title translated
4048 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4049 title=re.search(title_RE, webpage).group('title')
4050 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4051 "id":(?P<videoID>[\d]+).*?
4052 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4053 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4054 thumb_match=re.search(thumb_RE,webpage)
4055 info_match=re.search(info_RE,webpage,re.VERBOSE)
4056 video_id=info_match.group('videoID')
4057 mediaSlug=info_match.group('mediaSlug')
4058 video_url=self._talk_video_link(mediaSlug)
4064 'thumbnail': thumb_match.group('thumbnail')
4068 class MySpassIE(InfoExtractor):
4069 _VALID_URL = r'http://www.myspass.de/.*'
4071 def _real_extract(self, url):
4072 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4074 # video id is the last path element of the URL
4075 # usually there is a trailing slash, so also try the second but last
4076 url_path = compat_urllib_parse_urlparse(url).path
4077 url_parent_path, video_id = os.path.split(url_path)
4079 _, video_id = os.path.split(url_parent_path)
4082 metadata_url = META_DATA_URL_TEMPLATE % video_id
4083 metadata_text = self._download_webpage(metadata_url, video_id)
4084 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4086 # extract values from metadata
4087 url_flv_el = metadata.find('url_flv')
4088 if url_flv_el is None:
4089 self._downloader.report_error(u'unable to extract download url')
4091 video_url = url_flv_el.text
4092 extension = os.path.splitext(video_url)[1][1:]
4093 title_el = metadata.find('title')
4094 if title_el is None:
4095 self._downloader.report_error(u'unable to extract title')
4097 title = title_el.text
4098 format_id_el = metadata.find('format_id')
4099 if format_id_el is None:
4102 format = format_id_el.text
4103 description_el = metadata.find('description')
4104 if description_el is not None:
4105 description = description_el.text
4108 imagePreview_el = metadata.find('imagePreview')
4109 if imagePreview_el is not None:
4110 thumbnail = imagePreview_el.text
4119 'thumbnail': thumbnail,
4120 'description': description
4124 class SpiegelIE(InfoExtractor):
4125 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4127 def _real_extract(self, url):
4128 m = re.match(self._VALID_URL, url)
4129 video_id = m.group('videoID')
4131 webpage = self._download_webpage(url, video_id)
4132 m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4134 raise ExtractorError(u'Cannot find title')
4135 video_title = unescapeHTML(m.group(1))
4137 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4138 xml_code = self._download_webpage(xml_url, video_id,
4139 note=u'Downloading XML', errnote=u'Failed to download XML')
4141 idoc = xml.etree.ElementTree.fromstring(xml_code)
4142 last_type = idoc[-1]
4143 filename = last_type.findall('./filename')[0].text
4144 duration = float(last_type.findall('./duration')[0].text)
4146 video_url = 'http://video2.spiegel.de/flash/' + filename
4147 video_ext = filename.rpartition('.')[2]
4152 'title': video_title,
4153 'duration': duration,
4157 class LiveLeakIE(InfoExtractor):
4159 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4160 IE_NAME = u'liveleak'
4162 def _real_extract(self, url):
4163 mobj = re.match(self._VALID_URL, url)
4165 self._downloader.report_error(u'invalid URL: %s' % url)
4168 video_id = mobj.group('video_id')
4170 webpage = self._download_webpage(url, video_id)
4172 m = re.search(r'file: "(.*?)",', webpage)
4174 self._downloader.report_error(u'unable to find video url')
4176 video_url = m.group(1)
4178 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4180 self._downloader.report_error(u'Cannot find video title')
4181 title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4183 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4185 desc = unescapeHTML(m.group('desc'))
4189 m = re.search(r'By:.*?(\w+)</a>', webpage)
4191 uploader = clean_html(m.group(1))
4200 'description': desc,
4201 'uploader': uploader
4206 class ARDIE(InfoExtractor):
4207 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4208 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4209 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4211 def _real_extract(self, url):
4212 # determine video id from url
4213 m = re.match(self._VALID_URL, url)
4215 numid = re.search(r'documentId=([0-9]+)', url)
4217 video_id = numid.group(1)
4219 video_id = m.group('video_id')
4221 # determine title and media streams from webpage
4222 html = self._download_webpage(url, video_id)
4223 title = re.search(self._TITLE, html).group('title')
4224 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4226 assert '"fsk"' in html
4227 self._downloader.report_error(u'this video is only available after 8:00 pm')
4230 # choose default media type and highest quality for now
4231 stream = max([s for s in streams if int(s["media_type"]) == 0],
4232 key=lambda s: int(s["quality"]))
4234 # there's two possibilities: RTMP stream or HTTP download
4235 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4236 if stream['rtmp_url']:
4237 self.to_screen(u'RTMP download detected')
4238 assert stream['video_url'].startswith('mp4:')
4239 info["url"] = stream["rtmp_url"]
4240 info["play_path"] = stream['video_url']
4242 assert stream["video_url"].endswith('.mp4')
4243 info["url"] = stream["video_url"]
4246 class TumblrIE(InfoExtractor):
4247 _VALID_URL = r'http://(?P<blog_name>.*?).tumblr.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4249 def _real_extract(self, url):
4250 m_url = re.match(self._VALID_URL, url)
4251 video_id = m_url.group('id')
4252 blog = m_url.group('blog_name')
4254 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4255 webpage = self._download_webpage(url, video_id)
4257 re_video = r'src=\\x22(?P<video_url>http://%s.tumblr.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4258 video = re.search(re_video, webpage)
4260 self.to_screen("No video founded")
4262 video_url = video.group('video_url')
4263 ext = video.group('ext')
4265 re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22' # We pick the first poster
4266 thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
4268 # The only place where you can get a title, it's not complete,
4269 # but searching in other places doesn't work for all videos
4270 re_title = r'<title>(.*?) - (?P<title>.*?)</title>'
4271 title = unescapeHTML(re.search(re_title, webpage).group('title'))
4273 return [{'id': video_id,
4281 def gen_extractors():
4282 """ Return a list of an instance of every supported extractor.
4283 The order does matter; the first extractor matched is the one handling the URL.
4286 YoutubePlaylistIE(),
4311 StanfordOpenClassroomIE(),
4321 WorldStarHipHopIE(),
4338 def get_info_extractor(ie_name):
4339 """Returns the info extractor class with the given ie_name"""
4340 return globals()[ie_name+'IE']