2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
22 class InfoExtractor(object):
23 """Information Extractor class.
25 Information extractors are the classes that, given a URL, extract
26 information about the video (or videos) the URL refers to. This
27 information includes the real video URL, the video title, author and
28 others. The information is stored in a dictionary which is then
29 passed to the FileDownloader. The FileDownloader processes this
30 information possibly downloading the video to the file system, among
31 other possible outcomes.
33 The dictionaries must include the following fields:
37 title: Video title, unescaped.
38 ext: Video filename extension.
40 The following fields are optional:
42 format: The video format, defaults to ext (used for --get-format)
43 thumbnail: Full URL to a video thumbnail image.
44 description: One-line video description.
45 uploader: Full name of the video uploader.
46 upload_date: Video upload date (YYYYMMDD).
47 uploader_id: Nickname or id of the video uploader.
48 location: Physical location of the video.
49 player_url: SWF Player URL (used for rtmpdump).
50 subtitles: The subtitle file contents.
51 urlhandle: [internal] The urlHandle to be used to download the file,
52 like returned by urllib.request.urlopen
54 The fields should all be Unicode strings.
56 Subclasses of this one should re-define the _real_initialize() and
57 _real_extract() methods and define a _VALID_URL regexp.
58 Probably, they should also be added to the list of extractors.
60 _real_extract() must return a *list* of information dictionaries as
63 Finally, the _WORKING attribute should be set to False for broken IEs
64 in order to warn the users and skip the tests.
71 def __init__(self, downloader=None):
72 """Constructor. Receives an optional downloader."""
74 self.set_downloader(downloader)
76 def suitable(self, url):
77 """Receives a URL and returns True if suitable for this IE."""
78 return re.match(self._VALID_URL, url) is not None
81 """Getter method for _WORKING."""
85 """Initializes an instance (authentication, etc)."""
87 self._real_initialize()
90 def extract(self, url):
91 """Extracts URL information and returns it in list of dicts."""
93 return self._real_extract(url)
95 def set_downloader(self, downloader):
96 """Sets the downloader for this IE."""
97 self._downloader = downloader
99 def _real_initialize(self):
100 """Real initialization process. Redefine in subclasses."""
103 def _real_extract(self, url):
104 """Real extraction process. Redefine in subclasses."""
109 return type(self).__name__[:-2]
111 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
112 """ Returns the response handle """
114 note = u'Downloading video webpage'
115 self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
117 return compat_urllib_request.urlopen(url_or_request)
118 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
120 errnote = u'Unable to download webpage'
121 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
123 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
124 """ Returns the data of the page as a string """
125 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
126 webpage_bytes = urlh.read()
127 return webpage_bytes.decode('utf-8', 'replace')
130 class YoutubeIE(InfoExtractor):
131 """Information extractor for youtube.com."""
135 (?:https?://)? # http(s):// (optional)
136 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
137 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
138 (?:.*?\#/)? # handle anchor (#/) redirect urls
139 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
140 (?: # the various things that can precede the ID:
141 (?:(?:v|embed|e)/) # v/ or embed/ or e/
142 |(?: # or the v= param in all its forms
143 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
144 (?:\?|\#!?) # the params delimiter ? or # or #!
145 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
148 )? # optional -> youtube.com/xxxx is OK
149 )? # all until now is optional -> you can pass the naked ID
150 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
151 (?(1).+)? # if we found the ID, everything can follow
153 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
154 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
155 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
156 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
157 _NETRC_MACHINE = 'youtube'
158 # Listed in order of quality
159 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
160 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
161 _video_extensions = {
167 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
173 _video_dimensions = {
191 def suitable(self, url):
192 """Receives a URL and returns True if suitable for this IE."""
193 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
195 def report_lang(self):
196 """Report attempt to set language."""
197 self._downloader.to_screen(u'[youtube] Setting language')
199 def report_login(self):
200 """Report attempt to log in."""
201 self._downloader.to_screen(u'[youtube] Logging in')
203 def report_age_confirmation(self):
204 """Report attempt to confirm age."""
205 self._downloader.to_screen(u'[youtube] Confirming age')
207 def report_video_webpage_download(self, video_id):
208 """Report attempt to download video webpage."""
209 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
211 def report_video_info_webpage_download(self, video_id):
212 """Report attempt to download video info webpage."""
213 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
215 def report_video_subtitles_download(self, video_id):
216 """Report attempt to download video info webpage."""
217 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
219 def report_video_subtitles_request(self, video_id, lang):
220 """Report attempt to download video info webpage."""
221 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles for lang: %s' % (video_id,lang))
223 def report_information_extraction(self, video_id):
224 """Report attempt to extract video information."""
225 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
227 def report_unavailable_format(self, video_id, format):
228 """Report extracted video URL."""
229 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
231 def report_rtmp_download(self):
232 """Indicate the download will use the RTMP protocol."""
233 self._downloader.to_screen(u'[youtube] RTMP download detected')
235 def _get_available_subtitles(self, video_id):
236 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
238 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
239 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
240 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
241 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
242 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
243 if not sub_lang_list:
244 return (u'WARNING: video has no closed captions', None)
247 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
248 self.report_video_subtitles_request(video_id, sub_lang)
249 params = compat_urllib_parse.urlencode({
255 url = 'http://www.youtube.com/api/timedtext?' + params
257 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
258 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
259 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
261 return (u'WARNING: Did not fetch video subtitles', None)
262 return (None, sub_lang, sub)
264 def _extract_subtitle(self, video_id):
265 self.report_video_subtitles_download(video_id)
266 sub_lang_list = self._get_available_subtitles(video_id)
267 sub_format = self._downloader.params.get('subtitlesformat')
268 if self._downloader.params.get('subtitleslang', False):
269 sub_lang = self._downloader.params.get('subtitleslang')
270 elif 'en' in sub_lang_list:
273 sub_lang = list(sub_lang_list.keys())[0]
274 if not sub_lang in sub_lang_list:
275 return (u'WARNING: no closed captions found in the specified language "%s"' % sub_lang, None)
277 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
280 def _extract_all_subtitles(self, video_id):
281 self.report_video_subtitles_download(video_id)
282 sub_lang_list = self._get_available_subtitles(video_id)
283 sub_format = self._downloader.params.get('subtitlesformat')
285 for sub_lang in sub_lang_list:
286 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
287 subtitles.append(subtitle)
290 def _print_formats(self, formats):
291 print('Available formats:')
293 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
295 def _real_initialize(self):
296 if self._downloader is None:
301 downloader_params = self._downloader.params
303 # Attempt to use provided username and password or .netrc data
304 if downloader_params.get('username', None) is not None:
305 username = downloader_params['username']
306 password = downloader_params['password']
307 elif downloader_params.get('usenetrc', False):
309 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
314 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
315 except (IOError, netrc.NetrcParseError) as err:
316 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
320 request = compat_urllib_request.Request(self._LANG_URL)
323 compat_urllib_request.urlopen(request).read()
324 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
325 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
328 # No authentication to be performed
332 request = compat_urllib_request.Request(self._LOGIN_URL)
334 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
335 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
336 self._downloader.to_stderr(u'WARNING: unable to fetch login page: %s' % compat_str(err))
341 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
343 galx = match.group(1)
345 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
351 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
355 u'PersistentCookie': u'yes',
357 u'bgresponse': u'js_disabled',
358 u'checkConnection': u'',
359 u'checkedDomains': u'youtube',
365 u'signIn': u'Sign in',
367 u'service': u'youtube',
371 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
373 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
374 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
375 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
378 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
379 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
380 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
382 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
383 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
389 'action_confirm': 'Confirm',
391 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
393 self.report_age_confirmation()
394 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
395 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
396 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
399 def _extract_id(self, url):
400 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
402 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
404 video_id = mobj.group(2)
407 def _real_extract(self, url):
408 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
409 mobj = re.search(self._NEXT_URL_RE, url)
411 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
412 video_id = self._extract_id(url)
415 self.report_video_webpage_download(video_id)
416 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
417 request = compat_urllib_request.Request(url)
419 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
420 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
421 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
424 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
426 # Attempt to extract SWF player URL
427 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
429 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
434 self.report_video_info_webpage_download(video_id)
435 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
436 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
437 % (video_id, el_type))
438 request = compat_urllib_request.Request(video_info_url)
440 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
441 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
442 video_info = compat_parse_qs(video_info_webpage)
443 if 'token' in video_info:
445 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
446 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
448 if 'token' not in video_info:
449 if 'reason' in video_info:
450 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
452 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
455 # Check for "rental" videos
456 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
457 self._downloader.trouble(u'ERROR: "rental" videos not supported')
460 # Start extracting information
461 self.report_information_extraction(video_id)
464 if 'author' not in video_info:
465 self._downloader.trouble(u'ERROR: unable to extract uploader name')
467 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
470 video_uploader_id = None
471 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
473 video_uploader_id = mobj.group(1)
475 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
478 if 'title' not in video_info:
479 self._downloader.trouble(u'ERROR: unable to extract video title')
481 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
484 if 'thumbnail_url' not in video_info:
485 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
487 else: # don't panic if we can't find it
488 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
492 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
494 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
495 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
496 for expression in format_expressions:
498 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
503 video_description = get_element_by_id("eow-description", video_webpage)
504 if video_description:
505 video_description = clean_html(video_description)
507 video_description = ''
510 video_subtitles = None
512 if self._downloader.params.get('writesubtitles', False):
513 video_subtitles = self._extract_subtitle(video_id)
515 (sub_error, sub_lang, sub) = video_subtitles[0]
517 self._downloader.trouble(sub_error)
519 if self._downloader.params.get('allsubtitles', False):
520 video_subtitles = self._extract_all_subtitles(video_id)
521 for video_subtitle in video_subtitles:
522 (sub_error, sub_lang, sub) = video_subtitle
524 self._downloader.trouble(sub_error)
526 if 'length_seconds' not in video_info:
527 self._downloader.trouble(u'WARNING: unable to extract video duration')
530 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
533 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
535 # Decide which formats to download
536 req_format = self._downloader.params.get('format', None)
538 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
539 self.report_rtmp_download()
540 video_url_list = [(None, video_info['conn'][0])]
541 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
542 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
543 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
544 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
545 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
547 format_limit = self._downloader.params.get('format_limit', None)
548 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
549 if format_limit is not None and format_limit in available_formats:
550 format_list = available_formats[available_formats.index(format_limit):]
552 format_list = available_formats
553 existing_formats = [x for x in format_list if x in url_map]
554 if len(existing_formats) == 0:
555 self._downloader.trouble(u'ERROR: no known formats available for video')
557 if self._downloader.params.get('listformats', None):
558 self._print_formats(existing_formats)
560 if req_format is None or req_format == 'best':
561 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
562 elif req_format == 'worst':
563 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
564 elif req_format in ('-1', 'all'):
565 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
567 # Specific formats. We pick the first in a slash-delimeted sequence.
568 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
569 req_formats = req_format.split('/')
570 video_url_list = None
571 for rf in req_formats:
573 video_url_list = [(rf, url_map[rf])]
575 if video_url_list is None:
576 self._downloader.trouble(u'ERROR: requested format not available')
579 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
583 for format_param, video_real_url in video_url_list:
585 video_extension = self._video_extensions.get(format_param, 'flv')
587 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
588 self._video_dimensions.get(format_param, '???'))
592 'url': video_real_url,
593 'uploader': video_uploader,
594 'uploader_id': video_uploader_id,
595 'upload_date': upload_date,
596 'title': video_title,
597 'ext': video_extension,
598 'format': video_format,
599 'thumbnail': video_thumbnail,
600 'description': video_description,
601 'player_url': player_url,
602 'subtitles': video_subtitles,
603 'duration': video_duration
608 class MetacafeIE(InfoExtractor):
609 """Information Extractor for metacafe.com."""
611 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
612 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
613 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
614 IE_NAME = u'metacafe'
616 def __init__(self, downloader=None):
617 InfoExtractor.__init__(self, downloader)
619 def report_disclaimer(self):
620 """Report disclaimer retrieval."""
621 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
623 def report_age_confirmation(self):
624 """Report attempt to confirm age."""
625 self._downloader.to_screen(u'[metacafe] Confirming age')
627 def report_download_webpage(self, video_id):
628 """Report webpage download."""
629 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
631 def report_extraction(self, video_id):
632 """Report information extraction."""
633 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
635 def _real_initialize(self):
636 # Retrieve disclaimer
637 request = compat_urllib_request.Request(self._DISCLAIMER)
639 self.report_disclaimer()
640 disclaimer = compat_urllib_request.urlopen(request).read()
641 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
642 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
648 'submit': "Continue - I'm over 18",
650 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
652 self.report_age_confirmation()
653 disclaimer = compat_urllib_request.urlopen(request).read()
654 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
655 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
658 def _real_extract(self, url):
659 # Extract id and simplified title from URL
660 mobj = re.match(self._VALID_URL, url)
662 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
665 video_id = mobj.group(1)
667 # Check if video comes from YouTube
668 mobj2 = re.match(r'^yt-(.*)$', video_id)
669 if mobj2 is not None:
670 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
673 # Retrieve video webpage to extract further information
674 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
676 self.report_download_webpage(video_id)
677 webpage = compat_urllib_request.urlopen(request).read()
678 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
679 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
682 # Extract URL, uploader and title from webpage
683 self.report_extraction(video_id)
684 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
686 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
687 video_extension = mediaURL[-3:]
689 # Extract gdaKey if available
690 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
694 gdaKey = mobj.group(1)
695 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
697 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
699 self._downloader.trouble(u'ERROR: unable to extract media URL')
701 vardict = compat_parse_qs(mobj.group(1))
702 if 'mediaData' not in vardict:
703 self._downloader.trouble(u'ERROR: unable to extract media URL')
705 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
707 self._downloader.trouble(u'ERROR: unable to extract media URL')
709 mediaURL = mobj.group(1).replace('\\/', '/')
710 video_extension = mediaURL[-3:]
711 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
713 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
715 self._downloader.trouble(u'ERROR: unable to extract title')
717 video_title = mobj.group(1).decode('utf-8')
719 mobj = re.search(r'submitter=(.*?);', webpage)
721 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
723 video_uploader = mobj.group(1)
726 'id': video_id.decode('utf-8'),
727 'url': video_url.decode('utf-8'),
728 'uploader': video_uploader.decode('utf-8'),
730 'title': video_title,
731 'ext': video_extension.decode('utf-8'),
735 class DailymotionIE(InfoExtractor):
736 """Information Extractor for Dailymotion"""
738 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
739 IE_NAME = u'dailymotion'
742 def __init__(self, downloader=None):
743 InfoExtractor.__init__(self, downloader)
745 def report_extraction(self, video_id):
746 """Report information extraction."""
747 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
749 def _real_extract(self, url):
750 # Extract id and simplified title from URL
751 mobj = re.match(self._VALID_URL, url)
753 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
756 video_id = mobj.group(1).split('_')[0].split('?')[0]
758 video_extension = 'mp4'
760 # Retrieve video webpage to extract further information
761 request = compat_urllib_request.Request(url)
762 request.add_header('Cookie', 'family_filter=off')
763 webpage = self._download_webpage(request, video_id)
765 # Extract URL, uploader and title from webpage
766 self.report_extraction(video_id)
767 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
769 self._downloader.trouble(u'ERROR: unable to extract media URL')
771 flashvars = compat_urllib_parse.unquote(mobj.group(1))
773 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
776 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
779 self._downloader.trouble(u'ERROR: unable to extract video URL')
782 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
784 self._downloader.trouble(u'ERROR: unable to extract video URL')
787 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
789 # TODO: support choosing qualities
791 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
793 self._downloader.trouble(u'ERROR: unable to extract title')
795 video_title = unescapeHTML(mobj.group('title'))
797 video_uploader = None
798 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
800 # lookin for official user
801 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
802 if mobj_official is None:
803 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
805 video_uploader = mobj_official.group(1)
807 video_uploader = mobj.group(1)
809 video_upload_date = None
810 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
812 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
817 'uploader': video_uploader,
818 'upload_date': video_upload_date,
819 'title': video_title,
820 'ext': video_extension,
824 class PhotobucketIE(InfoExtractor):
825 """Information extractor for photobucket.com."""
827 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
828 IE_NAME = u'photobucket'
830 def __init__(self, downloader=None):
831 InfoExtractor.__init__(self, downloader)
833 def report_download_webpage(self, video_id):
834 """Report webpage download."""
835 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
837 def report_extraction(self, video_id):
838 """Report information extraction."""
839 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
841 def _real_extract(self, url):
842 # Extract id from URL
843 mobj = re.match(self._VALID_URL, url)
845 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
848 video_id = mobj.group(1)
850 video_extension = 'flv'
852 # Retrieve video webpage to extract further information
853 request = compat_urllib_request.Request(url)
855 self.report_download_webpage(video_id)
856 webpage = compat_urllib_request.urlopen(request).read()
857 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
858 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
861 # Extract URL, uploader, and title from webpage
862 self.report_extraction(video_id)
863 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
865 self._downloader.trouble(u'ERROR: unable to extract media URL')
867 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
871 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
873 self._downloader.trouble(u'ERROR: unable to extract title')
875 video_title = mobj.group(1).decode('utf-8')
877 video_uploader = mobj.group(2).decode('utf-8')
880 'id': video_id.decode('utf-8'),
881 'url': video_url.decode('utf-8'),
882 'uploader': video_uploader,
884 'title': video_title,
885 'ext': video_extension.decode('utf-8'),
889 class YahooIE(InfoExtractor):
890 """Information extractor for video.yahoo.com."""
893 # _VALID_URL matches all Yahoo! Video URLs
894 # _VPAGE_URL matches only the extractable '/watch/' URLs
895 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
896 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
897 IE_NAME = u'video.yahoo'
899 def __init__(self, downloader=None):
900 InfoExtractor.__init__(self, downloader)
902 def report_download_webpage(self, video_id):
903 """Report webpage download."""
904 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
906 def report_extraction(self, video_id):
907 """Report information extraction."""
908 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
910 def _real_extract(self, url, new_video=True):
911 # Extract ID from URL
912 mobj = re.match(self._VALID_URL, url)
914 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
917 video_id = mobj.group(2)
918 video_extension = 'flv'
920 # Rewrite valid but non-extractable URLs as
921 # extractable English language /watch/ URLs
922 if re.match(self._VPAGE_URL, url) is None:
923 request = compat_urllib_request.Request(url)
925 webpage = compat_urllib_request.urlopen(request).read()
926 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
927 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
930 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
932 self._downloader.trouble(u'ERROR: Unable to extract id field')
934 yahoo_id = mobj.group(1)
936 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
938 self._downloader.trouble(u'ERROR: Unable to extract vid field')
940 yahoo_vid = mobj.group(1)
942 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
943 return self._real_extract(url, new_video=False)
945 # Retrieve video webpage to extract further information
946 request = compat_urllib_request.Request(url)
948 self.report_download_webpage(video_id)
949 webpage = compat_urllib_request.urlopen(request).read()
950 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
951 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
954 # Extract uploader and title from webpage
955 self.report_extraction(video_id)
956 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
958 self._downloader.trouble(u'ERROR: unable to extract video title')
960 video_title = mobj.group(1).decode('utf-8')
962 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
964 self._downloader.trouble(u'ERROR: unable to extract video uploader')
966 video_uploader = mobj.group(1).decode('utf-8')
968 # Extract video thumbnail
969 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
971 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
973 video_thumbnail = mobj.group(1).decode('utf-8')
975 # Extract video description
976 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
978 self._downloader.trouble(u'ERROR: unable to extract video description')
980 video_description = mobj.group(1).decode('utf-8')
981 if not video_description:
982 video_description = 'No description available.'
984 # Extract video height and width
985 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
987 self._downloader.trouble(u'ERROR: unable to extract video height')
989 yv_video_height = mobj.group(1)
991 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
993 self._downloader.trouble(u'ERROR: unable to extract video width')
995 yv_video_width = mobj.group(1)
997 # Retrieve video playlist to extract media URL
998 # I'm not completely sure what all these options are, but we
999 # seem to need most of them, otherwise the server sends a 401.
1000 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1001 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1002 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1003 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1004 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1006 self.report_download_webpage(video_id)
1007 webpage = compat_urllib_request.urlopen(request).read()
1008 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1009 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1012 # Extract media URL from playlist XML
1013 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1015 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1017 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1018 video_url = unescapeHTML(video_url)
1021 'id': video_id.decode('utf-8'),
1023 'uploader': video_uploader,
1024 'upload_date': None,
1025 'title': video_title,
1026 'ext': video_extension.decode('utf-8'),
1027 'thumbnail': video_thumbnail.decode('utf-8'),
1028 'description': video_description,
1032 class VimeoIE(InfoExtractor):
1033 """Information extractor for vimeo.com."""
1035 # _VALID_URL matches Vimeo URLs
1036 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1039 def __init__(self, downloader=None):
1040 InfoExtractor.__init__(self, downloader)
1042 def report_download_webpage(self, video_id):
1043 """Report webpage download."""
1044 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1046 def report_extraction(self, video_id):
1047 """Report information extraction."""
1048 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1050 def _real_extract(self, url, new_video=True):
1051 # Extract ID from URL
1052 mobj = re.match(self._VALID_URL, url)
1054 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1057 video_id = mobj.group('id')
1058 if not mobj.group('proto'):
1059 url = 'https://' + url
1060 if mobj.group('direct_link'):
1061 url = 'https://vimeo.com/' + video_id
1063 # Retrieve video webpage to extract further information
1064 request = compat_urllib_request.Request(url, None, std_headers)
1066 self.report_download_webpage(video_id)
1067 webpage_bytes = compat_urllib_request.urlopen(request).read()
1068 webpage = webpage_bytes.decode('utf-8')
1069 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1070 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1073 # Now we begin extracting as much information as we can from what we
1074 # retrieved. First we extract the information common to all extractors,
1075 # and latter we extract those that are Vimeo specific.
1076 self.report_extraction(video_id)
1078 # Extract the config JSON
1080 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1081 config = json.loads(config)
1083 self._downloader.trouble(u'ERROR: unable to extract info section')
1087 video_title = config["video"]["title"]
1089 # Extract uploader and uploader_id
1090 video_uploader = config["video"]["owner"]["name"]
1091 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1093 # Extract video thumbnail
1094 video_thumbnail = config["video"]["thumbnail"]
1096 # Extract video description
1097 video_description = get_element_by_attribute("itemprop", "description", webpage)
1098 if video_description: video_description = clean_html(video_description)
1099 else: video_description = ''
1101 # Extract upload date
1102 video_upload_date = None
1103 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1104 if mobj is not None:
1105 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1107 # Vimeo specific: extract request signature and timestamp
1108 sig = config['request']['signature']
1109 timestamp = config['request']['timestamp']
1111 # Vimeo specific: extract video codec and quality information
1112 # First consider quality, then codecs, then take everything
1113 # TODO bind to format param
1114 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1115 files = { 'hd': [], 'sd': [], 'other': []}
1116 for codec_name, codec_extension in codecs:
1117 if codec_name in config["video"]["files"]:
1118 if 'hd' in config["video"]["files"][codec_name]:
1119 files['hd'].append((codec_name, codec_extension, 'hd'))
1120 elif 'sd' in config["video"]["files"][codec_name]:
1121 files['sd'].append((codec_name, codec_extension, 'sd'))
1123 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1125 for quality in ('hd', 'sd', 'other'):
1126 if len(files[quality]) > 0:
1127 video_quality = files[quality][0][2]
1128 video_codec = files[quality][0][0]
1129 video_extension = files[quality][0][1]
1130 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1133 self._downloader.trouble(u'ERROR: no known codec found')
1136 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1137 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1142 'uploader': video_uploader,
1143 'uploader_id': video_uploader_id,
1144 'upload_date': video_upload_date,
1145 'title': video_title,
1146 'ext': video_extension,
1147 'thumbnail': video_thumbnail,
1148 'description': video_description,
1152 class ArteTvIE(InfoExtractor):
1153 """arte.tv information extractor."""
1155 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1156 _LIVE_URL = r'index-[0-9]+\.html$'
1158 IE_NAME = u'arte.tv'
1160 def __init__(self, downloader=None):
1161 InfoExtractor.__init__(self, downloader)
1163 def report_download_webpage(self, video_id):
1164 """Report webpage download."""
1165 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1167 def report_extraction(self, video_id):
1168 """Report information extraction."""
1169 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1171 def fetch_webpage(self, url):
1172 request = compat_urllib_request.Request(url)
1174 self.report_download_webpage(url)
1175 webpage = compat_urllib_request.urlopen(request).read()
1176 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1177 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1179 except ValueError as err:
1180 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1184 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1185 page = self.fetch_webpage(url)
1186 mobj = re.search(regex, page, regexFlags)
1190 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1193 for (i, key, err) in matchTuples:
1194 if mobj.group(i) is None:
1195 self._downloader.trouble(err)
1198 info[key] = mobj.group(i)
1202 def extractLiveStream(self, url):
1203 video_lang = url.split('/')[-4]
1204 info = self.grep_webpage(
1206 r'src="(.*?/videothek_js.*?\.js)',
1209 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1212 http_host = url.split('/')[2]
1213 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1214 info = self.grep_webpage(
1216 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1217 '(http://.*?\.swf).*?' +
1221 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1222 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1223 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1226 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1228 def extractPlus7Stream(self, url):
1229 video_lang = url.split('/')[-3]
1230 info = self.grep_webpage(
1232 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1235 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1238 next_url = compat_urllib_parse.unquote(info.get('url'))
1239 info = self.grep_webpage(
1241 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1244 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1247 next_url = compat_urllib_parse.unquote(info.get('url'))
1249 info = self.grep_webpage(
1251 r'<video id="(.*?)".*?>.*?' +
1252 '<name>(.*?)</name>.*?' +
1253 '<dateVideo>(.*?)</dateVideo>.*?' +
1254 '<url quality="hd">(.*?)</url>',
1257 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1258 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1259 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1260 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1265 'id': info.get('id'),
1266 'url': compat_urllib_parse.unquote(info.get('url')),
1267 'uploader': u'arte.tv',
1268 'upload_date': info.get('date'),
1269 'title': info.get('title').decode('utf-8'),
1275 def _real_extract(self, url):
1276 video_id = url.split('/')[-1]
1277 self.report_extraction(video_id)
1279 if re.search(self._LIVE_URL, video_id) is not None:
1280 self.extractLiveStream(url)
1283 info = self.extractPlus7Stream(url)
1288 class GenericIE(InfoExtractor):
1289 """Generic last-resort information extractor."""
1292 IE_NAME = u'generic'
1294 def __init__(self, downloader=None):
1295 InfoExtractor.__init__(self, downloader)
1297 def report_download_webpage(self, video_id):
1298 """Report webpage download."""
1299 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1300 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1302 def report_extraction(self, video_id):
1303 """Report information extraction."""
1304 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1306 def report_following_redirect(self, new_url):
1307 """Report information extraction."""
1308 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1310 def _test_redirect(self, url):
1311 """Check if it is a redirect, like url shorteners, in case restart chain."""
1312 class HeadRequest(compat_urllib_request.Request):
1313 def get_method(self):
1316 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1318 Subclass the HTTPRedirectHandler to make it use our
1319 HeadRequest also on the redirected URL
1321 def redirect_request(self, req, fp, code, msg, headers, newurl):
1322 if code in (301, 302, 303, 307):
1323 newurl = newurl.replace(' ', '%20')
1324 newheaders = dict((k,v) for k,v in req.headers.items()
1325 if k.lower() not in ("content-length", "content-type"))
1326 return HeadRequest(newurl,
1328 origin_req_host=req.get_origin_req_host(),
1331 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1333 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1335 Fallback to GET if HEAD is not allowed (405 HTTP error)
1337 def http_error_405(self, req, fp, code, msg, headers):
1341 newheaders = dict((k,v) for k,v in req.headers.items()
1342 if k.lower() not in ("content-length", "content-type"))
1343 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1345 origin_req_host=req.get_origin_req_host(),
1349 opener = compat_urllib_request.OpenerDirector()
1350 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1351 HTTPMethodFallback, HEADRedirectHandler,
1352 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1353 opener.add_handler(handler())
1355 response = opener.open(HeadRequest(url))
1356 new_url = response.geturl()
1361 self.report_following_redirect(new_url)
1362 self._downloader.download([new_url])
1365 def _real_extract(self, url):
1366 if self._test_redirect(url): return
1368 video_id = url.split('/')[-1]
1369 request = compat_urllib_request.Request(url)
1371 self.report_download_webpage(video_id)
1372 webpage = compat_urllib_request.urlopen(request).read()
1373 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1374 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1376 except ValueError as err:
1377 # since this is the last-resort InfoExtractor, if
1378 # this error is thrown, it'll be thrown here
1379 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1382 self.report_extraction(video_id)
1383 # Start with something easy: JW Player in SWFObject
1384 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1386 # Broaden the search a little bit
1387 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1389 # Broaden the search a little bit: JWPlayer JS loader
1390 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1392 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1395 # It's possible that one of the regexes
1396 # matched, but returned an empty group:
1397 if mobj.group(1) is None:
1398 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1401 video_url = compat_urllib_parse.unquote(mobj.group(1))
1402 video_id = os.path.basename(video_url)
1404 # here's a fun little line of code for you:
1405 video_extension = os.path.splitext(video_id)[1][1:]
1406 video_id = os.path.splitext(video_id)[0]
1408 # it's tempting to parse this further, but you would
1409 # have to take into account all the variations like
1410 # Video Title - Site Name
1411 # Site Name | Video Title
1412 # Video Title - Tagline | Site Name
1413 # and so on and so forth; it's just not practical
1414 mobj = re.search(r'<title>(.*)</title>', webpage)
1416 self._downloader.trouble(u'ERROR: unable to extract title')
1418 video_title = mobj.group(1)
1420 # video uploader is domain name
1421 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1423 self._downloader.trouble(u'ERROR: unable to extract title')
1425 video_uploader = mobj.group(1)
1430 'uploader': video_uploader,
1431 'upload_date': None,
1432 'title': video_title,
1433 'ext': video_extension,
1437 class YoutubeSearchIE(InfoExtractor):
1438 """Information Extractor for YouTube search queries."""
1439 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1440 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1441 _max_youtube_results = 1000
1442 IE_NAME = u'youtube:search'
1444 def __init__(self, downloader=None):
1445 InfoExtractor.__init__(self, downloader)
1447 def report_download_page(self, query, pagenum):
1448 """Report attempt to download search page with given number."""
1449 query = query.decode(preferredencoding())
1450 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1452 def _real_extract(self, query):
1453 mobj = re.match(self._VALID_URL, query)
1455 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1458 prefix, query = query.split(':')
1460 query = query.encode('utf-8')
1462 self._download_n_results(query, 1)
1464 elif prefix == 'all':
1465 self._download_n_results(query, self._max_youtube_results)
1471 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1473 elif n > self._max_youtube_results:
1474 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1475 n = self._max_youtube_results
1476 self._download_n_results(query, n)
1478 except ValueError: # parsing prefix as integer fails
1479 self._download_n_results(query, 1)
1482 def _download_n_results(self, query, n):
1483 """Downloads a specified number of results for a query"""
1489 while (50 * pagenum) < limit:
1490 self.report_download_page(query, pagenum+1)
1491 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1492 request = compat_urllib_request.Request(result_url)
1494 data = compat_urllib_request.urlopen(request).read()
1495 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1496 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1498 api_response = json.loads(data)['data']
1500 new_ids = list(video['id'] for video in api_response['items'])
1501 video_ids += new_ids
1503 limit = min(n, api_response['totalItems'])
1506 if len(video_ids) > n:
1507 video_ids = video_ids[:n]
1508 for id in video_ids:
1509 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1513 class GoogleSearchIE(InfoExtractor):
1514 """Information Extractor for Google Video search queries."""
1515 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1516 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1517 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1518 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1519 _max_google_results = 1000
1520 IE_NAME = u'video.google:search'
1522 def __init__(self, downloader=None):
1523 InfoExtractor.__init__(self, downloader)
1525 def report_download_page(self, query, pagenum):
1526 """Report attempt to download playlist page with given number."""
1527 query = query.decode(preferredencoding())
1528 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1530 def _real_extract(self, query):
1531 mobj = re.match(self._VALID_URL, query)
1533 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1536 prefix, query = query.split(':')
1538 query = query.encode('utf-8')
1540 self._download_n_results(query, 1)
1542 elif prefix == 'all':
1543 self._download_n_results(query, self._max_google_results)
1549 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1551 elif n > self._max_google_results:
1552 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1553 n = self._max_google_results
1554 self._download_n_results(query, n)
1556 except ValueError: # parsing prefix as integer fails
1557 self._download_n_results(query, 1)
1560 def _download_n_results(self, query, n):
1561 """Downloads a specified number of results for a query"""
1567 self.report_download_page(query, pagenum)
1568 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1569 request = compat_urllib_request.Request(result_url)
1571 page = compat_urllib_request.urlopen(request).read()
1572 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1573 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1576 # Extract video identifiers
1577 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1578 video_id = mobj.group(1)
1579 if video_id not in video_ids:
1580 video_ids.append(video_id)
1581 if len(video_ids) == n:
1582 # Specified n videos reached
1583 for id in video_ids:
1584 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1587 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1588 for id in video_ids:
1589 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1592 pagenum = pagenum + 1
1595 class YahooSearchIE(InfoExtractor):
1596 """Information Extractor for Yahoo! Video search queries."""
1599 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1600 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1601 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1602 _MORE_PAGES_INDICATOR = r'\s*Next'
1603 _max_yahoo_results = 1000
1604 IE_NAME = u'video.yahoo:search'
1606 def __init__(self, downloader=None):
1607 InfoExtractor.__init__(self, downloader)
1609 def report_download_page(self, query, pagenum):
1610 """Report attempt to download playlist page with given number."""
1611 query = query.decode(preferredencoding())
1612 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1614 def _real_extract(self, query):
1615 mobj = re.match(self._VALID_URL, query)
1617 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1620 prefix, query = query.split(':')
1622 query = query.encode('utf-8')
1624 self._download_n_results(query, 1)
1626 elif prefix == 'all':
1627 self._download_n_results(query, self._max_yahoo_results)
1633 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1635 elif n > self._max_yahoo_results:
1636 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1637 n = self._max_yahoo_results
1638 self._download_n_results(query, n)
1640 except ValueError: # parsing prefix as integer fails
1641 self._download_n_results(query, 1)
1644 def _download_n_results(self, query, n):
1645 """Downloads a specified number of results for a query"""
1648 already_seen = set()
1652 self.report_download_page(query, pagenum)
1653 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1654 request = compat_urllib_request.Request(result_url)
1656 page = compat_urllib_request.urlopen(request).read()
1657 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1658 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1661 # Extract video identifiers
1662 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1663 video_id = mobj.group(1)
1664 if video_id not in already_seen:
1665 video_ids.append(video_id)
1666 already_seen.add(video_id)
1667 if len(video_ids) == n:
1668 # Specified n videos reached
1669 for id in video_ids:
1670 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1673 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1674 for id in video_ids:
1675 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1678 pagenum = pagenum + 1
1681 class YoutubePlaylistIE(InfoExtractor):
1682 """Information Extractor for YouTube playlists."""
1684 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1685 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1686 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&([^&"]+&)*list=.*?%s'
1687 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1688 IE_NAME = u'youtube:playlist'
1690 def __init__(self, downloader=None):
1691 InfoExtractor.__init__(self, downloader)
1693 def report_download_page(self, playlist_id, pagenum):
1694 """Report attempt to download playlist page with given number."""
1695 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1697 def _real_extract(self, url):
1698 # Extract playlist id
1699 mobj = re.match(self._VALID_URL, url)
1701 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1705 if mobj.group(3) is not None:
1706 self._downloader.download([mobj.group(3)])
1709 # Download playlist pages
1710 # prefix is 'p' as default for playlists but there are other types that need extra care
1711 playlist_prefix = mobj.group(1)
1712 if playlist_prefix == 'a':
1713 playlist_access = 'artist'
1715 playlist_prefix = 'p'
1716 playlist_access = 'view_play_list'
1717 playlist_id = mobj.group(2)
1722 self.report_download_page(playlist_id, pagenum)
1723 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1724 request = compat_urllib_request.Request(url)
1726 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1727 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1728 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1731 # Extract video identifiers
1733 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1734 if mobj.group(1) not in ids_in_page:
1735 ids_in_page.append(mobj.group(1))
1736 video_ids.extend(ids_in_page)
1738 if self._MORE_PAGES_INDICATOR not in page:
1740 pagenum = pagenum + 1
1742 total = len(video_ids)
1744 playliststart = self._downloader.params.get('playliststart', 1) - 1
1745 playlistend = self._downloader.params.get('playlistend', -1)
1746 if playlistend == -1:
1747 video_ids = video_ids[playliststart:]
1749 video_ids = video_ids[playliststart:playlistend]
1751 if len(video_ids) == total:
1752 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1754 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1756 for id in video_ids:
1757 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1761 class YoutubeChannelIE(InfoExtractor):
1762 """Information Extractor for YouTube channels."""
1764 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1765 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1766 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1767 IE_NAME = u'youtube:channel'
1769 def report_download_page(self, channel_id, pagenum):
1770 """Report attempt to download channel page with given number."""
1771 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1773 def _real_extract(self, url):
1774 # Extract channel id
1775 mobj = re.match(self._VALID_URL, url)
1777 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1780 # Download channel pages
1781 channel_id = mobj.group(1)
1786 self.report_download_page(channel_id, pagenum)
1787 url = self._TEMPLATE_URL % (channel_id, pagenum)
1788 request = compat_urllib_request.Request(url)
1790 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1791 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1792 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1795 # Extract video identifiers
1797 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1798 if mobj.group(1) not in ids_in_page:
1799 ids_in_page.append(mobj.group(1))
1800 video_ids.extend(ids_in_page)
1802 if self._MORE_PAGES_INDICATOR not in page:
1804 pagenum = pagenum + 1
1806 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1808 for id in video_ids:
1809 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1813 class YoutubeUserIE(InfoExtractor):
1814 """Information Extractor for YouTube users."""
1816 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1817 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1818 _GDATA_PAGE_SIZE = 50
1819 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1820 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1821 IE_NAME = u'youtube:user'
1823 def __init__(self, downloader=None):
1824 InfoExtractor.__init__(self, downloader)
1826 def report_download_page(self, username, start_index):
1827 """Report attempt to download user page."""
1828 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1829 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1831 def _real_extract(self, url):
1833 mobj = re.match(self._VALID_URL, url)
1835 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1838 username = mobj.group(1)
1840 # Download video ids using YouTube Data API. Result size per
1841 # query is limited (currently to 50 videos) so we need to query
1842 # page by page until there are no video ids - it means we got
1849 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1850 self.report_download_page(username, start_index)
1852 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1855 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1856 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1857 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1860 # Extract video identifiers
1863 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1864 if mobj.group(1) not in ids_in_page:
1865 ids_in_page.append(mobj.group(1))
1867 video_ids.extend(ids_in_page)
1869 # A little optimization - if current page is not
1870 # "full", ie. does not contain PAGE_SIZE video ids then
1871 # we can assume that this page is the last one - there
1872 # are no more ids on further pages - no need to query
1875 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1880 all_ids_count = len(video_ids)
1881 playliststart = self._downloader.params.get('playliststart', 1) - 1
1882 playlistend = self._downloader.params.get('playlistend', -1)
1884 if playlistend == -1:
1885 video_ids = video_ids[playliststart:]
1887 video_ids = video_ids[playliststart:playlistend]
1889 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1890 (username, all_ids_count, len(video_ids)))
1892 for video_id in video_ids:
1893 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1896 class BlipTVUserIE(InfoExtractor):
1897 """Information Extractor for blip.tv users."""
1899 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1901 IE_NAME = u'blip.tv:user'
1903 def __init__(self, downloader=None):
1904 InfoExtractor.__init__(self, downloader)
1906 def report_download_page(self, username, pagenum):
1907 """Report attempt to download user page."""
1908 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1909 (self.IE_NAME, username, pagenum))
1911 def _real_extract(self, url):
1913 mobj = re.match(self._VALID_URL, url)
1915 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1918 username = mobj.group(1)
1920 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1922 request = compat_urllib_request.Request(url)
1925 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1926 mobj = re.search(r'data-users-id="([^"]+)"', page)
1927 page_base = page_base % mobj.group(1)
1928 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1929 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1933 # Download video ids using BlipTV Ajax calls. Result size per
1934 # query is limited (currently to 12 videos) so we need to query
1935 # page by page until there are no video ids - it means we got
1942 self.report_download_page(username, pagenum)
1944 request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1947 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1948 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1949 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1952 # Extract video identifiers
1955 for mobj in re.finditer(r'href="/([^"]+)"', page):
1956 if mobj.group(1) not in ids_in_page:
1957 ids_in_page.append(unescapeHTML(mobj.group(1)))
1959 video_ids.extend(ids_in_page)
1961 # A little optimization - if current page is not
1962 # "full", ie. does not contain PAGE_SIZE video ids then
1963 # we can assume that this page is the last one - there
1964 # are no more ids on further pages - no need to query
1967 if len(ids_in_page) < self._PAGE_SIZE:
1972 all_ids_count = len(video_ids)
1973 playliststart = self._downloader.params.get('playliststart', 1) - 1
1974 playlistend = self._downloader.params.get('playlistend', -1)
1976 if playlistend == -1:
1977 video_ids = video_ids[playliststart:]
1979 video_ids = video_ids[playliststart:playlistend]
1981 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1982 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1984 for video_id in video_ids:
1985 self._downloader.download([u'http://blip.tv/'+video_id])
1988 class DepositFilesIE(InfoExtractor):
1989 """Information extractor for depositfiles.com"""
1991 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1993 def report_download_webpage(self, file_id):
1994 """Report webpage download."""
1995 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1997 def report_extraction(self, file_id):
1998 """Report information extraction."""
1999 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2001 def _real_extract(self, url):
2002 file_id = url.split('/')[-1]
2003 # Rebuild url in english locale
2004 url = 'http://depositfiles.com/en/files/' + file_id
2006 # Retrieve file webpage with 'Free download' button pressed
2007 free_download_indication = { 'gateway_result' : '1' }
2008 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2010 self.report_download_webpage(file_id)
2011 webpage = compat_urllib_request.urlopen(request).read()
2012 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2013 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
2016 # Search for the real file URL
2017 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2018 if (mobj is None) or (mobj.group(1) is None):
2019 # Try to figure out reason of the error.
2020 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2021 if (mobj is not None) and (mobj.group(1) is not None):
2022 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2023 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2025 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2028 file_url = mobj.group(1)
2029 file_extension = os.path.splitext(file_url)[1][1:]
2031 # Search for file title
2032 mobj = re.search(r'<b title="(.*?)">', webpage)
2034 self._downloader.trouble(u'ERROR: unable to extract title')
2036 file_title = mobj.group(1).decode('utf-8')
2039 'id': file_id.decode('utf-8'),
2040 'url': file_url.decode('utf-8'),
2042 'upload_date': None,
2043 'title': file_title,
2044 'ext': file_extension.decode('utf-8'),
2048 class FacebookIE(InfoExtractor):
2049 """Information Extractor for Facebook"""
2051 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2052 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2053 _NETRC_MACHINE = 'facebook'
2054 IE_NAME = u'facebook'
2056 def report_login(self):
2057 """Report attempt to log in."""
2058 self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2060 def _real_initialize(self):
2061 if self._downloader is None:
2066 downloader_params = self._downloader.params
2068 # Attempt to use provided username and password or .netrc data
2069 if downloader_params.get('username', None) is not None:
2070 useremail = downloader_params['username']
2071 password = downloader_params['password']
2072 elif downloader_params.get('usenetrc', False):
2074 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2075 if info is not None:
2079 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2080 except (IOError, netrc.NetrcParseError) as err:
2081 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2084 if useremail is None:
2093 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2096 login_results = compat_urllib_request.urlopen(request).read()
2097 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2098 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2100 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2101 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2104 def _real_extract(self, url):
2105 mobj = re.match(self._VALID_URL, url)
2107 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2109 video_id = mobj.group('ID')
2111 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2112 webpage = self._download_webpage(url, video_id)
2114 BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2115 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2116 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2118 raise ExtractorError(u'Cannot parse data')
2119 data = dict(json.loads(m.group(1)))
2120 params_raw = compat_urllib_parse.unquote(data['params'])
2121 params = json.loads(params_raw)
2122 video_url = params['hd_src']
2124 video_url = params['sd_src']
2126 raise ExtractorError(u'Cannot find video URL')
2127 video_duration = int(params['video_duration'])
2129 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2131 raise ExtractorError(u'Cannot find title in webpage')
2132 video_title = unescapeHTML(m.group(1))
2136 'title': video_title,
2139 'duration': video_duration,
2140 'thumbnail': params['thumbnail_src'],
2145 class BlipTVIE(InfoExtractor):
2146 """Information extractor for blip.tv"""
2148 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2149 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2150 IE_NAME = u'blip.tv'
2152 def report_extraction(self, file_id):
2153 """Report information extraction."""
2154 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2156 def report_direct_download(self, title):
2157 """Report information extraction."""
2158 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2160 def _real_extract(self, url):
2161 mobj = re.match(self._VALID_URL, url)
2163 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2170 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2171 request = compat_urllib_request.Request(json_url)
2172 request.add_header('User-Agent', 'iTunes/10.6.1')
2173 self.report_extraction(mobj.group(1))
2176 urlh = compat_urllib_request.urlopen(request)
2177 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2178 basename = url.split('/')[-1]
2179 title,ext = os.path.splitext(basename)
2180 title = title.decode('UTF-8')
2181 ext = ext.replace('.', '')
2182 self.report_direct_download(title)
2187 'upload_date': None,
2192 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2193 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2194 if info is None: # Regular URL
2196 json_code_bytes = urlh.read()
2197 json_code = json_code_bytes.decode('utf-8')
2198 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2199 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2203 json_data = json.loads(json_code)
2204 if 'Post' in json_data:
2205 data = json_data['Post']
2209 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2210 video_url = data['media']['url']
2211 umobj = re.match(self._URL_EXT, video_url)
2213 raise ValueError('Can not determine filename extension')
2214 ext = umobj.group(1)
2217 'id': data['item_id'],
2219 'uploader': data['display_name'],
2220 'upload_date': upload_date,
2221 'title': data['title'],
2223 'format': data['media']['mimeType'],
2224 'thumbnail': data['thumbnailUrl'],
2225 'description': data['description'],
2226 'player_url': data['embedUrl'],
2227 'user_agent': 'iTunes/10.6.1',
2229 except (ValueError,KeyError) as err:
2230 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2236 class MyVideoIE(InfoExtractor):
2237 """Information Extractor for myvideo.de."""
2239 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2240 IE_NAME = u'myvideo'
2242 def __init__(self, downloader=None):
2243 InfoExtractor.__init__(self, downloader)
2245 def report_extraction(self, video_id):
2246 """Report information extraction."""
2247 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2249 def _real_extract(self,url):
2250 mobj = re.match(self._VALID_URL, url)
2252 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2255 video_id = mobj.group(1)
2258 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2259 webpage = self._download_webpage(webpage_url, video_id)
2261 self.report_extraction(video_id)
2262 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\' />',
2265 self._downloader.trouble(u'ERROR: unable to extract media URL')
2267 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2269 mobj = re.search('<title>([^<]+)</title>', webpage)
2271 self._downloader.trouble(u'ERROR: unable to extract title')
2274 video_title = mobj.group(1)
2280 'upload_date': None,
2281 'title': video_title,
2285 class ComedyCentralIE(InfoExtractor):
2286 """Information extractor for The Daily Show and Colbert Report """
2288 # urls can be abbreviations like :thedailyshow or :colbert
2289 # urls for episodes like:
2290 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2291 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2292 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2293 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2294 |(https?://)?(www\.)?
2295 (?P<showname>thedailyshow|colbertnation)\.com/
2296 (full-episodes/(?P<episode>.*)|
2298 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2299 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2302 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2304 _video_extensions = {
2312 _video_dimensions = {
2321 def suitable(self, url):
2322 """Receives a URL and returns True if suitable for this IE."""
2323 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2325 def report_extraction(self, episode_id):
2326 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2328 def report_config_download(self, episode_id, media_id):
2329 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2331 def report_index_download(self, episode_id):
2332 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2334 def _print_formats(self, formats):
2335 print('Available formats:')
2337 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2340 def _real_extract(self, url):
2341 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2343 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2346 if mobj.group('shortname'):
2347 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2348 url = u'http://www.thedailyshow.com/full-episodes/'
2350 url = u'http://www.colbertnation.com/full-episodes/'
2351 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2352 assert mobj is not None
2354 if mobj.group('clip'):
2355 if mobj.group('showname') == 'thedailyshow':
2356 epTitle = mobj.group('tdstitle')
2358 epTitle = mobj.group('cntitle')
2361 dlNewest = not mobj.group('episode')
2363 epTitle = mobj.group('showname')
2365 epTitle = mobj.group('episode')
2367 req = compat_urllib_request.Request(url)
2368 self.report_extraction(epTitle)
2370 htmlHandle = compat_urllib_request.urlopen(req)
2371 html = htmlHandle.read()
2372 webpage = html.decode('utf-8')
2373 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2374 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2377 url = htmlHandle.geturl()
2378 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2380 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2382 if mobj.group('episode') == '':
2383 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2385 epTitle = mobj.group('episode')
2387 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2389 if len(mMovieParams) == 0:
2390 # The Colbert Report embeds the information in a without
2391 # a URL prefix; so extract the alternate reference
2392 # and then add the URL prefix manually.
2394 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2395 if len(altMovieParams) == 0:
2396 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2399 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2401 uri = mMovieParams[0][1]
2402 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2403 self.report_index_download(epTitle)
2405 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2406 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2407 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2412 idoc = xml.etree.ElementTree.fromstring(indexXml)
2413 itemEls = idoc.findall('.//item')
2414 for partNum,itemEl in enumerate(itemEls):
2415 mediaId = itemEl.findall('./guid')[0].text
2416 shortMediaId = mediaId.split(':')[-1]
2417 showId = mediaId.split(':')[-2].replace('.com', '')
2418 officialTitle = itemEl.findall('./title')[0].text
2419 officialDate = itemEl.findall('./pubDate')[0].text
2421 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2422 compat_urllib_parse.urlencode({'uri': mediaId}))
2423 configReq = compat_urllib_request.Request(configUrl)
2424 self.report_config_download(epTitle, shortMediaId)
2426 configXml = compat_urllib_request.urlopen(configReq).read()
2427 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2428 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2431 cdoc = xml.etree.ElementTree.fromstring(configXml)
2433 for rendition in cdoc.findall('.//rendition'):
2434 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2438 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2441 if self._downloader.params.get('listformats', None):
2442 self._print_formats([i[0] for i in turls])
2445 # For now, just pick the highest bitrate
2446 format,rtmp_video_url = turls[-1]
2448 # Get the format arg from the arg stream
2449 req_format = self._downloader.params.get('format', None)
2451 # Select format if we can find one
2454 format, rtmp_video_url = f, v
2457 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2459 raise ExtractorError(u'Cannot transform RTMP url')
2460 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2461 video_url = base + m.group('finalid')
2463 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2468 'upload_date': officialDate,
2473 'description': officialTitle,
2475 results.append(info)
2480 class EscapistIE(InfoExtractor):
2481 """Information extractor for The Escapist """
2483 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2484 IE_NAME = u'escapist'
2486 def report_extraction(self, showName):
2487 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2489 def report_config_download(self, showName):
2490 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2492 def _real_extract(self, url):
2493 mobj = re.match(self._VALID_URL, url)
2495 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2497 showName = mobj.group('showname')
2498 videoId = mobj.group('episode')
2500 self.report_extraction(showName)
2502 webPage = compat_urllib_request.urlopen(url)
2503 webPageBytes = webPage.read()
2504 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2505 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2506 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2507 self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2510 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2511 description = unescapeHTML(descMatch.group(1))
2512 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2513 imgUrl = unescapeHTML(imgMatch.group(1))
2514 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2515 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2516 configUrlMatch = re.search('config=(.*)$', playerUrl)
2517 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2519 self.report_config_download(showName)
2521 configJSON = compat_urllib_request.urlopen(configUrl)
2522 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2523 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2524 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2525 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2528 # Technically, it's JavaScript, not JSON
2529 configJSON = configJSON.replace("'", '"')
2532 config = json.loads(configJSON)
2533 except (ValueError,) as err:
2534 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2537 playlist = config['playlist']
2538 videoUrl = playlist[1]['url']
2543 'uploader': showName,
2544 'upload_date': None,
2547 'thumbnail': imgUrl,
2548 'description': description,
2549 'player_url': playerUrl,
2554 class CollegeHumorIE(InfoExtractor):
2555 """Information extractor for collegehumor.com"""
2558 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2559 IE_NAME = u'collegehumor'
2561 def report_manifest(self, video_id):
2562 """Report information extraction."""
2563 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2565 def report_extraction(self, video_id):
2566 """Report information extraction."""
2567 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2569 def _real_extract(self, url):
2570 mobj = re.match(self._VALID_URL, url)
2572 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2574 video_id = mobj.group('videoid')
2579 'upload_date': None,
2582 self.report_extraction(video_id)
2583 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2585 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2586 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2587 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2590 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2592 videoNode = mdoc.findall('./video')[0]
2593 info['description'] = videoNode.findall('./description')[0].text
2594 info['title'] = videoNode.findall('./caption')[0].text
2595 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2596 manifest_url = videoNode.findall('./file')[0].text
2598 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2601 manifest_url += '?hdcore=2.10.3'
2602 self.report_manifest(video_id)
2604 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2605 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2606 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2609 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2611 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2612 node_id = media_node.attrib['url']
2613 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2614 except IndexError as err:
2615 self._downloader.trouble(u'\nERROR: Invalid manifest file')
2618 url_pr = compat_urllib_parse_urlparse(manifest_url)
2619 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2626 class XVideosIE(InfoExtractor):
2627 """Information extractor for xvideos.com"""
2629 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2630 IE_NAME = u'xvideos'
2632 def report_extraction(self, video_id):
2633 """Report information extraction."""
2634 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2636 def _real_extract(self, url):
2637 mobj = re.match(self._VALID_URL, url)
2639 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2641 video_id = mobj.group(1)
2643 webpage = self._download_webpage(url, video_id)
2645 self.report_extraction(video_id)
2649 mobj = re.search(r'flv_url=(.+?)&', webpage)
2651 self._downloader.trouble(u'ERROR: unable to extract video url')
2653 video_url = compat_urllib_parse.unquote(mobj.group(1))
2657 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2659 self._downloader.trouble(u'ERROR: unable to extract video title')
2661 video_title = mobj.group(1)
2664 # Extract video thumbnail
2665 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2667 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2669 video_thumbnail = mobj.group(0)
2675 'upload_date': None,
2676 'title': video_title,
2678 'thumbnail': video_thumbnail,
2679 'description': None,
2685 class SoundcloudIE(InfoExtractor):
2686 """Information extractor for soundcloud.com
2687 To access the media, the uid of the song and a stream token
2688 must be extracted from the page source and the script must make
2689 a request to media.soundcloud.com/crossdomain.xml. Then
2690 the media can be grabbed by requesting from an url composed
2691 of the stream token and uid
2694 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2695 IE_NAME = u'soundcloud'
2697 def __init__(self, downloader=None):
2698 InfoExtractor.__init__(self, downloader)
2700 def report_resolve(self, video_id):
2701 """Report information extraction."""
2702 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2704 def report_extraction(self, video_id):
2705 """Report information extraction."""
2706 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2708 def _real_extract(self, url):
2709 mobj = re.match(self._VALID_URL, url)
2711 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2714 # extract uploader (which is in the url)
2715 uploader = mobj.group(1)
2716 # extract simple title (uploader + slug of song title)
2717 slug_title = mobj.group(2)
2718 simple_title = uploader + u'-' + slug_title
2720 self.report_resolve('%s/%s' % (uploader, slug_title))
2722 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2723 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2724 request = compat_urllib_request.Request(resolv_url)
2726 info_json_bytes = compat_urllib_request.urlopen(request).read()
2727 info_json = info_json_bytes.decode('utf-8')
2728 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2729 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2732 info = json.loads(info_json)
2733 video_id = info['id']
2734 self.report_extraction('%s/%s' % (uploader, slug_title))
2736 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2737 request = compat_urllib_request.Request(streams_url)
2739 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2740 stream_json = stream_json_bytes.decode('utf-8')
2741 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2742 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2745 streams = json.loads(stream_json)
2746 mediaURL = streams['http_mp3_128_url']
2751 'uploader': info['user']['username'],
2752 'upload_date': info['created_at'],
2753 'title': info['title'],
2755 'description': info['description'],
2759 class InfoQIE(InfoExtractor):
2760 """Information extractor for infoq.com"""
2761 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2763 def report_extraction(self, video_id):
2764 """Report information extraction."""
2765 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2767 def _real_extract(self, url):
2768 mobj = re.match(self._VALID_URL, url)
2770 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2773 webpage = self._download_webpage(url, video_id=url)
2774 self.report_extraction(url)
2777 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2779 self._downloader.trouble(u'ERROR: unable to extract video url')
2781 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2782 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2785 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2787 self._downloader.trouble(u'ERROR: unable to extract video title')
2789 video_title = mobj.group(1)
2791 # Extract description
2792 video_description = u'No description available.'
2793 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2794 if mobj is not None:
2795 video_description = mobj.group(1)
2797 video_filename = video_url.split('/')[-1]
2798 video_id, extension = video_filename.split('.')
2804 'upload_date': None,
2805 'title': video_title,
2806 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2808 'description': video_description,
2813 class MixcloudIE(InfoExtractor):
2814 """Information extractor for www.mixcloud.com"""
2816 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2817 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2818 IE_NAME = u'mixcloud'
2820 def __init__(self, downloader=None):
2821 InfoExtractor.__init__(self, downloader)
2823 def report_download_json(self, file_id):
2824 """Report JSON download."""
2825 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2827 def report_extraction(self, file_id):
2828 """Report information extraction."""
2829 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2831 def get_urls(self, jsonData, fmt, bitrate='best'):
2832 """Get urls from 'audio_formats' section in json"""
2835 bitrate_list = jsonData[fmt]
2836 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2837 bitrate = max(bitrate_list) # select highest
2839 url_list = jsonData[fmt][bitrate]
2840 except TypeError: # we have no bitrate info.
2841 url_list = jsonData[fmt]
2844 def check_urls(self, url_list):
2845 """Returns 1st active url from list"""
2846 for url in url_list:
2848 compat_urllib_request.urlopen(url)
2850 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2855 def _print_formats(self, formats):
2856 print('Available formats:')
2857 for fmt in formats.keys():
2858 for b in formats[fmt]:
2860 ext = formats[fmt][b][0]
2861 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2862 except TypeError: # we have no bitrate info
2863 ext = formats[fmt][0]
2864 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2867 def _real_extract(self, url):
2868 mobj = re.match(self._VALID_URL, url)
2870 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2872 # extract uploader & filename from url
2873 uploader = mobj.group(1).decode('utf-8')
2874 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2876 # construct API request
2877 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2878 # retrieve .json file with links to files
2879 request = compat_urllib_request.Request(file_url)
2881 self.report_download_json(file_url)
2882 jsonData = compat_urllib_request.urlopen(request).read()
2883 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2884 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2888 json_data = json.loads(jsonData)
2889 player_url = json_data['player_swf_url']
2890 formats = dict(json_data['audio_formats'])
2892 req_format = self._downloader.params.get('format', None)
2895 if self._downloader.params.get('listformats', None):
2896 self._print_formats(formats)
2899 if req_format is None or req_format == 'best':
2900 for format_param in formats.keys():
2901 url_list = self.get_urls(formats, format_param)
2903 file_url = self.check_urls(url_list)
2904 if file_url is not None:
2907 if req_format not in formats:
2908 self._downloader.trouble(u'ERROR: format is not available')
2911 url_list = self.get_urls(formats, req_format)
2912 file_url = self.check_urls(url_list)
2913 format_param = req_format
2916 'id': file_id.decode('utf-8'),
2917 'url': file_url.decode('utf-8'),
2918 'uploader': uploader.decode('utf-8'),
2919 'upload_date': None,
2920 'title': json_data['name'],
2921 'ext': file_url.split('.')[-1].decode('utf-8'),
2922 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2923 'thumbnail': json_data['thumbnail_url'],
2924 'description': json_data['description'],
2925 'player_url': player_url.decode('utf-8'),
2928 class StanfordOpenClassroomIE(InfoExtractor):
2929 """Information extractor for Stanford's Open ClassRoom"""
2931 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2932 IE_NAME = u'stanfordoc'
2934 def report_download_webpage(self, objid):
2935 """Report information extraction."""
2936 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2938 def report_extraction(self, video_id):
2939 """Report information extraction."""
2940 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2942 def _real_extract(self, url):
2943 mobj = re.match(self._VALID_URL, url)
2945 raise ExtractorError(u'Invalid URL: %s' % url)
2947 if mobj.group('course') and mobj.group('video'): # A specific video
2948 course = mobj.group('course')
2949 video = mobj.group('video')
2951 'id': course + '_' + video,
2953 'upload_date': None,
2956 self.report_extraction(info['id'])
2957 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2958 xmlUrl = baseUrl + video + '.xml'
2960 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2961 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2962 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2964 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2966 info['title'] = mdoc.findall('./title')[0].text
2967 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2969 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2971 info['ext'] = info['url'].rpartition('.')[2]
2973 elif mobj.group('course'): # A course page
2974 course = mobj.group('course')
2979 'upload_date': None,
2982 coursepage = self._download_webpage(url, info['id'],
2983 note='Downloading course info page',
2984 errnote='Unable to download course info page')
2986 m = re.search('<h1>([^<]+)</h1>', coursepage)
2988 info['title'] = unescapeHTML(m.group(1))
2990 info['title'] = info['id']
2992 m = re.search('<description>([^<]+)</description>', coursepage)
2994 info['description'] = unescapeHTML(m.group(1))
2996 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2999 'type': 'reference',
3000 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3004 for entry in info['list']:
3005 assert entry['type'] == 'reference'
3006 results += self.extract(entry['url'])
3010 'id': 'Stanford OpenClassroom',
3013 'upload_date': None,
3016 self.report_download_webpage(info['id'])
3017 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3019 rootpage = compat_urllib_request.urlopen(rootURL).read()
3020 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3021 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3024 info['title'] = info['id']
3026 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3029 'type': 'reference',
3030 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3035 for entry in info['list']:
3036 assert entry['type'] == 'reference'
3037 results += self.extract(entry['url'])
3040 class MTVIE(InfoExtractor):
3041 """Information extractor for MTV.com"""
3043 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3046 def report_extraction(self, video_id):
3047 """Report information extraction."""
3048 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3050 def _real_extract(self, url):
3051 mobj = re.match(self._VALID_URL, url)
3053 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3055 if not mobj.group('proto'):
3056 url = 'http://' + url
3057 video_id = mobj.group('videoid')
3059 webpage = self._download_webpage(url, video_id)
3061 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3063 self._downloader.trouble(u'ERROR: unable to extract song name')
3065 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3066 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3068 self._downloader.trouble(u'ERROR: unable to extract performer')
3070 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3071 video_title = performer + ' - ' + song_name
3073 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3075 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3077 mtvn_uri = mobj.group(1)
3079 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3081 self._downloader.trouble(u'ERROR: unable to extract content id')
3083 content_id = mobj.group(1)
3085 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3086 self.report_extraction(video_id)
3087 request = compat_urllib_request.Request(videogen_url)
3089 metadataXml = compat_urllib_request.urlopen(request).read()
3090 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3091 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3094 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3095 renditions = mdoc.findall('.//rendition')
3097 # For now, always pick the highest quality.
3098 rendition = renditions[-1]
3101 _,_,ext = rendition.attrib['type'].partition('/')
3102 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3103 video_url = rendition.find('./src').text
3105 self._downloader.trouble('Invalid rendition field.')
3111 'uploader': performer,
3112 'upload_date': None,
3113 'title': video_title,
3121 class YoukuIE(InfoExtractor):
3122 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3124 def report_download_webpage(self, file_id):
3125 """Report webpage download."""
3126 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3128 def report_extraction(self, file_id):
3129 """Report information extraction."""
3130 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3133 nowTime = int(time.time() * 1000)
3134 random1 = random.randint(1000,1998)
3135 random2 = random.randint(1000,9999)
3137 return "%d%d%d" %(nowTime,random1,random2)
3139 def _get_file_ID_mix_string(self, seed):
3141 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3143 for i in range(len(source)):
3144 seed = (seed * 211 + 30031 ) % 65536
3145 index = math.floor(seed / 65536 * len(source) )
3146 mixed.append(source[int(index)])
3147 source.remove(source[int(index)])
3148 #return ''.join(mixed)
3151 def _get_file_id(self, fileId, seed):
3152 mixed = self._get_file_ID_mix_string(seed)
3153 ids = fileId.split('*')
3157 realId.append(mixed[int(ch)])
3158 return ''.join(realId)
3160 def _real_extract(self, url):
3161 mobj = re.match(self._VALID_URL, url)
3163 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3165 video_id = mobj.group('ID')
3167 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3169 request = compat_urllib_request.Request(info_url, None, std_headers)
3171 self.report_download_webpage(video_id)
3172 jsondata = compat_urllib_request.urlopen(request).read()
3173 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3174 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3177 self.report_extraction(video_id)
3179 jsonstr = jsondata.decode('utf-8')
3180 config = json.loads(jsonstr)
3182 video_title = config['data'][0]['title']
3183 seed = config['data'][0]['seed']
3185 format = self._downloader.params.get('format', None)
3186 supported_format = list(config['data'][0]['streamfileids'].keys())
3188 if format is None or format == 'best':
3189 if 'hd2' in supported_format:
3194 elif format == 'worst':
3202 fileid = config['data'][0]['streamfileids'][format]
3203 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3204 except (UnicodeDecodeError, ValueError, KeyError):
3205 self._downloader.trouble(u'ERROR: unable to extract info section')
3209 sid = self._gen_sid()
3210 fileid = self._get_file_id(fileid, seed)
3212 #column 8,9 of fileid represent the segment number
3213 #fileid[7:9] should be changed
3214 for index, key in enumerate(keys):
3216 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3217 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3220 'id': '%s_part%02d' % (video_id, index),
3221 'url': download_url,
3223 'upload_date': None,
3224 'title': video_title,
3227 files_info.append(info)
3232 class XNXXIE(InfoExtractor):
3233 """Information extractor for xnxx.com"""
3235 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3237 VIDEO_URL_RE = r'flv_url=(.*?)&'
3238 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3239 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3241 def report_webpage(self, video_id):
3242 """Report information extraction"""
3243 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3245 def report_extraction(self, video_id):
3246 """Report information extraction"""
3247 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3249 def _real_extract(self, url):
3250 mobj = re.match(self._VALID_URL, url)
3252 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3254 video_id = mobj.group(1)
3256 self.report_webpage(video_id)
3258 # Get webpage content
3260 webpage_bytes = compat_urllib_request.urlopen(url).read()
3261 webpage = webpage_bytes.decode('utf-8')
3262 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3263 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3266 result = re.search(self.VIDEO_URL_RE, webpage)
3268 self._downloader.trouble(u'ERROR: unable to extract video url')
3270 video_url = compat_urllib_parse.unquote(result.group(1))
3272 result = re.search(self.VIDEO_TITLE_RE, webpage)
3274 self._downloader.trouble(u'ERROR: unable to extract video title')
3276 video_title = result.group(1)
3278 result = re.search(self.VIDEO_THUMB_RE, webpage)
3280 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3282 video_thumbnail = result.group(1)
3288 'upload_date': None,
3289 'title': video_title,
3291 'thumbnail': video_thumbnail,
3292 'description': None,
3296 class GooglePlusIE(InfoExtractor):
3297 """Information extractor for plus.google.com."""
3299 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3300 IE_NAME = u'plus.google'
3302 def __init__(self, downloader=None):
3303 InfoExtractor.__init__(self, downloader)
3305 def report_extract_entry(self, url):
3306 """Report downloading extry"""
3307 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3309 def report_date(self, upload_date):
3310 """Report downloading extry"""
3311 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3313 def report_uploader(self, uploader):
3314 """Report downloading extry"""
3315 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3317 def report_title(self, video_title):
3318 """Report downloading extry"""
3319 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3321 def report_extract_vid_page(self, video_page):
3322 """Report information extraction."""
3323 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3325 def _real_extract(self, url):
3326 # Extract id from URL
3327 mobj = re.match(self._VALID_URL, url)
3329 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3332 post_url = mobj.group(0)
3333 video_id = mobj.group(1)
3335 video_extension = 'flv'
3337 # Step 1, Retrieve post webpage to extract further information
3338 self.report_extract_entry(post_url)
3339 request = compat_urllib_request.Request(post_url)
3341 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3342 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3343 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3346 # Extract update date
3348 pattern = 'title="Timestamp">(.*?)</a>'
3349 mobj = re.search(pattern, webpage)
3351 upload_date = mobj.group(1)
3352 # Convert timestring to a format suitable for filename
3353 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3354 upload_date = upload_date.strftime('%Y%m%d')
3355 self.report_date(upload_date)
3359 pattern = r'rel\="author".*?>(.*?)</a>'
3360 mobj = re.search(pattern, webpage)
3362 uploader = mobj.group(1)
3363 self.report_uploader(uploader)
3366 # Get the first line for title
3368 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3369 mobj = re.search(pattern, webpage)
3371 video_title = mobj.group(1)
3372 self.report_title(video_title)
3374 # Step 2, Stimulate clicking the image box to launch video
3375 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3376 mobj = re.search(pattern, webpage)
3378 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3380 video_page = mobj.group(1)
3381 request = compat_urllib_request.Request(video_page)
3383 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3384 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3385 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3387 self.report_extract_vid_page(video_page)
3390 # Extract video links on video page
3391 """Extract video links of all sizes"""
3392 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3393 mobj = re.findall(pattern, webpage)
3395 self._downloader.trouble(u'ERROR: unable to extract video links')
3397 # Sort in resolution
3398 links = sorted(mobj)
3400 # Choose the lowest of the sort, i.e. highest resolution
3401 video_url = links[-1]
3402 # Only get the url. The resolution part in the tuple has no use anymore
3403 video_url = video_url[-1]
3404 # Treat escaped \u0026 style hex
3406 video_url = video_url.decode("unicode_escape")
3407 except AttributeError: # Python 3
3408 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3414 'uploader': uploader,
3415 'upload_date': upload_date,
3416 'title': video_title,
3417 'ext': video_extension,
3420 class NBAIE(InfoExtractor):
3421 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3424 def _real_extract(self, url):
3425 mobj = re.match(self._VALID_URL, url)
3427 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3430 video_id = mobj.group(1)
3431 if video_id.endswith('/index.html'):
3432 video_id = video_id[:-len('/index.html')]
3434 webpage = self._download_webpage(url, video_id)
3436 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3437 def _findProp(rexp, default=None):
3438 m = re.search(rexp, webpage)
3440 return unescapeHTML(m.group(1))
3444 shortened_video_id = video_id.rpartition('/')[2]
3445 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3447 'id': shortened_video_id,
3451 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3452 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3456 class JustinTVIE(InfoExtractor):
3457 """Information extractor for justin.tv and twitch.tv"""
3458 # TODO: One broadcast may be split into multiple videos. The key
3459 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3460 # starts at 1 and increases. Can we treat all parts as one video?
3462 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3463 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3464 _JUSTIN_PAGE_LIMIT = 100
3465 IE_NAME = u'justin.tv'
3467 def report_extraction(self, file_id):
3468 """Report information extraction."""
3469 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3471 def report_download_page(self, channel, offset):
3472 """Report attempt to download a single page of videos."""
3473 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3474 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3476 # Return count of items, list of *valid* items
3477 def _parse_page(self, url):
3479 urlh = compat_urllib_request.urlopen(url)
3480 webpage_bytes = urlh.read()
3481 webpage = webpage_bytes.decode('utf-8', 'ignore')
3482 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3483 self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3486 response = json.loads(webpage)
3487 if type(response) != list:
3488 error_text = response.get('error', 'unknown error')
3489 self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3492 for clip in response:
3493 video_url = clip['video_file_url']
3495 video_extension = os.path.splitext(video_url)[1][1:]
3496 video_date = re.sub('-', '', clip['start_time'][:10])
3497 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3498 video_id = clip['id']
3499 video_title = clip.get('title', video_id)
3503 'title': video_title,
3504 'uploader': clip.get('channel_name', video_uploader_id),
3505 'uploader_id': video_uploader_id,
3506 'upload_date': video_date,
3507 'ext': video_extension,
3509 return (len(response), info)
3511 def _real_extract(self, url):
3512 mobj = re.match(self._VALID_URL, url)
3514 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3517 api = 'http://api.justin.tv'
3518 video_id = mobj.group(mobj.lastindex)
3520 if mobj.lastindex == 1:
3522 api += '/channel/archives/%s.json'
3524 api += '/broadcast/by_archive/%s.json'
3525 api = api % (video_id,)
3527 self.report_extraction(video_id)
3531 limit = self._JUSTIN_PAGE_LIMIT
3534 self.report_download_page(video_id, offset)
3535 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3536 page_count, page_info = self._parse_page(page_url)
3537 info.extend(page_info)
3538 if not paged or page_count != limit:
3543 class FunnyOrDieIE(InfoExtractor):
3544 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3546 def _real_extract(self, url):
3547 mobj = re.match(self._VALID_URL, url)
3549 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3552 video_id = mobj.group('id')
3553 webpage = self._download_webpage(url, video_id)
3555 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3557 self._downloader.trouble(u'ERROR: unable to find video information')
3558 video_url = unescapeHTML(m.group('url'))
3560 m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3562 self._downloader.trouble(u'Cannot find video title')
3563 title = unescapeHTML(m.group('title'))
3565 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3567 desc = unescapeHTML(m.group('desc'))
3576 'description': desc,
3580 class TweetReelIE(InfoExtractor):
3581 _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3583 def _real_extract(self, url):
3584 mobj = re.match(self._VALID_URL, url)
3586 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3589 video_id = mobj.group('id')
3590 webpage = self._download_webpage(url, video_id)
3592 m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3594 self._downloader.trouble(u'ERROR: Cannot find status ID')
3595 status_id = m.group(1)
3597 m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3599 self._downloader.trouble(u'WARNING: Cannot find description')
3600 desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3602 m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3604 self._downloader.trouble(u'ERROR: Cannot find uploader')
3605 uploader = unescapeHTML(m.group('uploader'))
3606 uploader_id = unescapeHTML(m.group('uploader_id'))
3608 m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3610 self._downloader.trouble(u'ERROR: Cannot find upload date')
3611 upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3614 video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3621 'description': desc,
3622 'uploader': uploader,
3623 'uploader_id': uploader_id,
3624 'internal_id': status_id,
3625 'upload_date': upload_date
3629 class SteamIE(InfoExtractor):
3630 _VALID_URL = r"""http://store.steampowered.com/
3631 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3633 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3636 def suitable(self, url):
3637 """Receives a URL and returns True if suitable for this IE."""
3638 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3640 def _real_extract(self, url):
3641 m = re.match(self._VALID_URL, url, re.VERBOSE)
3642 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3643 gameID = m.group('gameID')
3644 videourl = 'http://store.steampowered.com/video/%s/' % gameID
3645 webpage = self._download_webpage(videourl, gameID)
3646 mweb = re.finditer(urlRE, webpage)
3647 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3648 titles = re.finditer(namesRE, webpage)
3650 for vid,vtitle in zip(mweb,titles):
3651 video_id = vid.group('videoID')
3652 title = vtitle.group('videoName')
3653 video_url = vid.group('videoURL')
3655 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3660 'title': unescapeHTML(title)
3665 class UstreamIE(InfoExtractor):
3666 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3667 IE_NAME = u'ustream'
3669 def _real_extract(self, url):
3670 m = re.match(self._VALID_URL, url)
3671 video_id = m.group('videoID')
3672 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3673 webpage = self._download_webpage(url, video_id)
3674 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3675 title = m.group('title')
3676 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3677 uploader = m.group('uploader')
3683 'uploader': uploader
3687 class RBMARadioIE(InfoExtractor):
3688 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3690 def _real_extract(self, url):
3691 m = re.match(self._VALID_URL, url)
3692 video_id = m.group('videoID')
3694 webpage = self._download_webpage(url, video_id)
3695 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3697 raise ExtractorError(u'Cannot find metadata')
3698 json_data = m.group(1)
3701 data = json.loads(json_data)
3702 except ValueError as e:
3703 raise ExtractorError(u'Invalid JSON: ' + str(e))
3705 video_url = data['akamai_url'] + '&cbr=256'
3706 url_parts = compat_urllib_parse_urlparse(video_url)
3707 video_ext = url_parts.path.rpartition('.')[2]
3712 'title': data['title'],
3713 'description': data.get('teaser_text'),
3714 'location': data.get('country_of_origin'),
3715 'uploader': data.get('host', {}).get('name'),
3716 'uploader_id': data.get('host', {}).get('slug'),
3717 'thumbnail': data.get('image', {}).get('large_url_2x'),
3718 'duration': data.get('duration'),
3723 class YouPornIE(InfoExtractor):
3724 """Information extractor for youporn.com."""
3725 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3727 def _print_formats(self, formats):
3728 """Print all available formats"""
3729 print(u'Available formats:')
3730 print(u'ext\t\tformat')
3731 print(u'---------------------------------')
3732 for format in formats:
3733 print(u'%s\t\t%s' % (format['ext'], format['format']))
3735 def _specific(self, req_format, formats):
3737 if(x["format"]==req_format):
3741 def _real_extract(self, url):
3742 mobj = re.match(self._VALID_URL, url)
3744 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3747 video_id = mobj.group('videoid')
3749 req = compat_urllib_request.Request(url)
3750 req.add_header('Cookie', 'age_verified=1')
3751 webpage = self._download_webpage(req, video_id)
3753 # Get the video title
3754 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3756 raise ExtractorError(u'Unable to extract video title')
3757 video_title = result.group('title').strip()
3759 # Get the video date
3760 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3762 self._downloader.to_stderr(u'WARNING: unable to extract video date')
3765 upload_date = result.group('date').strip()
3767 # Get the video uploader
3768 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3770 self._downloader.to_stderr(u'WARNING: unable to extract uploader')
3771 video_uploader = None
3773 video_uploader = result.group('uploader').strip()
3774 video_uploader = clean_html( video_uploader )
3776 # Get all of the formats available
3777 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3778 result = re.search(DOWNLOAD_LIST_RE, webpage)
3780 raise ExtractorError(u'Unable to extract download list')
3781 download_list_html = result.group('download_list').strip()
3783 # Get all of the links from the page
3784 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3785 links = re.findall(LINK_RE, download_list_html)
3786 if(len(links) == 0):
3787 raise ExtractorError(u'ERROR: no known formats available for video')
3789 self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3794 # A link looks like this:
3795 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3796 # A path looks like this:
3797 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3798 video_url = unescapeHTML( link )
3799 path = compat_urllib_parse_urlparse( video_url ).path
3800 extension = os.path.splitext( path )[1][1:]
3801 format = path.split('/')[4].split('_')[:2]
3804 format = "-".join( format )
3805 title = u'%s-%s-%s' % (video_title, size, bitrate)
3810 'uploader': video_uploader,
3811 'upload_date': upload_date,
3816 'description': None,
3820 if self._downloader.params.get('listformats', None):
3821 self._print_formats(formats)
3824 req_format = self._downloader.params.get('format', None)
3825 self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3827 if req_format is None or req_format == 'best':
3829 elif req_format == 'worst':
3830 return [formats[-1]]
3831 elif req_format in ('-1', 'all'):
3834 format = self._specific( req_format, formats )
3836 self._downloader.trouble(u'ERROR: requested format not available')
3842 class PornotubeIE(InfoExtractor):
3843 """Information extractor for pornotube.com."""
3844 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3846 def _real_extract(self, url):
3847 mobj = re.match(self._VALID_URL, url)
3849 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3852 video_id = mobj.group('videoid')
3853 video_title = mobj.group('title')
3855 # Get webpage content
3856 webpage = self._download_webpage(url, video_id)
3859 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3860 result = re.search(VIDEO_URL_RE, webpage)
3862 self._downloader.trouble(u'ERROR: unable to extract video url')
3864 video_url = compat_urllib_parse.unquote(result.group('url'))
3866 #Get the uploaded date
3867 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3868 result = re.search(VIDEO_UPLOADED_RE, webpage)
3870 self._downloader.trouble(u'ERROR: unable to extract video title')
3872 upload_date = result.group('date')
3874 info = {'id': video_id,
3877 'upload_date': upload_date,
3878 'title': video_title,
3884 class YouJizzIE(InfoExtractor):
3885 """Information extractor for youjizz.com."""
3886 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3888 def _real_extract(self, url):
3889 mobj = re.match(self._VALID_URL, url)
3891 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3894 video_id = mobj.group('videoid')
3896 # Get webpage content
3897 webpage = self._download_webpage(url, video_id)
3899 # Get the video title
3900 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3902 raise ExtractorError(u'ERROR: unable to extract video title')
3903 video_title = result.group('title').strip()
3905 # Get the embed page
3906 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3908 raise ExtractorError(u'ERROR: unable to extract embed page')
3910 embed_page_url = result.group(0).strip()
3911 video_id = result.group('videoid')
3913 webpage = self._download_webpage(embed_page_url, video_id)
3916 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3918 raise ExtractorError(u'ERROR: unable to extract video url')
3919 video_url = result.group('source')
3921 info = {'id': video_id,
3923 'title': video_title,
3926 'player_url': embed_page_url}
3930 class EightTracksIE(InfoExtractor):
3932 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3934 def _real_extract(self, url):
3935 mobj = re.match(self._VALID_URL, url)
3937 raise ExtractorError(u'Invalid URL: %s' % url)
3938 playlist_id = mobj.group('id')
3940 webpage = self._download_webpage(url, playlist_id)
3942 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3944 raise ExtractorError(u'Cannot find trax information')
3945 json_like = m.group(1)
3946 data = json.loads(json_like)
3948 session = str(random.randint(0, 1000000000))
3950 track_count = data['tracks_count']
3951 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3952 next_url = first_url
3954 for i in itertools.count():
3955 api_json = self._download_webpage(next_url, playlist_id,
3956 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3957 errnote=u'Failed to download song information')
3958 api_data = json.loads(api_json)
3959 track_data = api_data[u'set']['track']
3961 'id': track_data['id'],
3962 'url': track_data['track_file_stream_url'],
3963 'title': track_data['performer'] + u' - ' + track_data['name'],
3964 'raw_title': track_data['name'],
3965 'uploader_id': data['user']['login'],
3969 if api_data['set']['at_last_track']:
3971 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3974 class KeekIE(InfoExtractor):
3975 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3978 def _real_extract(self, url):
3979 m = re.match(self._VALID_URL, url)
3980 video_id = m.group('videoID')
3981 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3982 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3983 webpage = self._download_webpage(url, video_id)
3984 m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
3985 title = unescapeHTML(m.group('title'))
3986 m = re.search(r'<div class="bio-names-and-report">[\s\n]+<h4>(?P<uploader>\w+)</h4>', webpage)
3987 uploader = unescapeHTML(m.group('uploader'))
3993 'thumbnail': thumbnail,
3994 'uploader': uploader
3998 class TEDIE(InfoExtractor):
3999 _VALID_URL=r'''http://www.ted.com/
4001 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4003 ((?P<type_talk>talks)) # We have a simple talk
4005 /(?P<name>\w+) # Here goes the name and then ".html"
4008 def suitable(self, url):
4009 """Receives a URL and returns True if suitable for this IE."""
4010 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
4012 def _real_extract(self, url):
4013 m=re.match(self._VALID_URL, url, re.VERBOSE)
4014 if m.group('type_talk'):
4015 return [self._talk_info(url)]
4017 playlist_id=m.group('playlist_id')
4018 name=m.group('name')
4019 self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4020 return self._playlist_videos_info(url,name,playlist_id)
4022 def _talk_video_link(self,mediaSlug):
4023 '''Returns the video link for that mediaSlug'''
4024 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4026 def _playlist_videos_info(self,url,name,playlist_id=0):
4027 '''Returns the videos of the playlist'''
4029 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4030 ([.\s]*?)data-playlist_item_id="(\d+)"
4031 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4033 video_name_RE=r'<p\ class="talk-title"><a href="/talks/(.+).html">(?P<fullname>.+?)</a></p>'
4034 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4035 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4036 m_names=re.finditer(video_name_RE,webpage)
4038 for m_video, m_name in zip(m_videos,m_names):
4040 'id': m_video.group('video_id'),
4041 'url': self._talk_video_link(m_video.group('mediaSlug')),
4043 'title': m_name.group('fullname')
4045 info.append(video_dic)
4047 def _talk_info(self, url, video_id=0):
4048 """Return the video for the talk in the url"""
4049 m=re.match(self._VALID_URL, url,re.VERBOSE)
4050 videoName=m.group('name')
4051 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4052 # If the url includes the language we get the title translated
4053 title_RE=r'<h1><span id="altHeadline" >(?P<title>[\s\w:/\.\?=\+-\\\']*)</span></h1>'
4054 title=re.search(title_RE, webpage).group('title')
4055 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4056 "id":(?P<videoID>[\d]+).*?
4057 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4058 info_match=re.search(info_RE,webpage,re.VERBOSE)
4059 video_id=info_match.group('videoID')
4060 mediaSlug=info_match.group('mediaSlug')
4061 video_url=self._talk_video_link(mediaSlug)
4070 class MySpassIE(InfoExtractor):
4071 _VALID_URL = r'http://www.myspass.de/.*'
4073 def _real_extract(self, url):
4074 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4076 # video id is the last path element of the URL
4077 # usually there is a trailing slash, so also try the second but last
4078 url_path = compat_urllib_parse_urlparse(url).path
4079 url_parent_path, video_id = os.path.split(url_path)
4081 _, video_id = os.path.split(url_parent_path)
4084 metadata_url = META_DATA_URL_TEMPLATE % video_id
4085 metadata_text = self._download_webpage(metadata_url, video_id)
4086 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4088 # extract values from metadata
4089 url_flv_el = metadata.find('url_flv')
4090 if url_flv_el is None:
4091 self._downloader.trouble(u'ERROR: unable to extract download url')
4093 video_url = url_flv_el.text
4094 extension = os.path.splitext(video_url)[1][1:]
4095 title_el = metadata.find('title')
4096 if title_el is None:
4097 self._downloader.trouble(u'ERROR: unable to extract title')
4099 title = title_el.text
4100 format_id_el = metadata.find('format_id')
4101 if format_id_el is None:
4104 format = format_id_el.text
4105 description_el = metadata.find('description')
4106 if description_el is not None:
4107 description = description_el.text
4110 imagePreview_el = metadata.find('imagePreview')
4111 if imagePreview_el is not None:
4112 thumbnail = imagePreview_el.text
4121 'thumbnail': thumbnail,
4122 'description': description
4126 def gen_extractors():
4127 """ Return a list of an instance of every supported extractor.
4128 The order does matter; the first extractor matched is the one handling the URL.
4131 YoutubePlaylistIE(),
4155 StanfordOpenClassroomIE(),