2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
23 class InfoExtractor(object):
24 """Information Extractor class.
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
29 others. The information is stored in a dictionary which is then
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
34 The dictionaries must include the following fields:
38 title: Video title, unescaped.
39 ext: Video filename extension.
41 The following fields are optional:
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
46 uploader: Full name of the video uploader.
47 upload_date: Video upload date (YYYYMMDD).
48 uploader_id: Nickname or id of the video uploader.
49 location: Physical location of the video.
50 player_url: SWF Player URL (used for rtmpdump).
51 subtitles: The subtitle file contents.
52 urlhandle: [internal] The urlHandle to be used to download the file,
53 like returned by urllib.request.urlopen
55 The fields should all be Unicode strings.
57 Subclasses of this one should re-define the _real_initialize() and
58 _real_extract() methods and define a _VALID_URL regexp.
59 Probably, they should also be added to the list of extractors.
61 _real_extract() must return a *list* of information dictionaries as
64 Finally, the _WORKING attribute should be set to False for broken IEs
65 in order to warn the users and skip the tests.
72 def __init__(self, downloader=None):
73 """Constructor. Receives an optional downloader."""
75 self.set_downloader(downloader)
78 def suitable(cls, url):
79 """Receives a URL and returns True if suitable for this IE."""
80 return re.match(cls._VALID_URL, url) is not None
84 """Getter method for _WORKING."""
88 """Initializes an instance (authentication, etc)."""
90 self._real_initialize()
93 def extract(self, url):
94 """Extracts URL information and returns it in list of dicts."""
96 return self._real_extract(url)
98 def set_downloader(self, downloader):
99 """Sets the downloader for this IE."""
100 self._downloader = downloader
102 def _real_initialize(self):
103 """Real initialization process. Redefine in subclasses."""
106 def _real_extract(self, url):
107 """Real extraction process. Redefine in subclasses."""
112 return type(self).__name__[:-2]
114 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115 """ Returns the response handle """
117 note = u'Downloading video webpage'
118 if note is not False:
119 self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
121 return compat_urllib_request.urlopen(url_or_request)
122 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
124 errnote = u'Unable to download webpage'
125 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
127 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
128 """ Returns the data of the page as a string """
129 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
130 content_type = urlh.headers.get('Content-Type', '')
131 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
133 encoding = m.group(1)
136 webpage_bytes = urlh.read()
137 return webpage_bytes.decode(encoding, 'replace')
140 class YoutubeIE(InfoExtractor):
141 """Information extractor for youtube.com."""
145 (?:https?://)? # http(s):// (optional)
146 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
147 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
148 (?:.*?\#/)? # handle anchor (#/) redirect urls
149 (?: # the various things that can precede the ID:
150 (?:(?:v|embed|e)/) # v/ or embed/ or e/
151 |(?: # or the v= param in all its forms
152 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
153 (?:\?|\#!?) # the params delimiter ? or # or #!
154 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
157 )? # optional -> youtube.com/xxxx is OK
158 )? # all until now is optional -> you can pass the naked ID
159 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
160 (?(1).+)? # if we found the ID, everything can follow
162 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
163 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
164 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
165 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
166 _NETRC_MACHINE = 'youtube'
167 # Listed in order of quality
168 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
169 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
170 _video_extensions = {
176 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
182 _video_dimensions = {
201 def suitable(cls, url):
202 """Receives a URL and returns True if suitable for this IE."""
203 if YoutubePlaylistIE.suitable(url): return False
204 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
206 def report_lang(self):
207 """Report attempt to set language."""
208 self._downloader.to_screen(u'[youtube] Setting language')
210 def report_login(self):
211 """Report attempt to log in."""
212 self._downloader.to_screen(u'[youtube] Logging in')
214 def report_age_confirmation(self):
215 """Report attempt to confirm age."""
216 self._downloader.to_screen(u'[youtube] Confirming age')
218 def report_video_webpage_download(self, video_id):
219 """Report attempt to download video webpage."""
220 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
222 def report_video_info_webpage_download(self, video_id):
223 """Report attempt to download video info webpage."""
224 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
226 def report_video_subtitles_download(self, video_id):
227 """Report attempt to download video info webpage."""
228 self._downloader.to_screen(u'[youtube] %s: Checking available subtitles' % video_id)
230 def report_video_subtitles_request(self, video_id, sub_lang, format):
231 """Report attempt to download video info webpage."""
232 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
234 def report_video_subtitles_available(self, video_id, sub_lang_list):
235 """Report available subtitles."""
236 sub_lang = ",".join(list(sub_lang_list.keys()))
237 self._downloader.to_screen(u'[youtube] %s: Available subtitles for video: %s' % (video_id, sub_lang))
239 def report_information_extraction(self, video_id):
240 """Report attempt to extract video information."""
241 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
243 def report_unavailable_format(self, video_id, format):
244 """Report extracted video URL."""
245 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
247 def report_rtmp_download(self):
248 """Indicate the download will use the RTMP protocol."""
249 self._downloader.to_screen(u'[youtube] RTMP download detected')
251 def _get_available_subtitles(self, video_id):
252 self.report_video_subtitles_download(video_id)
253 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
255 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
256 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
257 return (u'unable to download video subtitles: %s' % compat_str(err), None)
258 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
259 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
260 if not sub_lang_list:
261 return (u'video doesn\'t have subtitles', None)
264 def _list_available_subtitles(self, video_id):
265 sub_lang_list = self._get_available_subtitles(video_id)
266 self.report_video_subtitles_available(video_id, sub_lang_list)
268 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
271 (error_message, sub_lang, sub)
273 self.report_video_subtitles_request(video_id, sub_lang, format)
274 params = compat_urllib_parse.urlencode({
280 url = 'http://www.youtube.com/api/timedtext?' + params
282 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
283 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
284 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
286 return (u'Did not fetch video subtitles', None, None)
287 return (None, sub_lang, sub)
289 def _extract_subtitle(self, video_id):
291 Return a list with a tuple:
292 [(error_message, sub_lang, sub)]
294 sub_lang_list = self._get_available_subtitles(video_id)
295 sub_format = self._downloader.params.get('subtitlesformat')
296 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
297 return [(sub_lang_list[0], None, None)]
298 if self._downloader.params.get('subtitleslang', False):
299 sub_lang = self._downloader.params.get('subtitleslang')
300 elif 'en' in sub_lang_list:
303 sub_lang = list(sub_lang_list.keys())[0]
304 if not sub_lang in sub_lang_list:
305 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
307 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
310 def _extract_all_subtitles(self, video_id):
311 sub_lang_list = self._get_available_subtitles(video_id)
312 sub_format = self._downloader.params.get('subtitlesformat')
313 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
314 return [(sub_lang_list[0], None, None)]
316 for sub_lang in sub_lang_list:
317 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
318 subtitles.append(subtitle)
321 def _print_formats(self, formats):
322 print('Available formats:')
324 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
326 def _real_initialize(self):
327 if self._downloader is None:
332 downloader_params = self._downloader.params
334 # Attempt to use provided username and password or .netrc data
335 if downloader_params.get('username', None) is not None:
336 username = downloader_params['username']
337 password = downloader_params['password']
338 elif downloader_params.get('usenetrc', False):
340 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
345 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
346 except (IOError, netrc.NetrcParseError) as err:
347 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
351 request = compat_urllib_request.Request(self._LANG_URL)
354 compat_urllib_request.urlopen(request).read()
355 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
356 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
359 # No authentication to be performed
363 request = compat_urllib_request.Request(self._LOGIN_URL)
365 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
366 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
367 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
372 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
374 galx = match.group(1)
376 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
382 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
386 u'PersistentCookie': u'yes',
388 u'bgresponse': u'js_disabled',
389 u'checkConnection': u'',
390 u'checkedDomains': u'youtube',
396 u'signIn': u'Sign in',
398 u'service': u'youtube',
402 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
404 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
405 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
406 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
409 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
410 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
411 self._downloader.report_warning(u'unable to log in: bad username or password')
413 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
414 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
420 'action_confirm': 'Confirm',
422 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
424 self.report_age_confirmation()
425 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
426 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
427 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
430 def _extract_id(self, url):
431 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
433 self._downloader.report_error(u'invalid URL: %s' % url)
435 video_id = mobj.group(2)
438 def _real_extract(self, url):
439 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
440 mobj = re.search(self._NEXT_URL_RE, url)
442 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
443 video_id = self._extract_id(url)
446 self.report_video_webpage_download(video_id)
447 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
448 request = compat_urllib_request.Request(url)
450 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
451 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
452 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
455 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
457 # Attempt to extract SWF player URL
458 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
460 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
465 self.report_video_info_webpage_download(video_id)
466 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
467 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
468 % (video_id, el_type))
469 video_info_webpage = self._download_webpage(video_info_url, video_id,
471 errnote='unable to download video info webpage')
472 video_info = compat_parse_qs(video_info_webpage)
473 if 'token' in video_info:
475 if 'token' not in video_info:
476 if 'reason' in video_info:
477 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
479 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
482 # Check for "rental" videos
483 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
484 self._downloader.report_error(u'"rental" videos not supported')
487 # Start extracting information
488 self.report_information_extraction(video_id)
491 if 'author' not in video_info:
492 self._downloader.report_error(u'unable to extract uploader name')
494 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
497 video_uploader_id = None
498 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
500 video_uploader_id = mobj.group(1)
502 self._downloader.report_warning(u'unable to extract uploader nickname')
505 if 'title' not in video_info:
506 self._downloader.report_error(u'unable to extract video title')
508 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
511 if 'thumbnail_url' not in video_info:
512 self._downloader.report_warning(u'unable to extract video thumbnail')
514 else: # don't panic if we can't find it
515 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
519 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
521 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
522 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
523 for expression in format_expressions:
525 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
530 video_description = get_element_by_id("eow-description", video_webpage)
531 if video_description:
532 video_description = clean_html(video_description)
534 video_description = ''
537 video_subtitles = None
539 if self._downloader.params.get('writesubtitles', False):
540 video_subtitles = self._extract_subtitle(video_id)
542 (sub_error, sub_lang, sub) = video_subtitles[0]
544 self._downloader.report_error(sub_error)
546 if self._downloader.params.get('allsubtitles', False):
547 video_subtitles = self._extract_all_subtitles(video_id)
548 for video_subtitle in video_subtitles:
549 (sub_error, sub_lang, sub) = video_subtitle
551 self._downloader.report_error(sub_error)
553 if self._downloader.params.get('listsubtitles', False):
554 sub_lang_list = self._list_available_subtitles(video_id)
557 if 'length_seconds' not in video_info:
558 self._downloader.report_warning(u'unable to extract video duration')
561 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
564 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
566 # Decide which formats to download
567 req_format = self._downloader.params.get('format', None)
569 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
570 self.report_rtmp_download()
571 video_url_list = [(None, video_info['conn'][0])]
572 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
573 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
574 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
575 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
576 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
578 format_limit = self._downloader.params.get('format_limit', None)
579 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
580 if format_limit is not None and format_limit in available_formats:
581 format_list = available_formats[available_formats.index(format_limit):]
583 format_list = available_formats
584 existing_formats = [x for x in format_list if x in url_map]
585 if len(existing_formats) == 0:
586 self._downloader.report_error(u'no known formats available for video')
588 if self._downloader.params.get('listformats', None):
589 self._print_formats(existing_formats)
591 if req_format is None or req_format == 'best':
592 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
593 elif req_format == 'worst':
594 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
595 elif req_format in ('-1', 'all'):
596 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
598 # Specific formats. We pick the first in a slash-delimeted sequence.
599 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
600 req_formats = req_format.split('/')
601 video_url_list = None
602 for rf in req_formats:
604 video_url_list = [(rf, url_map[rf])]
606 if video_url_list is None:
607 self._downloader.report_error(u'requested format not available')
610 self._downloader.report_error(u'no conn or url_encoded_fmt_stream_map information found in video info')
614 for format_param, video_real_url in video_url_list:
616 video_extension = self._video_extensions.get(format_param, 'flv')
618 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
619 self._video_dimensions.get(format_param, '???'))
623 'url': video_real_url,
624 'uploader': video_uploader,
625 'uploader_id': video_uploader_id,
626 'upload_date': upload_date,
627 'title': video_title,
628 'ext': video_extension,
629 'format': video_format,
630 'thumbnail': video_thumbnail,
631 'description': video_description,
632 'player_url': player_url,
633 'subtitles': video_subtitles,
634 'duration': video_duration
639 class MetacafeIE(InfoExtractor):
640 """Information Extractor for metacafe.com."""
642 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
643 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
644 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
645 IE_NAME = u'metacafe'
647 def __init__(self, downloader=None):
648 InfoExtractor.__init__(self, downloader)
650 def report_disclaimer(self):
651 """Report disclaimer retrieval."""
652 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
654 def report_age_confirmation(self):
655 """Report attempt to confirm age."""
656 self._downloader.to_screen(u'[metacafe] Confirming age')
658 def report_download_webpage(self, video_id):
659 """Report webpage download."""
660 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
662 def report_extraction(self, video_id):
663 """Report information extraction."""
664 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
666 def _real_initialize(self):
667 # Retrieve disclaimer
668 request = compat_urllib_request.Request(self._DISCLAIMER)
670 self.report_disclaimer()
671 disclaimer = compat_urllib_request.urlopen(request).read()
672 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
673 self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
679 'submit': "Continue - I'm over 18",
681 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
683 self.report_age_confirmation()
684 disclaimer = compat_urllib_request.urlopen(request).read()
685 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
686 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
689 def _real_extract(self, url):
690 # Extract id and simplified title from URL
691 mobj = re.match(self._VALID_URL, url)
693 self._downloader.report_error(u'invalid URL: %s' % url)
696 video_id = mobj.group(1)
698 # Check if video comes from YouTube
699 mobj2 = re.match(r'^yt-(.*)$', video_id)
700 if mobj2 is not None:
701 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
704 # Retrieve video webpage to extract further information
705 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
707 self.report_download_webpage(video_id)
708 webpage = compat_urllib_request.urlopen(request).read()
709 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
710 self._downloader.report_error(u'unable retrieve video webpage: %s' % compat_str(err))
713 # Extract URL, uploader and title from webpage
714 self.report_extraction(video_id)
715 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
717 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
718 video_extension = mediaURL[-3:]
720 # Extract gdaKey if available
721 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
725 gdaKey = mobj.group(1)
726 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
728 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
730 self._downloader.report_error(u'unable to extract media URL')
732 vardict = compat_parse_qs(mobj.group(1))
733 if 'mediaData' not in vardict:
734 self._downloader.report_error(u'unable to extract media URL')
736 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
738 self._downloader.report_error(u'unable to extract media URL')
740 mediaURL = mobj.group(1).replace('\\/', '/')
741 video_extension = mediaURL[-3:]
742 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
744 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
746 self._downloader.report_error(u'unable to extract title')
748 video_title = mobj.group(1).decode('utf-8')
750 mobj = re.search(r'submitter=(.*?);', webpage)
752 self._downloader.report_error(u'unable to extract uploader nickname')
754 video_uploader = mobj.group(1)
757 'id': video_id.decode('utf-8'),
758 'url': video_url.decode('utf-8'),
759 'uploader': video_uploader.decode('utf-8'),
761 'title': video_title,
762 'ext': video_extension.decode('utf-8'),
766 class DailymotionIE(InfoExtractor):
767 """Information Extractor for Dailymotion"""
769 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
770 IE_NAME = u'dailymotion'
773 def __init__(self, downloader=None):
774 InfoExtractor.__init__(self, downloader)
776 def report_extraction(self, video_id):
777 """Report information extraction."""
778 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
780 def _real_extract(self, url):
781 # Extract id and simplified title from URL
782 mobj = re.match(self._VALID_URL, url)
784 self._downloader.report_error(u'invalid URL: %s' % url)
787 video_id = mobj.group(1).split('_')[0].split('?')[0]
789 video_extension = 'mp4'
791 # Retrieve video webpage to extract further information
792 request = compat_urllib_request.Request(url)
793 request.add_header('Cookie', 'family_filter=off')
794 webpage = self._download_webpage(request, video_id)
796 # Extract URL, uploader and title from webpage
797 self.report_extraction(video_id)
798 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
800 self._downloader.report_error(u'unable to extract media URL')
802 flashvars = compat_urllib_parse.unquote(mobj.group(1))
804 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
807 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
810 self._downloader.report_error(u'unable to extract video URL')
813 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
815 self._downloader.report_error(u'unable to extract video URL')
818 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
820 # TODO: support choosing qualities
822 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
824 self._downloader.report_error(u'unable to extract title')
826 video_title = unescapeHTML(mobj.group('title'))
828 video_uploader = None
829 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
831 # lookin for official user
832 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
833 if mobj_official is None:
834 self._downloader.report_warning(u'unable to extract uploader nickname')
836 video_uploader = mobj_official.group(1)
838 video_uploader = mobj.group(1)
840 video_upload_date = None
841 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
843 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
848 'uploader': video_uploader,
849 'upload_date': video_upload_date,
850 'title': video_title,
851 'ext': video_extension,
855 class PhotobucketIE(InfoExtractor):
856 """Information extractor for photobucket.com."""
858 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
859 IE_NAME = u'photobucket'
861 def __init__(self, downloader=None):
862 InfoExtractor.__init__(self, downloader)
864 def report_download_webpage(self, video_id):
865 """Report webpage download."""
866 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
868 def report_extraction(self, video_id):
869 """Report information extraction."""
870 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
872 def _real_extract(self, url):
873 # Extract id from URL
874 mobj = re.match(self._VALID_URL, url)
876 self._downloader.report_error(u'Invalid URL: %s' % url)
879 video_id = mobj.group(1)
881 video_extension = 'flv'
883 # Retrieve video webpage to extract further information
884 request = compat_urllib_request.Request(url)
886 self.report_download_webpage(video_id)
887 webpage = compat_urllib_request.urlopen(request).read()
888 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
889 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
892 # Extract URL, uploader, and title from webpage
893 self.report_extraction(video_id)
894 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
896 self._downloader.report_error(u'unable to extract media URL')
898 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
902 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
904 self._downloader.report_error(u'unable to extract title')
906 video_title = mobj.group(1).decode('utf-8')
908 video_uploader = mobj.group(2).decode('utf-8')
911 'id': video_id.decode('utf-8'),
912 'url': video_url.decode('utf-8'),
913 'uploader': video_uploader,
915 'title': video_title,
916 'ext': video_extension.decode('utf-8'),
920 class YahooIE(InfoExtractor):
921 """Information extractor for video.yahoo.com."""
924 # _VALID_URL matches all Yahoo! Video URLs
925 # _VPAGE_URL matches only the extractable '/watch/' URLs
926 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
927 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
928 IE_NAME = u'video.yahoo'
930 def __init__(self, downloader=None):
931 InfoExtractor.__init__(self, downloader)
933 def report_download_webpage(self, video_id):
934 """Report webpage download."""
935 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
937 def report_extraction(self, video_id):
938 """Report information extraction."""
939 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
941 def _real_extract(self, url, new_video=True):
942 # Extract ID from URL
943 mobj = re.match(self._VALID_URL, url)
945 self._downloader.report_error(u'Invalid URL: %s' % url)
948 video_id = mobj.group(2)
949 video_extension = 'flv'
951 # Rewrite valid but non-extractable URLs as
952 # extractable English language /watch/ URLs
953 if re.match(self._VPAGE_URL, url) is None:
954 request = compat_urllib_request.Request(url)
956 webpage = compat_urllib_request.urlopen(request).read()
957 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
958 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
961 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
963 self._downloader.report_error(u'Unable to extract id field')
965 yahoo_id = mobj.group(1)
967 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
969 self._downloader.report_error(u'Unable to extract vid field')
971 yahoo_vid = mobj.group(1)
973 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
974 return self._real_extract(url, new_video=False)
976 # Retrieve video webpage to extract further information
977 request = compat_urllib_request.Request(url)
979 self.report_download_webpage(video_id)
980 webpage = compat_urllib_request.urlopen(request).read()
981 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
982 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
985 # Extract uploader and title from webpage
986 self.report_extraction(video_id)
987 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
989 self._downloader.report_error(u'unable to extract video title')
991 video_title = mobj.group(1).decode('utf-8')
993 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
995 self._downloader.report_error(u'unable to extract video uploader')
997 video_uploader = mobj.group(1).decode('utf-8')
999 # Extract video thumbnail
1000 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1002 self._downloader.report_error(u'unable to extract video thumbnail')
1004 video_thumbnail = mobj.group(1).decode('utf-8')
1006 # Extract video description
1007 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1009 self._downloader.report_error(u'unable to extract video description')
1011 video_description = mobj.group(1).decode('utf-8')
1012 if not video_description:
1013 video_description = 'No description available.'
1015 # Extract video height and width
1016 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1018 self._downloader.report_error(u'unable to extract video height')
1020 yv_video_height = mobj.group(1)
1022 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1024 self._downloader.report_error(u'unable to extract video width')
1026 yv_video_width = mobj.group(1)
1028 # Retrieve video playlist to extract media URL
1029 # I'm not completely sure what all these options are, but we
1030 # seem to need most of them, otherwise the server sends a 401.
1031 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1032 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1033 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1034 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1035 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1037 self.report_download_webpage(video_id)
1038 webpage = compat_urllib_request.urlopen(request).read()
1039 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1040 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1043 # Extract media URL from playlist XML
1044 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1046 self._downloader.report_error(u'Unable to extract media URL')
1048 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1049 video_url = unescapeHTML(video_url)
1052 'id': video_id.decode('utf-8'),
1054 'uploader': video_uploader,
1055 'upload_date': None,
1056 'title': video_title,
1057 'ext': video_extension.decode('utf-8'),
1058 'thumbnail': video_thumbnail.decode('utf-8'),
1059 'description': video_description,
1063 class VimeoIE(InfoExtractor):
1064 """Information extractor for vimeo.com."""
1066 # _VALID_URL matches Vimeo URLs
1067 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1070 def __init__(self, downloader=None):
1071 InfoExtractor.__init__(self, downloader)
1073 def report_download_webpage(self, video_id):
1074 """Report webpage download."""
1075 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1077 def report_extraction(self, video_id):
1078 """Report information extraction."""
1079 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1081 def _real_extract(self, url, new_video=True):
1082 # Extract ID from URL
1083 mobj = re.match(self._VALID_URL, url)
1085 self._downloader.report_error(u'Invalid URL: %s' % url)
1088 video_id = mobj.group('id')
1089 if not mobj.group('proto'):
1090 url = 'https://' + url
1091 if mobj.group('direct_link'):
1092 url = 'https://vimeo.com/' + video_id
1094 # Retrieve video webpage to extract further information
1095 request = compat_urllib_request.Request(url, None, std_headers)
1097 self.report_download_webpage(video_id)
1098 webpage_bytes = compat_urllib_request.urlopen(request).read()
1099 webpage = webpage_bytes.decode('utf-8')
1100 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1101 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1104 # Now we begin extracting as much information as we can from what we
1105 # retrieved. First we extract the information common to all extractors,
1106 # and latter we extract those that are Vimeo specific.
1107 self.report_extraction(video_id)
1109 # Extract the config JSON
1111 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1112 config = json.loads(config)
1114 self._downloader.report_error(u'unable to extract info section')
1118 video_title = config["video"]["title"]
1120 # Extract uploader and uploader_id
1121 video_uploader = config["video"]["owner"]["name"]
1122 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1124 # Extract video thumbnail
1125 video_thumbnail = config["video"]["thumbnail"]
1127 # Extract video description
1128 video_description = get_element_by_attribute("itemprop", "description", webpage)
1129 if video_description: video_description = clean_html(video_description)
1130 else: video_description = u''
1132 # Extract upload date
1133 video_upload_date = None
1134 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1135 if mobj is not None:
1136 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1138 # Vimeo specific: extract request signature and timestamp
1139 sig = config['request']['signature']
1140 timestamp = config['request']['timestamp']
1142 # Vimeo specific: extract video codec and quality information
1143 # First consider quality, then codecs, then take everything
1144 # TODO bind to format param
1145 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1146 files = { 'hd': [], 'sd': [], 'other': []}
1147 for codec_name, codec_extension in codecs:
1148 if codec_name in config["video"]["files"]:
1149 if 'hd' in config["video"]["files"][codec_name]:
1150 files['hd'].append((codec_name, codec_extension, 'hd'))
1151 elif 'sd' in config["video"]["files"][codec_name]:
1152 files['sd'].append((codec_name, codec_extension, 'sd'))
1154 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1156 for quality in ('hd', 'sd', 'other'):
1157 if len(files[quality]) > 0:
1158 video_quality = files[quality][0][2]
1159 video_codec = files[quality][0][0]
1160 video_extension = files[quality][0][1]
1161 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1164 self._downloader.report_error(u'no known codec found')
1167 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1168 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1173 'uploader': video_uploader,
1174 'uploader_id': video_uploader_id,
1175 'upload_date': video_upload_date,
1176 'title': video_title,
1177 'ext': video_extension,
1178 'thumbnail': video_thumbnail,
1179 'description': video_description,
1183 class ArteTvIE(InfoExtractor):
1184 """arte.tv information extractor."""
1186 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1187 _LIVE_URL = r'index-[0-9]+\.html$'
1189 IE_NAME = u'arte.tv'
1191 def __init__(self, downloader=None):
1192 InfoExtractor.__init__(self, downloader)
1194 def report_download_webpage(self, video_id):
1195 """Report webpage download."""
1196 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1198 def report_extraction(self, video_id):
1199 """Report information extraction."""
1200 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1202 def fetch_webpage(self, url):
1203 request = compat_urllib_request.Request(url)
1205 self.report_download_webpage(url)
1206 webpage = compat_urllib_request.urlopen(request).read()
1207 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1208 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1210 except ValueError as err:
1211 self._downloader.report_error(u'Invalid URL: %s' % url)
1215 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1216 page = self.fetch_webpage(url)
1217 mobj = re.search(regex, page, regexFlags)
1221 self._downloader.report_error(u'Invalid URL: %s' % url)
1224 for (i, key, err) in matchTuples:
1225 if mobj.group(i) is None:
1226 self._downloader.trouble(err)
1229 info[key] = mobj.group(i)
1233 def extractLiveStream(self, url):
1234 video_lang = url.split('/')[-4]
1235 info = self.grep_webpage(
1237 r'src="(.*?/videothek_js.*?\.js)',
1240 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1243 http_host = url.split('/')[2]
1244 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1245 info = self.grep_webpage(
1247 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1248 '(http://.*?\.swf).*?' +
1252 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1253 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1254 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1257 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1259 def extractPlus7Stream(self, url):
1260 video_lang = url.split('/')[-3]
1261 info = self.grep_webpage(
1263 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1266 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1269 next_url = compat_urllib_parse.unquote(info.get('url'))
1270 info = self.grep_webpage(
1272 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1275 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1278 next_url = compat_urllib_parse.unquote(info.get('url'))
1280 info = self.grep_webpage(
1282 r'<video id="(.*?)".*?>.*?' +
1283 '<name>(.*?)</name>.*?' +
1284 '<dateVideo>(.*?)</dateVideo>.*?' +
1285 '<url quality="hd">(.*?)</url>',
1288 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1289 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1290 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1291 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1296 'id': info.get('id'),
1297 'url': compat_urllib_parse.unquote(info.get('url')),
1298 'uploader': u'arte.tv',
1299 'upload_date': info.get('date'),
1300 'title': info.get('title').decode('utf-8'),
1306 def _real_extract(self, url):
1307 video_id = url.split('/')[-1]
1308 self.report_extraction(video_id)
1310 if re.search(self._LIVE_URL, video_id) is not None:
1311 self.extractLiveStream(url)
1314 info = self.extractPlus7Stream(url)
1319 class GenericIE(InfoExtractor):
1320 """Generic last-resort information extractor."""
1323 IE_NAME = u'generic'
1325 def __init__(self, downloader=None):
1326 InfoExtractor.__init__(self, downloader)
1328 def report_download_webpage(self, video_id):
1329 """Report webpage download."""
1330 if not self._downloader.params.get('test', False):
1331 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1332 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1334 def report_extraction(self, video_id):
1335 """Report information extraction."""
1336 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1338 def report_following_redirect(self, new_url):
1339 """Report information extraction."""
1340 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1342 def _test_redirect(self, url):
1343 """Check if it is a redirect, like url shorteners, in case restart chain."""
1344 class HeadRequest(compat_urllib_request.Request):
1345 def get_method(self):
1348 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1350 Subclass the HTTPRedirectHandler to make it use our
1351 HeadRequest also on the redirected URL
1353 def redirect_request(self, req, fp, code, msg, headers, newurl):
1354 if code in (301, 302, 303, 307):
1355 newurl = newurl.replace(' ', '%20')
1356 newheaders = dict((k,v) for k,v in req.headers.items()
1357 if k.lower() not in ("content-length", "content-type"))
1358 return HeadRequest(newurl,
1360 origin_req_host=req.get_origin_req_host(),
1363 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1365 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1367 Fallback to GET if HEAD is not allowed (405 HTTP error)
1369 def http_error_405(self, req, fp, code, msg, headers):
1373 newheaders = dict((k,v) for k,v in req.headers.items()
1374 if k.lower() not in ("content-length", "content-type"))
1375 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1377 origin_req_host=req.get_origin_req_host(),
1381 opener = compat_urllib_request.OpenerDirector()
1382 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1383 HTTPMethodFallback, HEADRedirectHandler,
1384 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1385 opener.add_handler(handler())
1387 response = opener.open(HeadRequest(url))
1388 new_url = response.geturl()
1393 self.report_following_redirect(new_url)
1394 self._downloader.download([new_url])
1397 def _real_extract(self, url):
1398 if self._test_redirect(url): return
1400 video_id = url.split('/')[-1]
1402 webpage = self._download_webpage(url, video_id)
1403 except ValueError as err:
1404 # since this is the last-resort InfoExtractor, if
1405 # this error is thrown, it'll be thrown here
1406 self._downloader.report_error(u'Invalid URL: %s' % url)
1409 self.report_extraction(video_id)
1410 # Start with something easy: JW Player in SWFObject
1411 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1413 # Broaden the search a little bit
1414 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1416 # Broaden the search a little bit: JWPlayer JS loader
1417 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1419 self._downloader.report_error(u'Invalid URL: %s' % url)
1422 # It's possible that one of the regexes
1423 # matched, but returned an empty group:
1424 if mobj.group(1) is None:
1425 self._downloader.report_error(u'Invalid URL: %s' % url)
1428 video_url = compat_urllib_parse.unquote(mobj.group(1))
1429 video_id = os.path.basename(video_url)
1431 # here's a fun little line of code for you:
1432 video_extension = os.path.splitext(video_id)[1][1:]
1433 video_id = os.path.splitext(video_id)[0]
1435 # it's tempting to parse this further, but you would
1436 # have to take into account all the variations like
1437 # Video Title - Site Name
1438 # Site Name | Video Title
1439 # Video Title - Tagline | Site Name
1440 # and so on and so forth; it's just not practical
1441 mobj = re.search(r'<title>(.*)</title>', webpage)
1443 self._downloader.report_error(u'unable to extract title')
1445 video_title = mobj.group(1)
1447 # video uploader is domain name
1448 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1450 self._downloader.report_error(u'unable to extract title')
1452 video_uploader = mobj.group(1)
1457 'uploader': video_uploader,
1458 'upload_date': None,
1459 'title': video_title,
1460 'ext': video_extension,
1464 class YoutubeSearchIE(InfoExtractor):
1465 """Information Extractor for YouTube search queries."""
1466 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1467 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1468 _max_youtube_results = 1000
1469 IE_NAME = u'youtube:search'
1471 def __init__(self, downloader=None):
1472 InfoExtractor.__init__(self, downloader)
1474 def report_download_page(self, query, pagenum):
1475 """Report attempt to download search page with given number."""
1476 query = query.decode(preferredencoding())
1477 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1479 def _real_extract(self, query):
1480 mobj = re.match(self._VALID_URL, query)
1482 self._downloader.report_error(u'invalid search query "%s"' % query)
1485 prefix, query = query.split(':')
1487 query = query.encode('utf-8')
1489 self._download_n_results(query, 1)
1491 elif prefix == 'all':
1492 self._download_n_results(query, self._max_youtube_results)
1498 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1500 elif n > self._max_youtube_results:
1501 self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1502 n = self._max_youtube_results
1503 self._download_n_results(query, n)
1505 except ValueError: # parsing prefix as integer fails
1506 self._download_n_results(query, 1)
1509 def _download_n_results(self, query, n):
1510 """Downloads a specified number of results for a query"""
1516 while (50 * pagenum) < limit:
1517 self.report_download_page(query, pagenum+1)
1518 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1519 request = compat_urllib_request.Request(result_url)
1521 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1522 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1523 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1525 api_response = json.loads(data)['data']
1527 if not 'items' in api_response:
1528 self._downloader.trouble(u'[youtube] No video results')
1531 new_ids = list(video['id'] for video in api_response['items'])
1532 video_ids += new_ids
1534 limit = min(n, api_response['totalItems'])
1537 if len(video_ids) > n:
1538 video_ids = video_ids[:n]
1539 for id in video_ids:
1540 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1544 class GoogleSearchIE(InfoExtractor):
1545 """Information Extractor for Google Video search queries."""
1546 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1547 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1548 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1549 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1550 _max_google_results = 1000
1551 IE_NAME = u'video.google:search'
1553 def __init__(self, downloader=None):
1554 InfoExtractor.__init__(self, downloader)
1556 def report_download_page(self, query, pagenum):
1557 """Report attempt to download playlist page with given number."""
1558 query = query.decode(preferredencoding())
1559 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1561 def _real_extract(self, query):
1562 mobj = re.match(self._VALID_URL, query)
1564 self._downloader.report_error(u'invalid search query "%s"' % query)
1567 prefix, query = query.split(':')
1569 query = query.encode('utf-8')
1571 self._download_n_results(query, 1)
1573 elif prefix == 'all':
1574 self._download_n_results(query, self._max_google_results)
1580 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1582 elif n > self._max_google_results:
1583 self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1584 n = self._max_google_results
1585 self._download_n_results(query, n)
1587 except ValueError: # parsing prefix as integer fails
1588 self._download_n_results(query, 1)
1591 def _download_n_results(self, query, n):
1592 """Downloads a specified number of results for a query"""
1598 self.report_download_page(query, pagenum)
1599 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1600 request = compat_urllib_request.Request(result_url)
1602 page = compat_urllib_request.urlopen(request).read()
1603 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1604 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1607 # Extract video identifiers
1608 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1609 video_id = mobj.group(1)
1610 if video_id not in video_ids:
1611 video_ids.append(video_id)
1612 if len(video_ids) == n:
1613 # Specified n videos reached
1614 for id in video_ids:
1615 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1618 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1619 for id in video_ids:
1620 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1623 pagenum = pagenum + 1
1626 class YahooSearchIE(InfoExtractor):
1627 """Information Extractor for Yahoo! Video search queries."""
1630 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1631 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1632 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1633 _MORE_PAGES_INDICATOR = r'\s*Next'
1634 _max_yahoo_results = 1000
1635 IE_NAME = u'video.yahoo:search'
1637 def __init__(self, downloader=None):
1638 InfoExtractor.__init__(self, downloader)
1640 def report_download_page(self, query, pagenum):
1641 """Report attempt to download playlist page with given number."""
1642 query = query.decode(preferredencoding())
1643 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1645 def _real_extract(self, query):
1646 mobj = re.match(self._VALID_URL, query)
1648 self._downloader.report_error(u'invalid search query "%s"' % query)
1651 prefix, query = query.split(':')
1653 query = query.encode('utf-8')
1655 self._download_n_results(query, 1)
1657 elif prefix == 'all':
1658 self._download_n_results(query, self._max_yahoo_results)
1664 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1666 elif n > self._max_yahoo_results:
1667 self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1668 n = self._max_yahoo_results
1669 self._download_n_results(query, n)
1671 except ValueError: # parsing prefix as integer fails
1672 self._download_n_results(query, 1)
1675 def _download_n_results(self, query, n):
1676 """Downloads a specified number of results for a query"""
1679 already_seen = set()
1683 self.report_download_page(query, pagenum)
1684 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1685 request = compat_urllib_request.Request(result_url)
1687 page = compat_urllib_request.urlopen(request).read()
1688 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1689 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1692 # Extract video identifiers
1693 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1694 video_id = mobj.group(1)
1695 if video_id not in already_seen:
1696 video_ids.append(video_id)
1697 already_seen.add(video_id)
1698 if len(video_ids) == n:
1699 # Specified n videos reached
1700 for id in video_ids:
1701 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1704 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1705 for id in video_ids:
1706 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1709 pagenum = pagenum + 1
1712 class YoutubePlaylistIE(InfoExtractor):
1713 """Information Extractor for YouTube playlists."""
1715 _VALID_URL = r"""(?:
1720 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1721 \? (?:.*?&)*? (?:p|a|list)=
1724 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1727 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1729 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1731 IE_NAME = u'youtube:playlist'
1733 def __init__(self, downloader=None):
1734 InfoExtractor.__init__(self, downloader)
1737 def suitable(cls, url):
1738 """Receives a URL and returns True if suitable for this IE."""
1739 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1741 def report_download_page(self, playlist_id, pagenum):
1742 """Report attempt to download playlist page with given number."""
1743 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1745 def _real_extract(self, url):
1746 # Extract playlist id
1747 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1749 self._downloader.report_error(u'invalid url: %s' % url)
1752 # Download playlist videos from API
1753 playlist_id = mobj.group(1) or mobj.group(2)
1758 self.report_download_page(playlist_id, page_num)
1760 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1762 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1763 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1764 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1768 response = json.loads(page)
1769 except ValueError as err:
1770 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1773 if not 'feed' in response or not 'entry' in response['feed']:
1774 self._downloader.report_error(u'Got a malformed response from YouTube API')
1776 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1777 for entry in response['feed']['entry']
1778 if 'content' in entry ]
1780 if len(response['feed']['entry']) < self._MAX_RESULTS:
1784 videos = [v[1] for v in sorted(videos)]
1787 playliststart = self._downloader.params.get('playliststart', 1) - 1
1788 playlistend = self._downloader.params.get('playlistend', -1)
1789 if playlistend == -1:
1790 videos = videos[playliststart:]
1792 videos = videos[playliststart:playlistend]
1794 if len(videos) == total:
1795 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1797 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(videos)))
1799 for video in videos:
1800 self._downloader.download([video])
1804 class YoutubeChannelIE(InfoExtractor):
1805 """Information Extractor for YouTube channels."""
1807 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1808 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1809 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1810 IE_NAME = u'youtube:channel'
1812 def report_download_page(self, channel_id, pagenum):
1813 """Report attempt to download channel page with given number."""
1814 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1816 def _real_extract(self, url):
1817 # Extract channel id
1818 mobj = re.match(self._VALID_URL, url)
1820 self._downloader.report_error(u'invalid url: %s' % url)
1823 # Download channel pages
1824 channel_id = mobj.group(1)
1829 self.report_download_page(channel_id, pagenum)
1830 url = self._TEMPLATE_URL % (channel_id, pagenum)
1831 request = compat_urllib_request.Request(url)
1833 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1834 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1835 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1838 # Extract video identifiers
1840 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1841 if mobj.group(1) not in ids_in_page:
1842 ids_in_page.append(mobj.group(1))
1843 video_ids.extend(ids_in_page)
1845 if self._MORE_PAGES_INDICATOR not in page:
1847 pagenum = pagenum + 1
1849 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1851 for id in video_ids:
1852 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1856 class YoutubeUserIE(InfoExtractor):
1857 """Information Extractor for YouTube users."""
1859 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1860 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1861 _GDATA_PAGE_SIZE = 50
1862 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1863 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1864 IE_NAME = u'youtube:user'
1866 def __init__(self, downloader=None):
1867 InfoExtractor.__init__(self, downloader)
1869 def report_download_page(self, username, start_index):
1870 """Report attempt to download user page."""
1871 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1872 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1874 def _real_extract(self, url):
1876 mobj = re.match(self._VALID_URL, url)
1878 self._downloader.report_error(u'invalid url: %s' % url)
1881 username = mobj.group(1)
1883 # Download video ids using YouTube Data API. Result size per
1884 # query is limited (currently to 50 videos) so we need to query
1885 # page by page until there are no video ids - it means we got
1892 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1893 self.report_download_page(username, start_index)
1895 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1898 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1899 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1900 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1903 # Extract video identifiers
1906 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1907 if mobj.group(1) not in ids_in_page:
1908 ids_in_page.append(mobj.group(1))
1910 video_ids.extend(ids_in_page)
1912 # A little optimization - if current page is not
1913 # "full", ie. does not contain PAGE_SIZE video ids then
1914 # we can assume that this page is the last one - there
1915 # are no more ids on further pages - no need to query
1918 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1923 all_ids_count = len(video_ids)
1924 playliststart = self._downloader.params.get('playliststart', 1) - 1
1925 playlistend = self._downloader.params.get('playlistend', -1)
1927 if playlistend == -1:
1928 video_ids = video_ids[playliststart:]
1930 video_ids = video_ids[playliststart:playlistend]
1932 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1933 (username, all_ids_count, len(video_ids)))
1935 for video_id in video_ids:
1936 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1939 class BlipTVUserIE(InfoExtractor):
1940 """Information Extractor for blip.tv users."""
1942 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1944 IE_NAME = u'blip.tv:user'
1946 def __init__(self, downloader=None):
1947 InfoExtractor.__init__(self, downloader)
1949 def report_download_page(self, username, pagenum):
1950 """Report attempt to download user page."""
1951 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1952 (self.IE_NAME, username, pagenum))
1954 def _real_extract(self, url):
1956 mobj = re.match(self._VALID_URL, url)
1958 self._downloader.report_error(u'invalid url: %s' % url)
1961 username = mobj.group(1)
1963 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1965 request = compat_urllib_request.Request(url)
1968 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1969 mobj = re.search(r'data-users-id="([^"]+)"', page)
1970 page_base = page_base % mobj.group(1)
1971 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1972 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1976 # Download video ids using BlipTV Ajax calls. Result size per
1977 # query is limited (currently to 12 videos) so we need to query
1978 # page by page until there are no video ids - it means we got
1985 self.report_download_page(username, pagenum)
1986 url = page_base + "&page=" + str(pagenum)
1987 request = compat_urllib_request.Request( url )
1989 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1990 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1991 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
1994 # Extract video identifiers
1997 for mobj in re.finditer(r'href="/([^"]+)"', page):
1998 if mobj.group(1) not in ids_in_page:
1999 ids_in_page.append(unescapeHTML(mobj.group(1)))
2001 video_ids.extend(ids_in_page)
2003 # A little optimization - if current page is not
2004 # "full", ie. does not contain PAGE_SIZE video ids then
2005 # we can assume that this page is the last one - there
2006 # are no more ids on further pages - no need to query
2009 if len(ids_in_page) < self._PAGE_SIZE:
2014 all_ids_count = len(video_ids)
2015 playliststart = self._downloader.params.get('playliststart', 1) - 1
2016 playlistend = self._downloader.params.get('playlistend', -1)
2018 if playlistend == -1:
2019 video_ids = video_ids[playliststart:]
2021 video_ids = video_ids[playliststart:playlistend]
2023 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
2024 (self.IE_NAME, username, all_ids_count, len(video_ids)))
2026 for video_id in video_ids:
2027 self._downloader.download([u'http://blip.tv/'+video_id])
2030 class DepositFilesIE(InfoExtractor):
2031 """Information extractor for depositfiles.com"""
2033 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2035 def report_download_webpage(self, file_id):
2036 """Report webpage download."""
2037 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2039 def report_extraction(self, file_id):
2040 """Report information extraction."""
2041 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2043 def _real_extract(self, url):
2044 file_id = url.split('/')[-1]
2045 # Rebuild url in english locale
2046 url = 'http://depositfiles.com/en/files/' + file_id
2048 # Retrieve file webpage with 'Free download' button pressed
2049 free_download_indication = { 'gateway_result' : '1' }
2050 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2052 self.report_download_webpage(file_id)
2053 webpage = compat_urllib_request.urlopen(request).read()
2054 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2055 self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
2058 # Search for the real file URL
2059 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2060 if (mobj is None) or (mobj.group(1) is None):
2061 # Try to figure out reason of the error.
2062 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2063 if (mobj is not None) and (mobj.group(1) is not None):
2064 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2065 self._downloader.report_error(u'%s' % restriction_message)
2067 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
2070 file_url = mobj.group(1)
2071 file_extension = os.path.splitext(file_url)[1][1:]
2073 # Search for file title
2074 mobj = re.search(r'<b title="(.*?)">', webpage)
2076 self._downloader.report_error(u'unable to extract title')
2078 file_title = mobj.group(1).decode('utf-8')
2081 'id': file_id.decode('utf-8'),
2082 'url': file_url.decode('utf-8'),
2084 'upload_date': None,
2085 'title': file_title,
2086 'ext': file_extension.decode('utf-8'),
2090 class FacebookIE(InfoExtractor):
2091 """Information Extractor for Facebook"""
2093 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2094 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2095 _NETRC_MACHINE = 'facebook'
2096 IE_NAME = u'facebook'
2098 def report_login(self):
2099 """Report attempt to log in."""
2100 self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2102 def _real_initialize(self):
2103 if self._downloader is None:
2108 downloader_params = self._downloader.params
2110 # Attempt to use provided username and password or .netrc data
2111 if downloader_params.get('username', None) is not None:
2112 useremail = downloader_params['username']
2113 password = downloader_params['password']
2114 elif downloader_params.get('usenetrc', False):
2116 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2117 if info is not None:
2121 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2122 except (IOError, netrc.NetrcParseError) as err:
2123 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2126 if useremail is None:
2135 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2138 login_results = compat_urllib_request.urlopen(request).read()
2139 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2140 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2142 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2143 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2146 def _real_extract(self, url):
2147 mobj = re.match(self._VALID_URL, url)
2149 self._downloader.report_error(u'invalid URL: %s' % url)
2151 video_id = mobj.group('ID')
2153 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2154 webpage = self._download_webpage(url, video_id)
2156 BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2157 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2158 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2160 raise ExtractorError(u'Cannot parse data')
2161 data = dict(json.loads(m.group(1)))
2162 params_raw = compat_urllib_parse.unquote(data['params'])
2163 params = json.loads(params_raw)
2164 video_url = params['hd_src']
2166 video_url = params['sd_src']
2168 raise ExtractorError(u'Cannot find video URL')
2169 video_duration = int(params['video_duration'])
2171 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2173 raise ExtractorError(u'Cannot find title in webpage')
2174 video_title = unescapeHTML(m.group(1))
2178 'title': video_title,
2181 'duration': video_duration,
2182 'thumbnail': params['thumbnail_src'],
2187 class BlipTVIE(InfoExtractor):
2188 """Information extractor for blip.tv"""
2190 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2191 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2192 IE_NAME = u'blip.tv'
2194 def report_extraction(self, file_id):
2195 """Report information extraction."""
2196 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2198 def report_direct_download(self, title):
2199 """Report information extraction."""
2200 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2202 def _real_extract(self, url):
2203 mobj = re.match(self._VALID_URL, url)
2205 self._downloader.report_error(u'invalid URL: %s' % url)
2208 urlp = compat_urllib_parse_urlparse(url)
2209 if urlp.path.startswith('/play/'):
2210 request = compat_urllib_request.Request(url)
2211 response = compat_urllib_request.urlopen(request)
2212 redirecturl = response.geturl()
2213 rurlp = compat_urllib_parse_urlparse(redirecturl)
2214 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2215 url = 'http://blip.tv/a/a-' + file_id
2216 return self._real_extract(url)
2223 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2224 request = compat_urllib_request.Request(json_url)
2225 request.add_header('User-Agent', 'iTunes/10.6.1')
2226 self.report_extraction(mobj.group(1))
2229 urlh = compat_urllib_request.urlopen(request)
2230 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2231 basename = url.split('/')[-1]
2232 title,ext = os.path.splitext(basename)
2233 title = title.decode('UTF-8')
2234 ext = ext.replace('.', '')
2235 self.report_direct_download(title)
2240 'upload_date': None,
2245 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2246 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2247 if info is None: # Regular URL
2249 json_code_bytes = urlh.read()
2250 json_code = json_code_bytes.decode('utf-8')
2251 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2252 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2256 json_data = json.loads(json_code)
2257 if 'Post' in json_data:
2258 data = json_data['Post']
2262 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2263 video_url = data['media']['url']
2264 umobj = re.match(self._URL_EXT, video_url)
2266 raise ValueError('Can not determine filename extension')
2267 ext = umobj.group(1)
2270 'id': data['item_id'],
2272 'uploader': data['display_name'],
2273 'upload_date': upload_date,
2274 'title': data['title'],
2276 'format': data['media']['mimeType'],
2277 'thumbnail': data['thumbnailUrl'],
2278 'description': data['description'],
2279 'player_url': data['embedUrl'],
2280 'user_agent': 'iTunes/10.6.1',
2282 except (ValueError,KeyError) as err:
2283 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2289 class MyVideoIE(InfoExtractor):
2290 """Information Extractor for myvideo.de."""
2292 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2293 IE_NAME = u'myvideo'
2295 def __init__(self, downloader=None):
2296 InfoExtractor.__init__(self, downloader)
2298 def report_extraction(self, video_id):
2299 """Report information extraction."""
2300 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2302 def _real_extract(self,url):
2303 mobj = re.match(self._VALID_URL, url)
2305 self._download.report_error(u'invalid URL: %s' % url)
2308 video_id = mobj.group(1)
2311 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2312 webpage = self._download_webpage(webpage_url, video_id)
2314 self.report_extraction(video_id)
2315 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2318 self._downloader.report_error(u'unable to extract media URL')
2320 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2322 mobj = re.search('<title>([^<]+)</title>', webpage)
2324 self._downloader.report_error(u'unable to extract title')
2327 video_title = mobj.group(1)
2333 'upload_date': None,
2334 'title': video_title,
2338 class ComedyCentralIE(InfoExtractor):
2339 """Information extractor for The Daily Show and Colbert Report """
2341 # urls can be abbreviations like :thedailyshow or :colbert
2342 # urls for episodes like:
2343 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2344 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2345 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2346 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2347 |(https?://)?(www\.)?
2348 (?P<showname>thedailyshow|colbertnation)\.com/
2349 (full-episodes/(?P<episode>.*)|
2351 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2352 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2355 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2357 _video_extensions = {
2365 _video_dimensions = {
2375 def suitable(cls, url):
2376 """Receives a URL and returns True if suitable for this IE."""
2377 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2379 def report_extraction(self, episode_id):
2380 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2382 def report_config_download(self, episode_id, media_id):
2383 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2385 def report_index_download(self, episode_id):
2386 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2388 def _print_formats(self, formats):
2389 print('Available formats:')
2391 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2394 def _real_extract(self, url):
2395 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2397 self._downloader.report_error(u'invalid URL: %s' % url)
2400 if mobj.group('shortname'):
2401 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2402 url = u'http://www.thedailyshow.com/full-episodes/'
2404 url = u'http://www.colbertnation.com/full-episodes/'
2405 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2406 assert mobj is not None
2408 if mobj.group('clip'):
2409 if mobj.group('showname') == 'thedailyshow':
2410 epTitle = mobj.group('tdstitle')
2412 epTitle = mobj.group('cntitle')
2415 dlNewest = not mobj.group('episode')
2417 epTitle = mobj.group('showname')
2419 epTitle = mobj.group('episode')
2421 req = compat_urllib_request.Request(url)
2422 self.report_extraction(epTitle)
2424 htmlHandle = compat_urllib_request.urlopen(req)
2425 html = htmlHandle.read()
2426 webpage = html.decode('utf-8')
2427 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2428 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2431 url = htmlHandle.geturl()
2432 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2434 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2436 if mobj.group('episode') == '':
2437 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2439 epTitle = mobj.group('episode')
2441 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2443 if len(mMovieParams) == 0:
2444 # The Colbert Report embeds the information in a without
2445 # a URL prefix; so extract the alternate reference
2446 # and then add the URL prefix manually.
2448 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2449 if len(altMovieParams) == 0:
2450 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2453 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2455 uri = mMovieParams[0][1]
2456 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2457 self.report_index_download(epTitle)
2459 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2460 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2461 self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2466 idoc = xml.etree.ElementTree.fromstring(indexXml)
2467 itemEls = idoc.findall('.//item')
2468 for partNum,itemEl in enumerate(itemEls):
2469 mediaId = itemEl.findall('./guid')[0].text
2470 shortMediaId = mediaId.split(':')[-1]
2471 showId = mediaId.split(':')[-2].replace('.com', '')
2472 officialTitle = itemEl.findall('./title')[0].text
2473 officialDate = itemEl.findall('./pubDate')[0].text
2475 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2476 compat_urllib_parse.urlencode({'uri': mediaId}))
2477 configReq = compat_urllib_request.Request(configUrl)
2478 self.report_config_download(epTitle, shortMediaId)
2480 configXml = compat_urllib_request.urlopen(configReq).read()
2481 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2482 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2485 cdoc = xml.etree.ElementTree.fromstring(configXml)
2487 for rendition in cdoc.findall('.//rendition'):
2488 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2492 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2495 if self._downloader.params.get('listformats', None):
2496 self._print_formats([i[0] for i in turls])
2499 # For now, just pick the highest bitrate
2500 format,rtmp_video_url = turls[-1]
2502 # Get the format arg from the arg stream
2503 req_format = self._downloader.params.get('format', None)
2505 # Select format if we can find one
2508 format, rtmp_video_url = f, v
2511 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2513 raise ExtractorError(u'Cannot transform RTMP url')
2514 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2515 video_url = base + m.group('finalid')
2517 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2522 'upload_date': officialDate,
2527 'description': officialTitle,
2529 results.append(info)
2534 class EscapistIE(InfoExtractor):
2535 """Information extractor for The Escapist """
2537 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2538 IE_NAME = u'escapist'
2540 def report_extraction(self, showName):
2541 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2543 def report_config_download(self, showName):
2544 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2546 def _real_extract(self, url):
2547 mobj = re.match(self._VALID_URL, url)
2549 self._downloader.report_error(u'invalid URL: %s' % url)
2551 showName = mobj.group('showname')
2552 videoId = mobj.group('episode')
2554 self.report_extraction(showName)
2556 webPage = compat_urllib_request.urlopen(url)
2557 webPageBytes = webPage.read()
2558 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2559 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2560 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2561 self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2564 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2565 description = unescapeHTML(descMatch.group(1))
2566 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2567 imgUrl = unescapeHTML(imgMatch.group(1))
2568 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2569 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2570 configUrlMatch = re.search('config=(.*)$', playerUrl)
2571 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2573 self.report_config_download(showName)
2575 configJSON = compat_urllib_request.urlopen(configUrl)
2576 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2577 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2578 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2579 self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2582 # Technically, it's JavaScript, not JSON
2583 configJSON = configJSON.replace("'", '"')
2586 config = json.loads(configJSON)
2587 except (ValueError,) as err:
2588 self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2591 playlist = config['playlist']
2592 videoUrl = playlist[1]['url']
2597 'uploader': showName,
2598 'upload_date': None,
2601 'thumbnail': imgUrl,
2602 'description': description,
2603 'player_url': playerUrl,
2608 class CollegeHumorIE(InfoExtractor):
2609 """Information extractor for collegehumor.com"""
2612 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2613 IE_NAME = u'collegehumor'
2615 def report_manifest(self, video_id):
2616 """Report information extraction."""
2617 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2619 def report_extraction(self, video_id):
2620 """Report information extraction."""
2621 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2623 def _real_extract(self, url):
2624 mobj = re.match(self._VALID_URL, url)
2626 self._downloader.report_error(u'invalid URL: %s' % url)
2628 video_id = mobj.group('videoid')
2633 'upload_date': None,
2636 self.report_extraction(video_id)
2637 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2639 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2640 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2641 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2644 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2646 videoNode = mdoc.findall('./video')[0]
2647 info['description'] = videoNode.findall('./description')[0].text
2648 info['title'] = videoNode.findall('./caption')[0].text
2649 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2650 manifest_url = videoNode.findall('./file')[0].text
2652 self._downloader.report_error(u'Invalid metadata XML file')
2655 manifest_url += '?hdcore=2.10.3'
2656 self.report_manifest(video_id)
2658 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2659 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2660 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2663 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2665 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2666 node_id = media_node.attrib['url']
2667 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2668 except IndexError as err:
2669 self._downloader.report_error(u'Invalid manifest file')
2672 url_pr = compat_urllib_parse_urlparse(manifest_url)
2673 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2680 class XVideosIE(InfoExtractor):
2681 """Information extractor for xvideos.com"""
2683 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2684 IE_NAME = u'xvideos'
2686 def report_extraction(self, video_id):
2687 """Report information extraction."""
2688 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2690 def _real_extract(self, url):
2691 mobj = re.match(self._VALID_URL, url)
2693 self._downloader.report_error(u'invalid URL: %s' % url)
2695 video_id = mobj.group(1)
2697 webpage = self._download_webpage(url, video_id)
2699 self.report_extraction(video_id)
2703 mobj = re.search(r'flv_url=(.+?)&', webpage)
2705 self._downloader.report_error(u'unable to extract video url')
2707 video_url = compat_urllib_parse.unquote(mobj.group(1))
2711 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2713 self._downloader.report_error(u'unable to extract video title')
2715 video_title = mobj.group(1)
2718 # Extract video thumbnail
2719 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2721 self._downloader.report_error(u'unable to extract video thumbnail')
2723 video_thumbnail = mobj.group(0)
2729 'upload_date': None,
2730 'title': video_title,
2732 'thumbnail': video_thumbnail,
2733 'description': None,
2739 class SoundcloudIE(InfoExtractor):
2740 """Information extractor for soundcloud.com
2741 To access the media, the uid of the song and a stream token
2742 must be extracted from the page source and the script must make
2743 a request to media.soundcloud.com/crossdomain.xml. Then
2744 the media can be grabbed by requesting from an url composed
2745 of the stream token and uid
2748 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2749 IE_NAME = u'soundcloud'
2751 def __init__(self, downloader=None):
2752 InfoExtractor.__init__(self, downloader)
2754 def report_resolve(self, video_id):
2755 """Report information extraction."""
2756 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2758 def report_extraction(self, video_id):
2759 """Report information extraction."""
2760 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2762 def _real_extract(self, url):
2763 mobj = re.match(self._VALID_URL, url)
2765 self._downloader.report_error(u'invalid URL: %s' % url)
2768 # extract uploader (which is in the url)
2769 uploader = mobj.group(1)
2770 # extract simple title (uploader + slug of song title)
2771 slug_title = mobj.group(2)
2772 simple_title = uploader + u'-' + slug_title
2774 self.report_resolve('%s/%s' % (uploader, slug_title))
2776 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2777 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2778 request = compat_urllib_request.Request(resolv_url)
2780 info_json_bytes = compat_urllib_request.urlopen(request).read()
2781 info_json = info_json_bytes.decode('utf-8')
2782 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2783 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2786 info = json.loads(info_json)
2787 video_id = info['id']
2788 self.report_extraction('%s/%s' % (uploader, slug_title))
2790 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2791 request = compat_urllib_request.Request(streams_url)
2793 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2794 stream_json = stream_json_bytes.decode('utf-8')
2795 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2796 self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2799 streams = json.loads(stream_json)
2800 mediaURL = streams['http_mp3_128_url']
2805 'uploader': info['user']['username'],
2806 'upload_date': info['created_at'],
2807 'title': info['title'],
2809 'description': info['description'],
2812 class SoundcloudSetIE(InfoExtractor):
2813 """Information extractor for soundcloud.com sets
2814 To access the media, the uid of the song and a stream token
2815 must be extracted from the page source and the script must make
2816 a request to media.soundcloud.com/crossdomain.xml. Then
2817 the media can be grabbed by requesting from an url composed
2818 of the stream token and uid
2821 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2822 IE_NAME = u'soundcloud'
2824 def __init__(self, downloader=None):
2825 InfoExtractor.__init__(self, downloader)
2827 def report_resolve(self, video_id):
2828 """Report information extraction."""
2829 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2831 def report_extraction(self, video_id):
2832 """Report information extraction."""
2833 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2835 def _real_extract(self, url):
2836 mobj = re.match(self._VALID_URL, url)
2838 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2841 # extract uploader (which is in the url)
2842 uploader = mobj.group(1)
2843 # extract simple title (uploader + slug of song title)
2844 slug_title = mobj.group(2)
2845 simple_title = uploader + u'-' + slug_title
2847 self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2849 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2850 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2851 request = compat_urllib_request.Request(resolv_url)
2853 info_json_bytes = compat_urllib_request.urlopen(request).read()
2854 info_json = info_json_bytes.decode('utf-8')
2855 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2856 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2860 info = json.loads(info_json)
2861 if 'errors' in info:
2862 for err in info['errors']:
2863 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err['error_message']))
2866 for track in info['tracks']:
2867 video_id = track['id']
2868 self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2870 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2871 request = compat_urllib_request.Request(streams_url)
2873 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2874 stream_json = stream_json_bytes.decode('utf-8')
2875 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2876 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2879 streams = json.loads(stream_json)
2880 mediaURL = streams['http_mp3_128_url']
2885 'uploader': track['user']['username'],
2886 'upload_date': track['created_at'],
2887 'title': track['title'],
2889 'description': track['description'],
2894 class InfoQIE(InfoExtractor):
2895 """Information extractor for infoq.com"""
2896 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2898 def report_extraction(self, video_id):
2899 """Report information extraction."""
2900 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2902 def _real_extract(self, url):
2903 mobj = re.match(self._VALID_URL, url)
2905 self._downloader.report_error(u'invalid URL: %s' % url)
2908 webpage = self._download_webpage(url, video_id=url)
2909 self.report_extraction(url)
2912 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2914 self._downloader.report_error(u'unable to extract video url')
2916 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2917 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2920 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2922 self._downloader.report_error(u'unable to extract video title')
2924 video_title = mobj.group(1)
2926 # Extract description
2927 video_description = u'No description available.'
2928 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2929 if mobj is not None:
2930 video_description = mobj.group(1)
2932 video_filename = video_url.split('/')[-1]
2933 video_id, extension = video_filename.split('.')
2939 'upload_date': None,
2940 'title': video_title,
2941 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2943 'description': video_description,
2948 class MixcloudIE(InfoExtractor):
2949 """Information extractor for www.mixcloud.com"""
2951 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2952 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2953 IE_NAME = u'mixcloud'
2955 def __init__(self, downloader=None):
2956 InfoExtractor.__init__(self, downloader)
2958 def report_download_json(self, file_id):
2959 """Report JSON download."""
2960 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2962 def report_extraction(self, file_id):
2963 """Report information extraction."""
2964 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2966 def get_urls(self, jsonData, fmt, bitrate='best'):
2967 """Get urls from 'audio_formats' section in json"""
2970 bitrate_list = jsonData[fmt]
2971 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2972 bitrate = max(bitrate_list) # select highest
2974 url_list = jsonData[fmt][bitrate]
2975 except TypeError: # we have no bitrate info.
2976 url_list = jsonData[fmt]
2979 def check_urls(self, url_list):
2980 """Returns 1st active url from list"""
2981 for url in url_list:
2983 compat_urllib_request.urlopen(url)
2985 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2990 def _print_formats(self, formats):
2991 print('Available formats:')
2992 for fmt in formats.keys():
2993 for b in formats[fmt]:
2995 ext = formats[fmt][b][0]
2996 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2997 except TypeError: # we have no bitrate info
2998 ext = formats[fmt][0]
2999 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
3002 def _real_extract(self, url):
3003 mobj = re.match(self._VALID_URL, url)
3005 self._downloader.report_error(u'invalid URL: %s' % url)
3007 # extract uploader & filename from url
3008 uploader = mobj.group(1).decode('utf-8')
3009 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3011 # construct API request
3012 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3013 # retrieve .json file with links to files
3014 request = compat_urllib_request.Request(file_url)
3016 self.report_download_json(file_url)
3017 jsonData = compat_urllib_request.urlopen(request).read()
3018 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3019 self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
3023 json_data = json.loads(jsonData)
3024 player_url = json_data['player_swf_url']
3025 formats = dict(json_data['audio_formats'])
3027 req_format = self._downloader.params.get('format', None)
3030 if self._downloader.params.get('listformats', None):
3031 self._print_formats(formats)
3034 if req_format is None or req_format == 'best':
3035 for format_param in formats.keys():
3036 url_list = self.get_urls(formats, format_param)
3038 file_url = self.check_urls(url_list)
3039 if file_url is not None:
3042 if req_format not in formats:
3043 self._downloader.report_error(u'format is not available')
3046 url_list = self.get_urls(formats, req_format)
3047 file_url = self.check_urls(url_list)
3048 format_param = req_format
3051 'id': file_id.decode('utf-8'),
3052 'url': file_url.decode('utf-8'),
3053 'uploader': uploader.decode('utf-8'),
3054 'upload_date': None,
3055 'title': json_data['name'],
3056 'ext': file_url.split('.')[-1].decode('utf-8'),
3057 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3058 'thumbnail': json_data['thumbnail_url'],
3059 'description': json_data['description'],
3060 'player_url': player_url.decode('utf-8'),
3063 class StanfordOpenClassroomIE(InfoExtractor):
3064 """Information extractor for Stanford's Open ClassRoom"""
3066 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3067 IE_NAME = u'stanfordoc'
3069 def report_download_webpage(self, objid):
3070 """Report information extraction."""
3071 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3073 def report_extraction(self, video_id):
3074 """Report information extraction."""
3075 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3077 def _real_extract(self, url):
3078 mobj = re.match(self._VALID_URL, url)
3080 raise ExtractorError(u'Invalid URL: %s' % url)
3082 if mobj.group('course') and mobj.group('video'): # A specific video
3083 course = mobj.group('course')
3084 video = mobj.group('video')
3086 'id': course + '_' + video,
3088 'upload_date': None,
3091 self.report_extraction(info['id'])
3092 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3093 xmlUrl = baseUrl + video + '.xml'
3095 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3096 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3097 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
3099 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3101 info['title'] = mdoc.findall('./title')[0].text
3102 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3104 self._downloader.report_error(u'Invalid metadata XML file')
3106 info['ext'] = info['url'].rpartition('.')[2]
3108 elif mobj.group('course'): # A course page
3109 course = mobj.group('course')
3114 'upload_date': None,
3117 coursepage = self._download_webpage(url, info['id'],
3118 note='Downloading course info page',
3119 errnote='Unable to download course info page')
3121 m = re.search('<h1>([^<]+)</h1>', coursepage)
3123 info['title'] = unescapeHTML(m.group(1))
3125 info['title'] = info['id']
3127 m = re.search('<description>([^<]+)</description>', coursepage)
3129 info['description'] = unescapeHTML(m.group(1))
3131 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3134 'type': 'reference',
3135 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3139 for entry in info['list']:
3140 assert entry['type'] == 'reference'
3141 results += self.extract(entry['url'])
3145 'id': 'Stanford OpenClassroom',
3148 'upload_date': None,
3151 self.report_download_webpage(info['id'])
3152 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3154 rootpage = compat_urllib_request.urlopen(rootURL).read()
3155 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3156 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3159 info['title'] = info['id']
3161 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3164 'type': 'reference',
3165 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3170 for entry in info['list']:
3171 assert entry['type'] == 'reference'
3172 results += self.extract(entry['url'])
3175 class MTVIE(InfoExtractor):
3176 """Information extractor for MTV.com"""
3178 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3181 def report_extraction(self, video_id):
3182 """Report information extraction."""
3183 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3185 def _real_extract(self, url):
3186 mobj = re.match(self._VALID_URL, url)
3188 self._downloader.report_error(u'invalid URL: %s' % url)
3190 if not mobj.group('proto'):
3191 url = 'http://' + url
3192 video_id = mobj.group('videoid')
3194 webpage = self._download_webpage(url, video_id)
3196 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3198 self._downloader.report_error(u'unable to extract song name')
3200 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3201 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3203 self._downloader.report_error(u'unable to extract performer')
3205 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3206 video_title = performer + ' - ' + song_name
3208 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3210 self._downloader.report_error(u'unable to mtvn_uri')
3212 mtvn_uri = mobj.group(1)
3214 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3216 self._downloader.report_error(u'unable to extract content id')
3218 content_id = mobj.group(1)
3220 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3221 self.report_extraction(video_id)
3222 request = compat_urllib_request.Request(videogen_url)
3224 metadataXml = compat_urllib_request.urlopen(request).read()
3225 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3226 self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3229 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3230 renditions = mdoc.findall('.//rendition')
3232 # For now, always pick the highest quality.
3233 rendition = renditions[-1]
3236 _,_,ext = rendition.attrib['type'].partition('/')
3237 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3238 video_url = rendition.find('./src').text
3240 self._downloader.trouble('Invalid rendition field.')
3246 'uploader': performer,
3247 'upload_date': None,
3248 'title': video_title,
3256 class YoukuIE(InfoExtractor):
3257 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3259 def report_download_webpage(self, file_id):
3260 """Report webpage download."""
3261 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3263 def report_extraction(self, file_id):
3264 """Report information extraction."""
3265 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3268 nowTime = int(time.time() * 1000)
3269 random1 = random.randint(1000,1998)
3270 random2 = random.randint(1000,9999)
3272 return "%d%d%d" %(nowTime,random1,random2)
3274 def _get_file_ID_mix_string(self, seed):
3276 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3278 for i in range(len(source)):
3279 seed = (seed * 211 + 30031 ) % 65536
3280 index = math.floor(seed / 65536 * len(source) )
3281 mixed.append(source[int(index)])
3282 source.remove(source[int(index)])
3283 #return ''.join(mixed)
3286 def _get_file_id(self, fileId, seed):
3287 mixed = self._get_file_ID_mix_string(seed)
3288 ids = fileId.split('*')
3292 realId.append(mixed[int(ch)])
3293 return ''.join(realId)
3295 def _real_extract(self, url):
3296 mobj = re.match(self._VALID_URL, url)
3298 self._downloader.report_error(u'invalid URL: %s' % url)
3300 video_id = mobj.group('ID')
3302 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3304 request = compat_urllib_request.Request(info_url, None, std_headers)
3306 self.report_download_webpage(video_id)
3307 jsondata = compat_urllib_request.urlopen(request).read()
3308 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3309 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3312 self.report_extraction(video_id)
3314 jsonstr = jsondata.decode('utf-8')
3315 config = json.loads(jsonstr)
3317 video_title = config['data'][0]['title']
3318 seed = config['data'][0]['seed']
3320 format = self._downloader.params.get('format', None)
3321 supported_format = list(config['data'][0]['streamfileids'].keys())
3323 if format is None or format == 'best':
3324 if 'hd2' in supported_format:
3329 elif format == 'worst':
3337 fileid = config['data'][0]['streamfileids'][format]
3338 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3339 except (UnicodeDecodeError, ValueError, KeyError):
3340 self._downloader.report_error(u'unable to extract info section')
3344 sid = self._gen_sid()
3345 fileid = self._get_file_id(fileid, seed)
3347 #column 8,9 of fileid represent the segment number
3348 #fileid[7:9] should be changed
3349 for index, key in enumerate(keys):
3351 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3352 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3355 'id': '%s_part%02d' % (video_id, index),
3356 'url': download_url,
3358 'upload_date': None,
3359 'title': video_title,
3362 files_info.append(info)
3367 class XNXXIE(InfoExtractor):
3368 """Information extractor for xnxx.com"""
3370 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3372 VIDEO_URL_RE = r'flv_url=(.*?)&'
3373 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3374 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3376 def report_webpage(self, video_id):
3377 """Report information extraction"""
3378 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3380 def report_extraction(self, video_id):
3381 """Report information extraction"""
3382 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3384 def _real_extract(self, url):
3385 mobj = re.match(self._VALID_URL, url)
3387 self._downloader.report_error(u'invalid URL: %s' % url)
3389 video_id = mobj.group(1)
3391 self.report_webpage(video_id)
3393 # Get webpage content
3395 webpage_bytes = compat_urllib_request.urlopen(url).read()
3396 webpage = webpage_bytes.decode('utf-8')
3397 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3398 self._downloader.report_error(u'unable to download video webpage: %s' % err)
3401 result = re.search(self.VIDEO_URL_RE, webpage)
3403 self._downloader.report_error(u'unable to extract video url')
3405 video_url = compat_urllib_parse.unquote(result.group(1))
3407 result = re.search(self.VIDEO_TITLE_RE, webpage)
3409 self._downloader.report_error(u'unable to extract video title')
3411 video_title = result.group(1)
3413 result = re.search(self.VIDEO_THUMB_RE, webpage)
3415 self._downloader.report_error(u'unable to extract video thumbnail')
3417 video_thumbnail = result.group(1)
3423 'upload_date': None,
3424 'title': video_title,
3426 'thumbnail': video_thumbnail,
3427 'description': None,
3431 class GooglePlusIE(InfoExtractor):
3432 """Information extractor for plus.google.com."""
3434 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3435 IE_NAME = u'plus.google'
3437 def __init__(self, downloader=None):
3438 InfoExtractor.__init__(self, downloader)
3440 def report_extract_entry(self, url):
3441 """Report downloading extry"""
3442 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3444 def report_date(self, upload_date):
3445 """Report downloading extry"""
3446 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3448 def report_uploader(self, uploader):
3449 """Report downloading extry"""
3450 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3452 def report_title(self, video_title):
3453 """Report downloading extry"""
3454 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3456 def report_extract_vid_page(self, video_page):
3457 """Report information extraction."""
3458 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3460 def _real_extract(self, url):
3461 # Extract id from URL
3462 mobj = re.match(self._VALID_URL, url)
3464 self._downloader.report_error(u'Invalid URL: %s' % url)
3467 post_url = mobj.group(0)
3468 video_id = mobj.group(1)
3470 video_extension = 'flv'
3472 # Step 1, Retrieve post webpage to extract further information
3473 self.report_extract_entry(post_url)
3474 request = compat_urllib_request.Request(post_url)
3476 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3477 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3478 self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3481 # Extract update date
3483 pattern = 'title="Timestamp">(.*?)</a>'
3484 mobj = re.search(pattern, webpage)
3486 upload_date = mobj.group(1)
3487 # Convert timestring to a format suitable for filename
3488 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3489 upload_date = upload_date.strftime('%Y%m%d')
3490 self.report_date(upload_date)
3494 pattern = r'rel\="author".*?>(.*?)</a>'
3495 mobj = re.search(pattern, webpage)
3497 uploader = mobj.group(1)
3498 self.report_uploader(uploader)
3501 # Get the first line for title
3503 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3504 mobj = re.search(pattern, webpage)
3506 video_title = mobj.group(1)
3507 self.report_title(video_title)
3509 # Step 2, Stimulate clicking the image box to launch video
3510 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3511 mobj = re.search(pattern, webpage)
3513 self._downloader.report_error(u'unable to extract video page URL')
3515 video_page = mobj.group(1)
3516 request = compat_urllib_request.Request(video_page)
3518 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3519 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3520 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3522 self.report_extract_vid_page(video_page)
3525 # Extract video links on video page
3526 """Extract video links of all sizes"""
3527 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3528 mobj = re.findall(pattern, webpage)
3530 self._downloader.report_error(u'unable to extract video links')
3532 # Sort in resolution
3533 links = sorted(mobj)
3535 # Choose the lowest of the sort, i.e. highest resolution
3536 video_url = links[-1]
3537 # Only get the url. The resolution part in the tuple has no use anymore
3538 video_url = video_url[-1]
3539 # Treat escaped \u0026 style hex
3541 video_url = video_url.decode("unicode_escape")
3542 except AttributeError: # Python 3
3543 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3549 'uploader': uploader,
3550 'upload_date': upload_date,
3551 'title': video_title,
3552 'ext': video_extension,
3555 class NBAIE(InfoExtractor):
3556 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3559 def _real_extract(self, url):
3560 mobj = re.match(self._VALID_URL, url)
3562 self._downloader.report_error(u'invalid URL: %s' % url)
3565 video_id = mobj.group(1)
3566 if video_id.endswith('/index.html'):
3567 video_id = video_id[:-len('/index.html')]
3569 webpage = self._download_webpage(url, video_id)
3571 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3572 def _findProp(rexp, default=None):
3573 m = re.search(rexp, webpage)
3575 return unescapeHTML(m.group(1))
3579 shortened_video_id = video_id.rpartition('/')[2]
3580 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3582 'id': shortened_video_id,
3586 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3587 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3591 class JustinTVIE(InfoExtractor):
3592 """Information extractor for justin.tv and twitch.tv"""
3593 # TODO: One broadcast may be split into multiple videos. The key
3594 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3595 # starts at 1 and increases. Can we treat all parts as one video?
3597 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3598 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3599 _JUSTIN_PAGE_LIMIT = 100
3600 IE_NAME = u'justin.tv'
3602 def report_extraction(self, file_id):
3603 """Report information extraction."""
3604 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3606 def report_download_page(self, channel, offset):
3607 """Report attempt to download a single page of videos."""
3608 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3609 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3611 # Return count of items, list of *valid* items
3612 def _parse_page(self, url):
3614 urlh = compat_urllib_request.urlopen(url)
3615 webpage_bytes = urlh.read()
3616 webpage = webpage_bytes.decode('utf-8', 'ignore')
3617 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3618 self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3621 response = json.loads(webpage)
3622 if type(response) != list:
3623 error_text = response.get('error', 'unknown error')
3624 self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3627 for clip in response:
3628 video_url = clip['video_file_url']
3630 video_extension = os.path.splitext(video_url)[1][1:]
3631 video_date = re.sub('-', '', clip['start_time'][:10])
3632 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3633 video_id = clip['id']
3634 video_title = clip.get('title', video_id)
3638 'title': video_title,
3639 'uploader': clip.get('channel_name', video_uploader_id),
3640 'uploader_id': video_uploader_id,
3641 'upload_date': video_date,
3642 'ext': video_extension,
3644 return (len(response), info)
3646 def _real_extract(self, url):
3647 mobj = re.match(self._VALID_URL, url)
3649 self._downloader.report_error(u'invalid URL: %s' % url)
3652 api = 'http://api.justin.tv'
3653 video_id = mobj.group(mobj.lastindex)
3655 if mobj.lastindex == 1:
3657 api += '/channel/archives/%s.json'
3659 api += '/broadcast/by_archive/%s.json'
3660 api = api % (video_id,)
3662 self.report_extraction(video_id)
3666 limit = self._JUSTIN_PAGE_LIMIT
3669 self.report_download_page(video_id, offset)
3670 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3671 page_count, page_info = self._parse_page(page_url)
3672 info.extend(page_info)
3673 if not paged or page_count != limit:
3678 class FunnyOrDieIE(InfoExtractor):
3679 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3681 def _real_extract(self, url):
3682 mobj = re.match(self._VALID_URL, url)
3684 self._downloader.report_error(u'invalid URL: %s' % url)
3687 video_id = mobj.group('id')
3688 webpage = self._download_webpage(url, video_id)
3690 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3692 self._downloader.report_error(u'unable to find video information')
3693 video_url = unescapeHTML(m.group('url'))
3695 m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3697 self._downloader.trouble(u'Cannot find video title')
3698 title = clean_html(m.group('title'))
3700 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3702 desc = unescapeHTML(m.group('desc'))
3711 'description': desc,
3715 class SteamIE(InfoExtractor):
3716 _VALID_URL = r"""http://store.steampowered.com/
3717 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3719 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3723 def suitable(cls, url):
3724 """Receives a URL and returns True if suitable for this IE."""
3725 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3727 def _real_extract(self, url):
3728 m = re.match(self._VALID_URL, url, re.VERBOSE)
3729 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3730 gameID = m.group('gameID')
3731 videourl = 'http://store.steampowered.com/video/%s/' % gameID
3732 webpage = self._download_webpage(videourl, gameID)
3733 mweb = re.finditer(urlRE, webpage)
3734 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3735 titles = re.finditer(namesRE, webpage)
3736 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3737 thumbs = re.finditer(thumbsRE, webpage)
3739 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3740 video_id = vid.group('videoID')
3741 title = vtitle.group('videoName')
3742 video_url = vid.group('videoURL')
3743 video_thumb = thumb.group('thumbnail')
3745 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3750 'title': unescapeHTML(title),
3751 'thumbnail': video_thumb
3756 class UstreamIE(InfoExtractor):
3757 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3758 IE_NAME = u'ustream'
3760 def _real_extract(self, url):
3761 m = re.match(self._VALID_URL, url)
3762 video_id = m.group('videoID')
3763 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3764 webpage = self._download_webpage(url, video_id)
3765 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3766 title = m.group('title')
3767 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3768 uploader = m.group('uploader')
3774 'uploader': uploader
3778 class WorldStarHipHopIE(InfoExtractor):
3779 _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3780 IE_NAME = u'WorldStarHipHop'
3782 def _real_extract(self, url):
3783 _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3785 webpage_src = compat_urllib_request.urlopen(url).read()
3786 webpage_src = webpage_src.decode('utf-8')
3788 mobj = re.search(_src_url, webpage_src)
3790 m = re.match(self._VALID_URL, url)
3791 video_id = m.group('id')
3793 if mobj is not None:
3794 video_url = mobj.group()
3795 if 'mp4' in video_url:
3800 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3803 _title = r"""<title>(.*)</title>"""
3805 mobj = re.search(_title, webpage_src)
3807 if mobj is not None:
3808 title = mobj.group(1)
3810 title = 'World Start Hip Hop - %s' % time.ctime()
3812 _thumbnail = r"""rel="image_src" href="(.*)" />"""
3813 mobj = re.search(_thumbnail, webpage_src)
3815 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3816 if mobj is not None:
3817 thumbnail = mobj.group(1)
3819 _title = r"""candytitles.*>(.*)</span>"""
3820 mobj = re.search(_title, webpage_src)
3821 if mobj is not None:
3822 title = mobj.group(1)
3829 'thumbnail' : thumbnail,
3834 class RBMARadioIE(InfoExtractor):
3835 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3837 def _real_extract(self, url):
3838 m = re.match(self._VALID_URL, url)
3839 video_id = m.group('videoID')
3841 webpage = self._download_webpage(url, video_id)
3842 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3844 raise ExtractorError(u'Cannot find metadata')
3845 json_data = m.group(1)
3848 data = json.loads(json_data)
3849 except ValueError as e:
3850 raise ExtractorError(u'Invalid JSON: ' + str(e))
3852 video_url = data['akamai_url'] + '&cbr=256'
3853 url_parts = compat_urllib_parse_urlparse(video_url)
3854 video_ext = url_parts.path.rpartition('.')[2]
3859 'title': data['title'],
3860 'description': data.get('teaser_text'),
3861 'location': data.get('country_of_origin'),
3862 'uploader': data.get('host', {}).get('name'),
3863 'uploader_id': data.get('host', {}).get('slug'),
3864 'thumbnail': data.get('image', {}).get('large_url_2x'),
3865 'duration': data.get('duration'),
3870 class YouPornIE(InfoExtractor):
3871 """Information extractor for youporn.com."""
3872 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3874 def _print_formats(self, formats):
3875 """Print all available formats"""
3876 print(u'Available formats:')
3877 print(u'ext\t\tformat')
3878 print(u'---------------------------------')
3879 for format in formats:
3880 print(u'%s\t\t%s' % (format['ext'], format['format']))
3882 def _specific(self, req_format, formats):
3884 if(x["format"]==req_format):
3888 def _real_extract(self, url):
3889 mobj = re.match(self._VALID_URL, url)
3891 self._downloader.report_error(u'invalid URL: %s' % url)
3894 video_id = mobj.group('videoid')
3896 req = compat_urllib_request.Request(url)
3897 req.add_header('Cookie', 'age_verified=1')
3898 webpage = self._download_webpage(req, video_id)
3900 # Get the video title
3901 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3903 raise ExtractorError(u'Unable to extract video title')
3904 video_title = result.group('title').strip()
3906 # Get the video date
3907 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3909 self._downloader.report_warning(u'unable to extract video date')
3912 upload_date = result.group('date').strip()
3914 # Get the video uploader
3915 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3917 self._downloader.report_warning(u'unable to extract uploader')
3918 video_uploader = None
3920 video_uploader = result.group('uploader').strip()
3921 video_uploader = clean_html( video_uploader )
3923 # Get all of the formats available
3924 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3925 result = re.search(DOWNLOAD_LIST_RE, webpage)
3927 raise ExtractorError(u'Unable to extract download list')
3928 download_list_html = result.group('download_list').strip()
3930 # Get all of the links from the page
3931 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3932 links = re.findall(LINK_RE, download_list_html)
3933 if(len(links) == 0):
3934 raise ExtractorError(u'ERROR: no known formats available for video')
3936 self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3941 # A link looks like this:
3942 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3943 # A path looks like this:
3944 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3945 video_url = unescapeHTML( link )
3946 path = compat_urllib_parse_urlparse( video_url ).path
3947 extension = os.path.splitext( path )[1][1:]
3948 format = path.split('/')[4].split('_')[:2]
3951 format = "-".join( format )
3952 title = u'%s-%s-%s' % (video_title, size, bitrate)
3957 'uploader': video_uploader,
3958 'upload_date': upload_date,
3963 'description': None,
3967 if self._downloader.params.get('listformats', None):
3968 self._print_formats(formats)
3971 req_format = self._downloader.params.get('format', None)
3972 self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3974 if req_format is None or req_format == 'best':
3976 elif req_format == 'worst':
3977 return [formats[-1]]
3978 elif req_format in ('-1', 'all'):
3981 format = self._specific( req_format, formats )
3983 self._downloader.report_error(u'requested format not available')
3989 class PornotubeIE(InfoExtractor):
3990 """Information extractor for pornotube.com."""
3991 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3993 def _real_extract(self, url):
3994 mobj = re.match(self._VALID_URL, url)
3996 self._downloader.report_error(u'invalid URL: %s' % url)
3999 video_id = mobj.group('videoid')
4000 video_title = mobj.group('title')
4002 # Get webpage content
4003 webpage = self._download_webpage(url, video_id)
4006 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
4007 result = re.search(VIDEO_URL_RE, webpage)
4009 self._downloader.report_error(u'unable to extract video url')
4011 video_url = compat_urllib_parse.unquote(result.group('url'))
4013 #Get the uploaded date
4014 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
4015 result = re.search(VIDEO_UPLOADED_RE, webpage)
4017 self._downloader.report_error(u'unable to extract video title')
4019 upload_date = result.group('date')
4021 info = {'id': video_id,
4024 'upload_date': upload_date,
4025 'title': video_title,
4031 class YouJizzIE(InfoExtractor):
4032 """Information extractor for youjizz.com."""
4033 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
4035 def _real_extract(self, url):
4036 mobj = re.match(self._VALID_URL, url)
4038 self._downloader.report_error(u'invalid URL: %s' % url)
4041 video_id = mobj.group('videoid')
4043 # Get webpage content
4044 webpage = self._download_webpage(url, video_id)
4046 # Get the video title
4047 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
4049 raise ExtractorError(u'ERROR: unable to extract video title')
4050 video_title = result.group('title').strip()
4052 # Get the embed page
4053 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
4055 raise ExtractorError(u'ERROR: unable to extract embed page')
4057 embed_page_url = result.group(0).strip()
4058 video_id = result.group('videoid')
4060 webpage = self._download_webpage(embed_page_url, video_id)
4063 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
4065 raise ExtractorError(u'ERROR: unable to extract video url')
4066 video_url = result.group('source')
4068 info = {'id': video_id,
4070 'title': video_title,
4073 'player_url': embed_page_url}
4077 class EightTracksIE(InfoExtractor):
4079 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
4081 def _real_extract(self, url):
4082 mobj = re.match(self._VALID_URL, url)
4084 raise ExtractorError(u'Invalid URL: %s' % url)
4085 playlist_id = mobj.group('id')
4087 webpage = self._download_webpage(url, playlist_id)
4089 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
4091 raise ExtractorError(u'Cannot find trax information')
4092 json_like = m.group(1)
4093 data = json.loads(json_like)
4095 session = str(random.randint(0, 1000000000))
4097 track_count = data['tracks_count']
4098 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
4099 next_url = first_url
4101 for i in itertools.count():
4102 api_json = self._download_webpage(next_url, playlist_id,
4103 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
4104 errnote=u'Failed to download song information')
4105 api_data = json.loads(api_json)
4106 track_data = api_data[u'set']['track']
4108 'id': track_data['id'],
4109 'url': track_data['track_file_stream_url'],
4110 'title': track_data['performer'] + u' - ' + track_data['name'],
4111 'raw_title': track_data['name'],
4112 'uploader_id': data['user']['login'],
4116 if api_data['set']['at_last_track']:
4118 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
4121 class KeekIE(InfoExtractor):
4122 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
4125 def _real_extract(self, url):
4126 m = re.match(self._VALID_URL, url)
4127 video_id = m.group('videoID')
4128 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
4129 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
4130 webpage = self._download_webpage(url, video_id)
4131 m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
4132 title = unescapeHTML(m.group('title'))
4133 m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
4134 uploader = clean_html(m.group('uploader'))
4140 'thumbnail': thumbnail,
4141 'uploader': uploader
4145 class TEDIE(InfoExtractor):
4146 _VALID_URL=r'''http://www.ted.com/
4148 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4150 ((?P<type_talk>talks)) # We have a simple talk
4152 /(?P<name>\w+) # Here goes the name and then ".html"
4156 def suitable(cls, url):
4157 """Receives a URL and returns True if suitable for this IE."""
4158 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4160 def _real_extract(self, url):
4161 m=re.match(self._VALID_URL, url, re.VERBOSE)
4162 if m.group('type_talk'):
4163 return [self._talk_info(url)]
4165 playlist_id=m.group('playlist_id')
4166 name=m.group('name')
4167 self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4168 return self._playlist_videos_info(url,name,playlist_id)
4170 def _talk_video_link(self,mediaSlug):
4171 '''Returns the video link for that mediaSlug'''
4172 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4174 def _playlist_videos_info(self,url,name,playlist_id=0):
4175 '''Returns the videos of the playlist'''
4177 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4178 ([.\s]*?)data-playlist_item_id="(\d+)"
4179 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4181 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4182 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4183 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4184 m_names=re.finditer(video_name_RE,webpage)
4186 for m_video, m_name in zip(m_videos,m_names):
4187 video_id=m_video.group('video_id')
4188 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4189 info.append(self._talk_info(talk_url,video_id))
4192 def _talk_info(self, url, video_id=0):
4193 """Return the video for the talk in the url"""
4194 m=re.match(self._VALID_URL, url,re.VERBOSE)
4195 videoName=m.group('name')
4196 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4197 # If the url includes the language we get the title translated
4198 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4199 title=re.search(title_RE, webpage).group('title')
4200 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4201 "id":(?P<videoID>[\d]+).*?
4202 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4203 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4204 thumb_match=re.search(thumb_RE,webpage)
4205 info_match=re.search(info_RE,webpage,re.VERBOSE)
4206 video_id=info_match.group('videoID')
4207 mediaSlug=info_match.group('mediaSlug')
4208 video_url=self._talk_video_link(mediaSlug)
4214 'thumbnail': thumb_match.group('thumbnail')
4218 class MySpassIE(InfoExtractor):
4219 _VALID_URL = r'http://www.myspass.de/.*'
4221 def _real_extract(self, url):
4222 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4224 # video id is the last path element of the URL
4225 # usually there is a trailing slash, so also try the second but last
4226 url_path = compat_urllib_parse_urlparse(url).path
4227 url_parent_path, video_id = os.path.split(url_path)
4229 _, video_id = os.path.split(url_parent_path)
4232 metadata_url = META_DATA_URL_TEMPLATE % video_id
4233 metadata_text = self._download_webpage(metadata_url, video_id)
4234 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4236 # extract values from metadata
4237 url_flv_el = metadata.find('url_flv')
4238 if url_flv_el is None:
4239 self._downloader.report_error(u'unable to extract download url')
4241 video_url = url_flv_el.text
4242 extension = os.path.splitext(video_url)[1][1:]
4243 title_el = metadata.find('title')
4244 if title_el is None:
4245 self._downloader.report_error(u'unable to extract title')
4247 title = title_el.text
4248 format_id_el = metadata.find('format_id')
4249 if format_id_el is None:
4252 format = format_id_el.text
4253 description_el = metadata.find('description')
4254 if description_el is not None:
4255 description = description_el.text
4258 imagePreview_el = metadata.find('imagePreview')
4259 if imagePreview_el is not None:
4260 thumbnail = imagePreview_el.text
4269 'thumbnail': thumbnail,
4270 'description': description
4274 class SpiegelIE(InfoExtractor):
4275 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4277 def _real_extract(self, url):
4278 m = re.match(self._VALID_URL, url)
4279 video_id = m.group('videoID')
4281 webpage = self._download_webpage(url, video_id)
4282 m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4284 raise ExtractorError(u'Cannot find title')
4285 video_title = unescapeHTML(m.group(1))
4287 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4288 xml_code = self._download_webpage(xml_url, video_id,
4289 note=u'Downloading XML', errnote=u'Failed to download XML')
4291 idoc = xml.etree.ElementTree.fromstring(xml_code)
4292 last_type = idoc[-1]
4293 filename = last_type.findall('./filename')[0].text
4294 duration = float(last_type.findall('./duration')[0].text)
4296 video_url = 'http://video2.spiegel.de/flash/' + filename
4297 video_ext = filename.rpartition('.')[2]
4302 'title': video_title,
4303 'duration': duration,
4307 class LiveLeakIE(InfoExtractor):
4309 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4310 IE_NAME = u'liveleak'
4312 def _real_extract(self, url):
4313 mobj = re.match(self._VALID_URL, url)
4315 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4318 video_id = mobj.group('video_id')
4320 webpage = self._download_webpage(url, video_id)
4322 m = re.search(r'file: "(.*?)",', webpage)
4324 self._downloader.report_error(u'unable to find video url')
4326 video_url = m.group(1)
4328 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4330 self._downloader.trouble(u'Cannot find video title')
4331 title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4333 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4335 desc = unescapeHTML(m.group('desc'))
4339 m = re.search(r'By:.*?(\w+)</a>', webpage)
4341 uploader = clean_html(m.group(1))
4350 'description': desc,
4351 'uploader': uploader
4356 class ARDIE(InfoExtractor):
4357 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4358 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4359 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4361 def _real_extract(self, url):
4362 # determine video id from url
4363 m = re.match(self._VALID_URL, url)
4365 numid = re.search(r'documentId=([0-9]+)', url)
4367 video_id = numid.group(1)
4369 video_id = m.group('video_id')
4371 # determine title and media streams from webpage
4372 html = self._download_webpage(url, video_id)
4373 title = re.search(self._TITLE, html).group('title')
4374 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4376 assert '"fsk"' in html
4377 self._downloader.report_error(u'this video is only available after 8:00 pm')
4380 # choose default media type and highest quality for now
4381 stream = max([s for s in streams if int(s["media_type"]) == 0],
4382 key=lambda s: int(s["quality"]))
4384 # there's two possibilities: RTMP stream or HTTP download
4385 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4386 if stream['rtmp_url']:
4387 self._downloader.to_screen(u'[%s] RTMP download detected' % self.IE_NAME)
4388 assert stream['video_url'].startswith('mp4:')
4389 info["url"] = stream["rtmp_url"]
4390 info["play_path"] = stream['video_url']
4392 assert stream["video_url"].endswith('.mp4')
4393 info["url"] = stream["video_url"]
4397 def gen_extractors():
4398 """ Return a list of an instance of every supported extractor.
4399 The order does matter; the first extractor matched is the one handling the URL.
4402 YoutubePlaylistIE(),
4427 StanfordOpenClassroomIE(),
4437 WorldStarHipHopIE(),