2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
23 class InfoExtractor(object):
24 """Information Extractor class.
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
29 others. The information is stored in a dictionary which is then
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
34 The dictionaries must include the following fields:
38 title: Video title, unescaped.
39 ext: Video filename extension.
41 The following fields are optional:
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
46 uploader: Full name of the video uploader.
47 upload_date: Video upload date (YYYYMMDD).
48 uploader_id: Nickname or id of the video uploader.
49 location: Physical location of the video.
50 player_url: SWF Player URL (used for rtmpdump).
51 subtitles: The subtitle file contents.
52 urlhandle: [internal] The urlHandle to be used to download the file,
53 like returned by urllib.request.urlopen
55 The fields should all be Unicode strings.
57 Subclasses of this one should re-define the _real_initialize() and
58 _real_extract() methods and define a _VALID_URL regexp.
59 Probably, they should also be added to the list of extractors.
61 _real_extract() must return a *list* of information dictionaries as
64 Finally, the _WORKING attribute should be set to False for broken IEs
65 in order to warn the users and skip the tests.
72 def __init__(self, downloader=None):
73 """Constructor. Receives an optional downloader."""
75 self.set_downloader(downloader)
78 def suitable(cls, url):
79 """Receives a URL and returns True if suitable for this IE."""
80 return re.match(cls._VALID_URL, url) is not None
84 """Getter method for _WORKING."""
88 """Initializes an instance (authentication, etc)."""
90 self._real_initialize()
93 def extract(self, url):
94 """Extracts URL information and returns it in list of dicts."""
96 return self._real_extract(url)
98 def set_downloader(self, downloader):
99 """Sets the downloader for this IE."""
100 self._downloader = downloader
102 def _real_initialize(self):
103 """Real initialization process. Redefine in subclasses."""
106 def _real_extract(self, url):
107 """Real extraction process. Redefine in subclasses."""
112 return type(self).__name__[:-2]
114 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115 """ Returns the response handle """
117 note = u'Downloading video webpage'
118 if note is not False:
119 self.to_screen(u'%s: %s' % (video_id, note))
121 return compat_urllib_request.urlopen(url_or_request)
122 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
124 errnote = u'Unable to download webpage'
125 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
127 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
128 """ Returns the data of the page as a string """
129 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
130 content_type = urlh.headers.get('Content-Type', '')
131 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
133 encoding = m.group(1)
136 webpage_bytes = urlh.read()
137 if self._downloader.params.get('dump_intermediate_pages', False):
139 url = url_or_request.get_full_url()
140 except AttributeError:
142 self.to_screen(u'Dumping request to ' + url)
143 dump = base64.b64encode(webpage_bytes).decode('ascii')
144 self._downloader.to_screen(dump)
145 return webpage_bytes.decode(encoding, 'replace')
147 def to_screen(self, msg):
148 """Print msg to screen, prefixing it with '[ie_name]'"""
149 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
151 def report_extraction(self, id_or_name):
152 """Report information extraction."""
153 self.to_screen(u'%s: Extracting information' % id_or_name)
155 #Methods for following #608
156 #They set the correct value of the '_type' key
157 def video_result(self, video_info):
158 """Returns a video"""
159 video_info['_type'] = 'video'
161 def url_result(self, url, ie=None):
162 """Returns a url that points to a page that should be processed"""
163 #TODO: ie should be the class used for getting the info
164 video_info = {'_type': 'url',
168 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
169 """Returns a playlist"""
170 video_info = {'_type': 'playlist',
173 video_info['id'] = playlist_id
175 video_info['title'] = playlist_title
179 class YoutubeIE(InfoExtractor):
180 """Information extractor for youtube.com."""
184 (?:https?://)? # http(s):// (optional)
185 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
186 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
187 (?:.*?\#/)? # handle anchor (#/) redirect urls
188 (?: # the various things that can precede the ID:
189 (?:(?:v|embed|e)/) # v/ or embed/ or e/
190 |(?: # or the v= param in all its forms
191 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
192 (?:\?|\#!?) # the params delimiter ? or # or #!
193 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
196 )? # optional -> youtube.com/xxxx is OK
197 )? # all until now is optional -> you can pass the naked ID
198 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
199 (?(1).+)? # if we found the ID, everything can follow
201 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
202 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
203 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
204 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
205 _NETRC_MACHINE = 'youtube'
206 # Listed in order of quality
207 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
208 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
209 _video_extensions = {
215 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
221 _video_dimensions = {
240 def suitable(cls, url):
241 """Receives a URL and returns True if suitable for this IE."""
242 if YoutubePlaylistIE.suitable(url): return False
243 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
245 def report_lang(self):
246 """Report attempt to set language."""
247 self.to_screen(u'Setting language')
249 def report_login(self):
250 """Report attempt to log in."""
251 self.to_screen(u'Logging in')
253 def report_age_confirmation(self):
254 """Report attempt to confirm age."""
255 self.to_screen(u'Confirming age')
257 def report_video_webpage_download(self, video_id):
258 """Report attempt to download video webpage."""
259 self.to_screen(u'%s: Downloading video webpage' % video_id)
261 def report_video_info_webpage_download(self, video_id):
262 """Report attempt to download video info webpage."""
263 self.to_screen(u'%s: Downloading video info webpage' % video_id)
265 def report_video_subtitles_download(self, video_id):
266 """Report attempt to download video info webpage."""
267 self.to_screen(u'%s: Checking available subtitles' % video_id)
269 def report_video_subtitles_request(self, video_id, sub_lang, format):
270 """Report attempt to download video info webpage."""
271 self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
273 def report_video_subtitles_available(self, video_id, sub_lang_list):
274 """Report available subtitles."""
275 sub_lang = ",".join(list(sub_lang_list.keys()))
276 self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
278 def report_information_extraction(self, video_id):
279 """Report attempt to extract video information."""
280 self.to_screen(u'%s: Extracting video information' % video_id)
282 def report_unavailable_format(self, video_id, format):
283 """Report extracted video URL."""
284 self.to_screen(u'%s: Format %s not available' % (video_id, format))
286 def report_rtmp_download(self):
287 """Indicate the download will use the RTMP protocol."""
288 self.to_screen(u'RTMP download detected')
290 def _get_available_subtitles(self, video_id):
291 self.report_video_subtitles_download(video_id)
292 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
294 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
295 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
296 return (u'unable to download video subtitles: %s' % compat_str(err), None)
297 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
298 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
299 if not sub_lang_list:
300 return (u'video doesn\'t have subtitles', None)
303 def _list_available_subtitles(self, video_id):
304 sub_lang_list = self._get_available_subtitles(video_id)
305 self.report_video_subtitles_available(video_id, sub_lang_list)
307 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
310 (error_message, sub_lang, sub)
312 self.report_video_subtitles_request(video_id, sub_lang, format)
313 params = compat_urllib_parse.urlencode({
319 url = 'http://www.youtube.com/api/timedtext?' + params
321 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
322 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
323 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
325 return (u'Did not fetch video subtitles', None, None)
326 return (None, sub_lang, sub)
328 def _extract_subtitle(self, video_id):
330 Return a list with a tuple:
331 [(error_message, sub_lang, sub)]
333 sub_lang_list = self._get_available_subtitles(video_id)
334 sub_format = self._downloader.params.get('subtitlesformat')
335 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
336 return [(sub_lang_list[0], None, None)]
337 if self._downloader.params.get('subtitleslang', False):
338 sub_lang = self._downloader.params.get('subtitleslang')
339 elif 'en' in sub_lang_list:
342 sub_lang = list(sub_lang_list.keys())[0]
343 if not sub_lang in sub_lang_list:
344 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
346 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
349 def _extract_all_subtitles(self, video_id):
350 sub_lang_list = self._get_available_subtitles(video_id)
351 sub_format = self._downloader.params.get('subtitlesformat')
352 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
353 return [(sub_lang_list[0], None, None)]
355 for sub_lang in sub_lang_list:
356 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
357 subtitles.append(subtitle)
360 def _print_formats(self, formats):
361 print('Available formats:')
363 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
365 def _real_initialize(self):
366 if self._downloader is None:
371 downloader_params = self._downloader.params
373 # Attempt to use provided username and password or .netrc data
374 if downloader_params.get('username', None) is not None:
375 username = downloader_params['username']
376 password = downloader_params['password']
377 elif downloader_params.get('usenetrc', False):
379 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
384 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
385 except (IOError, netrc.NetrcParseError) as err:
386 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
390 request = compat_urllib_request.Request(self._LANG_URL)
393 compat_urllib_request.urlopen(request).read()
394 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
395 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
398 # No authentication to be performed
402 request = compat_urllib_request.Request(self._LOGIN_URL)
404 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
405 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
406 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
411 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
413 galx = match.group(1)
415 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
421 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
425 u'PersistentCookie': u'yes',
427 u'bgresponse': u'js_disabled',
428 u'checkConnection': u'',
429 u'checkedDomains': u'youtube',
435 u'signIn': u'Sign in',
437 u'service': u'youtube',
441 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
443 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
444 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
445 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
448 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
449 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
450 self._downloader.report_warning(u'unable to log in: bad username or password')
452 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
453 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
459 'action_confirm': 'Confirm',
461 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
463 self.report_age_confirmation()
464 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
465 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
466 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
469 def _extract_id(self, url):
470 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
472 self._downloader.report_error(u'invalid URL: %s' % url)
474 video_id = mobj.group(2)
477 def _real_extract(self, url):
478 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
479 mobj = re.search(self._NEXT_URL_RE, url)
481 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
482 video_id = self._extract_id(url)
485 self.report_video_webpage_download(video_id)
486 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
487 request = compat_urllib_request.Request(url)
489 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
490 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
491 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
494 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
496 # Attempt to extract SWF player URL
497 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
499 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
504 self.report_video_info_webpage_download(video_id)
505 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
506 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
507 % (video_id, el_type))
508 video_info_webpage = self._download_webpage(video_info_url, video_id,
510 errnote='unable to download video info webpage')
511 video_info = compat_parse_qs(video_info_webpage)
512 if 'token' in video_info:
514 if 'token' not in video_info:
515 if 'reason' in video_info:
516 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
518 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
521 # Check for "rental" videos
522 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
523 self._downloader.report_error(u'"rental" videos not supported')
526 # Start extracting information
527 self.report_information_extraction(video_id)
530 if 'author' not in video_info:
531 self._downloader.report_error(u'unable to extract uploader name')
533 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
536 video_uploader_id = None
537 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
539 video_uploader_id = mobj.group(1)
541 self._downloader.report_warning(u'unable to extract uploader nickname')
544 if 'title' not in video_info:
545 self._downloader.report_error(u'unable to extract video title')
547 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
550 if 'thumbnail_url' not in video_info:
551 self._downloader.report_warning(u'unable to extract video thumbnail')
553 else: # don't panic if we can't find it
554 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
558 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
560 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
561 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
562 for expression in format_expressions:
564 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
569 video_description = get_element_by_id("eow-description", video_webpage)
570 if video_description:
571 video_description = clean_html(video_description)
573 video_description = ''
576 video_subtitles = None
578 if self._downloader.params.get('writesubtitles', False):
579 video_subtitles = self._extract_subtitle(video_id)
581 (sub_error, sub_lang, sub) = video_subtitles[0]
583 self._downloader.report_error(sub_error)
585 if self._downloader.params.get('allsubtitles', False):
586 video_subtitles = self._extract_all_subtitles(video_id)
587 for video_subtitle in video_subtitles:
588 (sub_error, sub_lang, sub) = video_subtitle
590 self._downloader.report_error(sub_error)
592 if self._downloader.params.get('listsubtitles', False):
593 sub_lang_list = self._list_available_subtitles(video_id)
596 if 'length_seconds' not in video_info:
597 self._downloader.report_warning(u'unable to extract video duration')
600 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
603 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
605 # Decide which formats to download
606 req_format = self._downloader.params.get('format', None)
608 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
609 self.report_rtmp_download()
610 video_url_list = [(None, video_info['conn'][0])]
611 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
612 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
613 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
614 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
615 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
617 format_limit = self._downloader.params.get('format_limit', None)
618 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
619 if format_limit is not None and format_limit in available_formats:
620 format_list = available_formats[available_formats.index(format_limit):]
622 format_list = available_formats
623 existing_formats = [x for x in format_list if x in url_map]
624 if len(existing_formats) == 0:
625 self._downloader.report_error(u'no known formats available for video')
627 if self._downloader.params.get('listformats', None):
628 self._print_formats(existing_formats)
630 if req_format is None or req_format == 'best':
631 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
632 elif req_format == 'worst':
633 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
634 elif req_format in ('-1', 'all'):
635 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
637 # Specific formats. We pick the first in a slash-delimeted sequence.
638 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
639 req_formats = req_format.split('/')
640 video_url_list = None
641 for rf in req_formats:
643 video_url_list = [(rf, url_map[rf])]
645 if video_url_list is None:
646 self._downloader.report_error(u'requested format not available')
649 self._downloader.report_error(u'no conn or url_encoded_fmt_stream_map information found in video info')
653 for format_param, video_real_url in video_url_list:
655 video_extension = self._video_extensions.get(format_param, 'flv')
657 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
658 self._video_dimensions.get(format_param, '???'))
662 'url': video_real_url,
663 'uploader': video_uploader,
664 'uploader_id': video_uploader_id,
665 'upload_date': upload_date,
666 'title': video_title,
667 'ext': video_extension,
668 'format': video_format,
669 'thumbnail': video_thumbnail,
670 'description': video_description,
671 'player_url': player_url,
672 'subtitles': video_subtitles,
673 'duration': video_duration
678 class MetacafeIE(InfoExtractor):
679 """Information Extractor for metacafe.com."""
681 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
682 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
683 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
684 IE_NAME = u'metacafe'
686 def __init__(self, downloader=None):
687 InfoExtractor.__init__(self, downloader)
689 def report_disclaimer(self):
690 """Report disclaimer retrieval."""
691 self.to_screen(u'Retrieving disclaimer')
693 def report_age_confirmation(self):
694 """Report attempt to confirm age."""
695 self.to_screen(u'Confirming age')
697 def report_download_webpage(self, video_id):
698 """Report webpage download."""
699 self.to_screen(u'%s: Downloading webpage' % video_id)
701 def _real_initialize(self):
702 # Retrieve disclaimer
703 request = compat_urllib_request.Request(self._DISCLAIMER)
705 self.report_disclaimer()
706 disclaimer = compat_urllib_request.urlopen(request).read()
707 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
708 self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
714 'submit': "Continue - I'm over 18",
716 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
718 self.report_age_confirmation()
719 disclaimer = compat_urllib_request.urlopen(request).read()
720 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
721 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
724 def _real_extract(self, url):
725 # Extract id and simplified title from URL
726 mobj = re.match(self._VALID_URL, url)
728 self._downloader.report_error(u'invalid URL: %s' % url)
731 video_id = mobj.group(1)
733 # Check if video comes from YouTube
734 mobj2 = re.match(r'^yt-(.*)$', video_id)
735 if mobj2 is not None:
736 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
738 # Retrieve video webpage to extract further information
739 webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
741 # Extract URL, uploader and title from webpage
742 self.report_extraction(video_id)
743 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
745 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
746 video_extension = mediaURL[-3:]
748 # Extract gdaKey if available
749 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
753 gdaKey = mobj.group(1)
754 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
756 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
758 self._downloader.report_error(u'unable to extract media URL')
760 vardict = compat_parse_qs(mobj.group(1))
761 if 'mediaData' not in vardict:
762 self._downloader.report_error(u'unable to extract media URL')
764 mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
766 self._downloader.report_error(u'unable to extract media URL')
768 mediaURL = mobj.group('mediaURL').replace('\\/', '/')
769 video_extension = mediaURL[-3:]
770 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
772 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
774 self._downloader.report_error(u'unable to extract title')
776 video_title = mobj.group(1).decode('utf-8')
778 mobj = re.search(r'submitter=(.*?);', webpage)
780 self._downloader.report_error(u'unable to extract uploader nickname')
782 video_uploader = mobj.group(1)
785 'id': video_id.decode('utf-8'),
786 'url': video_url.decode('utf-8'),
787 'uploader': video_uploader.decode('utf-8'),
789 'title': video_title,
790 'ext': video_extension.decode('utf-8'),
794 class DailymotionIE(InfoExtractor):
795 """Information Extractor for Dailymotion"""
797 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
798 IE_NAME = u'dailymotion'
801 def __init__(self, downloader=None):
802 InfoExtractor.__init__(self, downloader)
804 def _real_extract(self, url):
805 # Extract id and simplified title from URL
806 mobj = re.match(self._VALID_URL, url)
808 self._downloader.report_error(u'invalid URL: %s' % url)
811 video_id = mobj.group(1).split('_')[0].split('?')[0]
813 video_extension = 'mp4'
815 # Retrieve video webpage to extract further information
816 request = compat_urllib_request.Request(url)
817 request.add_header('Cookie', 'family_filter=off')
818 webpage = self._download_webpage(request, video_id)
820 # Extract URL, uploader and title from webpage
821 self.report_extraction(video_id)
822 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
824 self._downloader.report_error(u'unable to extract media URL')
826 flashvars = compat_urllib_parse.unquote(mobj.group(1))
828 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
831 self.to_screen(u'Using %s' % key)
834 self._downloader.report_error(u'unable to extract video URL')
837 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
839 self._downloader.report_error(u'unable to extract video URL')
842 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
844 # TODO: support choosing qualities
846 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
848 self._downloader.report_error(u'unable to extract title')
850 video_title = unescapeHTML(mobj.group('title'))
852 video_uploader = None
853 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
855 # lookin for official user
856 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
857 if mobj_official is None:
858 self._downloader.report_warning(u'unable to extract uploader nickname')
860 video_uploader = mobj_official.group(1)
862 video_uploader = mobj.group(1)
864 video_upload_date = None
865 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
867 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
872 'uploader': video_uploader,
873 'upload_date': video_upload_date,
874 'title': video_title,
875 'ext': video_extension,
879 class PhotobucketIE(InfoExtractor):
880 """Information extractor for photobucket.com."""
882 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
883 IE_NAME = u'photobucket'
885 def __init__(self, downloader=None):
886 InfoExtractor.__init__(self, downloader)
888 def report_download_webpage(self, video_id):
889 """Report webpage download."""
890 self.to_screen(u'%s: Downloading webpage' % video_id)
892 def _real_extract(self, url):
893 # Extract id from URL
894 mobj = re.match(self._VALID_URL, url)
896 self._downloader.report_error(u'Invalid URL: %s' % url)
899 video_id = mobj.group(1)
901 video_extension = 'flv'
903 # Retrieve video webpage to extract further information
904 request = compat_urllib_request.Request(url)
906 self.report_download_webpage(video_id)
907 webpage = compat_urllib_request.urlopen(request).read()
908 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
909 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
912 # Extract URL, uploader, and title from webpage
913 self.report_extraction(video_id)
914 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
916 self._downloader.report_error(u'unable to extract media URL')
918 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
922 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
924 self._downloader.report_error(u'unable to extract title')
926 video_title = mobj.group(1).decode('utf-8')
928 video_uploader = mobj.group(2).decode('utf-8')
931 'id': video_id.decode('utf-8'),
932 'url': video_url.decode('utf-8'),
933 'uploader': video_uploader,
935 'title': video_title,
936 'ext': video_extension.decode('utf-8'),
940 class YahooIE(InfoExtractor):
941 """Information extractor for video.yahoo.com."""
944 # _VALID_URL matches all Yahoo! Video URLs
945 # _VPAGE_URL matches only the extractable '/watch/' URLs
946 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
947 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
948 IE_NAME = u'video.yahoo'
950 def __init__(self, downloader=None):
951 InfoExtractor.__init__(self, downloader)
953 def report_download_webpage(self, video_id):
954 """Report webpage download."""
955 self.to_screen(u'%s: Downloading webpage' % video_id)
957 def _real_extract(self, url, new_video=True):
958 # Extract ID from URL
959 mobj = re.match(self._VALID_URL, url)
961 self._downloader.report_error(u'Invalid URL: %s' % url)
964 video_id = mobj.group(2)
965 video_extension = 'flv'
967 # Rewrite valid but non-extractable URLs as
968 # extractable English language /watch/ URLs
969 if re.match(self._VPAGE_URL, url) is None:
970 request = compat_urllib_request.Request(url)
972 webpage = compat_urllib_request.urlopen(request).read()
973 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
974 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
977 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
979 self._downloader.report_error(u'Unable to extract id field')
981 yahoo_id = mobj.group(1)
983 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
985 self._downloader.report_error(u'Unable to extract vid field')
987 yahoo_vid = mobj.group(1)
989 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
990 return self._real_extract(url, new_video=False)
992 # Retrieve video webpage to extract further information
993 request = compat_urllib_request.Request(url)
995 self.report_download_webpage(video_id)
996 webpage = compat_urllib_request.urlopen(request).read()
997 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
998 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1001 # Extract uploader and title from webpage
1002 self.report_extraction(video_id)
1003 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1005 self._downloader.report_error(u'unable to extract video title')
1007 video_title = mobj.group(1).decode('utf-8')
1009 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1011 self._downloader.report_error(u'unable to extract video uploader')
1013 video_uploader = mobj.group(1).decode('utf-8')
1015 # Extract video thumbnail
1016 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1018 self._downloader.report_error(u'unable to extract video thumbnail')
1020 video_thumbnail = mobj.group(1).decode('utf-8')
1022 # Extract video description
1023 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1025 self._downloader.report_error(u'unable to extract video description')
1027 video_description = mobj.group(1).decode('utf-8')
1028 if not video_description:
1029 video_description = 'No description available.'
1031 # Extract video height and width
1032 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1034 self._downloader.report_error(u'unable to extract video height')
1036 yv_video_height = mobj.group(1)
1038 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1040 self._downloader.report_error(u'unable to extract video width')
1042 yv_video_width = mobj.group(1)
1044 # Retrieve video playlist to extract media URL
1045 # I'm not completely sure what all these options are, but we
1046 # seem to need most of them, otherwise the server sends a 401.
1047 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1048 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1049 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1050 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1051 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1053 self.report_download_webpage(video_id)
1054 webpage = compat_urllib_request.urlopen(request).read()
1055 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1056 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1059 # Extract media URL from playlist XML
1060 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1062 self._downloader.report_error(u'Unable to extract media URL')
1064 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1065 video_url = unescapeHTML(video_url)
1068 'id': video_id.decode('utf-8'),
1070 'uploader': video_uploader,
1071 'upload_date': None,
1072 'title': video_title,
1073 'ext': video_extension.decode('utf-8'),
1074 'thumbnail': video_thumbnail.decode('utf-8'),
1075 'description': video_description,
1079 class VimeoIE(InfoExtractor):
1080 """Information extractor for vimeo.com."""
1082 # _VALID_URL matches Vimeo URLs
1083 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1086 def __init__(self, downloader=None):
1087 InfoExtractor.__init__(self, downloader)
1089 def report_download_webpage(self, video_id):
1090 """Report webpage download."""
1091 self.to_screen(u'%s: Downloading webpage' % video_id)
1093 def _real_extract(self, url, new_video=True):
1094 # Extract ID from URL
1095 mobj = re.match(self._VALID_URL, url)
1097 self._downloader.report_error(u'Invalid URL: %s' % url)
1100 video_id = mobj.group('id')
1101 if not mobj.group('proto'):
1102 url = 'https://' + url
1103 if mobj.group('direct_link'):
1104 url = 'https://vimeo.com/' + video_id
1106 # Retrieve video webpage to extract further information
1107 request = compat_urllib_request.Request(url, None, std_headers)
1109 self.report_download_webpage(video_id)
1110 webpage_bytes = compat_urllib_request.urlopen(request).read()
1111 webpage = webpage_bytes.decode('utf-8')
1112 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1113 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1116 # Now we begin extracting as much information as we can from what we
1117 # retrieved. First we extract the information common to all extractors,
1118 # and latter we extract those that are Vimeo specific.
1119 self.report_extraction(video_id)
1121 # Extract the config JSON
1123 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1124 config = json.loads(config)
1126 self._downloader.report_error(u'unable to extract info section')
1130 video_title = config["video"]["title"]
1132 # Extract uploader and uploader_id
1133 video_uploader = config["video"]["owner"]["name"]
1134 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1136 # Extract video thumbnail
1137 video_thumbnail = config["video"]["thumbnail"]
1139 # Extract video description
1140 video_description = get_element_by_attribute("itemprop", "description", webpage)
1141 if video_description: video_description = clean_html(video_description)
1142 else: video_description = u''
1144 # Extract upload date
1145 video_upload_date = None
1146 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1147 if mobj is not None:
1148 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1150 # Vimeo specific: extract request signature and timestamp
1151 sig = config['request']['signature']
1152 timestamp = config['request']['timestamp']
1154 # Vimeo specific: extract video codec and quality information
1155 # First consider quality, then codecs, then take everything
1156 # TODO bind to format param
1157 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1158 files = { 'hd': [], 'sd': [], 'other': []}
1159 for codec_name, codec_extension in codecs:
1160 if codec_name in config["video"]["files"]:
1161 if 'hd' in config["video"]["files"][codec_name]:
1162 files['hd'].append((codec_name, codec_extension, 'hd'))
1163 elif 'sd' in config["video"]["files"][codec_name]:
1164 files['sd'].append((codec_name, codec_extension, 'sd'))
1166 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1168 for quality in ('hd', 'sd', 'other'):
1169 if len(files[quality]) > 0:
1170 video_quality = files[quality][0][2]
1171 video_codec = files[quality][0][0]
1172 video_extension = files[quality][0][1]
1173 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1176 self._downloader.report_error(u'no known codec found')
1179 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1180 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1185 'uploader': video_uploader,
1186 'uploader_id': video_uploader_id,
1187 'upload_date': video_upload_date,
1188 'title': video_title,
1189 'ext': video_extension,
1190 'thumbnail': video_thumbnail,
1191 'description': video_description,
1195 class ArteTvIE(InfoExtractor):
1196 """arte.tv information extractor."""
1198 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1199 _LIVE_URL = r'index-[0-9]+\.html$'
1201 IE_NAME = u'arte.tv'
1203 def __init__(self, downloader=None):
1204 InfoExtractor.__init__(self, downloader)
1206 def report_download_webpage(self, video_id):
1207 """Report webpage download."""
1208 self.to_screen(u'%s: Downloading webpage' % video_id)
1210 def fetch_webpage(self, url):
1211 request = compat_urllib_request.Request(url)
1213 self.report_download_webpage(url)
1214 webpage = compat_urllib_request.urlopen(request).read()
1215 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1216 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1218 except ValueError as err:
1219 self._downloader.report_error(u'Invalid URL: %s' % url)
1223 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1224 page = self.fetch_webpage(url)
1225 mobj = re.search(regex, page, regexFlags)
1229 self._downloader.report_error(u'Invalid URL: %s' % url)
1232 for (i, key, err) in matchTuples:
1233 if mobj.group(i) is None:
1234 self._downloader.trouble(err)
1237 info[key] = mobj.group(i)
1241 def extractLiveStream(self, url):
1242 video_lang = url.split('/')[-4]
1243 info = self.grep_webpage(
1245 r'src="(.*?/videothek_js.*?\.js)',
1248 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1251 http_host = url.split('/')[2]
1252 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1253 info = self.grep_webpage(
1255 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1256 '(http://.*?\.swf).*?' +
1260 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1261 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1262 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1265 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1267 def extractPlus7Stream(self, url):
1268 video_lang = url.split('/')[-3]
1269 info = self.grep_webpage(
1271 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1274 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1277 next_url = compat_urllib_parse.unquote(info.get('url'))
1278 info = self.grep_webpage(
1280 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1283 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1286 next_url = compat_urllib_parse.unquote(info.get('url'))
1288 info = self.grep_webpage(
1290 r'<video id="(.*?)".*?>.*?' +
1291 '<name>(.*?)</name>.*?' +
1292 '<dateVideo>(.*?)</dateVideo>.*?' +
1293 '<url quality="hd">(.*?)</url>',
1296 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1297 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1298 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1299 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1304 'id': info.get('id'),
1305 'url': compat_urllib_parse.unquote(info.get('url')),
1306 'uploader': u'arte.tv',
1307 'upload_date': info.get('date'),
1308 'title': info.get('title').decode('utf-8'),
1314 def _real_extract(self, url):
1315 video_id = url.split('/')[-1]
1316 self.report_extraction(video_id)
1318 if re.search(self._LIVE_URL, video_id) is not None:
1319 self.extractLiveStream(url)
1322 info = self.extractPlus7Stream(url)
1327 class GenericIE(InfoExtractor):
1328 """Generic last-resort information extractor."""
1331 IE_NAME = u'generic'
1333 def __init__(self, downloader=None):
1334 InfoExtractor.__init__(self, downloader)
1336 def report_download_webpage(self, video_id):
1337 """Report webpage download."""
1338 if not self._downloader.params.get('test', False):
1339 self._downloader.report_warning(u'Falling back on generic information extractor.')
1340 self.to_screen(u'%s: Downloading webpage' % video_id)
1342 def report_following_redirect(self, new_url):
1343 """Report information extraction."""
1344 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1346 def _test_redirect(self, url):
1347 """Check if it is a redirect, like url shorteners, in case return the new url."""
1348 class HeadRequest(compat_urllib_request.Request):
1349 def get_method(self):
1352 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1354 Subclass the HTTPRedirectHandler to make it use our
1355 HeadRequest also on the redirected URL
1357 def redirect_request(self, req, fp, code, msg, headers, newurl):
1358 if code in (301, 302, 303, 307):
1359 newurl = newurl.replace(' ', '%20')
1360 newheaders = dict((k,v) for k,v in req.headers.items()
1361 if k.lower() not in ("content-length", "content-type"))
1362 return HeadRequest(newurl,
1364 origin_req_host=req.get_origin_req_host(),
1367 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1369 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1371 Fallback to GET if HEAD is not allowed (405 HTTP error)
1373 def http_error_405(self, req, fp, code, msg, headers):
1377 newheaders = dict((k,v) for k,v in req.headers.items()
1378 if k.lower() not in ("content-length", "content-type"))
1379 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1381 origin_req_host=req.get_origin_req_host(),
1385 opener = compat_urllib_request.OpenerDirector()
1386 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1387 HTTPMethodFallback, HEADRedirectHandler,
1388 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1389 opener.add_handler(handler())
1391 response = opener.open(HeadRequest(url))
1392 new_url = response.geturl()
1397 self.report_following_redirect(new_url)
1400 def _real_extract(self, url):
1401 new_url = self._test_redirect(url)
1402 if new_url: return [self.url_result(new_url)]
1404 video_id = url.split('/')[-1]
1406 webpage = self._download_webpage(url, video_id)
1407 except ValueError as err:
1408 # since this is the last-resort InfoExtractor, if
1409 # this error is thrown, it'll be thrown here
1410 self._downloader.report_error(u'Invalid URL: %s' % url)
1413 self.report_extraction(video_id)
1414 # Start with something easy: JW Player in SWFObject
1415 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1417 # Broaden the search a little bit
1418 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1420 # Broaden the search a little bit: JWPlayer JS loader
1421 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1423 self._downloader.report_error(u'Invalid URL: %s' % url)
1426 # It's possible that one of the regexes
1427 # matched, but returned an empty group:
1428 if mobj.group(1) is None:
1429 self._downloader.report_error(u'Invalid URL: %s' % url)
1432 video_url = compat_urllib_parse.unquote(mobj.group(1))
1433 video_id = os.path.basename(video_url)
1435 # here's a fun little line of code for you:
1436 video_extension = os.path.splitext(video_id)[1][1:]
1437 video_id = os.path.splitext(video_id)[0]
1439 # it's tempting to parse this further, but you would
1440 # have to take into account all the variations like
1441 # Video Title - Site Name
1442 # Site Name | Video Title
1443 # Video Title - Tagline | Site Name
1444 # and so on and so forth; it's just not practical
1445 mobj = re.search(r'<title>(.*)</title>', webpage)
1447 self._downloader.report_error(u'unable to extract title')
1449 video_title = mobj.group(1)
1451 # video uploader is domain name
1452 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1454 self._downloader.report_error(u'unable to extract title')
1456 video_uploader = mobj.group(1)
1461 'uploader': video_uploader,
1462 'upload_date': None,
1463 'title': video_title,
1464 'ext': video_extension,
1468 class YoutubeSearchIE(InfoExtractor):
1469 """Information Extractor for YouTube search queries."""
1470 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1471 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1472 _max_youtube_results = 1000
1473 IE_NAME = u'youtube:search'
1475 def __init__(self, downloader=None):
1476 InfoExtractor.__init__(self, downloader)
1478 def report_download_page(self, query, pagenum):
1479 """Report attempt to download search page with given number."""
1480 query = query.decode(preferredencoding())
1481 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1483 def _real_extract(self, query):
1484 mobj = re.match(self._VALID_URL, query)
1486 self._downloader.report_error(u'invalid search query "%s"' % query)
1489 prefix, query = query.split(':')
1491 query = query.encode('utf-8')
1493 return self._get_n_results(query, 1)
1494 elif prefix == 'all':
1495 self._get_n_results(query, self._max_youtube_results)
1500 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1502 elif n > self._max_youtube_results:
1503 self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1504 n = self._max_youtube_results
1505 return self._get_n_results(query, n)
1506 except ValueError: # parsing prefix as integer fails
1507 return self._get_n_results(query, 1)
1509 def _get_n_results(self, query, n):
1510 """Get a specified number of results for a query"""
1516 while (50 * pagenum) < limit:
1517 self.report_download_page(query, pagenum+1)
1518 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1519 request = compat_urllib_request.Request(result_url)
1521 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1522 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1523 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1525 api_response = json.loads(data)['data']
1527 if not 'items' in api_response:
1528 self._downloader.trouble(u'[youtube] No video results')
1531 new_ids = list(video['id'] for video in api_response['items'])
1532 video_ids += new_ids
1534 limit = min(n, api_response['totalItems'])
1537 if len(video_ids) > n:
1538 video_ids = video_ids[:n]
1539 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1543 class GoogleSearchIE(InfoExtractor):
1544 """Information Extractor for Google Video search queries."""
1545 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1546 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1547 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1548 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1549 _max_google_results = 1000
1550 IE_NAME = u'video.google:search'
1552 def __init__(self, downloader=None):
1553 InfoExtractor.__init__(self, downloader)
1555 def report_download_page(self, query, pagenum):
1556 """Report attempt to download playlist page with given number."""
1557 query = query.decode(preferredencoding())
1558 self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1560 def _real_extract(self, query):
1561 mobj = re.match(self._VALID_URL, query)
1563 self._downloader.report_error(u'invalid search query "%s"' % query)
1566 prefix, query = query.split(':')
1568 query = query.encode('utf-8')
1570 self._download_n_results(query, 1)
1572 elif prefix == 'all':
1573 self._download_n_results(query, self._max_google_results)
1579 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1581 elif n > self._max_google_results:
1582 self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1583 n = self._max_google_results
1584 self._download_n_results(query, n)
1586 except ValueError: # parsing prefix as integer fails
1587 self._download_n_results(query, 1)
1590 def _download_n_results(self, query, n):
1591 """Downloads a specified number of results for a query"""
1597 self.report_download_page(query, pagenum)
1598 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1599 request = compat_urllib_request.Request(result_url)
1601 page = compat_urllib_request.urlopen(request).read()
1602 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1603 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1606 # Extract video identifiers
1607 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1608 video_id = mobj.group(1)
1609 if video_id not in video_ids:
1610 video_ids.append(video_id)
1611 if len(video_ids) == n:
1612 # Specified n videos reached
1613 for id in video_ids:
1614 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1617 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1618 for id in video_ids:
1619 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1622 pagenum = pagenum + 1
1625 class YahooSearchIE(InfoExtractor):
1626 """Information Extractor for Yahoo! Video search queries."""
1629 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1630 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1631 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1632 _MORE_PAGES_INDICATOR = r'\s*Next'
1633 _max_yahoo_results = 1000
1634 IE_NAME = u'video.yahoo:search'
1636 def __init__(self, downloader=None):
1637 InfoExtractor.__init__(self, downloader)
1639 def report_download_page(self, query, pagenum):
1640 """Report attempt to download playlist page with given number."""
1641 query = query.decode(preferredencoding())
1642 self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1644 def _real_extract(self, query):
1645 mobj = re.match(self._VALID_URL, query)
1647 self._downloader.report_error(u'invalid search query "%s"' % query)
1650 prefix, query = query.split(':')
1652 query = query.encode('utf-8')
1654 self._download_n_results(query, 1)
1656 elif prefix == 'all':
1657 self._download_n_results(query, self._max_yahoo_results)
1663 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1665 elif n > self._max_yahoo_results:
1666 self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1667 n = self._max_yahoo_results
1668 self._download_n_results(query, n)
1670 except ValueError: # parsing prefix as integer fails
1671 self._download_n_results(query, 1)
1674 def _download_n_results(self, query, n):
1675 """Downloads a specified number of results for a query"""
1678 already_seen = set()
1682 self.report_download_page(query, pagenum)
1683 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1684 request = compat_urllib_request.Request(result_url)
1686 page = compat_urllib_request.urlopen(request).read()
1687 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1688 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1691 # Extract video identifiers
1692 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1693 video_id = mobj.group(1)
1694 if video_id not in already_seen:
1695 video_ids.append(video_id)
1696 already_seen.add(video_id)
1697 if len(video_ids) == n:
1698 # Specified n videos reached
1699 for id in video_ids:
1700 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1703 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1704 for id in video_ids:
1705 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1708 pagenum = pagenum + 1
1711 class YoutubePlaylistIE(InfoExtractor):
1712 """Information Extractor for YouTube playlists."""
1714 _VALID_URL = r"""(?:
1719 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1720 \? (?:.*?&)*? (?:p|a|list)=
1723 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1726 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1728 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1730 IE_NAME = u'youtube:playlist'
1732 def __init__(self, downloader=None):
1733 InfoExtractor.__init__(self, downloader)
1736 def suitable(cls, url):
1737 """Receives a URL and returns True if suitable for this IE."""
1738 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1740 def report_download_page(self, playlist_id, pagenum):
1741 """Report attempt to download playlist page with given number."""
1742 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1744 def _real_extract(self, url):
1745 # Extract playlist id
1746 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1748 self._downloader.report_error(u'invalid url: %s' % url)
1751 # Download playlist videos from API
1752 playlist_id = mobj.group(1) or mobj.group(2)
1757 self.report_download_page(playlist_id, page_num)
1759 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1761 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1762 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1763 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1767 response = json.loads(page)
1768 except ValueError as err:
1769 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1772 if 'feed' not in response:
1773 self._downloader.report_error(u'Got a malformed response from YouTube API')
1775 if 'entry' not in response['feed']:
1776 # Number of videos is a multiple of self._MAX_RESULTS
1779 playlist_title = response['feed']['title']['$t']
1781 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1782 for entry in response['feed']['entry']
1783 if 'content' in entry ]
1785 if len(response['feed']['entry']) < self._MAX_RESULTS:
1789 videos = [v[1] for v in sorted(videos)]
1791 url_results = [self.url_result(url, 'Youtube') for url in videos]
1792 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1795 class YoutubeChannelIE(InfoExtractor):
1796 """Information Extractor for YouTube channels."""
1798 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1799 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1800 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1801 _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1802 IE_NAME = u'youtube:channel'
1804 def report_download_page(self, channel_id, pagenum):
1805 """Report attempt to download channel page with given number."""
1806 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1808 def extract_videos_from_page(self, page):
1810 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1811 if mobj.group(1) not in ids_in_page:
1812 ids_in_page.append(mobj.group(1))
1815 def _real_extract(self, url):
1816 # Extract channel id
1817 mobj = re.match(self._VALID_URL, url)
1819 self._downloader.report_error(u'invalid url: %s' % url)
1822 # Download channel page
1823 channel_id = mobj.group(1)
1827 self.report_download_page(channel_id, pagenum)
1828 url = self._TEMPLATE_URL % (channel_id, pagenum)
1829 request = compat_urllib_request.Request(url)
1831 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1832 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1833 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1836 # Extract video identifiers
1837 ids_in_page = self.extract_videos_from_page(page)
1838 video_ids.extend(ids_in_page)
1840 # Download any subsequent channel pages using the json-based channel_ajax query
1841 if self._MORE_PAGES_INDICATOR in page:
1843 pagenum = pagenum + 1
1845 self.report_download_page(channel_id, pagenum)
1846 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1847 request = compat_urllib_request.Request(url)
1849 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1850 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1851 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1854 page = json.loads(page)
1856 ids_in_page = self.extract_videos_from_page(page['content_html'])
1857 video_ids.extend(ids_in_page)
1859 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1862 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1864 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1865 url_entries = [self.url_result(url, 'Youtube') for url in urls]
1866 return [self.playlist_result(url_entries, channel_id)]
1869 class YoutubeUserIE(InfoExtractor):
1870 """Information Extractor for YouTube users."""
1872 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1873 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1874 _GDATA_PAGE_SIZE = 50
1875 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1876 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1877 IE_NAME = u'youtube:user'
1879 def __init__(self, downloader=None):
1880 InfoExtractor.__init__(self, downloader)
1882 def report_download_page(self, username, start_index):
1883 """Report attempt to download user page."""
1884 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1885 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1887 def _real_extract(self, url):
1889 mobj = re.match(self._VALID_URL, url)
1891 self._downloader.report_error(u'invalid url: %s' % url)
1894 username = mobj.group(1)
1896 # Download video ids using YouTube Data API. Result size per
1897 # query is limited (currently to 50 videos) so we need to query
1898 # page by page until there are no video ids - it means we got
1905 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1906 self.report_download_page(username, start_index)
1908 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1911 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1912 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1913 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1916 # Extract video identifiers
1919 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1920 if mobj.group(1) not in ids_in_page:
1921 ids_in_page.append(mobj.group(1))
1923 video_ids.extend(ids_in_page)
1925 # A little optimization - if current page is not
1926 # "full", ie. does not contain PAGE_SIZE video ids then
1927 # we can assume that this page is the last one - there
1928 # are no more ids on further pages - no need to query
1931 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1936 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1937 url_results = [self.url_result(url, 'Youtube') for url in urls]
1938 return [self.playlist_result(url_results, playlist_title = username)]
1941 class BlipTVUserIE(InfoExtractor):
1942 """Information Extractor for blip.tv users."""
1944 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1946 IE_NAME = u'blip.tv:user'
1948 def __init__(self, downloader=None):
1949 InfoExtractor.__init__(self, downloader)
1951 def report_download_page(self, username, pagenum):
1952 """Report attempt to download user page."""
1953 self.to_screen(u'user %s: Downloading video ids from page %d' %
1954 (username, pagenum))
1956 def _real_extract(self, url):
1958 mobj = re.match(self._VALID_URL, url)
1960 self._downloader.report_error(u'invalid url: %s' % url)
1963 username = mobj.group(1)
1965 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1967 request = compat_urllib_request.Request(url)
1970 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1971 mobj = re.search(r'data-users-id="([^"]+)"', page)
1972 page_base = page_base % mobj.group(1)
1973 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1974 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1978 # Download video ids using BlipTV Ajax calls. Result size per
1979 # query is limited (currently to 12 videos) so we need to query
1980 # page by page until there are no video ids - it means we got
1987 self.report_download_page(username, pagenum)
1988 url = page_base + "&page=" + str(pagenum)
1989 request = compat_urllib_request.Request( url )
1991 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1992 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1993 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
1996 # Extract video identifiers
1999 for mobj in re.finditer(r'href="/([^"]+)"', page):
2000 if mobj.group(1) not in ids_in_page:
2001 ids_in_page.append(unescapeHTML(mobj.group(1)))
2003 video_ids.extend(ids_in_page)
2005 # A little optimization - if current page is not
2006 # "full", ie. does not contain PAGE_SIZE video ids then
2007 # we can assume that this page is the last one - there
2008 # are no more ids on further pages - no need to query
2011 if len(ids_in_page) < self._PAGE_SIZE:
2016 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
2017 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
2018 return [self.playlist_result(url_entries, playlist_title = username)]
2021 class DepositFilesIE(InfoExtractor):
2022 """Information extractor for depositfiles.com"""
2024 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2026 def report_download_webpage(self, file_id):
2027 """Report webpage download."""
2028 self.to_screen(u'%s: Downloading webpage' % file_id)
2030 def _real_extract(self, url):
2031 file_id = url.split('/')[-1]
2032 # Rebuild url in english locale
2033 url = 'http://depositfiles.com/en/files/' + file_id
2035 # Retrieve file webpage with 'Free download' button pressed
2036 free_download_indication = { 'gateway_result' : '1' }
2037 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2039 self.report_download_webpage(file_id)
2040 webpage = compat_urllib_request.urlopen(request).read()
2041 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2042 self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
2045 # Search for the real file URL
2046 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2047 if (mobj is None) or (mobj.group(1) is None):
2048 # Try to figure out reason of the error.
2049 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2050 if (mobj is not None) and (mobj.group(1) is not None):
2051 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2052 self._downloader.report_error(u'%s' % restriction_message)
2054 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
2057 file_url = mobj.group(1)
2058 file_extension = os.path.splitext(file_url)[1][1:]
2060 # Search for file title
2061 mobj = re.search(r'<b title="(.*?)">', webpage)
2063 self._downloader.report_error(u'unable to extract title')
2065 file_title = mobj.group(1).decode('utf-8')
2068 'id': file_id.decode('utf-8'),
2069 'url': file_url.decode('utf-8'),
2071 'upload_date': None,
2072 'title': file_title,
2073 'ext': file_extension.decode('utf-8'),
2077 class FacebookIE(InfoExtractor):
2078 """Information Extractor for Facebook"""
2080 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2081 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2082 _NETRC_MACHINE = 'facebook'
2083 IE_NAME = u'facebook'
2085 def report_login(self):
2086 """Report attempt to log in."""
2087 self.to_screen(u'Logging in')
2089 def _real_initialize(self):
2090 if self._downloader is None:
2095 downloader_params = self._downloader.params
2097 # Attempt to use provided username and password or .netrc data
2098 if downloader_params.get('username', None) is not None:
2099 useremail = downloader_params['username']
2100 password = downloader_params['password']
2101 elif downloader_params.get('usenetrc', False):
2103 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2104 if info is not None:
2108 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2109 except (IOError, netrc.NetrcParseError) as err:
2110 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2113 if useremail is None:
2122 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2125 login_results = compat_urllib_request.urlopen(request).read()
2126 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2127 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2129 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2130 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2133 def _real_extract(self, url):
2134 mobj = re.match(self._VALID_URL, url)
2136 self._downloader.report_error(u'invalid URL: %s' % url)
2138 video_id = mobj.group('ID')
2140 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2141 webpage = self._download_webpage(url, video_id)
2143 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
2144 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2145 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2147 raise ExtractorError(u'Cannot parse data')
2148 data = dict(json.loads(m.group(1)))
2149 params_raw = compat_urllib_parse.unquote(data['params'])
2150 params = json.loads(params_raw)
2151 video_data = params['video_data'][0]
2152 video_url = video_data.get('hd_src')
2154 video_url = video_data['sd_src']
2156 raise ExtractorError(u'Cannot find video URL')
2157 video_duration = int(video_data['video_duration'])
2158 thumbnail = video_data['thumbnail_src']
2160 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2162 raise ExtractorError(u'Cannot find title in webpage')
2163 video_title = unescapeHTML(m.group(1))
2167 'title': video_title,
2170 'duration': video_duration,
2171 'thumbnail': thumbnail,
2176 class BlipTVIE(InfoExtractor):
2177 """Information extractor for blip.tv"""
2179 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2180 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2181 IE_NAME = u'blip.tv'
2183 def report_direct_download(self, title):
2184 """Report information extraction."""
2185 self.to_screen(u'%s: Direct download detected' % title)
2187 def _real_extract(self, url):
2188 mobj = re.match(self._VALID_URL, url)
2190 self._downloader.report_error(u'invalid URL: %s' % url)
2193 urlp = compat_urllib_parse_urlparse(url)
2194 if urlp.path.startswith('/play/'):
2195 request = compat_urllib_request.Request(url)
2196 response = compat_urllib_request.urlopen(request)
2197 redirecturl = response.geturl()
2198 rurlp = compat_urllib_parse_urlparse(redirecturl)
2199 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2200 url = 'http://blip.tv/a/a-' + file_id
2201 return self._real_extract(url)
2208 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2209 request = compat_urllib_request.Request(json_url)
2210 request.add_header('User-Agent', 'iTunes/10.6.1')
2211 self.report_extraction(mobj.group(1))
2214 urlh = compat_urllib_request.urlopen(request)
2215 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2216 basename = url.split('/')[-1]
2217 title,ext = os.path.splitext(basename)
2218 title = title.decode('UTF-8')
2219 ext = ext.replace('.', '')
2220 self.report_direct_download(title)
2225 'upload_date': None,
2230 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2231 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2232 if info is None: # Regular URL
2234 json_code_bytes = urlh.read()
2235 json_code = json_code_bytes.decode('utf-8')
2236 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2237 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2241 json_data = json.loads(json_code)
2242 if 'Post' in json_data:
2243 data = json_data['Post']
2247 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2248 video_url = data['media']['url']
2249 umobj = re.match(self._URL_EXT, video_url)
2251 raise ValueError('Can not determine filename extension')
2252 ext = umobj.group(1)
2255 'id': data['item_id'],
2257 'uploader': data['display_name'],
2258 'upload_date': upload_date,
2259 'title': data['title'],
2261 'format': data['media']['mimeType'],
2262 'thumbnail': data['thumbnailUrl'],
2263 'description': data['description'],
2264 'player_url': data['embedUrl'],
2265 'user_agent': 'iTunes/10.6.1',
2267 except (ValueError,KeyError) as err:
2268 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2274 class MyVideoIE(InfoExtractor):
2275 """Information Extractor for myvideo.de."""
2277 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2278 IE_NAME = u'myvideo'
2280 def __init__(self, downloader=None):
2281 InfoExtractor.__init__(self, downloader)
2283 def _real_extract(self,url):
2284 mobj = re.match(self._VALID_URL, url)
2286 self._download.report_error(u'invalid URL: %s' % url)
2289 video_id = mobj.group(1)
2292 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2293 webpage = self._download_webpage(webpage_url, video_id)
2295 self.report_extraction(video_id)
2296 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2299 self._downloader.report_error(u'unable to extract media URL')
2301 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2303 mobj = re.search('<title>([^<]+)</title>', webpage)
2305 self._downloader.report_error(u'unable to extract title')
2308 video_title = mobj.group(1)
2314 'upload_date': None,
2315 'title': video_title,
2319 class ComedyCentralIE(InfoExtractor):
2320 """Information extractor for The Daily Show and Colbert Report """
2322 # urls can be abbreviations like :thedailyshow or :colbert
2323 # urls for episodes like:
2324 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2325 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2326 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2327 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2328 |(https?://)?(www\.)?
2329 (?P<showname>thedailyshow|colbertnation)\.com/
2330 (full-episodes/(?P<episode>.*)|
2332 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2333 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2336 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2338 _video_extensions = {
2346 _video_dimensions = {
2356 def suitable(cls, url):
2357 """Receives a URL and returns True if suitable for this IE."""
2358 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2360 def report_config_download(self, episode_id, media_id):
2361 self.to_screen(u'%s: Downloading configuration for %s' % (episode_id, media_id))
2363 def report_index_download(self, episode_id):
2364 self.to_screen(u'%s: Downloading show index' % episode_id)
2366 def _print_formats(self, formats):
2367 print('Available formats:')
2369 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2372 def _real_extract(self, url):
2373 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2375 self._downloader.report_error(u'invalid URL: %s' % url)
2378 if mobj.group('shortname'):
2379 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2380 url = u'http://www.thedailyshow.com/full-episodes/'
2382 url = u'http://www.colbertnation.com/full-episodes/'
2383 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2384 assert mobj is not None
2386 if mobj.group('clip'):
2387 if mobj.group('showname') == 'thedailyshow':
2388 epTitle = mobj.group('tdstitle')
2390 epTitle = mobj.group('cntitle')
2393 dlNewest = not mobj.group('episode')
2395 epTitle = mobj.group('showname')
2397 epTitle = mobj.group('episode')
2399 req = compat_urllib_request.Request(url)
2400 self.report_extraction(epTitle)
2402 htmlHandle = compat_urllib_request.urlopen(req)
2403 html = htmlHandle.read()
2404 webpage = html.decode('utf-8')
2405 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2406 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2409 url = htmlHandle.geturl()
2410 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2412 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2414 if mobj.group('episode') == '':
2415 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2417 epTitle = mobj.group('episode')
2419 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2421 if len(mMovieParams) == 0:
2422 # The Colbert Report embeds the information in a without
2423 # a URL prefix; so extract the alternate reference
2424 # and then add the URL prefix manually.
2426 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2427 if len(altMovieParams) == 0:
2428 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2431 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2433 uri = mMovieParams[0][1]
2434 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2435 self.report_index_download(epTitle)
2437 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2438 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2439 self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2444 idoc = xml.etree.ElementTree.fromstring(indexXml)
2445 itemEls = idoc.findall('.//item')
2446 for partNum,itemEl in enumerate(itemEls):
2447 mediaId = itemEl.findall('./guid')[0].text
2448 shortMediaId = mediaId.split(':')[-1]
2449 showId = mediaId.split(':')[-2].replace('.com', '')
2450 officialTitle = itemEl.findall('./title')[0].text
2451 officialDate = itemEl.findall('./pubDate')[0].text
2453 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2454 compat_urllib_parse.urlencode({'uri': mediaId}))
2455 configReq = compat_urllib_request.Request(configUrl)
2456 self.report_config_download(epTitle, shortMediaId)
2458 configXml = compat_urllib_request.urlopen(configReq).read()
2459 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2460 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2463 cdoc = xml.etree.ElementTree.fromstring(configXml)
2465 for rendition in cdoc.findall('.//rendition'):
2466 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2470 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2473 if self._downloader.params.get('listformats', None):
2474 self._print_formats([i[0] for i in turls])
2477 # For now, just pick the highest bitrate
2478 format,rtmp_video_url = turls[-1]
2480 # Get the format arg from the arg stream
2481 req_format = self._downloader.params.get('format', None)
2483 # Select format if we can find one
2486 format, rtmp_video_url = f, v
2489 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2491 raise ExtractorError(u'Cannot transform RTMP url')
2492 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2493 video_url = base + m.group('finalid')
2495 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2500 'upload_date': officialDate,
2505 'description': officialTitle,
2507 results.append(info)
2512 class EscapistIE(InfoExtractor):
2513 """Information extractor for The Escapist """
2515 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2516 IE_NAME = u'escapist'
2518 def report_config_download(self, showName):
2519 self.to_screen(u'%s: Downloading configuration' % showName)
2521 def _real_extract(self, url):
2522 mobj = re.match(self._VALID_URL, url)
2524 self._downloader.report_error(u'invalid URL: %s' % url)
2526 showName = mobj.group('showname')
2527 videoId = mobj.group('episode')
2529 self.report_extraction(showName)
2531 webPage = compat_urllib_request.urlopen(url)
2532 webPageBytes = webPage.read()
2533 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2534 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2535 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2536 self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2539 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2540 description = unescapeHTML(descMatch.group(1))
2541 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2542 imgUrl = unescapeHTML(imgMatch.group(1))
2543 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2544 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2545 configUrlMatch = re.search('config=(.*)$', playerUrl)
2546 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2548 self.report_config_download(showName)
2550 configJSON = compat_urllib_request.urlopen(configUrl)
2551 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2552 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2553 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2554 self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2557 # Technically, it's JavaScript, not JSON
2558 configJSON = configJSON.replace("'", '"')
2561 config = json.loads(configJSON)
2562 except (ValueError,) as err:
2563 self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2566 playlist = config['playlist']
2567 videoUrl = playlist[1]['url']
2572 'uploader': showName,
2573 'upload_date': None,
2576 'thumbnail': imgUrl,
2577 'description': description,
2578 'player_url': playerUrl,
2583 class CollegeHumorIE(InfoExtractor):
2584 """Information extractor for collegehumor.com"""
2587 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2588 IE_NAME = u'collegehumor'
2590 def report_manifest(self, video_id):
2591 """Report information extraction."""
2592 self.to_screen(u'%s: Downloading XML manifest' % video_id)
2594 def _real_extract(self, url):
2595 mobj = re.match(self._VALID_URL, url)
2597 self._downloader.report_error(u'invalid URL: %s' % url)
2599 video_id = mobj.group('videoid')
2604 'upload_date': None,
2607 self.report_extraction(video_id)
2608 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2610 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2611 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2612 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2615 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2617 videoNode = mdoc.findall('./video')[0]
2618 info['description'] = videoNode.findall('./description')[0].text
2619 info['title'] = videoNode.findall('./caption')[0].text
2620 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2621 manifest_url = videoNode.findall('./file')[0].text
2623 self._downloader.report_error(u'Invalid metadata XML file')
2626 manifest_url += '?hdcore=2.10.3'
2627 self.report_manifest(video_id)
2629 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2630 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2631 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2634 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2636 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2637 node_id = media_node.attrib['url']
2638 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2639 except IndexError as err:
2640 self._downloader.report_error(u'Invalid manifest file')
2643 url_pr = compat_urllib_parse_urlparse(manifest_url)
2644 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2651 class XVideosIE(InfoExtractor):
2652 """Information extractor for xvideos.com"""
2654 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2655 IE_NAME = u'xvideos'
2657 def _real_extract(self, url):
2658 mobj = re.match(self._VALID_URL, url)
2660 self._downloader.report_error(u'invalid URL: %s' % url)
2662 video_id = mobj.group(1)
2664 webpage = self._download_webpage(url, video_id)
2666 self.report_extraction(video_id)
2670 mobj = re.search(r'flv_url=(.+?)&', webpage)
2672 self._downloader.report_error(u'unable to extract video url')
2674 video_url = compat_urllib_parse.unquote(mobj.group(1))
2678 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2680 self._downloader.report_error(u'unable to extract video title')
2682 video_title = mobj.group(1)
2685 # Extract video thumbnail
2686 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2688 self._downloader.report_error(u'unable to extract video thumbnail')
2690 video_thumbnail = mobj.group(0)
2696 'upload_date': None,
2697 'title': video_title,
2699 'thumbnail': video_thumbnail,
2700 'description': None,
2706 class SoundcloudIE(InfoExtractor):
2707 """Information extractor for soundcloud.com
2708 To access the media, the uid of the song and a stream token
2709 must be extracted from the page source and the script must make
2710 a request to media.soundcloud.com/crossdomain.xml. Then
2711 the media can be grabbed by requesting from an url composed
2712 of the stream token and uid
2715 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2716 IE_NAME = u'soundcloud'
2718 def __init__(self, downloader=None):
2719 InfoExtractor.__init__(self, downloader)
2721 def report_resolve(self, video_id):
2722 """Report information extraction."""
2723 self.to_screen(u'%s: Resolving id' % video_id)
2725 def _real_extract(self, url):
2726 mobj = re.match(self._VALID_URL, url)
2728 self._downloader.report_error(u'invalid URL: %s' % url)
2731 # extract uploader (which is in the url)
2732 uploader = mobj.group(1)
2733 # extract simple title (uploader + slug of song title)
2734 slug_title = mobj.group(2)
2735 simple_title = uploader + u'-' + slug_title
2737 self.report_resolve('%s/%s' % (uploader, slug_title))
2739 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2740 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2741 request = compat_urllib_request.Request(resolv_url)
2743 info_json_bytes = compat_urllib_request.urlopen(request).read()
2744 info_json = info_json_bytes.decode('utf-8')
2745 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2746 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2749 info = json.loads(info_json)
2750 video_id = info['id']
2751 self.report_extraction('%s/%s' % (uploader, slug_title))
2753 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2754 request = compat_urllib_request.Request(streams_url)
2756 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2757 stream_json = stream_json_bytes.decode('utf-8')
2758 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2759 self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2762 streams = json.loads(stream_json)
2763 mediaURL = streams['http_mp3_128_url']
2768 'uploader': info['user']['username'],
2769 'upload_date': info['created_at'],
2770 'title': info['title'],
2772 'description': info['description'],
2775 class SoundcloudSetIE(InfoExtractor):
2776 """Information extractor for soundcloud.com sets
2777 To access the media, the uid of the song and a stream token
2778 must be extracted from the page source and the script must make
2779 a request to media.soundcloud.com/crossdomain.xml. Then
2780 the media can be grabbed by requesting from an url composed
2781 of the stream token and uid
2784 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2785 IE_NAME = u'soundcloud'
2787 def __init__(self, downloader=None):
2788 InfoExtractor.__init__(self, downloader)
2790 def report_resolve(self, video_id):
2791 """Report information extraction."""
2792 self.to_screen(u'%s: Resolving id' % video_id)
2794 def _real_extract(self, url):
2795 mobj = re.match(self._VALID_URL, url)
2797 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2800 # extract uploader (which is in the url)
2801 uploader = mobj.group(1)
2802 # extract simple title (uploader + slug of song title)
2803 slug_title = mobj.group(2)
2804 simple_title = uploader + u'-' + slug_title
2806 self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2808 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2809 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2810 request = compat_urllib_request.Request(resolv_url)
2812 info_json_bytes = compat_urllib_request.urlopen(request).read()
2813 info_json = info_json_bytes.decode('utf-8')
2814 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2815 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2819 info = json.loads(info_json)
2820 if 'errors' in info:
2821 for err in info['errors']:
2822 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err['error_message']))
2825 for track in info['tracks']:
2826 video_id = track['id']
2827 self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2829 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2830 request = compat_urllib_request.Request(streams_url)
2832 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2833 stream_json = stream_json_bytes.decode('utf-8')
2834 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2835 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2838 streams = json.loads(stream_json)
2839 mediaURL = streams['http_mp3_128_url']
2844 'uploader': track['user']['username'],
2845 'upload_date': track['created_at'],
2846 'title': track['title'],
2848 'description': track['description'],
2853 class InfoQIE(InfoExtractor):
2854 """Information extractor for infoq.com"""
2855 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2857 def _real_extract(self, url):
2858 mobj = re.match(self._VALID_URL, url)
2860 self._downloader.report_error(u'invalid URL: %s' % url)
2863 webpage = self._download_webpage(url, video_id=url)
2864 self.report_extraction(url)
2867 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2869 self._downloader.report_error(u'unable to extract video url')
2871 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2872 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2875 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2877 self._downloader.report_error(u'unable to extract video title')
2879 video_title = mobj.group(1)
2881 # Extract description
2882 video_description = u'No description available.'
2883 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2884 if mobj is not None:
2885 video_description = mobj.group(1)
2887 video_filename = video_url.split('/')[-1]
2888 video_id, extension = video_filename.split('.')
2894 'upload_date': None,
2895 'title': video_title,
2896 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2898 'description': video_description,
2903 class MixcloudIE(InfoExtractor):
2904 """Information extractor for www.mixcloud.com"""
2906 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2907 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2908 IE_NAME = u'mixcloud'
2910 def __init__(self, downloader=None):
2911 InfoExtractor.__init__(self, downloader)
2913 def report_download_json(self, file_id):
2914 """Report JSON download."""
2915 self.to_screen(u'Downloading json')
2917 def get_urls(self, jsonData, fmt, bitrate='best'):
2918 """Get urls from 'audio_formats' section in json"""
2921 bitrate_list = jsonData[fmt]
2922 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2923 bitrate = max(bitrate_list) # select highest
2925 url_list = jsonData[fmt][bitrate]
2926 except TypeError: # we have no bitrate info.
2927 url_list = jsonData[fmt]
2930 def check_urls(self, url_list):
2931 """Returns 1st active url from list"""
2932 for url in url_list:
2934 compat_urllib_request.urlopen(url)
2936 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2941 def _print_formats(self, formats):
2942 print('Available formats:')
2943 for fmt in formats.keys():
2944 for b in formats[fmt]:
2946 ext = formats[fmt][b][0]
2947 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2948 except TypeError: # we have no bitrate info
2949 ext = formats[fmt][0]
2950 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2953 def _real_extract(self, url):
2954 mobj = re.match(self._VALID_URL, url)
2956 self._downloader.report_error(u'invalid URL: %s' % url)
2958 # extract uploader & filename from url
2959 uploader = mobj.group(1).decode('utf-8')
2960 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2962 # construct API request
2963 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2964 # retrieve .json file with links to files
2965 request = compat_urllib_request.Request(file_url)
2967 self.report_download_json(file_url)
2968 jsonData = compat_urllib_request.urlopen(request).read()
2969 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2970 self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
2974 json_data = json.loads(jsonData)
2975 player_url = json_data['player_swf_url']
2976 formats = dict(json_data['audio_formats'])
2978 req_format = self._downloader.params.get('format', None)
2981 if self._downloader.params.get('listformats', None):
2982 self._print_formats(formats)
2985 if req_format is None or req_format == 'best':
2986 for format_param in formats.keys():
2987 url_list = self.get_urls(formats, format_param)
2989 file_url = self.check_urls(url_list)
2990 if file_url is not None:
2993 if req_format not in formats:
2994 self._downloader.report_error(u'format is not available')
2997 url_list = self.get_urls(formats, req_format)
2998 file_url = self.check_urls(url_list)
2999 format_param = req_format
3002 'id': file_id.decode('utf-8'),
3003 'url': file_url.decode('utf-8'),
3004 'uploader': uploader.decode('utf-8'),
3005 'upload_date': None,
3006 'title': json_data['name'],
3007 'ext': file_url.split('.')[-1].decode('utf-8'),
3008 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3009 'thumbnail': json_data['thumbnail_url'],
3010 'description': json_data['description'],
3011 'player_url': player_url.decode('utf-8'),
3014 class StanfordOpenClassroomIE(InfoExtractor):
3015 """Information extractor for Stanford's Open ClassRoom"""
3017 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3018 IE_NAME = u'stanfordoc'
3020 def report_download_webpage(self, objid):
3021 """Report information extraction."""
3022 self.to_screen(u'%s: Downloading webpage' % objid)
3024 def _real_extract(self, url):
3025 mobj = re.match(self._VALID_URL, url)
3027 raise ExtractorError(u'Invalid URL: %s' % url)
3029 if mobj.group('course') and mobj.group('video'): # A specific video
3030 course = mobj.group('course')
3031 video = mobj.group('video')
3033 'id': course + '_' + video,
3035 'upload_date': None,
3038 self.report_extraction(info['id'])
3039 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3040 xmlUrl = baseUrl + video + '.xml'
3042 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3043 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3044 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
3046 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3048 info['title'] = mdoc.findall('./title')[0].text
3049 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3051 self._downloader.report_error(u'Invalid metadata XML file')
3053 info['ext'] = info['url'].rpartition('.')[2]
3055 elif mobj.group('course'): # A course page
3056 course = mobj.group('course')
3061 'upload_date': None,
3064 coursepage = self._download_webpage(url, info['id'],
3065 note='Downloading course info page',
3066 errnote='Unable to download course info page')
3068 m = re.search('<h1>([^<]+)</h1>', coursepage)
3070 info['title'] = unescapeHTML(m.group(1))
3072 info['title'] = info['id']
3074 m = re.search('<description>([^<]+)</description>', coursepage)
3076 info['description'] = unescapeHTML(m.group(1))
3078 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3081 'type': 'reference',
3082 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3086 for entry in info['list']:
3087 assert entry['type'] == 'reference'
3088 results += self.extract(entry['url'])
3092 'id': 'Stanford OpenClassroom',
3095 'upload_date': None,
3098 self.report_download_webpage(info['id'])
3099 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3101 rootpage = compat_urllib_request.urlopen(rootURL).read()
3102 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3103 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3106 info['title'] = info['id']
3108 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3111 'type': 'reference',
3112 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3117 for entry in info['list']:
3118 assert entry['type'] == 'reference'
3119 results += self.extract(entry['url'])
3122 class MTVIE(InfoExtractor):
3123 """Information extractor for MTV.com"""
3125 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3128 def _real_extract(self, url):
3129 mobj = re.match(self._VALID_URL, url)
3131 self._downloader.report_error(u'invalid URL: %s' % url)
3133 if not mobj.group('proto'):
3134 url = 'http://' + url
3135 video_id = mobj.group('videoid')
3137 webpage = self._download_webpage(url, video_id)
3139 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3141 self._downloader.report_error(u'unable to extract song name')
3143 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3144 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3146 self._downloader.report_error(u'unable to extract performer')
3148 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3149 video_title = performer + ' - ' + song_name
3151 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3153 self._downloader.report_error(u'unable to mtvn_uri')
3155 mtvn_uri = mobj.group(1)
3157 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3159 self._downloader.report_error(u'unable to extract content id')
3161 content_id = mobj.group(1)
3163 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3164 self.report_extraction(video_id)
3165 request = compat_urllib_request.Request(videogen_url)
3167 metadataXml = compat_urllib_request.urlopen(request).read()
3168 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3169 self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3172 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3173 renditions = mdoc.findall('.//rendition')
3175 # For now, always pick the highest quality.
3176 rendition = renditions[-1]
3179 _,_,ext = rendition.attrib['type'].partition('/')
3180 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3181 video_url = rendition.find('./src').text
3183 self._downloader.trouble('Invalid rendition field.')
3189 'uploader': performer,
3190 'upload_date': None,
3191 'title': video_title,
3199 class YoukuIE(InfoExtractor):
3200 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3202 def report_download_webpage(self, file_id):
3203 """Report webpage download."""
3204 self.to_screen(u'%s: Downloading webpage' % file_id)
3207 nowTime = int(time.time() * 1000)
3208 random1 = random.randint(1000,1998)
3209 random2 = random.randint(1000,9999)
3211 return "%d%d%d" %(nowTime,random1,random2)
3213 def _get_file_ID_mix_string(self, seed):
3215 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3217 for i in range(len(source)):
3218 seed = (seed * 211 + 30031 ) % 65536
3219 index = math.floor(seed / 65536 * len(source) )
3220 mixed.append(source[int(index)])
3221 source.remove(source[int(index)])
3222 #return ''.join(mixed)
3225 def _get_file_id(self, fileId, seed):
3226 mixed = self._get_file_ID_mix_string(seed)
3227 ids = fileId.split('*')
3231 realId.append(mixed[int(ch)])
3232 return ''.join(realId)
3234 def _real_extract(self, url):
3235 mobj = re.match(self._VALID_URL, url)
3237 self._downloader.report_error(u'invalid URL: %s' % url)
3239 video_id = mobj.group('ID')
3241 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3243 request = compat_urllib_request.Request(info_url, None, std_headers)
3245 self.report_download_webpage(video_id)
3246 jsondata = compat_urllib_request.urlopen(request).read()
3247 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3248 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3251 self.report_extraction(video_id)
3253 jsonstr = jsondata.decode('utf-8')
3254 config = json.loads(jsonstr)
3256 video_title = config['data'][0]['title']
3257 seed = config['data'][0]['seed']
3259 format = self._downloader.params.get('format', None)
3260 supported_format = list(config['data'][0]['streamfileids'].keys())
3262 if format is None or format == 'best':
3263 if 'hd2' in supported_format:
3268 elif format == 'worst':
3276 fileid = config['data'][0]['streamfileids'][format]
3277 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3278 except (UnicodeDecodeError, ValueError, KeyError):
3279 self._downloader.report_error(u'unable to extract info section')
3283 sid = self._gen_sid()
3284 fileid = self._get_file_id(fileid, seed)
3286 #column 8,9 of fileid represent the segment number
3287 #fileid[7:9] should be changed
3288 for index, key in enumerate(keys):
3290 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3291 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3294 'id': '%s_part%02d' % (video_id, index),
3295 'url': download_url,
3297 'upload_date': None,
3298 'title': video_title,
3301 files_info.append(info)
3306 class XNXXIE(InfoExtractor):
3307 """Information extractor for xnxx.com"""
3309 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3311 VIDEO_URL_RE = r'flv_url=(.*?)&'
3312 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3313 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3315 def report_webpage(self, video_id):
3316 """Report information extraction"""
3317 self.to_screen(u'%s: Downloading webpage' % video_id)
3319 def _real_extract(self, url):
3320 mobj = re.match(self._VALID_URL, url)
3322 self._downloader.report_error(u'invalid URL: %s' % url)
3324 video_id = mobj.group(1)
3326 self.report_webpage(video_id)
3328 # Get webpage content
3330 webpage_bytes = compat_urllib_request.urlopen(url).read()
3331 webpage = webpage_bytes.decode('utf-8')
3332 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3333 self._downloader.report_error(u'unable to download video webpage: %s' % err)
3336 result = re.search(self.VIDEO_URL_RE, webpage)
3338 self._downloader.report_error(u'unable to extract video url')
3340 video_url = compat_urllib_parse.unquote(result.group(1))
3342 result = re.search(self.VIDEO_TITLE_RE, webpage)
3344 self._downloader.report_error(u'unable to extract video title')
3346 video_title = result.group(1)
3348 result = re.search(self.VIDEO_THUMB_RE, webpage)
3350 self._downloader.report_error(u'unable to extract video thumbnail')
3352 video_thumbnail = result.group(1)
3358 'upload_date': None,
3359 'title': video_title,
3361 'thumbnail': video_thumbnail,
3362 'description': None,
3366 class GooglePlusIE(InfoExtractor):
3367 """Information extractor for plus.google.com."""
3369 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3370 IE_NAME = u'plus.google'
3372 def __init__(self, downloader=None):
3373 InfoExtractor.__init__(self, downloader)
3375 def report_extract_entry(self, url):
3376 """Report downloading extry"""
3377 self.to_screen(u'Downloading entry: %s' % url)
3379 def report_date(self, upload_date):
3380 """Report downloading extry"""
3381 self.to_screen(u'Entry date: %s' % upload_date)
3383 def report_uploader(self, uploader):
3384 """Report downloading extry"""
3385 self.to_screen(u'Uploader: %s' % uploader)
3387 def report_title(self, video_title):
3388 """Report downloading extry"""
3389 self.to_screen(u'Title: %s' % video_title)
3391 def report_extract_vid_page(self, video_page):
3392 """Report information extraction."""
3393 self.to_screen(u'Extracting video page: %s' % video_page)
3395 def _real_extract(self, url):
3396 # Extract id from URL
3397 mobj = re.match(self._VALID_URL, url)
3399 self._downloader.report_error(u'Invalid URL: %s' % url)
3402 post_url = mobj.group(0)
3403 video_id = mobj.group(1)
3405 video_extension = 'flv'
3407 # Step 1, Retrieve post webpage to extract further information
3408 self.report_extract_entry(post_url)
3409 request = compat_urllib_request.Request(post_url)
3411 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3412 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3413 self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3416 # Extract update date
3418 pattern = 'title="Timestamp">(.*?)</a>'
3419 mobj = re.search(pattern, webpage)
3421 upload_date = mobj.group(1)
3422 # Convert timestring to a format suitable for filename
3423 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3424 upload_date = upload_date.strftime('%Y%m%d')
3425 self.report_date(upload_date)
3429 pattern = r'rel\="author".*?>(.*?)</a>'
3430 mobj = re.search(pattern, webpage)
3432 uploader = mobj.group(1)
3433 self.report_uploader(uploader)
3436 # Get the first line for title
3438 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3439 mobj = re.search(pattern, webpage)
3441 video_title = mobj.group(1)
3442 self.report_title(video_title)
3444 # Step 2, Stimulate clicking the image box to launch video
3445 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3446 mobj = re.search(pattern, webpage)
3448 self._downloader.report_error(u'unable to extract video page URL')
3450 video_page = mobj.group(1)
3451 request = compat_urllib_request.Request(video_page)
3453 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3454 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3455 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3457 self.report_extract_vid_page(video_page)
3460 # Extract video links on video page
3461 """Extract video links of all sizes"""
3462 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3463 mobj = re.findall(pattern, webpage)
3465 self._downloader.report_error(u'unable to extract video links')
3467 # Sort in resolution
3468 links = sorted(mobj)
3470 # Choose the lowest of the sort, i.e. highest resolution
3471 video_url = links[-1]
3472 # Only get the url. The resolution part in the tuple has no use anymore
3473 video_url = video_url[-1]
3474 # Treat escaped \u0026 style hex
3476 video_url = video_url.decode("unicode_escape")
3477 except AttributeError: # Python 3
3478 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3484 'uploader': uploader,
3485 'upload_date': upload_date,
3486 'title': video_title,
3487 'ext': video_extension,
3490 class NBAIE(InfoExtractor):
3491 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3494 def _real_extract(self, url):
3495 mobj = re.match(self._VALID_URL, url)
3497 self._downloader.report_error(u'invalid URL: %s' % url)
3500 video_id = mobj.group(1)
3501 if video_id.endswith('/index.html'):
3502 video_id = video_id[:-len('/index.html')]
3504 webpage = self._download_webpage(url, video_id)
3506 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3507 def _findProp(rexp, default=None):
3508 m = re.search(rexp, webpage)
3510 return unescapeHTML(m.group(1))
3514 shortened_video_id = video_id.rpartition('/')[2]
3515 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3517 'id': shortened_video_id,
3521 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3522 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3526 class JustinTVIE(InfoExtractor):
3527 """Information extractor for justin.tv and twitch.tv"""
3528 # TODO: One broadcast may be split into multiple videos. The key
3529 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3530 # starts at 1 and increases. Can we treat all parts as one video?
3532 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3533 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3534 _JUSTIN_PAGE_LIMIT = 100
3535 IE_NAME = u'justin.tv'
3537 def report_download_page(self, channel, offset):
3538 """Report attempt to download a single page of videos."""
3539 self.to_screen(u'%s: Downloading video information from %d to %d' %
3540 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3542 # Return count of items, list of *valid* items
3543 def _parse_page(self, url):
3545 urlh = compat_urllib_request.urlopen(url)
3546 webpage_bytes = urlh.read()
3547 webpage = webpage_bytes.decode('utf-8', 'ignore')
3548 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3549 self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3552 response = json.loads(webpage)
3553 if type(response) != list:
3554 error_text = response.get('error', 'unknown error')
3555 self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3558 for clip in response:
3559 video_url = clip['video_file_url']
3561 video_extension = os.path.splitext(video_url)[1][1:]
3562 video_date = re.sub('-', '', clip['start_time'][:10])
3563 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3564 video_id = clip['id']
3565 video_title = clip.get('title', video_id)
3569 'title': video_title,
3570 'uploader': clip.get('channel_name', video_uploader_id),
3571 'uploader_id': video_uploader_id,
3572 'upload_date': video_date,
3573 'ext': video_extension,
3575 return (len(response), info)
3577 def _real_extract(self, url):
3578 mobj = re.match(self._VALID_URL, url)
3580 self._downloader.report_error(u'invalid URL: %s' % url)
3583 api = 'http://api.justin.tv'
3584 video_id = mobj.group(mobj.lastindex)
3586 if mobj.lastindex == 1:
3588 api += '/channel/archives/%s.json'
3590 api += '/broadcast/by_archive/%s.json'
3591 api = api % (video_id,)
3593 self.report_extraction(video_id)
3597 limit = self._JUSTIN_PAGE_LIMIT
3600 self.report_download_page(video_id, offset)
3601 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3602 page_count, page_info = self._parse_page(page_url)
3603 info.extend(page_info)
3604 if not paged or page_count != limit:
3609 class FunnyOrDieIE(InfoExtractor):
3610 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3612 def _real_extract(self, url):
3613 mobj = re.match(self._VALID_URL, url)
3615 self._downloader.report_error(u'invalid URL: %s' % url)
3618 video_id = mobj.group('id')
3619 webpage = self._download_webpage(url, video_id)
3621 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3623 self._downloader.report_error(u'unable to find video information')
3624 video_url = unescapeHTML(m.group('url'))
3626 m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3628 m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3630 self._downloader.trouble(u'Cannot find video title')
3631 title = clean_html(m.group('title'))
3633 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3635 desc = unescapeHTML(m.group('desc'))
3644 'description': desc,
3648 class SteamIE(InfoExtractor):
3649 _VALID_URL = r"""http://store.steampowered.com/
3650 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3652 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3656 def suitable(cls, url):
3657 """Receives a URL and returns True if suitable for this IE."""
3658 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3660 def _real_extract(self, url):
3661 m = re.match(self._VALID_URL, url, re.VERBOSE)
3662 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3663 gameID = m.group('gameID')
3664 videourl = 'http://store.steampowered.com/video/%s/' % gameID
3665 webpage = self._download_webpage(videourl, gameID)
3666 mweb = re.finditer(urlRE, webpage)
3667 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3668 titles = re.finditer(namesRE, webpage)
3669 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3670 thumbs = re.finditer(thumbsRE, webpage)
3672 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3673 video_id = vid.group('videoID')
3674 title = vtitle.group('videoName')
3675 video_url = vid.group('videoURL')
3676 video_thumb = thumb.group('thumbnail')
3678 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3683 'title': unescapeHTML(title),
3684 'thumbnail': video_thumb
3689 class UstreamIE(InfoExtractor):
3690 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3691 IE_NAME = u'ustream'
3693 def _real_extract(self, url):
3694 m = re.match(self._VALID_URL, url)
3695 video_id = m.group('videoID')
3696 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3697 webpage = self._download_webpage(url, video_id)
3698 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3699 title = m.group('title')
3700 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3701 uploader = m.group('uploader')
3707 'uploader': uploader
3711 class WorldStarHipHopIE(InfoExtractor):
3712 _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3713 IE_NAME = u'WorldStarHipHop'
3715 def _real_extract(self, url):
3716 _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3718 webpage_src = compat_urllib_request.urlopen(url).read()
3719 webpage_src = webpage_src.decode('utf-8')
3721 mobj = re.search(_src_url, webpage_src)
3723 m = re.match(self._VALID_URL, url)
3724 video_id = m.group('id')
3726 if mobj is not None:
3727 video_url = mobj.group()
3728 if 'mp4' in video_url:
3733 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3736 _title = r"""<title>(.*)</title>"""
3738 mobj = re.search(_title, webpage_src)
3740 if mobj is not None:
3741 title = mobj.group(1)
3743 title = 'World Start Hip Hop - %s' % time.ctime()
3745 _thumbnail = r"""rel="image_src" href="(.*)" />"""
3746 mobj = re.search(_thumbnail, webpage_src)
3748 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3749 if mobj is not None:
3750 thumbnail = mobj.group(1)
3752 _title = r"""candytitles.*>(.*)</span>"""
3753 mobj = re.search(_title, webpage_src)
3754 if mobj is not None:
3755 title = mobj.group(1)
3762 'thumbnail' : thumbnail,
3767 class RBMARadioIE(InfoExtractor):
3768 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3770 def _real_extract(self, url):
3771 m = re.match(self._VALID_URL, url)
3772 video_id = m.group('videoID')
3774 webpage = self._download_webpage(url, video_id)
3775 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3777 raise ExtractorError(u'Cannot find metadata')
3778 json_data = m.group(1)
3781 data = json.loads(json_data)
3782 except ValueError as e:
3783 raise ExtractorError(u'Invalid JSON: ' + str(e))
3785 video_url = data['akamai_url'] + '&cbr=256'
3786 url_parts = compat_urllib_parse_urlparse(video_url)
3787 video_ext = url_parts.path.rpartition('.')[2]
3792 'title': data['title'],
3793 'description': data.get('teaser_text'),
3794 'location': data.get('country_of_origin'),
3795 'uploader': data.get('host', {}).get('name'),
3796 'uploader_id': data.get('host', {}).get('slug'),
3797 'thumbnail': data.get('image', {}).get('large_url_2x'),
3798 'duration': data.get('duration'),
3803 class YouPornIE(InfoExtractor):
3804 """Information extractor for youporn.com."""
3805 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3807 def _print_formats(self, formats):
3808 """Print all available formats"""
3809 print(u'Available formats:')
3810 print(u'ext\t\tformat')
3811 print(u'---------------------------------')
3812 for format in formats:
3813 print(u'%s\t\t%s' % (format['ext'], format['format']))
3815 def _specific(self, req_format, formats):
3817 if(x["format"]==req_format):
3821 def _real_extract(self, url):
3822 mobj = re.match(self._VALID_URL, url)
3824 self._downloader.report_error(u'invalid URL: %s' % url)
3827 video_id = mobj.group('videoid')
3829 req = compat_urllib_request.Request(url)
3830 req.add_header('Cookie', 'age_verified=1')
3831 webpage = self._download_webpage(req, video_id)
3833 # Get the video title
3834 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3836 raise ExtractorError(u'Unable to extract video title')
3837 video_title = result.group('title').strip()
3839 # Get the video date
3840 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3842 self._downloader.report_warning(u'unable to extract video date')
3845 upload_date = result.group('date').strip()
3847 # Get the video uploader
3848 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3850 self._downloader.report_warning(u'unable to extract uploader')
3851 video_uploader = None
3853 video_uploader = result.group('uploader').strip()
3854 video_uploader = clean_html( video_uploader )
3856 # Get all of the formats available
3857 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3858 result = re.search(DOWNLOAD_LIST_RE, webpage)
3860 raise ExtractorError(u'Unable to extract download list')
3861 download_list_html = result.group('download_list').strip()
3863 # Get all of the links from the page
3864 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3865 links = re.findall(LINK_RE, download_list_html)
3866 if(len(links) == 0):
3867 raise ExtractorError(u'ERROR: no known formats available for video')
3869 self.to_screen(u'Links found: %d' % len(links))
3874 # A link looks like this:
3875 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3876 # A path looks like this:
3877 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3878 video_url = unescapeHTML( link )
3879 path = compat_urllib_parse_urlparse( video_url ).path
3880 extension = os.path.splitext( path )[1][1:]
3881 format = path.split('/')[4].split('_')[:2]
3884 format = "-".join( format )
3885 title = u'%s-%s-%s' % (video_title, size, bitrate)
3890 'uploader': video_uploader,
3891 'upload_date': upload_date,
3896 'description': None,
3900 if self._downloader.params.get('listformats', None):
3901 self._print_formats(formats)
3904 req_format = self._downloader.params.get('format', None)
3905 self.to_screen(u'Format: %s' % req_format)
3907 if req_format is None or req_format == 'best':
3909 elif req_format == 'worst':
3910 return [formats[-1]]
3911 elif req_format in ('-1', 'all'):
3914 format = self._specific( req_format, formats )
3916 self._downloader.report_error(u'requested format not available')
3922 class PornotubeIE(InfoExtractor):
3923 """Information extractor for pornotube.com."""
3924 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3926 def _real_extract(self, url):
3927 mobj = re.match(self._VALID_URL, url)
3929 self._downloader.report_error(u'invalid URL: %s' % url)
3932 video_id = mobj.group('videoid')
3933 video_title = mobj.group('title')
3935 # Get webpage content
3936 webpage = self._download_webpage(url, video_id)
3939 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3940 result = re.search(VIDEO_URL_RE, webpage)
3942 self._downloader.report_error(u'unable to extract video url')
3944 video_url = compat_urllib_parse.unquote(result.group('url'))
3946 #Get the uploaded date
3947 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3948 result = re.search(VIDEO_UPLOADED_RE, webpage)
3950 self._downloader.report_error(u'unable to extract video title')
3952 upload_date = result.group('date')
3954 info = {'id': video_id,
3957 'upload_date': upload_date,
3958 'title': video_title,
3964 class YouJizzIE(InfoExtractor):
3965 """Information extractor for youjizz.com."""
3966 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3968 def _real_extract(self, url):
3969 mobj = re.match(self._VALID_URL, url)
3971 self._downloader.report_error(u'invalid URL: %s' % url)
3974 video_id = mobj.group('videoid')
3976 # Get webpage content
3977 webpage = self._download_webpage(url, video_id)
3979 # Get the video title
3980 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3982 raise ExtractorError(u'ERROR: unable to extract video title')
3983 video_title = result.group('title').strip()
3985 # Get the embed page
3986 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3988 raise ExtractorError(u'ERROR: unable to extract embed page')
3990 embed_page_url = result.group(0).strip()
3991 video_id = result.group('videoid')
3993 webpage = self._download_webpage(embed_page_url, video_id)
3996 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3998 raise ExtractorError(u'ERROR: unable to extract video url')
3999 video_url = result.group('source')
4001 info = {'id': video_id,
4003 'title': video_title,
4006 'player_url': embed_page_url}
4010 class EightTracksIE(InfoExtractor):
4012 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
4014 def _real_extract(self, url):
4015 mobj = re.match(self._VALID_URL, url)
4017 raise ExtractorError(u'Invalid URL: %s' % url)
4018 playlist_id = mobj.group('id')
4020 webpage = self._download_webpage(url, playlist_id)
4022 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
4024 raise ExtractorError(u'Cannot find trax information')
4025 json_like = m.group(1)
4026 data = json.loads(json_like)
4028 session = str(random.randint(0, 1000000000))
4030 track_count = data['tracks_count']
4031 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
4032 next_url = first_url
4034 for i in itertools.count():
4035 api_json = self._download_webpage(next_url, playlist_id,
4036 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
4037 errnote=u'Failed to download song information')
4038 api_data = json.loads(api_json)
4039 track_data = api_data[u'set']['track']
4041 'id': track_data['id'],
4042 'url': track_data['track_file_stream_url'],
4043 'title': track_data['performer'] + u' - ' + track_data['name'],
4044 'raw_title': track_data['name'],
4045 'uploader_id': data['user']['login'],
4049 if api_data['set']['at_last_track']:
4051 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
4054 class KeekIE(InfoExtractor):
4055 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
4058 def _real_extract(self, url):
4059 m = re.match(self._VALID_URL, url)
4060 video_id = m.group('videoID')
4061 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
4062 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
4063 webpage = self._download_webpage(url, video_id)
4064 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4065 title = unescapeHTML(m.group('title'))
4066 m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
4067 uploader = clean_html(m.group('uploader'))
4073 'thumbnail': thumbnail,
4074 'uploader': uploader
4078 class TEDIE(InfoExtractor):
4079 _VALID_URL=r'''http://www.ted.com/
4081 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4083 ((?P<type_talk>talks)) # We have a simple talk
4085 /(?P<name>\w+) # Here goes the name and then ".html"
4089 def suitable(cls, url):
4090 """Receives a URL and returns True if suitable for this IE."""
4091 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4093 def _real_extract(self, url):
4094 m=re.match(self._VALID_URL, url, re.VERBOSE)
4095 if m.group('type_talk'):
4096 return [self._talk_info(url)]
4098 playlist_id=m.group('playlist_id')
4099 name=m.group('name')
4100 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
4101 return [self._playlist_videos_info(url,name,playlist_id)]
4103 def _talk_video_link(self,mediaSlug):
4104 '''Returns the video link for that mediaSlug'''
4105 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4107 def _playlist_videos_info(self,url,name,playlist_id=0):
4108 '''Returns the videos of the playlist'''
4110 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4111 ([.\s]*?)data-playlist_item_id="(\d+)"
4112 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4114 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4115 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4116 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4117 m_names=re.finditer(video_name_RE,webpage)
4119 playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
4120 m_playlist = re.search(playlist_RE, webpage)
4121 playlist_title = m_playlist.group('playlist_title')
4123 playlist_entries = []
4124 for m_video, m_name in zip(m_videos,m_names):
4125 video_id=m_video.group('video_id')
4126 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4127 playlist_entries.append(self.url_result(talk_url, 'TED'))
4128 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
4130 def _talk_info(self, url, video_id=0):
4131 """Return the video for the talk in the url"""
4132 m=re.match(self._VALID_URL, url,re.VERBOSE)
4133 videoName=m.group('name')
4134 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4135 # If the url includes the language we get the title translated
4136 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4137 title=re.search(title_RE, webpage).group('title')
4138 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4139 "id":(?P<videoID>[\d]+).*?
4140 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4141 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4142 thumb_match=re.search(thumb_RE,webpage)
4143 info_match=re.search(info_RE,webpage,re.VERBOSE)
4144 video_id=info_match.group('videoID')
4145 mediaSlug=info_match.group('mediaSlug')
4146 video_url=self._talk_video_link(mediaSlug)
4152 'thumbnail': thumb_match.group('thumbnail')
4156 class MySpassIE(InfoExtractor):
4157 _VALID_URL = r'http://www.myspass.de/.*'
4159 def _real_extract(self, url):
4160 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4162 # video id is the last path element of the URL
4163 # usually there is a trailing slash, so also try the second but last
4164 url_path = compat_urllib_parse_urlparse(url).path
4165 url_parent_path, video_id = os.path.split(url_path)
4167 _, video_id = os.path.split(url_parent_path)
4170 metadata_url = META_DATA_URL_TEMPLATE % video_id
4171 metadata_text = self._download_webpage(metadata_url, video_id)
4172 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4174 # extract values from metadata
4175 url_flv_el = metadata.find('url_flv')
4176 if url_flv_el is None:
4177 self._downloader.report_error(u'unable to extract download url')
4179 video_url = url_flv_el.text
4180 extension = os.path.splitext(video_url)[1][1:]
4181 title_el = metadata.find('title')
4182 if title_el is None:
4183 self._downloader.report_error(u'unable to extract title')
4185 title = title_el.text
4186 format_id_el = metadata.find('format_id')
4187 if format_id_el is None:
4190 format = format_id_el.text
4191 description_el = metadata.find('description')
4192 if description_el is not None:
4193 description = description_el.text
4196 imagePreview_el = metadata.find('imagePreview')
4197 if imagePreview_el is not None:
4198 thumbnail = imagePreview_el.text
4207 'thumbnail': thumbnail,
4208 'description': description
4212 class SpiegelIE(InfoExtractor):
4213 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4215 def _real_extract(self, url):
4216 m = re.match(self._VALID_URL, url)
4217 video_id = m.group('videoID')
4219 webpage = self._download_webpage(url, video_id)
4220 m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4222 raise ExtractorError(u'Cannot find title')
4223 video_title = unescapeHTML(m.group(1))
4225 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4226 xml_code = self._download_webpage(xml_url, video_id,
4227 note=u'Downloading XML', errnote=u'Failed to download XML')
4229 idoc = xml.etree.ElementTree.fromstring(xml_code)
4230 last_type = idoc[-1]
4231 filename = last_type.findall('./filename')[0].text
4232 duration = float(last_type.findall('./duration')[0].text)
4234 video_url = 'http://video2.spiegel.de/flash/' + filename
4235 video_ext = filename.rpartition('.')[2]
4240 'title': video_title,
4241 'duration': duration,
4245 class LiveLeakIE(InfoExtractor):
4247 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4248 IE_NAME = u'liveleak'
4250 def _real_extract(self, url):
4251 mobj = re.match(self._VALID_URL, url)
4253 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4256 video_id = mobj.group('video_id')
4258 webpage = self._download_webpage(url, video_id)
4260 m = re.search(r'file: "(.*?)",', webpage)
4262 self._downloader.report_error(u'unable to find video url')
4264 video_url = m.group(1)
4266 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4268 self._downloader.trouble(u'Cannot find video title')
4269 title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4271 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4273 desc = unescapeHTML(m.group('desc'))
4277 m = re.search(r'By:.*?(\w+)</a>', webpage)
4279 uploader = clean_html(m.group(1))
4288 'description': desc,
4289 'uploader': uploader
4294 class ARDIE(InfoExtractor):
4295 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4296 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4297 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4299 def _real_extract(self, url):
4300 # determine video id from url
4301 m = re.match(self._VALID_URL, url)
4303 numid = re.search(r'documentId=([0-9]+)', url)
4305 video_id = numid.group(1)
4307 video_id = m.group('video_id')
4309 # determine title and media streams from webpage
4310 html = self._download_webpage(url, video_id)
4311 title = re.search(self._TITLE, html).group('title')
4312 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4314 assert '"fsk"' in html
4315 self._downloader.report_error(u'this video is only available after 8:00 pm')
4318 # choose default media type and highest quality for now
4319 stream = max([s for s in streams if int(s["media_type"]) == 0],
4320 key=lambda s: int(s["quality"]))
4322 # there's two possibilities: RTMP stream or HTTP download
4323 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4324 if stream['rtmp_url']:
4325 self.to_screen(u'RTMP download detected')
4326 assert stream['video_url'].startswith('mp4:')
4327 info["url"] = stream["rtmp_url"]
4328 info["play_path"] = stream['video_url']
4330 assert stream["video_url"].endswith('.mp4')
4331 info["url"] = stream["video_url"]
4335 def gen_extractors():
4336 """ Return a list of an instance of every supported extractor.
4337 The order does matter; the first extractor matched is the one handling the URL.
4340 YoutubePlaylistIE(),
4365 StanfordOpenClassroomIE(),
4375 WorldStarHipHopIE(),
4391 def get_info_extractor(ie_name):
4392 """Returns the info extractor class with the given ie_name"""
4393 return globals()[ie_name+'IE']