2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
23 class InfoExtractor(object):
24 """Information Extractor class.
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
29 others. The information is stored in a dictionary which is then
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
34 The dictionaries must include the following fields:
38 title: Video title, unescaped.
39 ext: Video filename extension.
41 The following fields are optional:
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
46 uploader: Full name of the video uploader.
47 upload_date: Video upload date (YYYYMMDD).
48 uploader_id: Nickname or id of the video uploader.
49 location: Physical location of the video.
50 player_url: SWF Player URL (used for rtmpdump).
51 subtitles: The subtitle file contents.
52 urlhandle: [internal] The urlHandle to be used to download the file,
53 like returned by urllib.request.urlopen
55 The fields should all be Unicode strings.
57 Subclasses of this one should re-define the _real_initialize() and
58 _real_extract() methods and define a _VALID_URL regexp.
59 Probably, they should also be added to the list of extractors.
61 _real_extract() must return a *list* of information dictionaries as
64 Finally, the _WORKING attribute should be set to False for broken IEs
65 in order to warn the users and skip the tests.
72 def __init__(self, downloader=None):
73 """Constructor. Receives an optional downloader."""
75 self.set_downloader(downloader)
78 def suitable(cls, url):
79 """Receives a URL and returns True if suitable for this IE."""
80 return re.match(cls._VALID_URL, url) is not None
84 """Getter method for _WORKING."""
88 """Initializes an instance (authentication, etc)."""
90 self._real_initialize()
93 def extract(self, url):
94 """Extracts URL information and returns it in list of dicts."""
96 return self._real_extract(url)
98 def set_downloader(self, downloader):
99 """Sets the downloader for this IE."""
100 self._downloader = downloader
102 def _real_initialize(self):
103 """Real initialization process. Redefine in subclasses."""
106 def _real_extract(self, url):
107 """Real extraction process. Redefine in subclasses."""
112 return type(self).__name__[:-2]
114 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115 """ Returns the response handle """
117 note = u'Downloading video webpage'
118 if note is not False:
119 self.to_screen(u'%s: %s' % (video_id, note))
121 return compat_urllib_request.urlopen(url_or_request)
122 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
124 errnote = u'Unable to download webpage'
125 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
127 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
128 """ Returns the data of the page as a string """
129 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
130 content_type = urlh.headers.get('Content-Type', '')
131 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
133 encoding = m.group(1)
136 webpage_bytes = urlh.read()
137 if self._downloader.params.get('dump_intermediate_pages', False):
139 url = url_or_request.get_full_url()
140 except AttributeError:
142 self.to_screen(u'Dumping request to ' + url)
143 dump = base64.b64encode(webpage_bytes).decode('ascii')
144 self._downloader.to_screen(dump)
145 return webpage_bytes.decode(encoding, 'replace')
147 def to_screen(self, msg):
148 """Print msg to screen, prefixing it with '[ie_name]'"""
149 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
151 def report_extraction(self, id_or_name):
152 """Report information extraction."""
153 self.to_screen(u'%s: Extracting information' % id_or_name)
155 def report_age_confirmation(self):
156 """Report attempt to confirm age."""
157 self.to_screen(u'Confirming age')
159 #Methods for following #608
160 #They set the correct value of the '_type' key
161 def video_result(self, video_info):
162 """Returns a video"""
163 video_info['_type'] = 'video'
165 def url_result(self, url, ie=None):
166 """Returns a url that points to a page that should be processed"""
167 #TODO: ie should be the class used for getting the info
168 video_info = {'_type': 'url',
172 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
173 """Returns a playlist"""
174 video_info = {'_type': 'playlist',
177 video_info['id'] = playlist_id
179 video_info['title'] = playlist_title
183 class YoutubeIE(InfoExtractor):
184 """Information extractor for youtube.com."""
188 (?:https?://)? # http(s):// (optional)
189 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
190 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
191 (?:.*?\#/)? # handle anchor (#/) redirect urls
192 (?: # the various things that can precede the ID:
193 (?:(?:v|embed|e)/) # v/ or embed/ or e/
194 |(?: # or the v= param in all its forms
195 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
196 (?:\?|\#!?) # the params delimiter ? or # or #!
197 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
200 )? # optional -> youtube.com/xxxx is OK
201 )? # all until now is optional -> you can pass the naked ID
202 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
203 (?(1).+)? # if we found the ID, everything can follow
205 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
206 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
207 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
208 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
209 _NETRC_MACHINE = 'youtube'
210 # Listed in order of quality
211 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
212 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
213 _video_extensions = {
219 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
225 _video_dimensions = {
244 def suitable(cls, url):
245 """Receives a URL and returns True if suitable for this IE."""
246 if YoutubePlaylistIE.suitable(url): return False
247 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
249 def report_lang(self):
250 """Report attempt to set language."""
251 self.to_screen(u'Setting language')
253 def report_login(self):
254 """Report attempt to log in."""
255 self.to_screen(u'Logging in')
257 def report_video_webpage_download(self, video_id):
258 """Report attempt to download video webpage."""
259 self.to_screen(u'%s: Downloading video webpage' % video_id)
261 def report_video_info_webpage_download(self, video_id):
262 """Report attempt to download video info webpage."""
263 self.to_screen(u'%s: Downloading video info webpage' % video_id)
265 def report_video_subtitles_download(self, video_id):
266 """Report attempt to download video info webpage."""
267 self.to_screen(u'%s: Checking available subtitles' % video_id)
269 def report_video_subtitles_request(self, video_id, sub_lang, format):
270 """Report attempt to download video info webpage."""
271 self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
273 def report_video_subtitles_available(self, video_id, sub_lang_list):
274 """Report available subtitles."""
275 sub_lang = ",".join(list(sub_lang_list.keys()))
276 self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
278 def report_information_extraction(self, video_id):
279 """Report attempt to extract video information."""
280 self.to_screen(u'%s: Extracting video information' % video_id)
282 def report_unavailable_format(self, video_id, format):
283 """Report extracted video URL."""
284 self.to_screen(u'%s: Format %s not available' % (video_id, format))
286 def report_rtmp_download(self):
287 """Indicate the download will use the RTMP protocol."""
288 self.to_screen(u'RTMP download detected')
290 def _get_available_subtitles(self, video_id):
291 self.report_video_subtitles_download(video_id)
292 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
294 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
295 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
296 return (u'unable to download video subtitles: %s' % compat_str(err), None)
297 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
298 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
299 if not sub_lang_list:
300 return (u'video doesn\'t have subtitles', None)
303 def _list_available_subtitles(self, video_id):
304 sub_lang_list = self._get_available_subtitles(video_id)
305 self.report_video_subtitles_available(video_id, sub_lang_list)
307 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
310 (error_message, sub_lang, sub)
312 self.report_video_subtitles_request(video_id, sub_lang, format)
313 params = compat_urllib_parse.urlencode({
319 url = 'http://www.youtube.com/api/timedtext?' + params
321 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
322 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
323 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
325 return (u'Did not fetch video subtitles', None, None)
326 return (None, sub_lang, sub)
328 def _extract_subtitle(self, video_id):
330 Return a list with a tuple:
331 [(error_message, sub_lang, sub)]
333 sub_lang_list = self._get_available_subtitles(video_id)
334 sub_format = self._downloader.params.get('subtitlesformat')
335 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
336 return [(sub_lang_list[0], None, None)]
337 if self._downloader.params.get('subtitleslang', False):
338 sub_lang = self._downloader.params.get('subtitleslang')
339 elif 'en' in sub_lang_list:
342 sub_lang = list(sub_lang_list.keys())[0]
343 if not sub_lang in sub_lang_list:
344 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
346 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
349 def _extract_all_subtitles(self, video_id):
350 sub_lang_list = self._get_available_subtitles(video_id)
351 sub_format = self._downloader.params.get('subtitlesformat')
352 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
353 return [(sub_lang_list[0], None, None)]
355 for sub_lang in sub_lang_list:
356 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
357 subtitles.append(subtitle)
360 def _print_formats(self, formats):
361 print('Available formats:')
363 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
365 def _real_initialize(self):
366 if self._downloader is None:
371 downloader_params = self._downloader.params
373 # Attempt to use provided username and password or .netrc data
374 if downloader_params.get('username', None) is not None:
375 username = downloader_params['username']
376 password = downloader_params['password']
377 elif downloader_params.get('usenetrc', False):
379 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
384 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
385 except (IOError, netrc.NetrcParseError) as err:
386 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
390 request = compat_urllib_request.Request(self._LANG_URL)
393 compat_urllib_request.urlopen(request).read()
394 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
395 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
398 # No authentication to be performed
402 request = compat_urllib_request.Request(self._LOGIN_URL)
404 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
405 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
406 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
411 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
413 galx = match.group(1)
415 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
421 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
425 u'PersistentCookie': u'yes',
427 u'bgresponse': u'js_disabled',
428 u'checkConnection': u'',
429 u'checkedDomains': u'youtube',
435 u'signIn': u'Sign in',
437 u'service': u'youtube',
441 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
443 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
444 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
445 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
448 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
449 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
450 self._downloader.report_warning(u'unable to log in: bad username or password')
452 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
453 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
459 'action_confirm': 'Confirm',
461 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
463 self.report_age_confirmation()
464 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
465 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
466 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
469 def _extract_id(self, url):
470 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
472 self._downloader.report_error(u'invalid URL: %s' % url)
474 video_id = mobj.group(2)
477 def _real_extract(self, url):
478 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
479 mobj = re.search(self._NEXT_URL_RE, url)
481 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
482 video_id = self._extract_id(url)
485 self.report_video_webpage_download(video_id)
486 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
487 request = compat_urllib_request.Request(url)
489 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
490 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
491 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
494 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
496 # Attempt to extract SWF player URL
497 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
499 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
504 self.report_video_info_webpage_download(video_id)
505 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
506 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
507 % (video_id, el_type))
508 video_info_webpage = self._download_webpage(video_info_url, video_id,
510 errnote='unable to download video info webpage')
511 video_info = compat_parse_qs(video_info_webpage)
512 if 'token' in video_info:
514 if 'token' not in video_info:
515 if 'reason' in video_info:
516 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
518 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
521 # Check for "rental" videos
522 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
523 self._downloader.report_error(u'"rental" videos not supported')
526 # Start extracting information
527 self.report_information_extraction(video_id)
530 if 'author' not in video_info:
531 self._downloader.report_error(u'unable to extract uploader name')
533 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
536 video_uploader_id = None
537 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
539 video_uploader_id = mobj.group(1)
541 self._downloader.report_warning(u'unable to extract uploader nickname')
544 if 'title' not in video_info:
545 self._downloader.report_error(u'unable to extract video title')
547 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
550 if 'thumbnail_url' not in video_info:
551 self._downloader.report_warning(u'unable to extract video thumbnail')
553 else: # don't panic if we can't find it
554 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
558 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
560 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
561 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
562 for expression in format_expressions:
564 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
569 video_description = get_element_by_id("eow-description", video_webpage)
570 if video_description:
571 video_description = clean_html(video_description)
573 video_description = ''
576 video_subtitles = None
578 if self._downloader.params.get('writesubtitles', False):
579 video_subtitles = self._extract_subtitle(video_id)
581 (sub_error, sub_lang, sub) = video_subtitles[0]
583 self._downloader.report_error(sub_error)
585 if self._downloader.params.get('allsubtitles', False):
586 video_subtitles = self._extract_all_subtitles(video_id)
587 for video_subtitle in video_subtitles:
588 (sub_error, sub_lang, sub) = video_subtitle
590 self._downloader.report_error(sub_error)
592 if self._downloader.params.get('listsubtitles', False):
593 sub_lang_list = self._list_available_subtitles(video_id)
596 if 'length_seconds' not in video_info:
597 self._downloader.report_warning(u'unable to extract video duration')
600 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
603 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
605 # Decide which formats to download
606 req_format = self._downloader.params.get('format', None)
608 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
609 self.report_rtmp_download()
610 video_url_list = [(None, video_info['conn'][0])]
611 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
612 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
613 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
614 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
615 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
617 format_limit = self._downloader.params.get('format_limit', None)
618 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
619 if format_limit is not None and format_limit in available_formats:
620 format_list = available_formats[available_formats.index(format_limit):]
622 format_list = available_formats
623 existing_formats = [x for x in format_list if x in url_map]
624 if len(existing_formats) == 0:
625 self._downloader.report_error(u'no known formats available for video')
627 if self._downloader.params.get('listformats', None):
628 self._print_formats(existing_formats)
630 if req_format is None or req_format == 'best':
631 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
632 elif req_format == 'worst':
633 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
634 elif req_format in ('-1', 'all'):
635 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
637 # Specific formats. We pick the first in a slash-delimeted sequence.
638 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
639 req_formats = req_format.split('/')
640 video_url_list = None
641 for rf in req_formats:
643 video_url_list = [(rf, url_map[rf])]
645 if video_url_list is None:
646 self._downloader.report_error(u'requested format not available')
649 self._downloader.report_error(u'no conn or url_encoded_fmt_stream_map information found in video info')
653 for format_param, video_real_url in video_url_list:
655 video_extension = self._video_extensions.get(format_param, 'flv')
657 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
658 self._video_dimensions.get(format_param, '???'))
662 'url': video_real_url,
663 'uploader': video_uploader,
664 'uploader_id': video_uploader_id,
665 'upload_date': upload_date,
666 'title': video_title,
667 'ext': video_extension,
668 'format': video_format,
669 'thumbnail': video_thumbnail,
670 'description': video_description,
671 'player_url': player_url,
672 'subtitles': video_subtitles,
673 'duration': video_duration
678 class MetacafeIE(InfoExtractor):
679 """Information Extractor for metacafe.com."""
681 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
682 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
683 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
684 IE_NAME = u'metacafe'
686 def __init__(self, downloader=None):
687 InfoExtractor.__init__(self, downloader)
689 def report_disclaimer(self):
690 """Report disclaimer retrieval."""
691 self.to_screen(u'Retrieving disclaimer')
693 def report_download_webpage(self, video_id):
694 """Report webpage download."""
695 self.to_screen(u'%s: Downloading webpage' % video_id)
697 def _real_initialize(self):
698 # Retrieve disclaimer
699 request = compat_urllib_request.Request(self._DISCLAIMER)
701 self.report_disclaimer()
702 disclaimer = compat_urllib_request.urlopen(request).read()
703 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
704 self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
710 'submit': "Continue - I'm over 18",
712 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
714 self.report_age_confirmation()
715 disclaimer = compat_urllib_request.urlopen(request).read()
716 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
717 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
720 def _real_extract(self, url):
721 # Extract id and simplified title from URL
722 mobj = re.match(self._VALID_URL, url)
724 self._downloader.report_error(u'invalid URL: %s' % url)
727 video_id = mobj.group(1)
729 # Check if video comes from YouTube
730 mobj2 = re.match(r'^yt-(.*)$', video_id)
731 if mobj2 is not None:
732 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
734 # Retrieve video webpage to extract further information
735 webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
737 # Extract URL, uploader and title from webpage
738 self.report_extraction(video_id)
739 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
741 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
742 video_extension = mediaURL[-3:]
744 # Extract gdaKey if available
745 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
749 gdaKey = mobj.group(1)
750 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
752 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
754 self._downloader.report_error(u'unable to extract media URL')
756 vardict = compat_parse_qs(mobj.group(1))
757 if 'mediaData' not in vardict:
758 self._downloader.report_error(u'unable to extract media URL')
760 mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
762 self._downloader.report_error(u'unable to extract media URL')
764 mediaURL = mobj.group('mediaURL').replace('\\/', '/')
765 video_extension = mediaURL[-3:]
766 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
768 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
770 self._downloader.report_error(u'unable to extract title')
772 video_title = mobj.group(1).decode('utf-8')
774 mobj = re.search(r'submitter=(.*?);', webpage)
776 self._downloader.report_error(u'unable to extract uploader nickname')
778 video_uploader = mobj.group(1)
781 'id': video_id.decode('utf-8'),
782 'url': video_url.decode('utf-8'),
783 'uploader': video_uploader.decode('utf-8'),
785 'title': video_title,
786 'ext': video_extension.decode('utf-8'),
790 class DailymotionIE(InfoExtractor):
791 """Information Extractor for Dailymotion"""
793 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
794 IE_NAME = u'dailymotion'
797 def __init__(self, downloader=None):
798 InfoExtractor.__init__(self, downloader)
800 def _real_extract(self, url):
801 # Extract id and simplified title from URL
802 mobj = re.match(self._VALID_URL, url)
804 self._downloader.report_error(u'invalid URL: %s' % url)
807 video_id = mobj.group(1).split('_')[0].split('?')[0]
809 video_extension = 'mp4'
811 # Retrieve video webpage to extract further information
812 request = compat_urllib_request.Request(url)
813 request.add_header('Cookie', 'family_filter=off')
814 webpage = self._download_webpage(request, video_id)
816 # Extract URL, uploader and title from webpage
817 self.report_extraction(video_id)
818 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
820 self._downloader.report_error(u'unable to extract media URL')
822 flashvars = compat_urllib_parse.unquote(mobj.group(1))
824 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
827 self.to_screen(u'Using %s' % key)
830 self._downloader.report_error(u'unable to extract video URL')
833 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
835 self._downloader.report_error(u'unable to extract video URL')
838 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
840 # TODO: support choosing qualities
842 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
844 self._downloader.report_error(u'unable to extract title')
846 video_title = unescapeHTML(mobj.group('title'))
848 video_uploader = None
849 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
851 # lookin for official user
852 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
853 if mobj_official is None:
854 self._downloader.report_warning(u'unable to extract uploader nickname')
856 video_uploader = mobj_official.group(1)
858 video_uploader = mobj.group(1)
860 video_upload_date = None
861 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
863 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
868 'uploader': video_uploader,
869 'upload_date': video_upload_date,
870 'title': video_title,
871 'ext': video_extension,
875 class PhotobucketIE(InfoExtractor):
876 """Information extractor for photobucket.com."""
878 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
879 IE_NAME = u'photobucket'
881 def __init__(self, downloader=None):
882 InfoExtractor.__init__(self, downloader)
884 def report_download_webpage(self, video_id):
885 """Report webpage download."""
886 self.to_screen(u'%s: Downloading webpage' % video_id)
888 def _real_extract(self, url):
889 # Extract id from URL
890 mobj = re.match(self._VALID_URL, url)
892 self._downloader.report_error(u'Invalid URL: %s' % url)
895 video_id = mobj.group(1)
897 video_extension = 'flv'
899 # Retrieve video webpage to extract further information
900 request = compat_urllib_request.Request(url)
902 self.report_download_webpage(video_id)
903 webpage = compat_urllib_request.urlopen(request).read()
904 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
905 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
908 # Extract URL, uploader, and title from webpage
909 self.report_extraction(video_id)
910 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
912 self._downloader.report_error(u'unable to extract media URL')
914 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
918 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
920 self._downloader.report_error(u'unable to extract title')
922 video_title = mobj.group(1).decode('utf-8')
924 video_uploader = mobj.group(2).decode('utf-8')
927 'id': video_id.decode('utf-8'),
928 'url': video_url.decode('utf-8'),
929 'uploader': video_uploader,
931 'title': video_title,
932 'ext': video_extension.decode('utf-8'),
936 class YahooIE(InfoExtractor):
937 """Information extractor for video.yahoo.com."""
940 # _VALID_URL matches all Yahoo! Video URLs
941 # _VPAGE_URL matches only the extractable '/watch/' URLs
942 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
943 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
944 IE_NAME = u'video.yahoo'
946 def __init__(self, downloader=None):
947 InfoExtractor.__init__(self, downloader)
949 def report_download_webpage(self, video_id):
950 """Report webpage download."""
951 self.to_screen(u'%s: Downloading webpage' % video_id)
953 def _real_extract(self, url, new_video=True):
954 # Extract ID from URL
955 mobj = re.match(self._VALID_URL, url)
957 self._downloader.report_error(u'Invalid URL: %s' % url)
960 video_id = mobj.group(2)
961 video_extension = 'flv'
963 # Rewrite valid but non-extractable URLs as
964 # extractable English language /watch/ URLs
965 if re.match(self._VPAGE_URL, url) is None:
966 request = compat_urllib_request.Request(url)
968 webpage = compat_urllib_request.urlopen(request).read()
969 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
970 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
973 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
975 self._downloader.report_error(u'Unable to extract id field')
977 yahoo_id = mobj.group(1)
979 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
981 self._downloader.report_error(u'Unable to extract vid field')
983 yahoo_vid = mobj.group(1)
985 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
986 return self._real_extract(url, new_video=False)
988 # Retrieve video webpage to extract further information
989 request = compat_urllib_request.Request(url)
991 self.report_download_webpage(video_id)
992 webpage = compat_urllib_request.urlopen(request).read()
993 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
994 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
997 # Extract uploader and title from webpage
998 self.report_extraction(video_id)
999 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1001 self._downloader.report_error(u'unable to extract video title')
1003 video_title = mobj.group(1).decode('utf-8')
1005 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1007 self._downloader.report_error(u'unable to extract video uploader')
1009 video_uploader = mobj.group(1).decode('utf-8')
1011 # Extract video thumbnail
1012 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1014 self._downloader.report_error(u'unable to extract video thumbnail')
1016 video_thumbnail = mobj.group(1).decode('utf-8')
1018 # Extract video description
1019 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1021 self._downloader.report_error(u'unable to extract video description')
1023 video_description = mobj.group(1).decode('utf-8')
1024 if not video_description:
1025 video_description = 'No description available.'
1027 # Extract video height and width
1028 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1030 self._downloader.report_error(u'unable to extract video height')
1032 yv_video_height = mobj.group(1)
1034 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1036 self._downloader.report_error(u'unable to extract video width')
1038 yv_video_width = mobj.group(1)
1040 # Retrieve video playlist to extract media URL
1041 # I'm not completely sure what all these options are, but we
1042 # seem to need most of them, otherwise the server sends a 401.
1043 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1044 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1045 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1046 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1047 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1049 self.report_download_webpage(video_id)
1050 webpage = compat_urllib_request.urlopen(request).read()
1051 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1052 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1055 # Extract media URL from playlist XML
1056 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1058 self._downloader.report_error(u'Unable to extract media URL')
1060 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1061 video_url = unescapeHTML(video_url)
1064 'id': video_id.decode('utf-8'),
1066 'uploader': video_uploader,
1067 'upload_date': None,
1068 'title': video_title,
1069 'ext': video_extension.decode('utf-8'),
1070 'thumbnail': video_thumbnail.decode('utf-8'),
1071 'description': video_description,
1075 class VimeoIE(InfoExtractor):
1076 """Information extractor for vimeo.com."""
1078 # _VALID_URL matches Vimeo URLs
1079 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1082 def __init__(self, downloader=None):
1083 InfoExtractor.__init__(self, downloader)
1085 def report_download_webpage(self, video_id):
1086 """Report webpage download."""
1087 self.to_screen(u'%s: Downloading webpage' % video_id)
1089 def _real_extract(self, url, new_video=True):
1090 # Extract ID from URL
1091 mobj = re.match(self._VALID_URL, url)
1093 self._downloader.report_error(u'Invalid URL: %s' % url)
1096 video_id = mobj.group('id')
1097 if not mobj.group('proto'):
1098 url = 'https://' + url
1099 if mobj.group('direct_link'):
1100 url = 'https://vimeo.com/' + video_id
1102 # Retrieve video webpage to extract further information
1103 request = compat_urllib_request.Request(url, None, std_headers)
1105 self.report_download_webpage(video_id)
1106 webpage_bytes = compat_urllib_request.urlopen(request).read()
1107 webpage = webpage_bytes.decode('utf-8')
1108 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1109 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1112 # Now we begin extracting as much information as we can from what we
1113 # retrieved. First we extract the information common to all extractors,
1114 # and latter we extract those that are Vimeo specific.
1115 self.report_extraction(video_id)
1117 # Extract the config JSON
1119 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1120 config = json.loads(config)
1122 self._downloader.report_error(u'unable to extract info section')
1126 video_title = config["video"]["title"]
1128 # Extract uploader and uploader_id
1129 video_uploader = config["video"]["owner"]["name"]
1130 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1132 # Extract video thumbnail
1133 video_thumbnail = config["video"]["thumbnail"]
1135 # Extract video description
1136 video_description = get_element_by_attribute("itemprop", "description", webpage)
1137 if video_description: video_description = clean_html(video_description)
1138 else: video_description = u''
1140 # Extract upload date
1141 video_upload_date = None
1142 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1143 if mobj is not None:
1144 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1146 # Vimeo specific: extract request signature and timestamp
1147 sig = config['request']['signature']
1148 timestamp = config['request']['timestamp']
1150 # Vimeo specific: extract video codec and quality information
1151 # First consider quality, then codecs, then take everything
1152 # TODO bind to format param
1153 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1154 files = { 'hd': [], 'sd': [], 'other': []}
1155 for codec_name, codec_extension in codecs:
1156 if codec_name in config["video"]["files"]:
1157 if 'hd' in config["video"]["files"][codec_name]:
1158 files['hd'].append((codec_name, codec_extension, 'hd'))
1159 elif 'sd' in config["video"]["files"][codec_name]:
1160 files['sd'].append((codec_name, codec_extension, 'sd'))
1162 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1164 for quality in ('hd', 'sd', 'other'):
1165 if len(files[quality]) > 0:
1166 video_quality = files[quality][0][2]
1167 video_codec = files[quality][0][0]
1168 video_extension = files[quality][0][1]
1169 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1172 self._downloader.report_error(u'no known codec found')
1175 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1176 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1181 'uploader': video_uploader,
1182 'uploader_id': video_uploader_id,
1183 'upload_date': video_upload_date,
1184 'title': video_title,
1185 'ext': video_extension,
1186 'thumbnail': video_thumbnail,
1187 'description': video_description,
1191 class ArteTvIE(InfoExtractor):
1192 """arte.tv information extractor."""
1194 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1195 _LIVE_URL = r'index-[0-9]+\.html$'
1197 IE_NAME = u'arte.tv'
1199 def __init__(self, downloader=None):
1200 InfoExtractor.__init__(self, downloader)
1202 def report_download_webpage(self, video_id):
1203 """Report webpage download."""
1204 self.to_screen(u'%s: Downloading webpage' % video_id)
1206 def fetch_webpage(self, url):
1207 request = compat_urllib_request.Request(url)
1209 self.report_download_webpage(url)
1210 webpage = compat_urllib_request.urlopen(request).read()
1211 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1212 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1214 except ValueError as err:
1215 self._downloader.report_error(u'Invalid URL: %s' % url)
1219 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1220 page = self.fetch_webpage(url)
1221 mobj = re.search(regex, page, regexFlags)
1225 self._downloader.report_error(u'Invalid URL: %s' % url)
1228 for (i, key, err) in matchTuples:
1229 if mobj.group(i) is None:
1230 self._downloader.trouble(err)
1233 info[key] = mobj.group(i)
1237 def extractLiveStream(self, url):
1238 video_lang = url.split('/')[-4]
1239 info = self.grep_webpage(
1241 r'src="(.*?/videothek_js.*?\.js)',
1244 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1247 http_host = url.split('/')[2]
1248 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1249 info = self.grep_webpage(
1251 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1252 '(http://.*?\.swf).*?' +
1256 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1257 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1258 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1261 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1263 def extractPlus7Stream(self, url):
1264 video_lang = url.split('/')[-3]
1265 info = self.grep_webpage(
1267 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1270 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1273 next_url = compat_urllib_parse.unquote(info.get('url'))
1274 info = self.grep_webpage(
1276 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1279 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1282 next_url = compat_urllib_parse.unquote(info.get('url'))
1284 info = self.grep_webpage(
1286 r'<video id="(.*?)".*?>.*?' +
1287 '<name>(.*?)</name>.*?' +
1288 '<dateVideo>(.*?)</dateVideo>.*?' +
1289 '<url quality="hd">(.*?)</url>',
1292 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1293 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1294 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1295 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1300 'id': info.get('id'),
1301 'url': compat_urllib_parse.unquote(info.get('url')),
1302 'uploader': u'arte.tv',
1303 'upload_date': info.get('date'),
1304 'title': info.get('title').decode('utf-8'),
1310 def _real_extract(self, url):
1311 video_id = url.split('/')[-1]
1312 self.report_extraction(video_id)
1314 if re.search(self._LIVE_URL, video_id) is not None:
1315 self.extractLiveStream(url)
1318 info = self.extractPlus7Stream(url)
1323 class GenericIE(InfoExtractor):
1324 """Generic last-resort information extractor."""
1327 IE_NAME = u'generic'
1329 def __init__(self, downloader=None):
1330 InfoExtractor.__init__(self, downloader)
1332 def report_download_webpage(self, video_id):
1333 """Report webpage download."""
1334 if not self._downloader.params.get('test', False):
1335 self._downloader.report_warning(u'Falling back on generic information extractor.')
1336 self.to_screen(u'%s: Downloading webpage' % video_id)
1338 def report_following_redirect(self, new_url):
1339 """Report information extraction."""
1340 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1342 def _test_redirect(self, url):
1343 """Check if it is a redirect, like url shorteners, in case return the new url."""
1344 class HeadRequest(compat_urllib_request.Request):
1345 def get_method(self):
1348 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1350 Subclass the HTTPRedirectHandler to make it use our
1351 HeadRequest also on the redirected URL
1353 def redirect_request(self, req, fp, code, msg, headers, newurl):
1354 if code in (301, 302, 303, 307):
1355 newurl = newurl.replace(' ', '%20')
1356 newheaders = dict((k,v) for k,v in req.headers.items()
1357 if k.lower() not in ("content-length", "content-type"))
1358 return HeadRequest(newurl,
1360 origin_req_host=req.get_origin_req_host(),
1363 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1365 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1367 Fallback to GET if HEAD is not allowed (405 HTTP error)
1369 def http_error_405(self, req, fp, code, msg, headers):
1373 newheaders = dict((k,v) for k,v in req.headers.items()
1374 if k.lower() not in ("content-length", "content-type"))
1375 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1377 origin_req_host=req.get_origin_req_host(),
1381 opener = compat_urllib_request.OpenerDirector()
1382 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1383 HTTPMethodFallback, HEADRedirectHandler,
1384 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1385 opener.add_handler(handler())
1387 response = opener.open(HeadRequest(url))
1388 new_url = response.geturl()
1393 self.report_following_redirect(new_url)
1396 def _real_extract(self, url):
1397 new_url = self._test_redirect(url)
1398 if new_url: return [self.url_result(new_url)]
1400 video_id = url.split('/')[-1]
1402 webpage = self._download_webpage(url, video_id)
1403 except ValueError as err:
1404 # since this is the last-resort InfoExtractor, if
1405 # this error is thrown, it'll be thrown here
1406 self._downloader.report_error(u'Invalid URL: %s' % url)
1409 self.report_extraction(video_id)
1410 # Start with something easy: JW Player in SWFObject
1411 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1413 # Broaden the search a little bit
1414 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1416 # Broaden the search a little bit: JWPlayer JS loader
1417 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1419 self._downloader.report_error(u'Invalid URL: %s' % url)
1422 # It's possible that one of the regexes
1423 # matched, but returned an empty group:
1424 if mobj.group(1) is None:
1425 self._downloader.report_error(u'Invalid URL: %s' % url)
1428 video_url = compat_urllib_parse.unquote(mobj.group(1))
1429 video_id = os.path.basename(video_url)
1431 # here's a fun little line of code for you:
1432 video_extension = os.path.splitext(video_id)[1][1:]
1433 video_id = os.path.splitext(video_id)[0]
1435 # it's tempting to parse this further, but you would
1436 # have to take into account all the variations like
1437 # Video Title - Site Name
1438 # Site Name | Video Title
1439 # Video Title - Tagline | Site Name
1440 # and so on and so forth; it's just not practical
1441 mobj = re.search(r'<title>(.*)</title>', webpage)
1443 self._downloader.report_error(u'unable to extract title')
1445 video_title = mobj.group(1)
1447 # video uploader is domain name
1448 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1450 self._downloader.report_error(u'unable to extract title')
1452 video_uploader = mobj.group(1)
1457 'uploader': video_uploader,
1458 'upload_date': None,
1459 'title': video_title,
1460 'ext': video_extension,
1464 class YoutubeSearchIE(InfoExtractor):
1465 """Information Extractor for YouTube search queries."""
1466 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1467 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1468 _max_youtube_results = 1000
1469 IE_NAME = u'youtube:search'
1471 def __init__(self, downloader=None):
1472 InfoExtractor.__init__(self, downloader)
1474 def report_download_page(self, query, pagenum):
1475 """Report attempt to download search page with given number."""
1476 query = query.decode(preferredencoding())
1477 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1479 def _real_extract(self, query):
1480 mobj = re.match(self._VALID_URL, query)
1482 self._downloader.report_error(u'invalid search query "%s"' % query)
1485 prefix, query = query.split(':')
1487 query = query.encode('utf-8')
1489 return self._get_n_results(query, 1)
1490 elif prefix == 'all':
1491 self._get_n_results(query, self._max_youtube_results)
1496 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1498 elif n > self._max_youtube_results:
1499 self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1500 n = self._max_youtube_results
1501 return self._get_n_results(query, n)
1502 except ValueError: # parsing prefix as integer fails
1503 return self._get_n_results(query, 1)
1505 def _get_n_results(self, query, n):
1506 """Get a specified number of results for a query"""
1512 while (50 * pagenum) < limit:
1513 self.report_download_page(query, pagenum+1)
1514 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1515 request = compat_urllib_request.Request(result_url)
1517 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1518 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1519 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1521 api_response = json.loads(data)['data']
1523 if not 'items' in api_response:
1524 self._downloader.trouble(u'[youtube] No video results')
1527 new_ids = list(video['id'] for video in api_response['items'])
1528 video_ids += new_ids
1530 limit = min(n, api_response['totalItems'])
1533 if len(video_ids) > n:
1534 video_ids = video_ids[:n]
1535 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1539 class GoogleSearchIE(InfoExtractor):
1540 """Information Extractor for Google Video search queries."""
1541 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1542 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1543 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1544 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1545 _max_google_results = 1000
1546 IE_NAME = u'video.google:search'
1548 def __init__(self, downloader=None):
1549 InfoExtractor.__init__(self, downloader)
1551 def report_download_page(self, query, pagenum):
1552 """Report attempt to download playlist page with given number."""
1553 query = query.decode(preferredencoding())
1554 self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1556 def _real_extract(self, query):
1557 mobj = re.match(self._VALID_URL, query)
1559 self._downloader.report_error(u'invalid search query "%s"' % query)
1562 prefix, query = query.split(':')
1564 query = query.encode('utf-8')
1566 self._download_n_results(query, 1)
1568 elif prefix == 'all':
1569 self._download_n_results(query, self._max_google_results)
1575 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1577 elif n > self._max_google_results:
1578 self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1579 n = self._max_google_results
1580 self._download_n_results(query, n)
1582 except ValueError: # parsing prefix as integer fails
1583 self._download_n_results(query, 1)
1586 def _download_n_results(self, query, n):
1587 """Downloads a specified number of results for a query"""
1593 self.report_download_page(query, pagenum)
1594 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1595 request = compat_urllib_request.Request(result_url)
1597 page = compat_urllib_request.urlopen(request).read()
1598 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1599 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1602 # Extract video identifiers
1603 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1604 video_id = mobj.group(1)
1605 if video_id not in video_ids:
1606 video_ids.append(video_id)
1607 if len(video_ids) == n:
1608 # Specified n videos reached
1609 for id in video_ids:
1610 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1613 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1614 for id in video_ids:
1615 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1618 pagenum = pagenum + 1
1621 class YahooSearchIE(InfoExtractor):
1622 """Information Extractor for Yahoo! Video search queries."""
1625 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1626 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1627 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1628 _MORE_PAGES_INDICATOR = r'\s*Next'
1629 _max_yahoo_results = 1000
1630 IE_NAME = u'video.yahoo:search'
1632 def __init__(self, downloader=None):
1633 InfoExtractor.__init__(self, downloader)
1635 def report_download_page(self, query, pagenum):
1636 """Report attempt to download playlist page with given number."""
1637 query = query.decode(preferredencoding())
1638 self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1640 def _real_extract(self, query):
1641 mobj = re.match(self._VALID_URL, query)
1643 self._downloader.report_error(u'invalid search query "%s"' % query)
1646 prefix, query = query.split(':')
1648 query = query.encode('utf-8')
1650 self._download_n_results(query, 1)
1652 elif prefix == 'all':
1653 self._download_n_results(query, self._max_yahoo_results)
1659 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1661 elif n > self._max_yahoo_results:
1662 self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1663 n = self._max_yahoo_results
1664 self._download_n_results(query, n)
1666 except ValueError: # parsing prefix as integer fails
1667 self._download_n_results(query, 1)
1670 def _download_n_results(self, query, n):
1671 """Downloads a specified number of results for a query"""
1674 already_seen = set()
1678 self.report_download_page(query, pagenum)
1679 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1680 request = compat_urllib_request.Request(result_url)
1682 page = compat_urllib_request.urlopen(request).read()
1683 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1684 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1687 # Extract video identifiers
1688 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1689 video_id = mobj.group(1)
1690 if video_id not in already_seen:
1691 video_ids.append(video_id)
1692 already_seen.add(video_id)
1693 if len(video_ids) == n:
1694 # Specified n videos reached
1695 for id in video_ids:
1696 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1699 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1700 for id in video_ids:
1701 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1704 pagenum = pagenum + 1
1707 class YoutubePlaylistIE(InfoExtractor):
1708 """Information Extractor for YouTube playlists."""
1710 _VALID_URL = r"""(?:
1715 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1716 \? (?:.*?&)*? (?:p|a|list)=
1719 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1722 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1724 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1726 IE_NAME = u'youtube:playlist'
1728 def __init__(self, downloader=None):
1729 InfoExtractor.__init__(self, downloader)
1732 def suitable(cls, url):
1733 """Receives a URL and returns True if suitable for this IE."""
1734 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1736 def report_download_page(self, playlist_id, pagenum):
1737 """Report attempt to download playlist page with given number."""
1738 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1740 def _real_extract(self, url):
1741 # Extract playlist id
1742 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1744 self._downloader.report_error(u'invalid url: %s' % url)
1747 # Download playlist videos from API
1748 playlist_id = mobj.group(1) or mobj.group(2)
1753 self.report_download_page(playlist_id, page_num)
1755 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1757 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1758 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1759 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1763 response = json.loads(page)
1764 except ValueError as err:
1765 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1768 if 'feed' not in response:
1769 self._downloader.report_error(u'Got a malformed response from YouTube API')
1771 if 'entry' not in response['feed']:
1772 # Number of videos is a multiple of self._MAX_RESULTS
1775 playlist_title = response['feed']['title']['$t']
1777 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1778 for entry in response['feed']['entry']
1779 if 'content' in entry ]
1781 if len(response['feed']['entry']) < self._MAX_RESULTS:
1785 videos = [v[1] for v in sorted(videos)]
1787 url_results = [self.url_result(url, 'Youtube') for url in videos]
1788 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1791 class YoutubeChannelIE(InfoExtractor):
1792 """Information Extractor for YouTube channels."""
1794 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1795 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1796 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1797 _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1798 IE_NAME = u'youtube:channel'
1800 def report_download_page(self, channel_id, pagenum):
1801 """Report attempt to download channel page with given number."""
1802 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1804 def extract_videos_from_page(self, page):
1806 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1807 if mobj.group(1) not in ids_in_page:
1808 ids_in_page.append(mobj.group(1))
1811 def _real_extract(self, url):
1812 # Extract channel id
1813 mobj = re.match(self._VALID_URL, url)
1815 self._downloader.report_error(u'invalid url: %s' % url)
1818 # Download channel page
1819 channel_id = mobj.group(1)
1823 self.report_download_page(channel_id, pagenum)
1824 url = self._TEMPLATE_URL % (channel_id, pagenum)
1825 request = compat_urllib_request.Request(url)
1827 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1828 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1829 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1832 # Extract video identifiers
1833 ids_in_page = self.extract_videos_from_page(page)
1834 video_ids.extend(ids_in_page)
1836 # Download any subsequent channel pages using the json-based channel_ajax query
1837 if self._MORE_PAGES_INDICATOR in page:
1839 pagenum = pagenum + 1
1841 self.report_download_page(channel_id, pagenum)
1842 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1843 request = compat_urllib_request.Request(url)
1845 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1846 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1847 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1850 page = json.loads(page)
1852 ids_in_page = self.extract_videos_from_page(page['content_html'])
1853 video_ids.extend(ids_in_page)
1855 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1858 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1860 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1861 url_entries = [self.url_result(url, 'Youtube') for url in urls]
1862 return [self.playlist_result(url_entries, channel_id)]
1865 class YoutubeUserIE(InfoExtractor):
1866 """Information Extractor for YouTube users."""
1868 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1869 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1870 _GDATA_PAGE_SIZE = 50
1871 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1872 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1873 IE_NAME = u'youtube:user'
1875 def __init__(self, downloader=None):
1876 InfoExtractor.__init__(self, downloader)
1878 def report_download_page(self, username, start_index):
1879 """Report attempt to download user page."""
1880 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1881 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1883 def _real_extract(self, url):
1885 mobj = re.match(self._VALID_URL, url)
1887 self._downloader.report_error(u'invalid url: %s' % url)
1890 username = mobj.group(1)
1892 # Download video ids using YouTube Data API. Result size per
1893 # query is limited (currently to 50 videos) so we need to query
1894 # page by page until there are no video ids - it means we got
1901 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1902 self.report_download_page(username, start_index)
1904 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1907 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1908 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1909 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1912 # Extract video identifiers
1915 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1916 if mobj.group(1) not in ids_in_page:
1917 ids_in_page.append(mobj.group(1))
1919 video_ids.extend(ids_in_page)
1921 # A little optimization - if current page is not
1922 # "full", ie. does not contain PAGE_SIZE video ids then
1923 # we can assume that this page is the last one - there
1924 # are no more ids on further pages - no need to query
1927 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1932 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1933 url_results = [self.url_result(url, 'Youtube') for url in urls]
1934 return [self.playlist_result(url_results, playlist_title = username)]
1937 class BlipTVUserIE(InfoExtractor):
1938 """Information Extractor for blip.tv users."""
1940 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1942 IE_NAME = u'blip.tv:user'
1944 def __init__(self, downloader=None):
1945 InfoExtractor.__init__(self, downloader)
1947 def report_download_page(self, username, pagenum):
1948 """Report attempt to download user page."""
1949 self.to_screen(u'user %s: Downloading video ids from page %d' %
1950 (username, pagenum))
1952 def _real_extract(self, url):
1954 mobj = re.match(self._VALID_URL, url)
1956 self._downloader.report_error(u'invalid url: %s' % url)
1959 username = mobj.group(1)
1961 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1963 request = compat_urllib_request.Request(url)
1966 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1967 mobj = re.search(r'data-users-id="([^"]+)"', page)
1968 page_base = page_base % mobj.group(1)
1969 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1970 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1974 # Download video ids using BlipTV Ajax calls. Result size per
1975 # query is limited (currently to 12 videos) so we need to query
1976 # page by page until there are no video ids - it means we got
1983 self.report_download_page(username, pagenum)
1984 url = page_base + "&page=" + str(pagenum)
1985 request = compat_urllib_request.Request( url )
1987 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1988 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1989 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
1992 # Extract video identifiers
1995 for mobj in re.finditer(r'href="/([^"]+)"', page):
1996 if mobj.group(1) not in ids_in_page:
1997 ids_in_page.append(unescapeHTML(mobj.group(1)))
1999 video_ids.extend(ids_in_page)
2001 # A little optimization - if current page is not
2002 # "full", ie. does not contain PAGE_SIZE video ids then
2003 # we can assume that this page is the last one - there
2004 # are no more ids on further pages - no need to query
2007 if len(ids_in_page) < self._PAGE_SIZE:
2012 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
2013 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
2014 return [self.playlist_result(url_entries, playlist_title = username)]
2017 class DepositFilesIE(InfoExtractor):
2018 """Information extractor for depositfiles.com"""
2020 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2022 def report_download_webpage(self, file_id):
2023 """Report webpage download."""
2024 self.to_screen(u'%s: Downloading webpage' % file_id)
2026 def _real_extract(self, url):
2027 file_id = url.split('/')[-1]
2028 # Rebuild url in english locale
2029 url = 'http://depositfiles.com/en/files/' + file_id
2031 # Retrieve file webpage with 'Free download' button pressed
2032 free_download_indication = { 'gateway_result' : '1' }
2033 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2035 self.report_download_webpage(file_id)
2036 webpage = compat_urllib_request.urlopen(request).read()
2037 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2038 self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
2041 # Search for the real file URL
2042 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2043 if (mobj is None) or (mobj.group(1) is None):
2044 # Try to figure out reason of the error.
2045 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2046 if (mobj is not None) and (mobj.group(1) is not None):
2047 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2048 self._downloader.report_error(u'%s' % restriction_message)
2050 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
2053 file_url = mobj.group(1)
2054 file_extension = os.path.splitext(file_url)[1][1:]
2056 # Search for file title
2057 mobj = re.search(r'<b title="(.*?)">', webpage)
2059 self._downloader.report_error(u'unable to extract title')
2061 file_title = mobj.group(1).decode('utf-8')
2064 'id': file_id.decode('utf-8'),
2065 'url': file_url.decode('utf-8'),
2067 'upload_date': None,
2068 'title': file_title,
2069 'ext': file_extension.decode('utf-8'),
2073 class FacebookIE(InfoExtractor):
2074 """Information Extractor for Facebook"""
2076 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2077 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2078 _NETRC_MACHINE = 'facebook'
2079 IE_NAME = u'facebook'
2081 def report_login(self):
2082 """Report attempt to log in."""
2083 self.to_screen(u'Logging in')
2085 def _real_initialize(self):
2086 if self._downloader is None:
2091 downloader_params = self._downloader.params
2093 # Attempt to use provided username and password or .netrc data
2094 if downloader_params.get('username', None) is not None:
2095 useremail = downloader_params['username']
2096 password = downloader_params['password']
2097 elif downloader_params.get('usenetrc', False):
2099 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2100 if info is not None:
2104 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2105 except (IOError, netrc.NetrcParseError) as err:
2106 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2109 if useremail is None:
2118 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2121 login_results = compat_urllib_request.urlopen(request).read()
2122 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2123 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2125 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2126 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2129 def _real_extract(self, url):
2130 mobj = re.match(self._VALID_URL, url)
2132 self._downloader.report_error(u'invalid URL: %s' % url)
2134 video_id = mobj.group('ID')
2136 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2137 webpage = self._download_webpage(url, video_id)
2139 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
2140 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2141 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2143 raise ExtractorError(u'Cannot parse data')
2144 data = dict(json.loads(m.group(1)))
2145 params_raw = compat_urllib_parse.unquote(data['params'])
2146 params = json.loads(params_raw)
2147 video_data = params['video_data'][0]
2148 video_url = video_data.get('hd_src')
2150 video_url = video_data['sd_src']
2152 raise ExtractorError(u'Cannot find video URL')
2153 video_duration = int(video_data['video_duration'])
2154 thumbnail = video_data['thumbnail_src']
2156 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2158 raise ExtractorError(u'Cannot find title in webpage')
2159 video_title = unescapeHTML(m.group(1))
2163 'title': video_title,
2166 'duration': video_duration,
2167 'thumbnail': thumbnail,
2172 class BlipTVIE(InfoExtractor):
2173 """Information extractor for blip.tv"""
2175 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2176 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2177 IE_NAME = u'blip.tv'
2179 def report_direct_download(self, title):
2180 """Report information extraction."""
2181 self.to_screen(u'%s: Direct download detected' % title)
2183 def _real_extract(self, url):
2184 mobj = re.match(self._VALID_URL, url)
2186 self._downloader.report_error(u'invalid URL: %s' % url)
2189 urlp = compat_urllib_parse_urlparse(url)
2190 if urlp.path.startswith('/play/'):
2191 request = compat_urllib_request.Request(url)
2192 response = compat_urllib_request.urlopen(request)
2193 redirecturl = response.geturl()
2194 rurlp = compat_urllib_parse_urlparse(redirecturl)
2195 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2196 url = 'http://blip.tv/a/a-' + file_id
2197 return self._real_extract(url)
2204 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2205 request = compat_urllib_request.Request(json_url)
2206 request.add_header('User-Agent', 'iTunes/10.6.1')
2207 self.report_extraction(mobj.group(1))
2210 urlh = compat_urllib_request.urlopen(request)
2211 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2212 basename = url.split('/')[-1]
2213 title,ext = os.path.splitext(basename)
2214 title = title.decode('UTF-8')
2215 ext = ext.replace('.', '')
2216 self.report_direct_download(title)
2221 'upload_date': None,
2226 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2227 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2228 if info is None: # Regular URL
2230 json_code_bytes = urlh.read()
2231 json_code = json_code_bytes.decode('utf-8')
2232 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2233 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2237 json_data = json.loads(json_code)
2238 if 'Post' in json_data:
2239 data = json_data['Post']
2243 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2244 video_url = data['media']['url']
2245 umobj = re.match(self._URL_EXT, video_url)
2247 raise ValueError('Can not determine filename extension')
2248 ext = umobj.group(1)
2251 'id': data['item_id'],
2253 'uploader': data['display_name'],
2254 'upload_date': upload_date,
2255 'title': data['title'],
2257 'format': data['media']['mimeType'],
2258 'thumbnail': data['thumbnailUrl'],
2259 'description': data['description'],
2260 'player_url': data['embedUrl'],
2261 'user_agent': 'iTunes/10.6.1',
2263 except (ValueError,KeyError) as err:
2264 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2270 class MyVideoIE(InfoExtractor):
2271 """Information Extractor for myvideo.de."""
2273 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2274 IE_NAME = u'myvideo'
2276 def __init__(self, downloader=None):
2277 InfoExtractor.__init__(self, downloader)
2279 def _real_extract(self,url):
2280 mobj = re.match(self._VALID_URL, url)
2282 self._download.report_error(u'invalid URL: %s' % url)
2285 video_id = mobj.group(1)
2288 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2289 webpage = self._download_webpage(webpage_url, video_id)
2291 self.report_extraction(video_id)
2292 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2295 self._downloader.report_error(u'unable to extract media URL')
2297 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2299 mobj = re.search('<title>([^<]+)</title>', webpage)
2301 self._downloader.report_error(u'unable to extract title')
2304 video_title = mobj.group(1)
2310 'upload_date': None,
2311 'title': video_title,
2315 class ComedyCentralIE(InfoExtractor):
2316 """Information extractor for The Daily Show and Colbert Report """
2318 # urls can be abbreviations like :thedailyshow or :colbert
2319 # urls for episodes like:
2320 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2321 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2322 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2323 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2324 |(https?://)?(www\.)?
2325 (?P<showname>thedailyshow|colbertnation)\.com/
2326 (full-episodes/(?P<episode>.*)|
2328 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2329 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2332 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2334 _video_extensions = {
2342 _video_dimensions = {
2352 def suitable(cls, url):
2353 """Receives a URL and returns True if suitable for this IE."""
2354 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2356 def report_config_download(self, episode_id, media_id):
2357 self.to_screen(u'%s: Downloading configuration for %s' % (episode_id, media_id))
2359 def report_index_download(self, episode_id):
2360 self.to_screen(u'%s: Downloading show index' % episode_id)
2362 def _print_formats(self, formats):
2363 print('Available formats:')
2365 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2368 def _real_extract(self, url):
2369 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2371 self._downloader.report_error(u'invalid URL: %s' % url)
2374 if mobj.group('shortname'):
2375 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2376 url = u'http://www.thedailyshow.com/full-episodes/'
2378 url = u'http://www.colbertnation.com/full-episodes/'
2379 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2380 assert mobj is not None
2382 if mobj.group('clip'):
2383 if mobj.group('showname') == 'thedailyshow':
2384 epTitle = mobj.group('tdstitle')
2386 epTitle = mobj.group('cntitle')
2389 dlNewest = not mobj.group('episode')
2391 epTitle = mobj.group('showname')
2393 epTitle = mobj.group('episode')
2395 req = compat_urllib_request.Request(url)
2396 self.report_extraction(epTitle)
2398 htmlHandle = compat_urllib_request.urlopen(req)
2399 html = htmlHandle.read()
2400 webpage = html.decode('utf-8')
2401 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2402 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2405 url = htmlHandle.geturl()
2406 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2408 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2410 if mobj.group('episode') == '':
2411 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2413 epTitle = mobj.group('episode')
2415 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2417 if len(mMovieParams) == 0:
2418 # The Colbert Report embeds the information in a without
2419 # a URL prefix; so extract the alternate reference
2420 # and then add the URL prefix manually.
2422 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2423 if len(altMovieParams) == 0:
2424 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2427 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2429 uri = mMovieParams[0][1]
2430 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2431 self.report_index_download(epTitle)
2433 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2434 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2435 self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2440 idoc = xml.etree.ElementTree.fromstring(indexXml)
2441 itemEls = idoc.findall('.//item')
2442 for partNum,itemEl in enumerate(itemEls):
2443 mediaId = itemEl.findall('./guid')[0].text
2444 shortMediaId = mediaId.split(':')[-1]
2445 showId = mediaId.split(':')[-2].replace('.com', '')
2446 officialTitle = itemEl.findall('./title')[0].text
2447 officialDate = itemEl.findall('./pubDate')[0].text
2449 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2450 compat_urllib_parse.urlencode({'uri': mediaId}))
2451 configReq = compat_urllib_request.Request(configUrl)
2452 self.report_config_download(epTitle, shortMediaId)
2454 configXml = compat_urllib_request.urlopen(configReq).read()
2455 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2456 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2459 cdoc = xml.etree.ElementTree.fromstring(configXml)
2461 for rendition in cdoc.findall('.//rendition'):
2462 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2466 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2469 if self._downloader.params.get('listformats', None):
2470 self._print_formats([i[0] for i in turls])
2473 # For now, just pick the highest bitrate
2474 format,rtmp_video_url = turls[-1]
2476 # Get the format arg from the arg stream
2477 req_format = self._downloader.params.get('format', None)
2479 # Select format if we can find one
2482 format, rtmp_video_url = f, v
2485 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2487 raise ExtractorError(u'Cannot transform RTMP url')
2488 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2489 video_url = base + m.group('finalid')
2491 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2496 'upload_date': officialDate,
2501 'description': officialTitle,
2503 results.append(info)
2508 class EscapistIE(InfoExtractor):
2509 """Information extractor for The Escapist """
2511 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2512 IE_NAME = u'escapist'
2514 def report_config_download(self, showName):
2515 self.to_screen(u'%s: Downloading configuration' % showName)
2517 def _real_extract(self, url):
2518 mobj = re.match(self._VALID_URL, url)
2520 self._downloader.report_error(u'invalid URL: %s' % url)
2522 showName = mobj.group('showname')
2523 videoId = mobj.group('episode')
2525 self.report_extraction(showName)
2527 webPage = compat_urllib_request.urlopen(url)
2528 webPageBytes = webPage.read()
2529 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2530 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2531 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2532 self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2535 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2536 description = unescapeHTML(descMatch.group(1))
2537 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2538 imgUrl = unescapeHTML(imgMatch.group(1))
2539 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2540 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2541 configUrlMatch = re.search('config=(.*)$', playerUrl)
2542 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2544 self.report_config_download(showName)
2546 configJSON = compat_urllib_request.urlopen(configUrl)
2547 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2548 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2549 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2550 self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2553 # Technically, it's JavaScript, not JSON
2554 configJSON = configJSON.replace("'", '"')
2557 config = json.loads(configJSON)
2558 except (ValueError,) as err:
2559 self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2562 playlist = config['playlist']
2563 videoUrl = playlist[1]['url']
2568 'uploader': showName,
2569 'upload_date': None,
2572 'thumbnail': imgUrl,
2573 'description': description,
2574 'player_url': playerUrl,
2579 class CollegeHumorIE(InfoExtractor):
2580 """Information extractor for collegehumor.com"""
2583 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2584 IE_NAME = u'collegehumor'
2586 def report_manifest(self, video_id):
2587 """Report information extraction."""
2588 self.to_screen(u'%s: Downloading XML manifest' % video_id)
2590 def _real_extract(self, url):
2591 mobj = re.match(self._VALID_URL, url)
2593 self._downloader.report_error(u'invalid URL: %s' % url)
2595 video_id = mobj.group('videoid')
2600 'upload_date': None,
2603 self.report_extraction(video_id)
2604 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2606 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2607 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2608 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2611 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2613 videoNode = mdoc.findall('./video')[0]
2614 info['description'] = videoNode.findall('./description')[0].text
2615 info['title'] = videoNode.findall('./caption')[0].text
2616 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2617 manifest_url = videoNode.findall('./file')[0].text
2619 self._downloader.report_error(u'Invalid metadata XML file')
2622 manifest_url += '?hdcore=2.10.3'
2623 self.report_manifest(video_id)
2625 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2626 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2627 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2630 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2632 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2633 node_id = media_node.attrib['url']
2634 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2635 except IndexError as err:
2636 self._downloader.report_error(u'Invalid manifest file')
2639 url_pr = compat_urllib_parse_urlparse(manifest_url)
2640 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2647 class XVideosIE(InfoExtractor):
2648 """Information extractor for xvideos.com"""
2650 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2651 IE_NAME = u'xvideos'
2653 def _real_extract(self, url):
2654 mobj = re.match(self._VALID_URL, url)
2656 self._downloader.report_error(u'invalid URL: %s' % url)
2658 video_id = mobj.group(1)
2660 webpage = self._download_webpage(url, video_id)
2662 self.report_extraction(video_id)
2666 mobj = re.search(r'flv_url=(.+?)&', webpage)
2668 self._downloader.report_error(u'unable to extract video url')
2670 video_url = compat_urllib_parse.unquote(mobj.group(1))
2674 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2676 self._downloader.report_error(u'unable to extract video title')
2678 video_title = mobj.group(1)
2681 # Extract video thumbnail
2682 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2684 self._downloader.report_error(u'unable to extract video thumbnail')
2686 video_thumbnail = mobj.group(0)
2692 'upload_date': None,
2693 'title': video_title,
2695 'thumbnail': video_thumbnail,
2696 'description': None,
2702 class SoundcloudIE(InfoExtractor):
2703 """Information extractor for soundcloud.com
2704 To access the media, the uid of the song and a stream token
2705 must be extracted from the page source and the script must make
2706 a request to media.soundcloud.com/crossdomain.xml. Then
2707 the media can be grabbed by requesting from an url composed
2708 of the stream token and uid
2711 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2712 IE_NAME = u'soundcloud'
2714 def __init__(self, downloader=None):
2715 InfoExtractor.__init__(self, downloader)
2717 def report_resolve(self, video_id):
2718 """Report information extraction."""
2719 self.to_screen(u'%s: Resolving id' % video_id)
2721 def _real_extract(self, url):
2722 mobj = re.match(self._VALID_URL, url)
2724 self._downloader.report_error(u'invalid URL: %s' % url)
2727 # extract uploader (which is in the url)
2728 uploader = mobj.group(1)
2729 # extract simple title (uploader + slug of song title)
2730 slug_title = mobj.group(2)
2731 simple_title = uploader + u'-' + slug_title
2733 self.report_resolve('%s/%s' % (uploader, slug_title))
2735 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2736 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2737 request = compat_urllib_request.Request(resolv_url)
2739 info_json_bytes = compat_urllib_request.urlopen(request).read()
2740 info_json = info_json_bytes.decode('utf-8')
2741 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2742 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2745 info = json.loads(info_json)
2746 video_id = info['id']
2747 self.report_extraction('%s/%s' % (uploader, slug_title))
2749 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2750 request = compat_urllib_request.Request(streams_url)
2752 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2753 stream_json = stream_json_bytes.decode('utf-8')
2754 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2755 self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2758 streams = json.loads(stream_json)
2759 mediaURL = streams['http_mp3_128_url']
2764 'uploader': info['user']['username'],
2765 'upload_date': info['created_at'],
2766 'title': info['title'],
2768 'description': info['description'],
2771 class SoundcloudSetIE(InfoExtractor):
2772 """Information extractor for soundcloud.com sets
2773 To access the media, the uid of the song and a stream token
2774 must be extracted from the page source and the script must make
2775 a request to media.soundcloud.com/crossdomain.xml. Then
2776 the media can be grabbed by requesting from an url composed
2777 of the stream token and uid
2780 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2781 IE_NAME = u'soundcloud'
2783 def __init__(self, downloader=None):
2784 InfoExtractor.__init__(self, downloader)
2786 def report_resolve(self, video_id):
2787 """Report information extraction."""
2788 self.to_screen(u'%s: Resolving id' % video_id)
2790 def _real_extract(self, url):
2791 mobj = re.match(self._VALID_URL, url)
2793 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2796 # extract uploader (which is in the url)
2797 uploader = mobj.group(1)
2798 # extract simple title (uploader + slug of song title)
2799 slug_title = mobj.group(2)
2800 simple_title = uploader + u'-' + slug_title
2802 self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2804 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2805 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2806 request = compat_urllib_request.Request(resolv_url)
2808 info_json_bytes = compat_urllib_request.urlopen(request).read()
2809 info_json = info_json_bytes.decode('utf-8')
2810 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2811 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2815 info = json.loads(info_json)
2816 if 'errors' in info:
2817 for err in info['errors']:
2818 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err['error_message']))
2821 for track in info['tracks']:
2822 video_id = track['id']
2823 self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2825 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2826 request = compat_urllib_request.Request(streams_url)
2828 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2829 stream_json = stream_json_bytes.decode('utf-8')
2830 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2831 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2834 streams = json.loads(stream_json)
2835 mediaURL = streams['http_mp3_128_url']
2840 'uploader': track['user']['username'],
2841 'upload_date': track['created_at'],
2842 'title': track['title'],
2844 'description': track['description'],
2849 class InfoQIE(InfoExtractor):
2850 """Information extractor for infoq.com"""
2851 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2853 def _real_extract(self, url):
2854 mobj = re.match(self._VALID_URL, url)
2856 self._downloader.report_error(u'invalid URL: %s' % url)
2859 webpage = self._download_webpage(url, video_id=url)
2860 self.report_extraction(url)
2863 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2865 self._downloader.report_error(u'unable to extract video url')
2867 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2868 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2871 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2873 self._downloader.report_error(u'unable to extract video title')
2875 video_title = mobj.group(1)
2877 # Extract description
2878 video_description = u'No description available.'
2879 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2880 if mobj is not None:
2881 video_description = mobj.group(1)
2883 video_filename = video_url.split('/')[-1]
2884 video_id, extension = video_filename.split('.')
2890 'upload_date': None,
2891 'title': video_title,
2892 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2894 'description': video_description,
2899 class MixcloudIE(InfoExtractor):
2900 """Information extractor for www.mixcloud.com"""
2902 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2903 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2904 IE_NAME = u'mixcloud'
2906 def __init__(self, downloader=None):
2907 InfoExtractor.__init__(self, downloader)
2909 def report_download_json(self, file_id):
2910 """Report JSON download."""
2911 self.to_screen(u'Downloading json')
2913 def get_urls(self, jsonData, fmt, bitrate='best'):
2914 """Get urls from 'audio_formats' section in json"""
2917 bitrate_list = jsonData[fmt]
2918 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2919 bitrate = max(bitrate_list) # select highest
2921 url_list = jsonData[fmt][bitrate]
2922 except TypeError: # we have no bitrate info.
2923 url_list = jsonData[fmt]
2926 def check_urls(self, url_list):
2927 """Returns 1st active url from list"""
2928 for url in url_list:
2930 compat_urllib_request.urlopen(url)
2932 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2937 def _print_formats(self, formats):
2938 print('Available formats:')
2939 for fmt in formats.keys():
2940 for b in formats[fmt]:
2942 ext = formats[fmt][b][0]
2943 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2944 except TypeError: # we have no bitrate info
2945 ext = formats[fmt][0]
2946 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2949 def _real_extract(self, url):
2950 mobj = re.match(self._VALID_URL, url)
2952 self._downloader.report_error(u'invalid URL: %s' % url)
2954 # extract uploader & filename from url
2955 uploader = mobj.group(1).decode('utf-8')
2956 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2958 # construct API request
2959 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2960 # retrieve .json file with links to files
2961 request = compat_urllib_request.Request(file_url)
2963 self.report_download_json(file_url)
2964 jsonData = compat_urllib_request.urlopen(request).read()
2965 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2966 self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
2970 json_data = json.loads(jsonData)
2971 player_url = json_data['player_swf_url']
2972 formats = dict(json_data['audio_formats'])
2974 req_format = self._downloader.params.get('format', None)
2977 if self._downloader.params.get('listformats', None):
2978 self._print_formats(formats)
2981 if req_format is None or req_format == 'best':
2982 for format_param in formats.keys():
2983 url_list = self.get_urls(formats, format_param)
2985 file_url = self.check_urls(url_list)
2986 if file_url is not None:
2989 if req_format not in formats:
2990 self._downloader.report_error(u'format is not available')
2993 url_list = self.get_urls(formats, req_format)
2994 file_url = self.check_urls(url_list)
2995 format_param = req_format
2998 'id': file_id.decode('utf-8'),
2999 'url': file_url.decode('utf-8'),
3000 'uploader': uploader.decode('utf-8'),
3001 'upload_date': None,
3002 'title': json_data['name'],
3003 'ext': file_url.split('.')[-1].decode('utf-8'),
3004 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3005 'thumbnail': json_data['thumbnail_url'],
3006 'description': json_data['description'],
3007 'player_url': player_url.decode('utf-8'),
3010 class StanfordOpenClassroomIE(InfoExtractor):
3011 """Information extractor for Stanford's Open ClassRoom"""
3013 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3014 IE_NAME = u'stanfordoc'
3016 def report_download_webpage(self, objid):
3017 """Report information extraction."""
3018 self.to_screen(u'%s: Downloading webpage' % objid)
3020 def _real_extract(self, url):
3021 mobj = re.match(self._VALID_URL, url)
3023 raise ExtractorError(u'Invalid URL: %s' % url)
3025 if mobj.group('course') and mobj.group('video'): # A specific video
3026 course = mobj.group('course')
3027 video = mobj.group('video')
3029 'id': course + '_' + video,
3031 'upload_date': None,
3034 self.report_extraction(info['id'])
3035 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3036 xmlUrl = baseUrl + video + '.xml'
3038 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3039 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3040 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
3042 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3044 info['title'] = mdoc.findall('./title')[0].text
3045 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3047 self._downloader.report_error(u'Invalid metadata XML file')
3049 info['ext'] = info['url'].rpartition('.')[2]
3051 elif mobj.group('course'): # A course page
3052 course = mobj.group('course')
3057 'upload_date': None,
3060 coursepage = self._download_webpage(url, info['id'],
3061 note='Downloading course info page',
3062 errnote='Unable to download course info page')
3064 m = re.search('<h1>([^<]+)</h1>', coursepage)
3066 info['title'] = unescapeHTML(m.group(1))
3068 info['title'] = info['id']
3070 m = re.search('<description>([^<]+)</description>', coursepage)
3072 info['description'] = unescapeHTML(m.group(1))
3074 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3077 'type': 'reference',
3078 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3082 for entry in info['list']:
3083 assert entry['type'] == 'reference'
3084 results += self.extract(entry['url'])
3088 'id': 'Stanford OpenClassroom',
3091 'upload_date': None,
3094 self.report_download_webpage(info['id'])
3095 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3097 rootpage = compat_urllib_request.urlopen(rootURL).read()
3098 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3099 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3102 info['title'] = info['id']
3104 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3107 'type': 'reference',
3108 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3113 for entry in info['list']:
3114 assert entry['type'] == 'reference'
3115 results += self.extract(entry['url'])
3118 class MTVIE(InfoExtractor):
3119 """Information extractor for MTV.com"""
3121 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3124 def _real_extract(self, url):
3125 mobj = re.match(self._VALID_URL, url)
3127 self._downloader.report_error(u'invalid URL: %s' % url)
3129 if not mobj.group('proto'):
3130 url = 'http://' + url
3131 video_id = mobj.group('videoid')
3133 webpage = self._download_webpage(url, video_id)
3135 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3137 self._downloader.report_error(u'unable to extract song name')
3139 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3140 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3142 self._downloader.report_error(u'unable to extract performer')
3144 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3145 video_title = performer + ' - ' + song_name
3147 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3149 self._downloader.report_error(u'unable to mtvn_uri')
3151 mtvn_uri = mobj.group(1)
3153 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3155 self._downloader.report_error(u'unable to extract content id')
3157 content_id = mobj.group(1)
3159 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3160 self.report_extraction(video_id)
3161 request = compat_urllib_request.Request(videogen_url)
3163 metadataXml = compat_urllib_request.urlopen(request).read()
3164 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3165 self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3168 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3169 renditions = mdoc.findall('.//rendition')
3171 # For now, always pick the highest quality.
3172 rendition = renditions[-1]
3175 _,_,ext = rendition.attrib['type'].partition('/')
3176 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3177 video_url = rendition.find('./src').text
3179 self._downloader.trouble('Invalid rendition field.')
3185 'uploader': performer,
3186 'upload_date': None,
3187 'title': video_title,
3195 class YoukuIE(InfoExtractor):
3196 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3198 def report_download_webpage(self, file_id):
3199 """Report webpage download."""
3200 self.to_screen(u'%s: Downloading webpage' % file_id)
3203 nowTime = int(time.time() * 1000)
3204 random1 = random.randint(1000,1998)
3205 random2 = random.randint(1000,9999)
3207 return "%d%d%d" %(nowTime,random1,random2)
3209 def _get_file_ID_mix_string(self, seed):
3211 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3213 for i in range(len(source)):
3214 seed = (seed * 211 + 30031 ) % 65536
3215 index = math.floor(seed / 65536 * len(source) )
3216 mixed.append(source[int(index)])
3217 source.remove(source[int(index)])
3218 #return ''.join(mixed)
3221 def _get_file_id(self, fileId, seed):
3222 mixed = self._get_file_ID_mix_string(seed)
3223 ids = fileId.split('*')
3227 realId.append(mixed[int(ch)])
3228 return ''.join(realId)
3230 def _real_extract(self, url):
3231 mobj = re.match(self._VALID_URL, url)
3233 self._downloader.report_error(u'invalid URL: %s' % url)
3235 video_id = mobj.group('ID')
3237 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3239 request = compat_urllib_request.Request(info_url, None, std_headers)
3241 self.report_download_webpage(video_id)
3242 jsondata = compat_urllib_request.urlopen(request).read()
3243 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3244 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3247 self.report_extraction(video_id)
3249 jsonstr = jsondata.decode('utf-8')
3250 config = json.loads(jsonstr)
3252 video_title = config['data'][0]['title']
3253 seed = config['data'][0]['seed']
3255 format = self._downloader.params.get('format', None)
3256 supported_format = list(config['data'][0]['streamfileids'].keys())
3258 if format is None or format == 'best':
3259 if 'hd2' in supported_format:
3264 elif format == 'worst':
3272 fileid = config['data'][0]['streamfileids'][format]
3273 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3274 except (UnicodeDecodeError, ValueError, KeyError):
3275 self._downloader.report_error(u'unable to extract info section')
3279 sid = self._gen_sid()
3280 fileid = self._get_file_id(fileid, seed)
3282 #column 8,9 of fileid represent the segment number
3283 #fileid[7:9] should be changed
3284 for index, key in enumerate(keys):
3286 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3287 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3290 'id': '%s_part%02d' % (video_id, index),
3291 'url': download_url,
3293 'upload_date': None,
3294 'title': video_title,
3297 files_info.append(info)
3302 class XNXXIE(InfoExtractor):
3303 """Information extractor for xnxx.com"""
3305 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3307 VIDEO_URL_RE = r'flv_url=(.*?)&'
3308 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3309 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3311 def report_webpage(self, video_id):
3312 """Report information extraction"""
3313 self.to_screen(u'%s: Downloading webpage' % video_id)
3315 def _real_extract(self, url):
3316 mobj = re.match(self._VALID_URL, url)
3318 self._downloader.report_error(u'invalid URL: %s' % url)
3320 video_id = mobj.group(1)
3322 self.report_webpage(video_id)
3324 # Get webpage content
3326 webpage_bytes = compat_urllib_request.urlopen(url).read()
3327 webpage = webpage_bytes.decode('utf-8')
3328 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3329 self._downloader.report_error(u'unable to download video webpage: %s' % err)
3332 result = re.search(self.VIDEO_URL_RE, webpage)
3334 self._downloader.report_error(u'unable to extract video url')
3336 video_url = compat_urllib_parse.unquote(result.group(1))
3338 result = re.search(self.VIDEO_TITLE_RE, webpage)
3340 self._downloader.report_error(u'unable to extract video title')
3342 video_title = result.group(1)
3344 result = re.search(self.VIDEO_THUMB_RE, webpage)
3346 self._downloader.report_error(u'unable to extract video thumbnail')
3348 video_thumbnail = result.group(1)
3354 'upload_date': None,
3355 'title': video_title,
3357 'thumbnail': video_thumbnail,
3358 'description': None,
3362 class GooglePlusIE(InfoExtractor):
3363 """Information extractor for plus.google.com."""
3365 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3366 IE_NAME = u'plus.google'
3368 def __init__(self, downloader=None):
3369 InfoExtractor.__init__(self, downloader)
3371 def report_extract_entry(self, url):
3372 """Report downloading extry"""
3373 self.to_screen(u'Downloading entry: %s' % url)
3375 def report_date(self, upload_date):
3376 """Report downloading extry"""
3377 self.to_screen(u'Entry date: %s' % upload_date)
3379 def report_uploader(self, uploader):
3380 """Report downloading extry"""
3381 self.to_screen(u'Uploader: %s' % uploader)
3383 def report_title(self, video_title):
3384 """Report downloading extry"""
3385 self.to_screen(u'Title: %s' % video_title)
3387 def report_extract_vid_page(self, video_page):
3388 """Report information extraction."""
3389 self.to_screen(u'Extracting video page: %s' % video_page)
3391 def _real_extract(self, url):
3392 # Extract id from URL
3393 mobj = re.match(self._VALID_URL, url)
3395 self._downloader.report_error(u'Invalid URL: %s' % url)
3398 post_url = mobj.group(0)
3399 video_id = mobj.group(1)
3401 video_extension = 'flv'
3403 # Step 1, Retrieve post webpage to extract further information
3404 self.report_extract_entry(post_url)
3405 request = compat_urllib_request.Request(post_url)
3407 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3408 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3409 self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3412 # Extract update date
3414 pattern = 'title="Timestamp">(.*?)</a>'
3415 mobj = re.search(pattern, webpage)
3417 upload_date = mobj.group(1)
3418 # Convert timestring to a format suitable for filename
3419 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3420 upload_date = upload_date.strftime('%Y%m%d')
3421 self.report_date(upload_date)
3425 pattern = r'rel\="author".*?>(.*?)</a>'
3426 mobj = re.search(pattern, webpage)
3428 uploader = mobj.group(1)
3429 self.report_uploader(uploader)
3432 # Get the first line for title
3434 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3435 mobj = re.search(pattern, webpage)
3437 video_title = mobj.group(1)
3438 self.report_title(video_title)
3440 # Step 2, Stimulate clicking the image box to launch video
3441 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3442 mobj = re.search(pattern, webpage)
3444 self._downloader.report_error(u'unable to extract video page URL')
3446 video_page = mobj.group(1)
3447 request = compat_urllib_request.Request(video_page)
3449 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3450 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3451 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3453 self.report_extract_vid_page(video_page)
3456 # Extract video links on video page
3457 """Extract video links of all sizes"""
3458 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3459 mobj = re.findall(pattern, webpage)
3461 self._downloader.report_error(u'unable to extract video links')
3463 # Sort in resolution
3464 links = sorted(mobj)
3466 # Choose the lowest of the sort, i.e. highest resolution
3467 video_url = links[-1]
3468 # Only get the url. The resolution part in the tuple has no use anymore
3469 video_url = video_url[-1]
3470 # Treat escaped \u0026 style hex
3472 video_url = video_url.decode("unicode_escape")
3473 except AttributeError: # Python 3
3474 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3480 'uploader': uploader,
3481 'upload_date': upload_date,
3482 'title': video_title,
3483 'ext': video_extension,
3486 class NBAIE(InfoExtractor):
3487 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3490 def _real_extract(self, url):
3491 mobj = re.match(self._VALID_URL, url)
3493 self._downloader.report_error(u'invalid URL: %s' % url)
3496 video_id = mobj.group(1)
3497 if video_id.endswith('/index.html'):
3498 video_id = video_id[:-len('/index.html')]
3500 webpage = self._download_webpage(url, video_id)
3502 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3503 def _findProp(rexp, default=None):
3504 m = re.search(rexp, webpage)
3506 return unescapeHTML(m.group(1))
3510 shortened_video_id = video_id.rpartition('/')[2]
3511 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3513 'id': shortened_video_id,
3517 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3518 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3522 class JustinTVIE(InfoExtractor):
3523 """Information extractor for justin.tv and twitch.tv"""
3524 # TODO: One broadcast may be split into multiple videos. The key
3525 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3526 # starts at 1 and increases. Can we treat all parts as one video?
3528 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3529 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3530 _JUSTIN_PAGE_LIMIT = 100
3531 IE_NAME = u'justin.tv'
3533 def report_download_page(self, channel, offset):
3534 """Report attempt to download a single page of videos."""
3535 self.to_screen(u'%s: Downloading video information from %d to %d' %
3536 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3538 # Return count of items, list of *valid* items
3539 def _parse_page(self, url):
3541 urlh = compat_urllib_request.urlopen(url)
3542 webpage_bytes = urlh.read()
3543 webpage = webpage_bytes.decode('utf-8', 'ignore')
3544 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3545 self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3548 response = json.loads(webpage)
3549 if type(response) != list:
3550 error_text = response.get('error', 'unknown error')
3551 self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3554 for clip in response:
3555 video_url = clip['video_file_url']
3557 video_extension = os.path.splitext(video_url)[1][1:]
3558 video_date = re.sub('-', '', clip['start_time'][:10])
3559 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3560 video_id = clip['id']
3561 video_title = clip.get('title', video_id)
3565 'title': video_title,
3566 'uploader': clip.get('channel_name', video_uploader_id),
3567 'uploader_id': video_uploader_id,
3568 'upload_date': video_date,
3569 'ext': video_extension,
3571 return (len(response), info)
3573 def _real_extract(self, url):
3574 mobj = re.match(self._VALID_URL, url)
3576 self._downloader.report_error(u'invalid URL: %s' % url)
3579 api = 'http://api.justin.tv'
3580 video_id = mobj.group(mobj.lastindex)
3582 if mobj.lastindex == 1:
3584 api += '/channel/archives/%s.json'
3586 api += '/broadcast/by_archive/%s.json'
3587 api = api % (video_id,)
3589 self.report_extraction(video_id)
3593 limit = self._JUSTIN_PAGE_LIMIT
3596 self.report_download_page(video_id, offset)
3597 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3598 page_count, page_info = self._parse_page(page_url)
3599 info.extend(page_info)
3600 if not paged or page_count != limit:
3605 class FunnyOrDieIE(InfoExtractor):
3606 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3608 def _real_extract(self, url):
3609 mobj = re.match(self._VALID_URL, url)
3611 self._downloader.report_error(u'invalid URL: %s' % url)
3614 video_id = mobj.group('id')
3615 webpage = self._download_webpage(url, video_id)
3617 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3619 self._downloader.report_error(u'unable to find video information')
3620 video_url = unescapeHTML(m.group('url'))
3622 m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3624 m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3626 self._downloader.trouble(u'Cannot find video title')
3627 title = clean_html(m.group('title'))
3629 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3631 desc = unescapeHTML(m.group('desc'))
3640 'description': desc,
3644 class SteamIE(InfoExtractor):
3645 _VALID_URL = r"""http://store.steampowered.com/
3646 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3648 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3652 def suitable(cls, url):
3653 """Receives a URL and returns True if suitable for this IE."""
3654 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3656 def _real_extract(self, url):
3657 m = re.match(self._VALID_URL, url, re.VERBOSE)
3658 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3659 gameID = m.group('gameID')
3660 videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3661 self.report_age_confirmation()
3662 webpage = self._download_webpage(videourl, gameID)
3663 mweb = re.finditer(urlRE, webpage)
3664 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3665 titles = re.finditer(namesRE, webpage)
3666 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3667 thumbs = re.finditer(thumbsRE, webpage)
3669 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3670 video_id = vid.group('videoID')
3671 title = vtitle.group('videoName')
3672 video_url = vid.group('videoURL')
3673 video_thumb = thumb.group('thumbnail')
3675 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3680 'title': unescapeHTML(title),
3681 'thumbnail': video_thumb
3686 class UstreamIE(InfoExtractor):
3687 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3688 IE_NAME = u'ustream'
3690 def _real_extract(self, url):
3691 m = re.match(self._VALID_URL, url)
3692 video_id = m.group('videoID')
3693 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3694 webpage = self._download_webpage(url, video_id)
3695 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3696 title = m.group('title')
3697 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3698 uploader = m.group('uploader')
3704 'uploader': uploader
3708 class WorldStarHipHopIE(InfoExtractor):
3709 _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3710 IE_NAME = u'WorldStarHipHop'
3712 def _real_extract(self, url):
3713 _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3715 webpage_src = compat_urllib_request.urlopen(url).read()
3716 webpage_src = webpage_src.decode('utf-8')
3718 mobj = re.search(_src_url, webpage_src)
3720 m = re.match(self._VALID_URL, url)
3721 video_id = m.group('id')
3723 if mobj is not None:
3724 video_url = mobj.group()
3725 if 'mp4' in video_url:
3730 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3733 _title = r"""<title>(.*)</title>"""
3735 mobj = re.search(_title, webpage_src)
3737 if mobj is not None:
3738 title = mobj.group(1)
3740 title = 'World Start Hip Hop - %s' % time.ctime()
3742 _thumbnail = r"""rel="image_src" href="(.*)" />"""
3743 mobj = re.search(_thumbnail, webpage_src)
3745 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3746 if mobj is not None:
3747 thumbnail = mobj.group(1)
3749 _title = r"""candytitles.*>(.*)</span>"""
3750 mobj = re.search(_title, webpage_src)
3751 if mobj is not None:
3752 title = mobj.group(1)
3759 'thumbnail' : thumbnail,
3764 class RBMARadioIE(InfoExtractor):
3765 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3767 def _real_extract(self, url):
3768 m = re.match(self._VALID_URL, url)
3769 video_id = m.group('videoID')
3771 webpage = self._download_webpage(url, video_id)
3772 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3774 raise ExtractorError(u'Cannot find metadata')
3775 json_data = m.group(1)
3778 data = json.loads(json_data)
3779 except ValueError as e:
3780 raise ExtractorError(u'Invalid JSON: ' + str(e))
3782 video_url = data['akamai_url'] + '&cbr=256'
3783 url_parts = compat_urllib_parse_urlparse(video_url)
3784 video_ext = url_parts.path.rpartition('.')[2]
3789 'title': data['title'],
3790 'description': data.get('teaser_text'),
3791 'location': data.get('country_of_origin'),
3792 'uploader': data.get('host', {}).get('name'),
3793 'uploader_id': data.get('host', {}).get('slug'),
3794 'thumbnail': data.get('image', {}).get('large_url_2x'),
3795 'duration': data.get('duration'),
3800 class YouPornIE(InfoExtractor):
3801 """Information extractor for youporn.com."""
3802 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3804 def _print_formats(self, formats):
3805 """Print all available formats"""
3806 print(u'Available formats:')
3807 print(u'ext\t\tformat')
3808 print(u'---------------------------------')
3809 for format in formats:
3810 print(u'%s\t\t%s' % (format['ext'], format['format']))
3812 def _specific(self, req_format, formats):
3814 if(x["format"]==req_format):
3818 def _real_extract(self, url):
3819 mobj = re.match(self._VALID_URL, url)
3821 self._downloader.report_error(u'invalid URL: %s' % url)
3824 video_id = mobj.group('videoid')
3826 req = compat_urllib_request.Request(url)
3827 req.add_header('Cookie', 'age_verified=1')
3828 webpage = self._download_webpage(req, video_id)
3830 # Get the video title
3831 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3833 raise ExtractorError(u'Unable to extract video title')
3834 video_title = result.group('title').strip()
3836 # Get the video date
3837 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3839 self._downloader.report_warning(u'unable to extract video date')
3842 upload_date = result.group('date').strip()
3844 # Get the video uploader
3845 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3847 self._downloader.report_warning(u'unable to extract uploader')
3848 video_uploader = None
3850 video_uploader = result.group('uploader').strip()
3851 video_uploader = clean_html( video_uploader )
3853 # Get all of the formats available
3854 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3855 result = re.search(DOWNLOAD_LIST_RE, webpage)
3857 raise ExtractorError(u'Unable to extract download list')
3858 download_list_html = result.group('download_list').strip()
3860 # Get all of the links from the page
3861 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3862 links = re.findall(LINK_RE, download_list_html)
3863 if(len(links) == 0):
3864 raise ExtractorError(u'ERROR: no known formats available for video')
3866 self.to_screen(u'Links found: %d' % len(links))
3871 # A link looks like this:
3872 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3873 # A path looks like this:
3874 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3875 video_url = unescapeHTML( link )
3876 path = compat_urllib_parse_urlparse( video_url ).path
3877 extension = os.path.splitext( path )[1][1:]
3878 format = path.split('/')[4].split('_')[:2]
3881 format = "-".join( format )
3882 title = u'%s-%s-%s' % (video_title, size, bitrate)
3887 'uploader': video_uploader,
3888 'upload_date': upload_date,
3893 'description': None,
3897 if self._downloader.params.get('listformats', None):
3898 self._print_formats(formats)
3901 req_format = self._downloader.params.get('format', None)
3902 self.to_screen(u'Format: %s' % req_format)
3904 if req_format is None or req_format == 'best':
3906 elif req_format == 'worst':
3907 return [formats[-1]]
3908 elif req_format in ('-1', 'all'):
3911 format = self._specific( req_format, formats )
3913 self._downloader.report_error(u'requested format not available')
3919 class PornotubeIE(InfoExtractor):
3920 """Information extractor for pornotube.com."""
3921 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3923 def _real_extract(self, url):
3924 mobj = re.match(self._VALID_URL, url)
3926 self._downloader.report_error(u'invalid URL: %s' % url)
3929 video_id = mobj.group('videoid')
3930 video_title = mobj.group('title')
3932 # Get webpage content
3933 webpage = self._download_webpage(url, video_id)
3936 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3937 result = re.search(VIDEO_URL_RE, webpage)
3939 self._downloader.report_error(u'unable to extract video url')
3941 video_url = compat_urllib_parse.unquote(result.group('url'))
3943 #Get the uploaded date
3944 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3945 result = re.search(VIDEO_UPLOADED_RE, webpage)
3947 self._downloader.report_error(u'unable to extract video title')
3949 upload_date = result.group('date')
3951 info = {'id': video_id,
3954 'upload_date': upload_date,
3955 'title': video_title,
3961 class YouJizzIE(InfoExtractor):
3962 """Information extractor for youjizz.com."""
3963 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3965 def _real_extract(self, url):
3966 mobj = re.match(self._VALID_URL, url)
3968 self._downloader.report_error(u'invalid URL: %s' % url)
3971 video_id = mobj.group('videoid')
3973 # Get webpage content
3974 webpage = self._download_webpage(url, video_id)
3976 # Get the video title
3977 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3979 raise ExtractorError(u'ERROR: unable to extract video title')
3980 video_title = result.group('title').strip()
3982 # Get the embed page
3983 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3985 raise ExtractorError(u'ERROR: unable to extract embed page')
3987 embed_page_url = result.group(0).strip()
3988 video_id = result.group('videoid')
3990 webpage = self._download_webpage(embed_page_url, video_id)
3993 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3995 raise ExtractorError(u'ERROR: unable to extract video url')
3996 video_url = result.group('source')
3998 info = {'id': video_id,
4000 'title': video_title,
4003 'player_url': embed_page_url}
4007 class EightTracksIE(InfoExtractor):
4009 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
4011 def _real_extract(self, url):
4012 mobj = re.match(self._VALID_URL, url)
4014 raise ExtractorError(u'Invalid URL: %s' % url)
4015 playlist_id = mobj.group('id')
4017 webpage = self._download_webpage(url, playlist_id)
4019 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
4021 raise ExtractorError(u'Cannot find trax information')
4022 json_like = m.group(1)
4023 data = json.loads(json_like)
4025 session = str(random.randint(0, 1000000000))
4027 track_count = data['tracks_count']
4028 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
4029 next_url = first_url
4031 for i in itertools.count():
4032 api_json = self._download_webpage(next_url, playlist_id,
4033 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
4034 errnote=u'Failed to download song information')
4035 api_data = json.loads(api_json)
4036 track_data = api_data[u'set']['track']
4038 'id': track_data['id'],
4039 'url': track_data['track_file_stream_url'],
4040 'title': track_data['performer'] + u' - ' + track_data['name'],
4041 'raw_title': track_data['name'],
4042 'uploader_id': data['user']['login'],
4046 if api_data['set']['at_last_track']:
4048 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
4051 class KeekIE(InfoExtractor):
4052 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
4055 def _real_extract(self, url):
4056 m = re.match(self._VALID_URL, url)
4057 video_id = m.group('videoID')
4058 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
4059 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
4060 webpage = self._download_webpage(url, video_id)
4061 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4062 title = unescapeHTML(m.group('title'))
4063 m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
4064 uploader = clean_html(m.group('uploader'))
4070 'thumbnail': thumbnail,
4071 'uploader': uploader
4075 class TEDIE(InfoExtractor):
4076 _VALID_URL=r'''http://www.ted.com/
4078 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4080 ((?P<type_talk>talks)) # We have a simple talk
4082 /(?P<name>\w+) # Here goes the name and then ".html"
4086 def suitable(cls, url):
4087 """Receives a URL and returns True if suitable for this IE."""
4088 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4090 def _real_extract(self, url):
4091 m=re.match(self._VALID_URL, url, re.VERBOSE)
4092 if m.group('type_talk'):
4093 return [self._talk_info(url)]
4095 playlist_id=m.group('playlist_id')
4096 name=m.group('name')
4097 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
4098 return [self._playlist_videos_info(url,name,playlist_id)]
4100 def _talk_video_link(self,mediaSlug):
4101 '''Returns the video link for that mediaSlug'''
4102 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4104 def _playlist_videos_info(self,url,name,playlist_id=0):
4105 '''Returns the videos of the playlist'''
4107 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4108 ([.\s]*?)data-playlist_item_id="(\d+)"
4109 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4111 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4112 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4113 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4114 m_names=re.finditer(video_name_RE,webpage)
4116 playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
4117 m_playlist = re.search(playlist_RE, webpage)
4118 playlist_title = m_playlist.group('playlist_title')
4120 playlist_entries = []
4121 for m_video, m_name in zip(m_videos,m_names):
4122 video_id=m_video.group('video_id')
4123 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4124 playlist_entries.append(self.url_result(talk_url, 'TED'))
4125 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
4127 def _talk_info(self, url, video_id=0):
4128 """Return the video for the talk in the url"""
4129 m=re.match(self._VALID_URL, url,re.VERBOSE)
4130 videoName=m.group('name')
4131 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4132 # If the url includes the language we get the title translated
4133 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4134 title=re.search(title_RE, webpage).group('title')
4135 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4136 "id":(?P<videoID>[\d]+).*?
4137 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4138 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4139 thumb_match=re.search(thumb_RE,webpage)
4140 info_match=re.search(info_RE,webpage,re.VERBOSE)
4141 video_id=info_match.group('videoID')
4142 mediaSlug=info_match.group('mediaSlug')
4143 video_url=self._talk_video_link(mediaSlug)
4149 'thumbnail': thumb_match.group('thumbnail')
4153 class MySpassIE(InfoExtractor):
4154 _VALID_URL = r'http://www.myspass.de/.*'
4156 def _real_extract(self, url):
4157 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4159 # video id is the last path element of the URL
4160 # usually there is a trailing slash, so also try the second but last
4161 url_path = compat_urllib_parse_urlparse(url).path
4162 url_parent_path, video_id = os.path.split(url_path)
4164 _, video_id = os.path.split(url_parent_path)
4167 metadata_url = META_DATA_URL_TEMPLATE % video_id
4168 metadata_text = self._download_webpage(metadata_url, video_id)
4169 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4171 # extract values from metadata
4172 url_flv_el = metadata.find('url_flv')
4173 if url_flv_el is None:
4174 self._downloader.report_error(u'unable to extract download url')
4176 video_url = url_flv_el.text
4177 extension = os.path.splitext(video_url)[1][1:]
4178 title_el = metadata.find('title')
4179 if title_el is None:
4180 self._downloader.report_error(u'unable to extract title')
4182 title = title_el.text
4183 format_id_el = metadata.find('format_id')
4184 if format_id_el is None:
4187 format = format_id_el.text
4188 description_el = metadata.find('description')
4189 if description_el is not None:
4190 description = description_el.text
4193 imagePreview_el = metadata.find('imagePreview')
4194 if imagePreview_el is not None:
4195 thumbnail = imagePreview_el.text
4204 'thumbnail': thumbnail,
4205 'description': description
4209 class SpiegelIE(InfoExtractor):
4210 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4212 def _real_extract(self, url):
4213 m = re.match(self._VALID_URL, url)
4214 video_id = m.group('videoID')
4216 webpage = self._download_webpage(url, video_id)
4217 m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4219 raise ExtractorError(u'Cannot find title')
4220 video_title = unescapeHTML(m.group(1))
4222 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4223 xml_code = self._download_webpage(xml_url, video_id,
4224 note=u'Downloading XML', errnote=u'Failed to download XML')
4226 idoc = xml.etree.ElementTree.fromstring(xml_code)
4227 last_type = idoc[-1]
4228 filename = last_type.findall('./filename')[0].text
4229 duration = float(last_type.findall('./duration')[0].text)
4231 video_url = 'http://video2.spiegel.de/flash/' + filename
4232 video_ext = filename.rpartition('.')[2]
4237 'title': video_title,
4238 'duration': duration,
4242 class LiveLeakIE(InfoExtractor):
4244 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4245 IE_NAME = u'liveleak'
4247 def _real_extract(self, url):
4248 mobj = re.match(self._VALID_URL, url)
4250 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4253 video_id = mobj.group('video_id')
4255 webpage = self._download_webpage(url, video_id)
4257 m = re.search(r'file: "(.*?)",', webpage)
4259 self._downloader.report_error(u'unable to find video url')
4261 video_url = m.group(1)
4263 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4265 self._downloader.trouble(u'Cannot find video title')
4266 title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4268 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4270 desc = unescapeHTML(m.group('desc'))
4274 m = re.search(r'By:.*?(\w+)</a>', webpage)
4276 uploader = clean_html(m.group(1))
4285 'description': desc,
4286 'uploader': uploader
4291 class ARDIE(InfoExtractor):
4292 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4293 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4294 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4296 def _real_extract(self, url):
4297 # determine video id from url
4298 m = re.match(self._VALID_URL, url)
4300 numid = re.search(r'documentId=([0-9]+)', url)
4302 video_id = numid.group(1)
4304 video_id = m.group('video_id')
4306 # determine title and media streams from webpage
4307 html = self._download_webpage(url, video_id)
4308 title = re.search(self._TITLE, html).group('title')
4309 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4311 assert '"fsk"' in html
4312 self._downloader.report_error(u'this video is only available after 8:00 pm')
4315 # choose default media type and highest quality for now
4316 stream = max([s for s in streams if int(s["media_type"]) == 0],
4317 key=lambda s: int(s["quality"]))
4319 # there's two possibilities: RTMP stream or HTTP download
4320 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4321 if stream['rtmp_url']:
4322 self.to_screen(u'RTMP download detected')
4323 assert stream['video_url'].startswith('mp4:')
4324 info["url"] = stream["rtmp_url"]
4325 info["play_path"] = stream['video_url']
4327 assert stream["video_url"].endswith('.mp4')
4328 info["url"] = stream["video_url"]
4332 def gen_extractors():
4333 """ Return a list of an instance of every supported extractor.
4334 The order does matter; the first extractor matched is the one handling the URL.
4337 YoutubePlaylistIE(),
4362 StanfordOpenClassroomIE(),
4372 WorldStarHipHopIE(),
4388 def get_info_extractor(ie_name):
4389 """Returns the info extractor class with the given ie_name"""
4390 return globals()[ie_name+'IE']