11 compat_urllib_request,
20 class InfoExtractor(object):
21 """Information Extractor class.
23 Information extractors are the classes that, given a URL, extract
24 information about the video (or videos) the URL refers to. This
25 information includes the real video URL, the video title, author and
26 others. The information is stored in a dictionary which is then
27 passed to the FileDownloader. The FileDownloader processes this
28 information possibly downloading the video to the file system, among
29 other possible outcomes.
31 The dictionaries must include the following fields:
35 title: Video title, unescaped.
36 ext: Video filename extension.
38 Instead of url and ext, formats can also specified.
40 The following fields are optional:
42 format: The video format, defaults to ext (used for --get-format)
43 thumbnails: A list of dictionaries (with the entries "resolution" and
44 "url") for the varying thumbnails
45 thumbnail: Full URL to a video thumbnail image.
46 description: One-line video description.
47 uploader: Full name of the video uploader.
48 upload_date: Video upload date (YYYYMMDD).
49 uploader_id: Nickname or id of the video uploader.
50 location: Physical location of the video.
51 player_url: SWF Player URL (used for rtmpdump).
52 subtitles: The subtitle file contents as a dictionary in the format
53 {language: subtitles}.
54 view_count: How many users have watched the video on the platform.
55 urlhandle: [internal] The urlHandle to be used to download the file,
56 like returned by urllib.request.urlopen
57 formats: A list of dictionaries for each format available, it must
58 be ordered from worst to best quality. Potential fields:
59 * url Mandatory. The URL of the video file
60 * ext Will be calculated from url if missing
61 * format A human-readable description of the format
62 ("mp4 container with h264/opus").
63 Calculated from width and height if missing.
64 * format_id A short description of the format
65 ("mp4_h264_opus" or "19")
66 * width Width of the video, if known
67 * height Height of the video, if known
69 Unless mentioned otherwise, the fields should be Unicode strings.
71 Subclasses of this one should re-define the _real_initialize() and
72 _real_extract() methods and define a _VALID_URL regexp.
73 Probably, they should also be added to the list of extractors.
75 _real_extract() must return a *list* of information dictionaries as
78 Finally, the _WORKING attribute should be set to False for broken IEs
79 in order to warn the users and skip the tests.
86 def __init__(self, downloader=None):
87 """Constructor. Receives an optional downloader."""
89 self.set_downloader(downloader)
92 def suitable(cls, url):
93 """Receives a URL and returns True if suitable for this IE."""
95 # This does not use has/getattr intentionally - we want to know whether
96 # we have cached the regexp for *this* class, whereas getattr would also
97 # match the superclass
98 if '_VALID_URL_RE' not in cls.__dict__:
99 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
100 return cls._VALID_URL_RE.match(url) is not None
104 """Getter method for _WORKING."""
107 def initialize(self):
108 """Initializes an instance (authentication, etc)."""
110 self._real_initialize()
113 def extract(self, url):
114 """Extracts URL information and returns it in list of dicts."""
116 return self._real_extract(url)
118 def set_downloader(self, downloader):
119 """Sets the downloader for this IE."""
120 self._downloader = downloader
122 def _real_initialize(self):
123 """Real initialization process. Redefine in subclasses."""
126 def _real_extract(self, url):
127 """Real extraction process. Redefine in subclasses."""
132 """A string for getting the InfoExtractor with get_info_extractor"""
133 return cls.__name__[:-2]
137 return type(self).__name__[:-2]
139 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
140 """ Returns the response handle """
142 self.report_download_webpage(video_id)
143 elif note is not False:
144 self.to_screen(u'%s: %s' % (video_id, note))
146 return compat_urllib_request.urlopen(url_or_request)
147 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
149 errnote = u'Unable to download webpage'
150 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2], cause=err)
152 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
153 """ Returns a tuple (page content as string, URL handle) """
155 # Strip hashes from the URL (#1038)
156 if isinstance(url_or_request, (compat_str, str)):
157 url_or_request = url_or_request.partition('#')[0]
159 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
160 content_type = urlh.headers.get('Content-Type', '')
161 webpage_bytes = urlh.read()
162 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
164 encoding = m.group(1)
166 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
167 webpage_bytes[:1024])
169 encoding = m.group(1).decode('ascii')
172 if self._downloader.params.get('dump_intermediate_pages', False):
174 url = url_or_request.get_full_url()
175 except AttributeError:
177 self.to_screen(u'Dumping request to ' + url)
178 dump = base64.b64encode(webpage_bytes).decode('ascii')
179 self._downloader.to_screen(dump)
180 content = webpage_bytes.decode(encoding, 'replace')
181 return (content, urlh)
183 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
184 """ Returns the data of the page as a string """
185 return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
187 def to_screen(self, msg):
188 """Print msg to screen, prefixing it with '[ie_name]'"""
189 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
191 def report_extraction(self, id_or_name):
192 """Report information extraction."""
193 self.to_screen(u'%s: Extracting information' % id_or_name)
195 def report_download_webpage(self, video_id):
196 """Report webpage download."""
197 self.to_screen(u'%s: Downloading webpage' % video_id)
199 def report_age_confirmation(self):
200 """Report attempt to confirm age."""
201 self.to_screen(u'Confirming age')
203 def report_login(self):
204 """Report attempt to log in."""
205 self.to_screen(u'Logging in')
207 #Methods for following #608
208 def url_result(self, url, ie=None):
209 """Returns a url that points to a page that should be processed"""
210 #TODO: ie should be the class used for getting the info
211 video_info = {'_type': 'url',
215 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
216 """Returns a playlist"""
217 video_info = {'_type': 'playlist',
220 video_info['id'] = playlist_id
222 video_info['title'] = playlist_title
225 def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
227 Perform a regex search on the given string, using a single or a list of
228 patterns returning the first matching group.
229 In case of failure return a default value or raise a WARNING or a
230 ExtractorError, depending on fatal, specifying the field name.
232 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
233 mobj = re.search(pattern, string, flags)
236 mobj = re.search(p, string, flags)
239 if sys.stderr.isatty() and os.name != 'nt':
240 _name = u'\033[0;34m%s\033[0m' % name
245 # return the first matching group
246 return next(g for g in mobj.groups() if g is not None)
247 elif default is not None:
250 raise ExtractorError(u'Unable to extract %s' % _name)
252 self._downloader.report_warning(u'unable to extract %s; '
253 u'please report this issue on http://yt-dl.org/bug' % _name)
256 def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
258 Like _search_regex, but strips HTML tags and unescapes entities.
260 res = self._search_regex(pattern, string, name, default, fatal, flags)
262 return clean_html(res).strip()
266 def _get_login_info(self):
268 Get the the login info as (username, password)
269 It will look in the netrc file using the _NETRC_MACHINE value
270 If there's no info available, return (None, None)
272 if self._downloader is None:
277 downloader_params = self._downloader.params
279 # Attempt to use provided username and password or .netrc data
280 if downloader_params.get('username', None) is not None:
281 username = downloader_params['username']
282 password = downloader_params['password']
283 elif downloader_params.get('usenetrc', False):
285 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
290 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
291 except (IOError, netrc.NetrcParseError) as err:
292 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
294 return (username, password)
296 # Helper functions for extracting OpenGraph info
299 return r'<meta.+?property=[\'"]og:%s[\'"].+?content=(?:"(.+?)"|\'(.+?)\')' % re.escape(prop)
301 def _og_search_property(self, prop, html, name=None, **kargs):
303 name = 'OpenGraph %s' % prop
304 escaped = self._search_regex(self._og_regex(prop), html, name, flags=re.DOTALL, **kargs)
305 return unescapeHTML(escaped)
307 def _og_search_thumbnail(self, html, **kargs):
308 return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
310 def _og_search_description(self, html, **kargs):
311 return self._og_search_property('description', html, fatal=False, **kargs)
313 def _og_search_title(self, html, **kargs):
314 return self._og_search_property('title', html, **kargs)
316 def _og_search_video_url(self, html, name='video url', **kargs):
317 return self._html_search_regex([self._og_regex('video:secure_url'),
318 self._og_regex('video')],
321 class SearchInfoExtractor(InfoExtractor):
323 Base class for paged search queries extractors.
324 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
325 Instances should define _SEARCH_KEY and _MAX_RESULTS.
329 def _make_valid_url(cls):
330 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
333 def suitable(cls, url):
334 return re.match(cls._make_valid_url(), url) is not None
336 def _real_extract(self, query):
337 mobj = re.match(self._make_valid_url(), query)
339 raise ExtractorError(u'Invalid search query "%s"' % query)
341 prefix = mobj.group('prefix')
342 query = mobj.group('query')
344 return self._get_n_results(query, 1)
345 elif prefix == 'all':
346 return self._get_n_results(query, self._MAX_RESULTS)
350 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
351 elif n > self._MAX_RESULTS:
352 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
353 n = self._MAX_RESULTS
354 return self._get_n_results(query, n)
356 def _get_n_results(self, query, n):
357 """Get a specified number of results for a query"""
358 raise NotImplementedError("This method must be implemented by sublclasses")
361 def SEARCH_KEY(self):
362 return self._SEARCH_KEY