11 compat_urllib_request,
20 class InfoExtractor(object):
21 """Information Extractor class.
23 Information extractors are the classes that, given a URL, extract
24 information about the video (or videos) the URL refers to. This
25 information includes the real video URL, the video title, author and
26 others. The information is stored in a dictionary which is then
27 passed to the FileDownloader. The FileDownloader processes this
28 information possibly downloading the video to the file system, among
29 other possible outcomes.
31 The dictionaries must include the following fields:
35 title: Video title, unescaped.
36 ext: Video filename extension.
38 The following fields are optional:
40 format: The video format, defaults to ext (used for --get-format)
41 thumbnails: A list of dictionaries (with the entries "resolution" and
42 "url") for the varying thumbnails
43 thumbnail: Full URL to a video thumbnail image.
44 description: One-line video description.
45 uploader: Full name of the video uploader.
46 upload_date: Video upload date (YYYYMMDD).
47 uploader_id: Nickname or id of the video uploader.
48 location: Physical location of the video.
49 player_url: SWF Player URL (used for rtmpdump).
50 subtitles: The subtitle file contents as a dictionary in the format
51 {language: subtitles}.
52 view_count: How many users have watched the video on the platform.
53 urlhandle: [internal] The urlHandle to be used to download the file,
54 like returned by urllib.request.urlopen
55 formats: A list of dictionaries for each format available, it must
56 be ordered from worst to best quality. Potential fields:
57 * url Mandatory. The URL of the video file
58 * ext Will be calculated from url if missing
59 * format A human-readable description of the format
60 ("mp4 container with h264/opus").
61 Calculated from width and height if missing.
62 * format_id A short description of the format
63 ("mp4_h264_opus" or "19")
64 * width Width of the video, if known
65 * height Height of the video, if known
67 Unless mentioned otherwise, the fields should be Unicode strings.
69 Subclasses of this one should re-define the _real_initialize() and
70 _real_extract() methods and define a _VALID_URL regexp.
71 Probably, they should also be added to the list of extractors.
73 _real_extract() must return a *list* of information dictionaries as
76 Finally, the _WORKING attribute should be set to False for broken IEs
77 in order to warn the users and skip the tests.
84 def __init__(self, downloader=None):
85 """Constructor. Receives an optional downloader."""
87 self.set_downloader(downloader)
90 def suitable(cls, url):
91 """Receives a URL and returns True if suitable for this IE."""
93 # This does not use has/getattr intentionally - we want to know whether
94 # we have cached the regexp for *this* class, whereas getattr would also
95 # match the superclass
96 if '_VALID_URL_RE' not in cls.__dict__:
97 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
98 return cls._VALID_URL_RE.match(url) is not None
102 """Getter method for _WORKING."""
105 def initialize(self):
106 """Initializes an instance (authentication, etc)."""
108 self._real_initialize()
111 def extract(self, url):
112 """Extracts URL information and returns it in list of dicts."""
114 return self._real_extract(url)
116 def set_downloader(self, downloader):
117 """Sets the downloader for this IE."""
118 self._downloader = downloader
120 def _real_initialize(self):
121 """Real initialization process. Redefine in subclasses."""
124 def _real_extract(self, url):
125 """Real extraction process. Redefine in subclasses."""
130 """A string for getting the InfoExtractor with get_info_extractor"""
131 return cls.__name__[:-2]
135 return type(self).__name__[:-2]
137 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
138 """ Returns the response handle """
140 self.report_download_webpage(video_id)
141 elif note is not False:
142 self.to_screen(u'%s: %s' % (video_id, note))
144 return compat_urllib_request.urlopen(url_or_request)
145 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
147 errnote = u'Unable to download webpage'
148 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2], cause=err)
150 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
151 """ Returns a tuple (page content as string, URL handle) """
153 # Strip hashes from the URL (#1038)
154 if isinstance(url_or_request, (compat_str, str)):
155 url_or_request = url_or_request.partition('#')[0]
157 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
158 content_type = urlh.headers.get('Content-Type', '')
159 webpage_bytes = urlh.read()
160 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
162 encoding = m.group(1)
164 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
165 webpage_bytes[:1024])
167 encoding = m.group(1).decode('ascii')
170 if self._downloader.params.get('dump_intermediate_pages', False):
172 url = url_or_request.get_full_url()
173 except AttributeError:
175 self.to_screen(u'Dumping request to ' + url)
176 dump = base64.b64encode(webpage_bytes).decode('ascii')
177 self._downloader.to_screen(dump)
178 content = webpage_bytes.decode(encoding, 'replace')
179 return (content, urlh)
181 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
182 """ Returns the data of the page as a string """
183 return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
185 def to_screen(self, msg):
186 """Print msg to screen, prefixing it with '[ie_name]'"""
187 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
189 def report_extraction(self, id_or_name):
190 """Report information extraction."""
191 self.to_screen(u'%s: Extracting information' % id_or_name)
193 def report_download_webpage(self, video_id):
194 """Report webpage download."""
195 self.to_screen(u'%s: Downloading webpage' % video_id)
197 def report_age_confirmation(self):
198 """Report attempt to confirm age."""
199 self.to_screen(u'Confirming age')
201 def report_login(self):
202 """Report attempt to log in."""
203 self.to_screen(u'Logging in')
205 #Methods for following #608
206 def url_result(self, url, ie=None):
207 """Returns a url that points to a page that should be processed"""
208 #TODO: ie should be the class used for getting the info
209 video_info = {'_type': 'url',
213 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
214 """Returns a playlist"""
215 video_info = {'_type': 'playlist',
218 video_info['id'] = playlist_id
220 video_info['title'] = playlist_title
223 def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
225 Perform a regex search on the given string, using a single or a list of
226 patterns returning the first matching group.
227 In case of failure return a default value or raise a WARNING or a
228 ExtractorError, depending on fatal, specifying the field name.
230 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
231 mobj = re.search(pattern, string, flags)
234 mobj = re.search(p, string, flags)
237 if sys.stderr.isatty() and os.name != 'nt':
238 _name = u'\033[0;34m%s\033[0m' % name
243 # return the first matching group
244 return next(g for g in mobj.groups() if g is not None)
245 elif default is not None:
248 raise ExtractorError(u'Unable to extract %s' % _name)
250 self._downloader.report_warning(u'unable to extract %s; '
251 u'please report this issue on http://yt-dl.org/bug' % _name)
254 def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
256 Like _search_regex, but strips HTML tags and unescapes entities.
258 res = self._search_regex(pattern, string, name, default, fatal, flags)
260 return clean_html(res).strip()
264 def _get_login_info(self):
266 Get the the login info as (username, password)
267 It will look in the netrc file using the _NETRC_MACHINE value
268 If there's no info available, return (None, None)
270 if self._downloader is None:
275 downloader_params = self._downloader.params
277 # Attempt to use provided username and password or .netrc data
278 if downloader_params.get('username', None) is not None:
279 username = downloader_params['username']
280 password = downloader_params['password']
281 elif downloader_params.get('usenetrc', False):
283 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
288 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
289 except (IOError, netrc.NetrcParseError) as err:
290 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
292 return (username, password)
294 # Helper functions for extracting OpenGraph info
297 return r'<meta.+?property=[\'"]og:%s[\'"].+?content=(?:"(.+?)"|\'(.+?)\')' % re.escape(prop)
299 def _og_search_property(self, prop, html, name=None, **kargs):
301 name = 'OpenGraph %s' % prop
302 escaped = self._search_regex(self._og_regex(prop), html, name, flags=re.DOTALL, **kargs)
303 return unescapeHTML(escaped)
305 def _og_search_thumbnail(self, html, **kargs):
306 return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
308 def _og_search_description(self, html, **kargs):
309 return self._og_search_property('description', html, fatal=False, **kargs)
311 def _og_search_title(self, html, **kargs):
312 return self._og_search_property('title', html, **kargs)
314 def _og_search_video_url(self, html, name='video url', **kargs):
315 return self._html_search_regex([self._og_regex('video:secure_url'),
316 self._og_regex('video')],
319 class SearchInfoExtractor(InfoExtractor):
321 Base class for paged search queries extractors.
322 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
323 Instances should define _SEARCH_KEY and _MAX_RESULTS.
327 def _make_valid_url(cls):
328 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
331 def suitable(cls, url):
332 return re.match(cls._make_valid_url(), url) is not None
334 def _real_extract(self, query):
335 mobj = re.match(self._make_valid_url(), query)
337 raise ExtractorError(u'Invalid search query "%s"' % query)
339 prefix = mobj.group('prefix')
340 query = mobj.group('query')
342 return self._get_n_results(query, 1)
343 elif prefix == 'all':
344 return self._get_n_results(query, self._MAX_RESULTS)
348 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
349 elif n > self._MAX_RESULTS:
350 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
351 n = self._MAX_RESULTS
352 return self._get_n_results(query, n)
354 def _get_n_results(self, query, n):
355 """Get a specified number of results for a query"""
356 raise NotImplementedError("This method must be implemented by sublclasses")
359 def SEARCH_KEY(self):
360 return self._SEARCH_KEY