11 compat_urllib_request,
22 class InfoExtractor(object):
23 """Information Extractor class.
25 Information extractors are the classes that, given a URL, extract
26 information about the video (or videos) the URL refers to. This
27 information includes the real video URL, the video title, author and
28 others. The information is stored in a dictionary which is then
29 passed to the FileDownloader. The FileDownloader processes this
30 information possibly downloading the video to the file system, among
31 other possible outcomes.
33 The dictionaries must include the following fields:
37 title: Video title, unescaped.
38 ext: Video filename extension.
40 Instead of url and ext, formats can also specified.
42 The following fields are optional:
44 format: The video format, defaults to ext (used for --get-format)
45 thumbnails: A list of dictionaries (with the entries "resolution" and
46 "url") for the varying thumbnails
47 thumbnail: Full URL to a video thumbnail image.
48 description: One-line video description.
49 uploader: Full name of the video uploader.
50 upload_date: Video upload date (YYYYMMDD).
51 uploader_id: Nickname or id of the video uploader.
52 location: Physical location of the video.
53 player_url: SWF Player URL (used for rtmpdump).
54 subtitles: The subtitle file contents as a dictionary in the format
55 {language: subtitles}.
56 view_count: How many users have watched the video on the platform.
57 urlhandle: [internal] The urlHandle to be used to download the file,
58 like returned by urllib.request.urlopen
59 age_limit: Age restriction for the video, as an integer (years)
60 formats: A list of dictionaries for each format available, it must
61 be ordered from worst to best quality. Potential fields:
62 * url Mandatory. The URL of the video file
63 * ext Will be calculated from url if missing
64 * format A human-readable description of the format
65 ("mp4 container with h264/opus").
66 Calculated from the format_id, width, height
67 and format_note fields if missing.
68 * format_id A short description of the format
69 ("mp4_h264_opus" or "19")
70 * format_note Additional info about the format
71 ("3D" or "DASH video")
72 * width Width of the video, if known
73 * height Height of the video, if known
75 Unless mentioned otherwise, the fields should be Unicode strings.
77 Subclasses of this one should re-define the _real_initialize() and
78 _real_extract() methods and define a _VALID_URL regexp.
79 Probably, they should also be added to the list of extractors.
81 _real_extract() must return a *list* of information dictionaries as
84 Finally, the _WORKING attribute should be set to False for broken IEs
85 in order to warn the users and skip the tests.
92 def __init__(self, downloader=None):
93 """Constructor. Receives an optional downloader."""
95 self.set_downloader(downloader)
98 def suitable(cls, url):
99 """Receives a URL and returns True if suitable for this IE."""
101 # This does not use has/getattr intentionally - we want to know whether
102 # we have cached the regexp for *this* class, whereas getattr would also
103 # match the superclass
104 if '_VALID_URL_RE' not in cls.__dict__:
105 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
106 return cls._VALID_URL_RE.match(url) is not None
110 """Getter method for _WORKING."""
113 def initialize(self):
114 """Initializes an instance (authentication, etc)."""
116 self._real_initialize()
119 def extract(self, url):
120 """Extracts URL information and returns it in list of dicts."""
122 return self._real_extract(url)
124 def set_downloader(self, downloader):
125 """Sets the downloader for this IE."""
126 self._downloader = downloader
128 def _real_initialize(self):
129 """Real initialization process. Redefine in subclasses."""
132 def _real_extract(self, url):
133 """Real extraction process. Redefine in subclasses."""
138 """A string for getting the InfoExtractor with get_info_extractor"""
139 return cls.__name__[:-2]
143 return type(self).__name__[:-2]
145 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
146 """ Returns the response handle """
148 self.report_download_webpage(video_id)
149 elif note is not False:
150 self.to_screen(u'%s: %s' % (video_id, note))
152 return compat_urllib_request.urlopen(url_or_request)
153 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
155 errnote = u'Unable to download webpage'
156 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2], cause=err)
158 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
159 """ Returns a tuple (page content as string, URL handle) """
161 # Strip hashes from the URL (#1038)
162 if isinstance(url_or_request, (compat_str, str)):
163 url_or_request = url_or_request.partition('#')[0]
165 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
166 content_type = urlh.headers.get('Content-Type', '')
167 webpage_bytes = urlh.read()
168 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
170 encoding = m.group(1)
172 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
173 webpage_bytes[:1024])
175 encoding = m.group(1).decode('ascii')
178 if self._downloader.params.get('dump_intermediate_pages', False):
180 url = url_or_request.get_full_url()
181 except AttributeError:
183 self.to_screen(u'Dumping request to ' + url)
184 dump = base64.b64encode(webpage_bytes).decode('ascii')
185 self._downloader.to_screen(dump)
186 if self._downloader.params.get('write_pages', False):
188 url = url_or_request.get_full_url()
189 except AttributeError:
191 raw_filename = ('%s_%s.dump' % (video_id, url))
192 filename = sanitize_filename(raw_filename, restricted=True)
193 self.to_screen(u'Saving request to ' + filename)
194 with open(filename, 'wb') as outf:
195 outf.write(webpage_bytes)
197 content = webpage_bytes.decode(encoding, 'replace')
198 return (content, urlh)
200 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
201 """ Returns the data of the page as a string """
202 return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
204 def to_screen(self, msg):
205 """Print msg to screen, prefixing it with '[ie_name]'"""
206 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
208 def report_extraction(self, id_or_name):
209 """Report information extraction."""
210 self.to_screen(u'%s: Extracting information' % id_or_name)
212 def report_download_webpage(self, video_id):
213 """Report webpage download."""
214 self.to_screen(u'%s: Downloading webpage' % video_id)
216 def report_age_confirmation(self):
217 """Report attempt to confirm age."""
218 self.to_screen(u'Confirming age')
220 def report_login(self):
221 """Report attempt to log in."""
222 self.to_screen(u'Logging in')
224 #Methods for following #608
225 def url_result(self, url, ie=None):
226 """Returns a url that points to a page that should be processed"""
227 #TODO: ie should be the class used for getting the info
228 video_info = {'_type': 'url',
232 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
233 """Returns a playlist"""
234 video_info = {'_type': 'playlist',
237 video_info['id'] = playlist_id
239 video_info['title'] = playlist_title
242 def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
244 Perform a regex search on the given string, using a single or a list of
245 patterns returning the first matching group.
246 In case of failure return a default value or raise a WARNING or a
247 RegexNotFoundError, depending on fatal, specifying the field name.
249 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
250 mobj = re.search(pattern, string, flags)
253 mobj = re.search(p, string, flags)
256 if sys.stderr.isatty() and os.name != 'nt':
257 _name = u'\033[0;34m%s\033[0m' % name
262 # return the first matching group
263 return next(g for g in mobj.groups() if g is not None)
264 elif default is not None:
267 raise RegexNotFoundError(u'Unable to extract %s' % _name)
269 self._downloader.report_warning(u'unable to extract %s; '
270 u'please report this issue on http://yt-dl.org/bug' % _name)
273 def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
275 Like _search_regex, but strips HTML tags and unescapes entities.
277 res = self._search_regex(pattern, string, name, default, fatal, flags)
279 return clean_html(res).strip()
283 def _get_login_info(self):
285 Get the the login info as (username, password)
286 It will look in the netrc file using the _NETRC_MACHINE value
287 If there's no info available, return (None, None)
289 if self._downloader is None:
294 downloader_params = self._downloader.params
296 # Attempt to use provided username and password or .netrc data
297 if downloader_params.get('username', None) is not None:
298 username = downloader_params['username']
299 password = downloader_params['password']
300 elif downloader_params.get('usenetrc', False):
302 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
307 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
308 except (IOError, netrc.NetrcParseError) as err:
309 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
311 return (username, password)
313 # Helper functions for extracting OpenGraph info
316 return r'<meta.+?property=[\'"]og:%s[\'"].+?content=(?:"(.+?)"|\'(.+?)\')' % re.escape(prop)
318 def _og_search_property(self, prop, html, name=None, **kargs):
320 name = 'OpenGraph %s' % prop
321 escaped = self._search_regex(self._og_regex(prop), html, name, flags=re.DOTALL, **kargs)
322 return unescapeHTML(escaped)
324 def _og_search_thumbnail(self, html, **kargs):
325 return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
327 def _og_search_description(self, html, **kargs):
328 return self._og_search_property('description', html, fatal=False, **kargs)
330 def _og_search_title(self, html, **kargs):
331 return self._og_search_property('title', html, **kargs)
333 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
334 regexes = [self._og_regex('video')]
335 if secure: regexes.insert(0, self._og_regex('video:secure_url'))
336 return self._html_search_regex(regexes, html, name, **kargs)
338 def _rta_search(self, html):
339 # See http://www.rtalabel.org/index.php?content=howtofaq#single
340 if re.search(r'(?ix)<meta\s+name="rating"\s+'
341 r' content="RTA-5042-1996-1400-1577-RTA"',
347 class SearchInfoExtractor(InfoExtractor):
349 Base class for paged search queries extractors.
350 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
351 Instances should define _SEARCH_KEY and _MAX_RESULTS.
355 def _make_valid_url(cls):
356 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
359 def suitable(cls, url):
360 return re.match(cls._make_valid_url(), url) is not None
362 def _real_extract(self, query):
363 mobj = re.match(self._make_valid_url(), query)
365 raise ExtractorError(u'Invalid search query "%s"' % query)
367 prefix = mobj.group('prefix')
368 query = mobj.group('query')
370 return self._get_n_results(query, 1)
371 elif prefix == 'all':
372 return self._get_n_results(query, self._MAX_RESULTS)
376 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
377 elif n > self._MAX_RESULTS:
378 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
379 n = self._MAX_RESULTS
380 return self._get_n_results(query, n)
382 def _get_n_results(self, query, n):
383 """Get a specified number of results for a query"""
384 raise NotImplementedError("This method must be implemented by subclasses")
387 def SEARCH_KEY(self):
388 return self._SEARCH_KEY