11 compat_urllib_request,
22 class InfoExtractor(object):
23 """Information Extractor class.
25 Information extractors are the classes that, given a URL, extract
26 information about the video (or videos) the URL refers to. This
27 information includes the real video URL, the video title, author and
28 others. The information is stored in a dictionary which is then
29 passed to the FileDownloader. The FileDownloader processes this
30 information possibly downloading the video to the file system, among
31 other possible outcomes.
33 The dictionaries must include the following fields:
37 title: Video title, unescaped.
38 ext: Video filename extension.
40 Instead of url and ext, formats can also specified.
42 The following fields are optional:
44 format: The video format, defaults to ext (used for --get-format)
45 thumbnails: A list of dictionaries (with the entries "resolution" and
46 "url") for the varying thumbnails
47 thumbnail: Full URL to a video thumbnail image.
48 description: One-line video description.
49 uploader: Full name of the video uploader.
50 upload_date: Video upload date (YYYYMMDD).
51 uploader_id: Nickname or id of the video uploader.
52 location: Physical location of the video.
53 player_url: SWF Player URL (used for rtmpdump).
54 subtitles: The subtitle file contents as a dictionary in the format
55 {language: subtitles}.
56 view_count: How many users have watched the video on the platform.
57 urlhandle: [internal] The urlHandle to be used to download the file,
58 like returned by urllib.request.urlopen
59 age_limit: Age restriction for the video, as an integer (years)
60 formats: A list of dictionaries for each format available, it must
61 be ordered from worst to best quality. Potential fields:
62 * url Mandatory. The URL of the video file
63 * ext Will be calculated from url if missing
64 * format A human-readable description of the format
65 ("mp4 container with h264/opus").
66 Calculated from the format_id, width, height.
67 and format_note fields if missing.
68 * format_id A short description of the format
69 ("mp4_h264_opus" or "19")
70 * format_note Additional info about the format
71 ("3D" or "DASH video")
72 * width Width of the video, if known
73 * height Height of the video, if known
74 * abr Average audio bitrate in KBit/s
75 * acodec Name of the audio codec in use
76 * vbr Average video bitrate in KBit/s
77 * vcodec Name of the video codec in use
78 webpage_url: The url to the video webpage, if given to youtube-dl it
79 should allow to get the same result again. (It will be set
80 by YoutubeDL if it's missing)
82 Unless mentioned otherwise, the fields should be Unicode strings.
84 Subclasses of this one should re-define the _real_initialize() and
85 _real_extract() methods and define a _VALID_URL regexp.
86 Probably, they should also be added to the list of extractors.
88 _real_extract() must return a *list* of information dictionaries as
91 Finally, the _WORKING attribute should be set to False for broken IEs
92 in order to warn the users and skip the tests.
99 def __init__(self, downloader=None):
100 """Constructor. Receives an optional downloader."""
102 self.set_downloader(downloader)
105 def suitable(cls, url):
106 """Receives a URL and returns True if suitable for this IE."""
108 # This does not use has/getattr intentionally - we want to know whether
109 # we have cached the regexp for *this* class, whereas getattr would also
110 # match the superclass
111 if '_VALID_URL_RE' not in cls.__dict__:
112 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
113 return cls._VALID_URL_RE.match(url) is not None
117 """Getter method for _WORKING."""
120 def initialize(self):
121 """Initializes an instance (authentication, etc)."""
123 self._real_initialize()
126 def extract(self, url):
127 """Extracts URL information and returns it in list of dicts."""
129 return self._real_extract(url)
131 def set_downloader(self, downloader):
132 """Sets the downloader for this IE."""
133 self._downloader = downloader
135 def _real_initialize(self):
136 """Real initialization process. Redefine in subclasses."""
139 def _real_extract(self, url):
140 """Real extraction process. Redefine in subclasses."""
145 """A string for getting the InfoExtractor with get_info_extractor"""
146 return cls.__name__[:-2]
150 return type(self).__name__[:-2]
152 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
153 """ Returns the response handle """
155 self.report_download_webpage(video_id)
156 elif note is not False:
157 self.to_screen(u'%s: %s' % (video_id, note))
159 return compat_urllib_request.urlopen(url_or_request)
160 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
162 errnote = u'Unable to download webpage'
163 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2], cause=err)
165 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
166 """ Returns a tuple (page content as string, URL handle) """
168 # Strip hashes from the URL (#1038)
169 if isinstance(url_or_request, (compat_str, str)):
170 url_or_request = url_or_request.partition('#')[0]
172 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
173 content_type = urlh.headers.get('Content-Type', '')
174 webpage_bytes = urlh.read()
175 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
177 encoding = m.group(1)
179 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
180 webpage_bytes[:1024])
182 encoding = m.group(1).decode('ascii')
185 if self._downloader.params.get('dump_intermediate_pages', False):
187 url = url_or_request.get_full_url()
188 except AttributeError:
190 self.to_screen(u'Dumping request to ' + url)
191 dump = base64.b64encode(webpage_bytes).decode('ascii')
192 self._downloader.to_screen(dump)
193 if self._downloader.params.get('write_pages', False):
195 url = url_or_request.get_full_url()
196 except AttributeError:
198 raw_filename = ('%s_%s.dump' % (video_id, url))
199 filename = sanitize_filename(raw_filename, restricted=True)
200 self.to_screen(u'Saving request to ' + filename)
201 with open(filename, 'wb') as outf:
202 outf.write(webpage_bytes)
204 content = webpage_bytes.decode(encoding, 'replace')
205 return (content, urlh)
207 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
208 """ Returns the data of the page as a string """
209 return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
211 def to_screen(self, msg):
212 """Print msg to screen, prefixing it with '[ie_name]'"""
213 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
215 def report_extraction(self, id_or_name):
216 """Report information extraction."""
217 self.to_screen(u'%s: Extracting information' % id_or_name)
219 def report_download_webpage(self, video_id):
220 """Report webpage download."""
221 self.to_screen(u'%s: Downloading webpage' % video_id)
223 def report_age_confirmation(self):
224 """Report attempt to confirm age."""
225 self.to_screen(u'Confirming age')
227 def report_login(self):
228 """Report attempt to log in."""
229 self.to_screen(u'Logging in')
231 #Methods for following #608
232 def url_result(self, url, ie=None, video_id=None):
233 """Returns a url that points to a page that should be processed"""
234 #TODO: ie should be the class used for getting the info
235 video_info = {'_type': 'url',
238 if video_id is not None:
239 video_info['id'] = video_id
241 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
242 """Returns a playlist"""
243 video_info = {'_type': 'playlist',
246 video_info['id'] = playlist_id
248 video_info['title'] = playlist_title
251 def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
253 Perform a regex search on the given string, using a single or a list of
254 patterns returning the first matching group.
255 In case of failure return a default value or raise a WARNING or a
256 RegexNotFoundError, depending on fatal, specifying the field name.
258 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
259 mobj = re.search(pattern, string, flags)
262 mobj = re.search(p, string, flags)
265 if sys.stderr.isatty() and os.name != 'nt':
266 _name = u'\033[0;34m%s\033[0m' % name
271 # return the first matching group
272 return next(g for g in mobj.groups() if g is not None)
273 elif default is not None:
276 raise RegexNotFoundError(u'Unable to extract %s' % _name)
278 self._downloader.report_warning(u'unable to extract %s; '
279 u'please report this issue on http://yt-dl.org/bug' % _name)
282 def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
284 Like _search_regex, but strips HTML tags and unescapes entities.
286 res = self._search_regex(pattern, string, name, default, fatal, flags)
288 return clean_html(res).strip()
292 def _get_login_info(self):
294 Get the the login info as (username, password)
295 It will look in the netrc file using the _NETRC_MACHINE value
296 If there's no info available, return (None, None)
298 if self._downloader is None:
303 downloader_params = self._downloader.params
305 # Attempt to use provided username and password or .netrc data
306 if downloader_params.get('username', None) is not None:
307 username = downloader_params['username']
308 password = downloader_params['password']
309 elif downloader_params.get('usenetrc', False):
311 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
316 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
317 except (IOError, netrc.NetrcParseError) as err:
318 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
320 return (username, password)
322 # Helper functions for extracting OpenGraph info
324 def _og_regexes(prop):
325 content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
326 property_re = r'property=[\'"]og:%s[\'"]' % re.escape(prop)
327 template = r'<meta[^>]+?%s[^>]+?%s'
329 template % (property_re, content_re),
330 template % (content_re, property_re),
333 def _og_search_property(self, prop, html, name=None, **kargs):
335 name = 'OpenGraph %s' % prop
336 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
339 return unescapeHTML(escaped)
341 def _og_search_thumbnail(self, html, **kargs):
342 return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
344 def _og_search_description(self, html, **kargs):
345 return self._og_search_property('description', html, fatal=False, **kargs)
347 def _og_search_title(self, html, **kargs):
348 return self._og_search_property('title', html, **kargs)
350 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
351 regexes = self._og_regexes('video')
352 if secure: regexes = self._og_regexes('video:secure_url') + regexes
353 return self._html_search_regex(regexes, html, name, **kargs)
355 def _html_search_meta(self, name, html, display_name=None):
356 if display_name is None:
358 return self._html_search_regex(
359 r'''(?ix)<meta(?=[^>]+(?:name|property)=["\']%s["\'])
360 [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
361 html, display_name, fatal=False)
363 def _dc_search_uploader(self, html):
364 return self._html_search_meta('dc.creator', html, 'uploader')
366 def _rta_search(self, html):
367 # See http://www.rtalabel.org/index.php?content=howtofaq#single
368 if re.search(r'(?ix)<meta\s+name="rating"\s+'
369 r' content="RTA-5042-1996-1400-1577-RTA"',
374 def _media_rating_search(self, html):
375 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
376 rating = self._html_search_meta('rating', html)
388 return RATING_TABLE.get(rating.lower(), None)
392 class SearchInfoExtractor(InfoExtractor):
394 Base class for paged search queries extractors.
395 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
396 Instances should define _SEARCH_KEY and _MAX_RESULTS.
400 def _make_valid_url(cls):
401 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
404 def suitable(cls, url):
405 return re.match(cls._make_valid_url(), url) is not None
407 def _real_extract(self, query):
408 mobj = re.match(self._make_valid_url(), query)
410 raise ExtractorError(u'Invalid search query "%s"' % query)
412 prefix = mobj.group('prefix')
413 query = mobj.group('query')
415 return self._get_n_results(query, 1)
416 elif prefix == 'all':
417 return self._get_n_results(query, self._MAX_RESULTS)
421 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
422 elif n > self._MAX_RESULTS:
423 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
424 n = self._MAX_RESULTS
425 return self._get_n_results(query, n)
427 def _get_n_results(self, query, n):
428 """Get a specified number of results for a query"""
429 raise NotImplementedError("This method must be implemented by subclasses")
432 def SEARCH_KEY(self):
433 return self._SEARCH_KEY