7 import xml.etree.ElementTree
21 _NO_DEFAULT = object()
24 class InfoExtractor(object):
25 """Information Extractor class.
27 Information extractors are the classes that, given a URL, extract
28 information about the video (or videos) the URL refers to. This
29 information includes the real video URL, the video title, author and
30 others. The information is stored in a dictionary which is then
31 passed to the FileDownloader. The FileDownloader processes this
32 information possibly downloading the video to the file system, among
33 other possible outcomes.
35 The dictionaries must include the following fields:
38 title: Video title, unescaped.
40 Additionally, it must contain either a formats entry or url and ext:
42 formats: A list of dictionaries for each format available, it must
43 be ordered from worst to best quality. Potential fields:
44 * url Mandatory. The URL of the video file
45 * ext Will be calculated from url if missing
46 * format A human-readable description of the format
47 ("mp4 container with h264/opus").
48 Calculated from the format_id, width, height.
49 and format_note fields if missing.
50 * format_id A short description of the format
51 ("mp4_h264_opus" or "19")
52 * format_note Additional info about the format
53 ("3D" or "DASH video")
54 * width Width of the video, if known
55 * height Height of the video, if known
56 * abr Average audio bitrate in KBit/s
57 * acodec Name of the audio codec in use
58 * vbr Average video bitrate in KBit/s
59 * vcodec Name of the video codec in use
60 * filesize The number of bytes, if known in advance
61 * player_url SWF Player URL (used for rtmpdump).
63 ext: Video filename extension.
64 format: The video format, defaults to ext (used for --get-format)
65 player_url: SWF Player URL (used for rtmpdump).
67 The following fields are optional:
69 thumbnails: A list of dictionaries (with the entries "resolution" and
70 "url") for the varying thumbnails
71 thumbnail: Full URL to a video thumbnail image.
72 description: One-line video description.
73 uploader: Full name of the video uploader.
74 upload_date: Video upload date (YYYYMMDD).
75 uploader_id: Nickname or id of the video uploader.
76 location: Physical location of the video.
77 subtitles: The subtitle file contents as a dictionary in the format
78 {language: subtitles}.
79 duration: Length of the video in seconds, as an integer.
80 view_count: How many users have watched the video on the platform.
81 like_count: Number of positive ratings of the video
82 dislike_count: Number of negative ratings of the video
83 comment_count: Number of comments on the video
84 age_limit: Age restriction for the video, as an integer (years)
85 webpage_url: The url to the video webpage, if given to youtube-dl it
86 should allow to get the same result again. (It will be set
87 by YoutubeDL if it's missing)
89 Unless mentioned otherwise, the fields should be Unicode strings.
91 Subclasses of this one should re-define the _real_initialize() and
92 _real_extract() methods and define a _VALID_URL regexp.
93 Probably, they should also be added to the list of extractors.
95 _real_extract() must return a *list* of information dictionaries as
98 Finally, the _WORKING attribute should be set to False for broken IEs
99 in order to warn the users and skip the tests.
106 def __init__(self, downloader=None):
107 """Constructor. Receives an optional downloader."""
109 self.set_downloader(downloader)
112 def suitable(cls, url):
113 """Receives a URL and returns True if suitable for this IE."""
115 # This does not use has/getattr intentionally - we want to know whether
116 # we have cached the regexp for *this* class, whereas getattr would also
117 # match the superclass
118 if '_VALID_URL_RE' not in cls.__dict__:
119 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
120 return cls._VALID_URL_RE.match(url) is not None
124 """Getter method for _WORKING."""
127 def initialize(self):
128 """Initializes an instance (authentication, etc)."""
130 self._real_initialize()
133 def extract(self, url):
134 """Extracts URL information and returns it in list of dicts."""
136 return self._real_extract(url)
138 def set_downloader(self, downloader):
139 """Sets the downloader for this IE."""
140 self._downloader = downloader
142 def _real_initialize(self):
143 """Real initialization process. Redefine in subclasses."""
146 def _real_extract(self, url):
147 """Real extraction process. Redefine in subclasses."""
152 """A string for getting the InfoExtractor with get_info_extractor"""
153 return cls.__name__[:-2]
157 return type(self).__name__[:-2]
159 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
160 """ Returns the response handle """
162 self.report_download_webpage(video_id)
163 elif note is not False:
165 self.to_screen(u'%s' % (note,))
167 self.to_screen(u'%s: %s' % (video_id, note))
169 return self._downloader.urlopen(url_or_request)
170 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
174 errnote = u'Unable to download webpage'
175 errmsg = u'%s: %s' % (errnote, compat_str(err))
177 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
179 self._downloader.report_warning(errmsg)
182 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
183 """ Returns a tuple (page content as string, URL handle) """
185 # Strip hashes from the URL (#1038)
186 if isinstance(url_or_request, (compat_str, str)):
187 url_or_request = url_or_request.partition('#')[0]
189 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
193 content_type = urlh.headers.get('Content-Type', '')
194 webpage_bytes = urlh.read()
195 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
197 encoding = m.group(1)
199 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
200 webpage_bytes[:1024])
202 encoding = m.group(1).decode('ascii')
205 if self._downloader.params.get('dump_intermediate_pages', False):
207 url = url_or_request.get_full_url()
208 except AttributeError:
210 self.to_screen(u'Dumping request to ' + url)
211 dump = base64.b64encode(webpage_bytes).decode('ascii')
212 self._downloader.to_screen(dump)
213 if self._downloader.params.get('write_pages', False):
215 url = url_or_request.get_full_url()
216 except AttributeError:
218 raw_filename = ('%s_%s.dump' % (video_id, url))
219 filename = sanitize_filename(raw_filename, restricted=True)
220 self.to_screen(u'Saving request to ' + filename)
221 with open(filename, 'wb') as outf:
222 outf.write(webpage_bytes)
224 content = webpage_bytes.decode(encoding, 'replace')
225 return (content, urlh)
227 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
228 """ Returns the data of the page as a string """
229 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
236 def _download_xml(self, url_or_request, video_id,
237 note=u'Downloading XML', errnote=u'Unable to download XML',
238 transform_source=None):
239 """Return the xml as an xml.etree.ElementTree.Element"""
240 xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
242 xml_string = transform_source(xml_string)
243 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
245 def report_warning(self, msg, video_id=None):
246 idstr = u'' if video_id is None else u'%s: ' % video_id
247 self._downloader.report_warning(
248 u'[%s] %s%s' % (self.IE_NAME, idstr, msg))
250 def to_screen(self, msg):
251 """Print msg to screen, prefixing it with '[ie_name]'"""
252 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
254 def report_extraction(self, id_or_name):
255 """Report information extraction."""
256 self.to_screen(u'%s: Extracting information' % id_or_name)
258 def report_download_webpage(self, video_id):
259 """Report webpage download."""
260 self.to_screen(u'%s: Downloading webpage' % video_id)
262 def report_age_confirmation(self):
263 """Report attempt to confirm age."""
264 self.to_screen(u'Confirming age')
266 def report_login(self):
267 """Report attempt to log in."""
268 self.to_screen(u'Logging in')
270 #Methods for following #608
272 def url_result(url, ie=None, video_id=None):
273 """Returns a url that points to a page that should be processed"""
274 #TODO: ie should be the class used for getting the info
275 video_info = {'_type': 'url',
278 if video_id is not None:
279 video_info['id'] = video_id
282 def playlist_result(entries, playlist_id=None, playlist_title=None):
283 """Returns a playlist"""
284 video_info = {'_type': 'playlist',
287 video_info['id'] = playlist_id
289 video_info['title'] = playlist_title
292 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
294 Perform a regex search on the given string, using a single or a list of
295 patterns returning the first matching group.
296 In case of failure return a default value or raise a WARNING or a
297 RegexNotFoundError, depending on fatal, specifying the field name.
299 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
300 mobj = re.search(pattern, string, flags)
303 mobj = re.search(p, string, flags)
306 if os.name != 'nt' and sys.stderr.isatty():
307 _name = u'\033[0;34m%s\033[0m' % name
312 # return the first matching group
313 return next(g for g in mobj.groups() if g is not None)
314 elif default is not _NO_DEFAULT:
317 raise RegexNotFoundError(u'Unable to extract %s' % _name)
319 self._downloader.report_warning(u'unable to extract %s; '
320 u'please report this issue on http://yt-dl.org/bug' % _name)
323 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
325 Like _search_regex, but strips HTML tags and unescapes entities.
327 res = self._search_regex(pattern, string, name, default, fatal, flags)
329 return clean_html(res).strip()
333 def _get_login_info(self):
335 Get the the login info as (username, password)
336 It will look in the netrc file using the _NETRC_MACHINE value
337 If there's no info available, return (None, None)
339 if self._downloader is None:
344 downloader_params = self._downloader.params
346 # Attempt to use provided username and password or .netrc data
347 if downloader_params.get('username', None) is not None:
348 username = downloader_params['username']
349 password = downloader_params['password']
350 elif downloader_params.get('usenetrc', False):
352 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
357 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
358 except (IOError, netrc.NetrcParseError) as err:
359 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
361 return (username, password)
363 # Helper functions for extracting OpenGraph info
365 def _og_regexes(prop):
366 content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
367 property_re = r'property=[\'"]og:%s[\'"]' % re.escape(prop)
368 template = r'<meta[^>]+?%s[^>]+?%s'
370 template % (property_re, content_re),
371 template % (content_re, property_re),
374 def _og_search_property(self, prop, html, name=None, **kargs):
376 name = 'OpenGraph %s' % prop
377 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
380 return unescapeHTML(escaped)
382 def _og_search_thumbnail(self, html, **kargs):
383 return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
385 def _og_search_description(self, html, **kargs):
386 return self._og_search_property('description', html, fatal=False, **kargs)
388 def _og_search_title(self, html, **kargs):
389 return self._og_search_property('title', html, **kargs)
391 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
392 regexes = self._og_regexes('video')
393 if secure: regexes = self._og_regexes('video:secure_url') + regexes
394 return self._html_search_regex(regexes, html, name, **kargs)
396 def _html_search_meta(self, name, html, display_name=None):
397 if display_name is None:
399 return self._html_search_regex(
401 (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
402 [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
403 html, display_name, fatal=False)
405 def _dc_search_uploader(self, html):
406 return self._html_search_meta('dc.creator', html, 'uploader')
408 def _rta_search(self, html):
409 # See http://www.rtalabel.org/index.php?content=howtofaq#single
410 if re.search(r'(?ix)<meta\s+name="rating"\s+'
411 r' content="RTA-5042-1996-1400-1577-RTA"',
416 def _media_rating_search(self, html):
417 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
418 rating = self._html_search_meta('rating', html)
430 return RATING_TABLE.get(rating.lower(), None)
434 class SearchInfoExtractor(InfoExtractor):
436 Base class for paged search queries extractors.
437 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
438 Instances should define _SEARCH_KEY and _MAX_RESULTS.
442 def _make_valid_url(cls):
443 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
446 def suitable(cls, url):
447 return re.match(cls._make_valid_url(), url) is not None
449 def _real_extract(self, query):
450 mobj = re.match(self._make_valid_url(), query)
452 raise ExtractorError(u'Invalid search query "%s"' % query)
454 prefix = mobj.group('prefix')
455 query = mobj.group('query')
457 return self._get_n_results(query, 1)
458 elif prefix == 'all':
459 return self._get_n_results(query, self._MAX_RESULTS)
463 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
464 elif n > self._MAX_RESULTS:
465 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
466 n = self._MAX_RESULTS
467 return self._get_n_results(query, n)
469 def _get_n_results(self, query, n):
470 """Get a specified number of results for a query"""
471 raise NotImplementedError("This method must be implemented by subclasses")
474 def SEARCH_KEY(self):
475 return self._SEARCH_KEY