7 import xml.etree.ElementTree
23 class InfoExtractor(object):
24 """Information Extractor class.
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
29 others. The information is stored in a dictionary which is then
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
34 The dictionaries must include the following fields:
37 title: Video title, unescaped.
39 Additionally, it must contain either a formats entry or url and ext:
41 formats: A list of dictionaries for each format available, it must
42 be ordered from worst to best quality. Potential fields:
43 * url Mandatory. The URL of the video file
44 * ext Will be calculated from url if missing
45 * format A human-readable description of the format
46 ("mp4 container with h264/opus").
47 Calculated from the format_id, width, height.
48 and format_note fields if missing.
49 * format_id A short description of the format
50 ("mp4_h264_opus" or "19")
51 * format_note Additional info about the format
52 ("3D" or "DASH video")
53 * width Width of the video, if known
54 * height Height of the video, if known
55 * abr Average audio bitrate in KBit/s
56 * acodec Name of the audio codec in use
57 * vbr Average video bitrate in KBit/s
58 * vcodec Name of the video codec in use
59 * filesize The number of bytes, if known in advance
60 * player_url SWF Player URL (used for rtmpdump).
62 ext: Video filename extension.
63 format: The video format, defaults to ext (used for --get-format)
64 player_url: SWF Player URL (used for rtmpdump).
65 urlhandle: [internal] The urlHandle to be used to download the file,
66 like returned by urllib.request.urlopen
68 The following fields are optional:
70 thumbnails: A list of dictionaries (with the entries "resolution" and
71 "url") for the varying thumbnails
72 thumbnail: Full URL to a video thumbnail image.
73 description: One-line video description.
74 uploader: Full name of the video uploader.
75 upload_date: Video upload date (YYYYMMDD).
76 uploader_id: Nickname or id of the video uploader.
77 location: Physical location of the video.
78 subtitles: The subtitle file contents as a dictionary in the format
79 {language: subtitles}.
80 duration: Length of the video in seconds, as an integer.
81 view_count: How many users have watched the video on the platform.
82 like_count: Number of positive ratings of the video
83 dislike_count: Number of negative ratings of the video
84 comment_count: Number of comments on the video
85 age_limit: Age restriction for the video, as an integer (years)
86 webpage_url: The url to the video webpage, if given to youtube-dl it
87 should allow to get the same result again. (It will be set
88 by YoutubeDL if it's missing)
90 Unless mentioned otherwise, the fields should be Unicode strings.
92 Subclasses of this one should re-define the _real_initialize() and
93 _real_extract() methods and define a _VALID_URL regexp.
94 Probably, they should also be added to the list of extractors.
96 _real_extract() must return a *list* of information dictionaries as
99 Finally, the _WORKING attribute should be set to False for broken IEs
100 in order to warn the users and skip the tests.
107 def __init__(self, downloader=None):
108 """Constructor. Receives an optional downloader."""
110 self.set_downloader(downloader)
113 def suitable(cls, url):
114 """Receives a URL and returns True if suitable for this IE."""
116 # This does not use has/getattr intentionally - we want to know whether
117 # we have cached the regexp for *this* class, whereas getattr would also
118 # match the superclass
119 if '_VALID_URL_RE' not in cls.__dict__:
120 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
121 return cls._VALID_URL_RE.match(url) is not None
125 """Getter method for _WORKING."""
128 def initialize(self):
129 """Initializes an instance (authentication, etc)."""
131 self._real_initialize()
134 def extract(self, url):
135 """Extracts URL information and returns it in list of dicts."""
137 return self._real_extract(url)
139 def set_downloader(self, downloader):
140 """Sets the downloader for this IE."""
141 self._downloader = downloader
143 def _real_initialize(self):
144 """Real initialization process. Redefine in subclasses."""
147 def _real_extract(self, url):
148 """Real extraction process. Redefine in subclasses."""
153 """A string for getting the InfoExtractor with get_info_extractor"""
154 return cls.__name__[:-2]
158 return type(self).__name__[:-2]
160 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
161 """ Returns the response handle """
163 self.report_download_webpage(video_id)
164 elif note is not False:
166 self.to_screen(u'%s' % (note,))
168 self.to_screen(u'%s: %s' % (video_id, note))
170 return self._downloader.urlopen(url_or_request)
171 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
173 errnote = u'Unable to download webpage'
174 errmsg = u'%s: %s' % (errnote, compat_str(err))
176 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
178 self._downloader.report_warning(errmsg)
181 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
182 """ Returns a tuple (page content as string, URL handle) """
184 # Strip hashes from the URL (#1038)
185 if isinstance(url_or_request, (compat_str, str)):
186 url_or_request = url_or_request.partition('#')[0]
188 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
192 content_type = urlh.headers.get('Content-Type', '')
193 webpage_bytes = urlh.read()
194 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
196 encoding = m.group(1)
198 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
199 webpage_bytes[:1024])
201 encoding = m.group(1).decode('ascii')
204 if self._downloader.params.get('dump_intermediate_pages', False):
206 url = url_or_request.get_full_url()
207 except AttributeError:
209 self.to_screen(u'Dumping request to ' + url)
210 dump = base64.b64encode(webpage_bytes).decode('ascii')
211 self._downloader.to_screen(dump)
212 if self._downloader.params.get('write_pages', False):
214 url = url_or_request.get_full_url()
215 except AttributeError:
217 raw_filename = ('%s_%s.dump' % (video_id, url))
218 filename = sanitize_filename(raw_filename, restricted=True)
219 self.to_screen(u'Saving request to ' + filename)
220 with open(filename, 'wb') as outf:
221 outf.write(webpage_bytes)
223 content = webpage_bytes.decode(encoding, 'replace')
224 return (content, urlh)
226 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
227 """ Returns the data of the page as a string """
228 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
235 def _download_xml(self, url_or_request, video_id,
236 note=u'Downloading XML', errnote=u'Unable to download XML',
237 transform_source=None):
238 """Return the xml as an xml.etree.ElementTree.Element"""
239 xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
241 xml_string = transform_source(xml_string)
242 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
244 def to_screen(self, msg):
245 """Print msg to screen, prefixing it with '[ie_name]'"""
246 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
248 def report_extraction(self, id_or_name):
249 """Report information extraction."""
250 self.to_screen(u'%s: Extracting information' % id_or_name)
252 def report_download_webpage(self, video_id):
253 """Report webpage download."""
254 self.to_screen(u'%s: Downloading webpage' % video_id)
256 def report_age_confirmation(self):
257 """Report attempt to confirm age."""
258 self.to_screen(u'Confirming age')
260 def report_login(self):
261 """Report attempt to log in."""
262 self.to_screen(u'Logging in')
264 #Methods for following #608
265 def url_result(self, url, ie=None, video_id=None):
266 """Returns a url that points to a page that should be processed"""
267 #TODO: ie should be the class used for getting the info
268 video_info = {'_type': 'url',
271 if video_id is not None:
272 video_info['id'] = video_id
274 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
275 """Returns a playlist"""
276 video_info = {'_type': 'playlist',
279 video_info['id'] = playlist_id
281 video_info['title'] = playlist_title
284 def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
286 Perform a regex search on the given string, using a single or a list of
287 patterns returning the first matching group.
288 In case of failure return a default value or raise a WARNING or a
289 RegexNotFoundError, depending on fatal, specifying the field name.
291 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
292 mobj = re.search(pattern, string, flags)
295 mobj = re.search(p, string, flags)
298 if sys.stderr.isatty() and os.name != 'nt':
299 _name = u'\033[0;34m%s\033[0m' % name
304 # return the first matching group
305 return next(g for g in mobj.groups() if g is not None)
306 elif default is not None:
309 raise RegexNotFoundError(u'Unable to extract %s' % _name)
311 self._downloader.report_warning(u'unable to extract %s; '
312 u'please report this issue on http://yt-dl.org/bug' % _name)
315 def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
317 Like _search_regex, but strips HTML tags and unescapes entities.
319 res = self._search_regex(pattern, string, name, default, fatal, flags)
321 return clean_html(res).strip()
325 def _get_login_info(self):
327 Get the the login info as (username, password)
328 It will look in the netrc file using the _NETRC_MACHINE value
329 If there's no info available, return (None, None)
331 if self._downloader is None:
336 downloader_params = self._downloader.params
338 # Attempt to use provided username and password or .netrc data
339 if downloader_params.get('username', None) is not None:
340 username = downloader_params['username']
341 password = downloader_params['password']
342 elif downloader_params.get('usenetrc', False):
344 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
349 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
350 except (IOError, netrc.NetrcParseError) as err:
351 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
353 return (username, password)
355 # Helper functions for extracting OpenGraph info
357 def _og_regexes(prop):
358 content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
359 property_re = r'property=[\'"]og:%s[\'"]' % re.escape(prop)
360 template = r'<meta[^>]+?%s[^>]+?%s'
362 template % (property_re, content_re),
363 template % (content_re, property_re),
366 def _og_search_property(self, prop, html, name=None, **kargs):
368 name = 'OpenGraph %s' % prop
369 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
372 return unescapeHTML(escaped)
374 def _og_search_thumbnail(self, html, **kargs):
375 return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
377 def _og_search_description(self, html, **kargs):
378 return self._og_search_property('description', html, fatal=False, **kargs)
380 def _og_search_title(self, html, **kargs):
381 return self._og_search_property('title', html, **kargs)
383 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
384 regexes = self._og_regexes('video')
385 if secure: regexes = self._og_regexes('video:secure_url') + regexes
386 return self._html_search_regex(regexes, html, name, **kargs)
388 def _html_search_meta(self, name, html, display_name=None):
389 if display_name is None:
391 return self._html_search_regex(
393 (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
394 [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
395 html, display_name, fatal=False)
397 def _dc_search_uploader(self, html):
398 return self._html_search_meta('dc.creator', html, 'uploader')
400 def _rta_search(self, html):
401 # See http://www.rtalabel.org/index.php?content=howtofaq#single
402 if re.search(r'(?ix)<meta\s+name="rating"\s+'
403 r' content="RTA-5042-1996-1400-1577-RTA"',
408 def _media_rating_search(self, html):
409 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
410 rating = self._html_search_meta('rating', html)
422 return RATING_TABLE.get(rating.lower(), None)
426 class SearchInfoExtractor(InfoExtractor):
428 Base class for paged search queries extractors.
429 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
430 Instances should define _SEARCH_KEY and _MAX_RESULTS.
434 def _make_valid_url(cls):
435 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
438 def suitable(cls, url):
439 return re.match(cls._make_valid_url(), url) is not None
441 def _real_extract(self, query):
442 mobj = re.match(self._make_valid_url(), query)
444 raise ExtractorError(u'Invalid search query "%s"' % query)
446 prefix = mobj.group('prefix')
447 query = mobj.group('query')
449 return self._get_n_results(query, 1)
450 elif prefix == 'all':
451 return self._get_n_results(query, self._MAX_RESULTS)
455 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
456 elif n > self._MAX_RESULTS:
457 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
458 n = self._MAX_RESULTS
459 return self._get_n_results(query, n)
461 def _get_n_results(self, query, n):
462 """Get a specified number of results for a query"""
463 raise NotImplementedError("This method must be implemented by subclasses")
466 def SEARCH_KEY(self):
467 return self._SEARCH_KEY