7 import xml.etree.ElementTree
23 class InfoExtractor(object):
24 """Information Extractor class.
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
29 others. The information is stored in a dictionary which is then
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
34 The dictionaries must include the following fields:
38 title: Video title, unescaped.
39 ext: Video filename extension.
41 Instead of url and ext, formats can also specified.
43 The following fields are optional:
45 format: The video format, defaults to ext (used for --get-format)
46 thumbnails: A list of dictionaries (with the entries "resolution" and
47 "url") for the varying thumbnails
48 thumbnail: Full URL to a video thumbnail image.
49 description: One-line video description.
50 uploader: Full name of the video uploader.
51 upload_date: Video upload date (YYYYMMDD).
52 uploader_id: Nickname or id of the video uploader.
53 location: Physical location of the video.
54 player_url: SWF Player URL (used for rtmpdump).
55 subtitles: The subtitle file contents as a dictionary in the format
56 {language: subtitles}.
57 view_count: How many users have watched the video on the platform.
58 like_count: Number of positive ratings of the video
59 dislike_count: Number of negative ratings of the video
60 comment_count: Number of comments on the video
61 urlhandle: [internal] The urlHandle to be used to download the file,
62 like returned by urllib.request.urlopen
63 age_limit: Age restriction for the video, as an integer (years)
64 formats: A list of dictionaries for each format available, it must
65 be ordered from worst to best quality. Potential fields:
66 * url Mandatory. The URL of the video file
67 * ext Will be calculated from url if missing
68 * format A human-readable description of the format
69 ("mp4 container with h264/opus").
70 Calculated from the format_id, width, height.
71 and format_note fields if missing.
72 * format_id A short description of the format
73 ("mp4_h264_opus" or "19")
74 * format_note Additional info about the format
75 ("3D" or "DASH video")
76 * width Width of the video, if known
77 * height Height of the video, if known
78 * abr Average audio bitrate in KBit/s
79 * acodec Name of the audio codec in use
80 * vbr Average video bitrate in KBit/s
81 * vcodec Name of the video codec in use
82 * filesize The number of bytes, if known in advance
83 webpage_url: The url to the video webpage, if given to youtube-dl it
84 should allow to get the same result again. (It will be set
85 by YoutubeDL if it's missing)
87 Unless mentioned otherwise, the fields should be Unicode strings.
89 Subclasses of this one should re-define the _real_initialize() and
90 _real_extract() methods and define a _VALID_URL regexp.
91 Probably, they should also be added to the list of extractors.
93 _real_extract() must return a *list* of information dictionaries as
96 Finally, the _WORKING attribute should be set to False for broken IEs
97 in order to warn the users and skip the tests.
104 def __init__(self, downloader=None):
105 """Constructor. Receives an optional downloader."""
107 self.set_downloader(downloader)
110 def suitable(cls, url):
111 """Receives a URL and returns True if suitable for this IE."""
113 # This does not use has/getattr intentionally - we want to know whether
114 # we have cached the regexp for *this* class, whereas getattr would also
115 # match the superclass
116 if '_VALID_URL_RE' not in cls.__dict__:
117 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
118 return cls._VALID_URL_RE.match(url) is not None
122 """Getter method for _WORKING."""
125 def initialize(self):
126 """Initializes an instance (authentication, etc)."""
128 self._real_initialize()
131 def extract(self, url):
132 """Extracts URL information and returns it in list of dicts."""
134 return self._real_extract(url)
136 def set_downloader(self, downloader):
137 """Sets the downloader for this IE."""
138 self._downloader = downloader
140 def _real_initialize(self):
141 """Real initialization process. Redefine in subclasses."""
144 def _real_extract(self, url):
145 """Real extraction process. Redefine in subclasses."""
150 """A string for getting the InfoExtractor with get_info_extractor"""
151 return cls.__name__[:-2]
155 return type(self).__name__[:-2]
157 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
158 """ Returns the response handle """
160 self.report_download_webpage(video_id)
161 elif note is not False:
163 self.to_screen(u'%s' % (note,))
165 self.to_screen(u'%s: %s' % (video_id, note))
167 return self._downloader.urlopen(url_or_request)
168 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
170 errnote = u'Unable to download webpage'
171 errmsg = u'%s: %s' % (errnote, compat_str(err))
173 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
175 self._downloader.report_warning(errmsg)
178 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
179 """ Returns a tuple (page content as string, URL handle) """
181 # Strip hashes from the URL (#1038)
182 if isinstance(url_or_request, (compat_str, str)):
183 url_or_request = url_or_request.partition('#')[0]
185 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
189 content_type = urlh.headers.get('Content-Type', '')
190 webpage_bytes = urlh.read()
191 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
193 encoding = m.group(1)
195 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
196 webpage_bytes[:1024])
198 encoding = m.group(1).decode('ascii')
201 if self._downloader.params.get('dump_intermediate_pages', False):
203 url = url_or_request.get_full_url()
204 except AttributeError:
206 self.to_screen(u'Dumping request to ' + url)
207 dump = base64.b64encode(webpage_bytes).decode('ascii')
208 self._downloader.to_screen(dump)
209 if self._downloader.params.get('write_pages', False):
211 url = url_or_request.get_full_url()
212 except AttributeError:
214 raw_filename = ('%s_%s.dump' % (video_id, url))
215 filename = sanitize_filename(raw_filename, restricted=True)
216 self.to_screen(u'Saving request to ' + filename)
217 with open(filename, 'wb') as outf:
218 outf.write(webpage_bytes)
220 content = webpage_bytes.decode(encoding, 'replace')
221 return (content, urlh)
223 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
224 """ Returns the data of the page as a string """
225 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
232 def _download_xml(self, url_or_request, video_id,
233 note=u'Downloading XML', errnote=u'Unable to download XML',
234 transform_source=None):
235 """Return the xml as an xml.etree.ElementTree.Element"""
236 xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
238 xml_string = transform_source(xml_string)
239 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
241 def to_screen(self, msg):
242 """Print msg to screen, prefixing it with '[ie_name]'"""
243 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
245 def report_extraction(self, id_or_name):
246 """Report information extraction."""
247 self.to_screen(u'%s: Extracting information' % id_or_name)
249 def report_download_webpage(self, video_id):
250 """Report webpage download."""
251 self.to_screen(u'%s: Downloading webpage' % video_id)
253 def report_age_confirmation(self):
254 """Report attempt to confirm age."""
255 self.to_screen(u'Confirming age')
257 def report_login(self):
258 """Report attempt to log in."""
259 self.to_screen(u'Logging in')
261 #Methods for following #608
262 def url_result(self, url, ie=None, video_id=None):
263 """Returns a url that points to a page that should be processed"""
264 #TODO: ie should be the class used for getting the info
265 video_info = {'_type': 'url',
268 if video_id is not None:
269 video_info['id'] = video_id
271 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
272 """Returns a playlist"""
273 video_info = {'_type': 'playlist',
276 video_info['id'] = playlist_id
278 video_info['title'] = playlist_title
281 def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
283 Perform a regex search on the given string, using a single or a list of
284 patterns returning the first matching group.
285 In case of failure return a default value or raise a WARNING or a
286 RegexNotFoundError, depending on fatal, specifying the field name.
288 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
289 mobj = re.search(pattern, string, flags)
292 mobj = re.search(p, string, flags)
295 if sys.stderr.isatty() and os.name != 'nt':
296 _name = u'\033[0;34m%s\033[0m' % name
301 # return the first matching group
302 return next(g for g in mobj.groups() if g is not None)
303 elif default is not None:
306 raise RegexNotFoundError(u'Unable to extract %s' % _name)
308 self._downloader.report_warning(u'unable to extract %s; '
309 u'please report this issue on http://yt-dl.org/bug' % _name)
312 def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
314 Like _search_regex, but strips HTML tags and unescapes entities.
316 res = self._search_regex(pattern, string, name, default, fatal, flags)
318 return clean_html(res).strip()
322 def _get_login_info(self):
324 Get the the login info as (username, password)
325 It will look in the netrc file using the _NETRC_MACHINE value
326 If there's no info available, return (None, None)
328 if self._downloader is None:
333 downloader_params = self._downloader.params
335 # Attempt to use provided username and password or .netrc data
336 if downloader_params.get('username', None) is not None:
337 username = downloader_params['username']
338 password = downloader_params['password']
339 elif downloader_params.get('usenetrc', False):
341 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
346 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
347 except (IOError, netrc.NetrcParseError) as err:
348 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
350 return (username, password)
352 # Helper functions for extracting OpenGraph info
354 def _og_regexes(prop):
355 content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
356 property_re = r'property=[\'"]og:%s[\'"]' % re.escape(prop)
357 template = r'<meta[^>]+?%s[^>]+?%s'
359 template % (property_re, content_re),
360 template % (content_re, property_re),
363 def _og_search_property(self, prop, html, name=None, **kargs):
365 name = 'OpenGraph %s' % prop
366 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
369 return unescapeHTML(escaped)
371 def _og_search_thumbnail(self, html, **kargs):
372 return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
374 def _og_search_description(self, html, **kargs):
375 return self._og_search_property('description', html, fatal=False, **kargs)
377 def _og_search_title(self, html, **kargs):
378 return self._og_search_property('title', html, **kargs)
380 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
381 regexes = self._og_regexes('video')
382 if secure: regexes = self._og_regexes('video:secure_url') + regexes
383 return self._html_search_regex(regexes, html, name, **kargs)
385 def _html_search_meta(self, name, html, display_name=None):
386 if display_name is None:
388 return self._html_search_regex(
390 (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
391 [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
392 html, display_name, fatal=False)
394 def _dc_search_uploader(self, html):
395 return self._html_search_meta('dc.creator', html, 'uploader')
397 def _rta_search(self, html):
398 # See http://www.rtalabel.org/index.php?content=howtofaq#single
399 if re.search(r'(?ix)<meta\s+name="rating"\s+'
400 r' content="RTA-5042-1996-1400-1577-RTA"',
405 def _media_rating_search(self, html):
406 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
407 rating = self._html_search_meta('rating', html)
419 return RATING_TABLE.get(rating.lower(), None)
423 class SearchInfoExtractor(InfoExtractor):
425 Base class for paged search queries extractors.
426 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
427 Instances should define _SEARCH_KEY and _MAX_RESULTS.
431 def _make_valid_url(cls):
432 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
435 def suitable(cls, url):
436 return re.match(cls._make_valid_url(), url) is not None
438 def _real_extract(self, query):
439 mobj = re.match(self._make_valid_url(), query)
441 raise ExtractorError(u'Invalid search query "%s"' % query)
443 prefix = mobj.group('prefix')
444 query = mobj.group('query')
446 return self._get_n_results(query, 1)
447 elif prefix == 'all':
448 return self._get_n_results(query, self._MAX_RESULTS)
452 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
453 elif n > self._MAX_RESULTS:
454 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
455 n = self._MAX_RESULTS
456 return self._get_n_results(query, n)
458 def _get_n_results(self, query, n):
459 """Get a specified number of results for a query"""
460 raise NotImplementedError("This method must be implemented by subclasses")
463 def SEARCH_KEY(self):
464 return self._SEARCH_KEY