7 import xml.etree.ElementTree
23 class InfoExtractor(object):
24 """Information Extractor class.
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
29 others. The information is stored in a dictionary which is then
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
34 The dictionaries must include the following fields:
38 title: Video title, unescaped.
39 ext: Video filename extension.
41 Instead of url and ext, formats can also specified.
43 The following fields are optional:
45 format: The video format, defaults to ext (used for --get-format)
46 thumbnails: A list of dictionaries (with the entries "resolution" and
47 "url") for the varying thumbnails
48 thumbnail: Full URL to a video thumbnail image.
49 description: One-line video description.
50 uploader: Full name of the video uploader.
51 upload_date: Video upload date (YYYYMMDD).
52 uploader_id: Nickname or id of the video uploader.
53 location: Physical location of the video.
54 player_url: SWF Player URL (used for rtmpdump).
55 subtitles: The subtitle file contents as a dictionary in the format
56 {language: subtitles}.
57 view_count: How many users have watched the video on the platform.
58 urlhandle: [internal] The urlHandle to be used to download the file,
59 like returned by urllib.request.urlopen
60 age_limit: Age restriction for the video, as an integer (years)
61 formats: A list of dictionaries for each format available, it must
62 be ordered from worst to best quality. Potential fields:
63 * url Mandatory. The URL of the video file
64 * ext Will be calculated from url if missing
65 * format A human-readable description of the format
66 ("mp4 container with h264/opus").
67 Calculated from the format_id, width, height.
68 and format_note fields if missing.
69 * format_id A short description of the format
70 ("mp4_h264_opus" or "19")
71 * format_note Additional info about the format
72 ("3D" or "DASH video")
73 * width Width of the video, if known
74 * height Height of the video, if known
75 * abr Average audio bitrate in KBit/s
76 * acodec Name of the audio codec in use
77 * vbr Average video bitrate in KBit/s
78 * vcodec Name of the video codec in use
79 * filesize The number of bytes, if known in advance
80 webpage_url: The url to the video webpage, if given to youtube-dl it
81 should allow to get the same result again. (It will be set
82 by YoutubeDL if it's missing)
84 Unless mentioned otherwise, the fields should be Unicode strings.
86 Subclasses of this one should re-define the _real_initialize() and
87 _real_extract() methods and define a _VALID_URL regexp.
88 Probably, they should also be added to the list of extractors.
90 _real_extract() must return a *list* of information dictionaries as
93 Finally, the _WORKING attribute should be set to False for broken IEs
94 in order to warn the users and skip the tests.
101 def __init__(self, downloader=None):
102 """Constructor. Receives an optional downloader."""
104 self.set_downloader(downloader)
107 def suitable(cls, url):
108 """Receives a URL and returns True if suitable for this IE."""
110 # This does not use has/getattr intentionally - we want to know whether
111 # we have cached the regexp for *this* class, whereas getattr would also
112 # match the superclass
113 if '_VALID_URL_RE' not in cls.__dict__:
114 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
115 return cls._VALID_URL_RE.match(url) is not None
119 """Getter method for _WORKING."""
122 def initialize(self):
123 """Initializes an instance (authentication, etc)."""
125 self._real_initialize()
128 def extract(self, url):
129 """Extracts URL information and returns it in list of dicts."""
131 return self._real_extract(url)
133 def set_downloader(self, downloader):
134 """Sets the downloader for this IE."""
135 self._downloader = downloader
137 def _real_initialize(self):
138 """Real initialization process. Redefine in subclasses."""
141 def _real_extract(self, url):
142 """Real extraction process. Redefine in subclasses."""
147 """A string for getting the InfoExtractor with get_info_extractor"""
148 return cls.__name__[:-2]
152 return type(self).__name__[:-2]
154 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
155 """ Returns the response handle """
157 self.report_download_webpage(video_id)
158 elif note is not False:
159 self.to_screen(u'%s: %s' % (video_id, note))
161 return self._downloader.urlopen(url_or_request)
162 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
164 errnote = u'Unable to download webpage'
165 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2], cause=err)
167 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
168 """ Returns a tuple (page content as string, URL handle) """
170 # Strip hashes from the URL (#1038)
171 if isinstance(url_or_request, (compat_str, str)):
172 url_or_request = url_or_request.partition('#')[0]
174 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
175 content_type = urlh.headers.get('Content-Type', '')
176 webpage_bytes = urlh.read()
177 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
179 encoding = m.group(1)
181 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
182 webpage_bytes[:1024])
184 encoding = m.group(1).decode('ascii')
187 if self._downloader.params.get('dump_intermediate_pages', False):
189 url = url_or_request.get_full_url()
190 except AttributeError:
192 self.to_screen(u'Dumping request to ' + url)
193 dump = base64.b64encode(webpage_bytes).decode('ascii')
194 self._downloader.to_screen(dump)
195 if self._downloader.params.get('write_pages', False):
197 url = url_or_request.get_full_url()
198 except AttributeError:
200 raw_filename = ('%s_%s.dump' % (video_id, url))
201 filename = sanitize_filename(raw_filename, restricted=True)
202 self.to_screen(u'Saving request to ' + filename)
203 with open(filename, 'wb') as outf:
204 outf.write(webpage_bytes)
206 content = webpage_bytes.decode(encoding, 'replace')
207 return (content, urlh)
209 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
210 """ Returns the data of the page as a string """
211 return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
213 def _download_xml(self, url_or_request, video_id, note=u'Downloading XML', errnote=u'Unable to downloand XML'):
214 """Return the xml as an xml.etree.ElementTree.Element"""
215 xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
216 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
218 def to_screen(self, msg):
219 """Print msg to screen, prefixing it with '[ie_name]'"""
220 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
222 def report_extraction(self, id_or_name):
223 """Report information extraction."""
224 self.to_screen(u'%s: Extracting information' % id_or_name)
226 def report_download_webpage(self, video_id):
227 """Report webpage download."""
228 self.to_screen(u'%s: Downloading webpage' % video_id)
230 def report_age_confirmation(self):
231 """Report attempt to confirm age."""
232 self.to_screen(u'Confirming age')
234 def report_login(self):
235 """Report attempt to log in."""
236 self.to_screen(u'Logging in')
238 #Methods for following #608
239 def url_result(self, url, ie=None, video_id=None):
240 """Returns a url that points to a page that should be processed"""
241 #TODO: ie should be the class used for getting the info
242 video_info = {'_type': 'url',
245 if video_id is not None:
246 video_info['id'] = video_id
248 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
249 """Returns a playlist"""
250 video_info = {'_type': 'playlist',
253 video_info['id'] = playlist_id
255 video_info['title'] = playlist_title
258 def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
260 Perform a regex search on the given string, using a single or a list of
261 patterns returning the first matching group.
262 In case of failure return a default value or raise a WARNING or a
263 RegexNotFoundError, depending on fatal, specifying the field name.
265 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
266 mobj = re.search(pattern, string, flags)
269 mobj = re.search(p, string, flags)
272 if sys.stderr.isatty() and os.name != 'nt':
273 _name = u'\033[0;34m%s\033[0m' % name
278 # return the first matching group
279 return next(g for g in mobj.groups() if g is not None)
280 elif default is not None:
283 raise RegexNotFoundError(u'Unable to extract %s' % _name)
285 self._downloader.report_warning(u'unable to extract %s; '
286 u'please report this issue on http://yt-dl.org/bug' % _name)
289 def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
291 Like _search_regex, but strips HTML tags and unescapes entities.
293 res = self._search_regex(pattern, string, name, default, fatal, flags)
295 return clean_html(res).strip()
299 def _get_login_info(self):
301 Get the the login info as (username, password)
302 It will look in the netrc file using the _NETRC_MACHINE value
303 If there's no info available, return (None, None)
305 if self._downloader is None:
310 downloader_params = self._downloader.params
312 # Attempt to use provided username and password or .netrc data
313 if downloader_params.get('username', None) is not None:
314 username = downloader_params['username']
315 password = downloader_params['password']
316 elif downloader_params.get('usenetrc', False):
318 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
323 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
324 except (IOError, netrc.NetrcParseError) as err:
325 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
327 return (username, password)
329 # Helper functions for extracting OpenGraph info
331 def _og_regexes(prop):
332 content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
333 property_re = r'property=[\'"]og:%s[\'"]' % re.escape(prop)
334 template = r'<meta[^>]+?%s[^>]+?%s'
336 template % (property_re, content_re),
337 template % (content_re, property_re),
340 def _og_search_property(self, prop, html, name=None, **kargs):
342 name = 'OpenGraph %s' % prop
343 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
346 return unescapeHTML(escaped)
348 def _og_search_thumbnail(self, html, **kargs):
349 return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
351 def _og_search_description(self, html, **kargs):
352 return self._og_search_property('description', html, fatal=False, **kargs)
354 def _og_search_title(self, html, **kargs):
355 return self._og_search_property('title', html, **kargs)
357 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
358 regexes = self._og_regexes('video')
359 if secure: regexes = self._og_regexes('video:secure_url') + regexes
360 return self._html_search_regex(regexes, html, name, **kargs)
362 def _html_search_meta(self, name, html, display_name=None):
363 if display_name is None:
365 return self._html_search_regex(
366 r'''(?ix)<meta(?=[^>]+(?:name|property)=["\']%s["\'])
367 [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
368 html, display_name, fatal=False)
370 def _dc_search_uploader(self, html):
371 return self._html_search_meta('dc.creator', html, 'uploader')
373 def _rta_search(self, html):
374 # See http://www.rtalabel.org/index.php?content=howtofaq#single
375 if re.search(r'(?ix)<meta\s+name="rating"\s+'
376 r' content="RTA-5042-1996-1400-1577-RTA"',
381 def _media_rating_search(self, html):
382 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
383 rating = self._html_search_meta('rating', html)
395 return RATING_TABLE.get(rating.lower(), None)
399 class SearchInfoExtractor(InfoExtractor):
401 Base class for paged search queries extractors.
402 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
403 Instances should define _SEARCH_KEY and _MAX_RESULTS.
407 def _make_valid_url(cls):
408 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
411 def suitable(cls, url):
412 return re.match(cls._make_valid_url(), url) is not None
414 def _real_extract(self, query):
415 mobj = re.match(self._make_valid_url(), query)
417 raise ExtractorError(u'Invalid search query "%s"' % query)
419 prefix = mobj.group('prefix')
420 query = mobj.group('query')
422 return self._get_n_results(query, 1)
423 elif prefix == 'all':
424 return self._get_n_results(query, self._MAX_RESULTS)
428 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
429 elif n > self._MAX_RESULTS:
430 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
431 n = self._MAX_RESULTS
432 return self._get_n_results(query, n)
434 def _get_n_results(self, query, n):
435 """Get a specified number of results for a query"""
436 raise NotImplementedError("This method must be implemented by subclasses")
439 def SEARCH_KEY(self):
440 return self._SEARCH_KEY