7 import xml.etree.ElementTree
21 _NO_DEFAULT = object()
24 class InfoExtractor(object):
25 """Information Extractor class.
27 Information extractors are the classes that, given a URL, extract
28 information about the video (or videos) the URL refers to. This
29 information includes the real video URL, the video title, author and
30 others. The information is stored in a dictionary which is then
31 passed to the FileDownloader. The FileDownloader processes this
32 information possibly downloading the video to the file system, among
33 other possible outcomes.
35 The dictionaries must include the following fields:
38 title: Video title, unescaped.
40 Additionally, it must contain either a formats entry or a url one:
42 formats: A list of dictionaries for each format available, ordered
43 from worst to best quality.
46 * url Mandatory. The URL of the video file
47 * ext Will be calculated from url if missing
48 * format A human-readable description of the format
49 ("mp4 container with h264/opus").
50 Calculated from the format_id, width, height.
51 and format_note fields if missing.
52 * format_id A short description of the format
53 ("mp4_h264_opus" or "19")
54 * format_note Additional info about the format
55 ("3D" or "DASH video")
56 * width Width of the video, if known
57 * height Height of the video, if known
58 * resolution Textual description of width and height
59 * abr Average audio bitrate in KBit/s
60 * acodec Name of the audio codec in use
61 * vbr Average video bitrate in KBit/s
62 * vcodec Name of the video codec in use
63 * filesize The number of bytes, if known in advance
64 * player_url SWF Player URL (used for rtmpdump).
65 * preference Order number of this format. If this field is
66 present, the formats get sorted by this field.
67 -1 for default (order by other properties),
68 -2 or smaller for less than default.
70 ext: Video filename extension.
71 format: The video format, defaults to ext (used for --get-format)
72 player_url: SWF Player URL (used for rtmpdump).
74 The following fields are optional:
76 thumbnails: A list of dictionaries (with the entries "resolution" and
77 "url") for the varying thumbnails
78 thumbnail: Full URL to a video thumbnail image.
79 description: One-line video description.
80 uploader: Full name of the video uploader.
81 upload_date: Video upload date (YYYYMMDD).
82 uploader_id: Nickname or id of the video uploader.
83 location: Physical location of the video.
84 subtitles: The subtitle file contents as a dictionary in the format
85 {language: subtitles}.
86 duration: Length of the video in seconds, as an integer.
87 view_count: How many users have watched the video on the platform.
88 like_count: Number of positive ratings of the video
89 dislike_count: Number of negative ratings of the video
90 comment_count: Number of comments on the video
91 age_limit: Age restriction for the video, as an integer (years)
92 webpage_url: The url to the video webpage, if given to youtube-dl it
93 should allow to get the same result again. (It will be set
94 by YoutubeDL if it's missing)
96 Unless mentioned otherwise, the fields should be Unicode strings.
98 Subclasses of this one should re-define the _real_initialize() and
99 _real_extract() methods and define a _VALID_URL regexp.
100 Probably, they should also be added to the list of extractors.
102 _real_extract() must return a *list* of information dictionaries as
105 Finally, the _WORKING attribute should be set to False for broken IEs
106 in order to warn the users and skip the tests.
113 def __init__(self, downloader=None):
114 """Constructor. Receives an optional downloader."""
116 self.set_downloader(downloader)
119 def suitable(cls, url):
120 """Receives a URL and returns True if suitable for this IE."""
122 # This does not use has/getattr intentionally - we want to know whether
123 # we have cached the regexp for *this* class, whereas getattr would also
124 # match the superclass
125 if '_VALID_URL_RE' not in cls.__dict__:
126 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
127 return cls._VALID_URL_RE.match(url) is not None
131 """Getter method for _WORKING."""
134 def initialize(self):
135 """Initializes an instance (authentication, etc)."""
137 self._real_initialize()
140 def extract(self, url):
141 """Extracts URL information and returns it in list of dicts."""
143 return self._real_extract(url)
145 def set_downloader(self, downloader):
146 """Sets the downloader for this IE."""
147 self._downloader = downloader
149 def _real_initialize(self):
150 """Real initialization process. Redefine in subclasses."""
153 def _real_extract(self, url):
154 """Real extraction process. Redefine in subclasses."""
159 """A string for getting the InfoExtractor with get_info_extractor"""
160 return cls.__name__[:-2]
164 return type(self).__name__[:-2]
166 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
167 """ Returns the response handle """
169 self.report_download_webpage(video_id)
170 elif note is not False:
172 self.to_screen(u'%s' % (note,))
174 self.to_screen(u'%s: %s' % (video_id, note))
176 return self._downloader.urlopen(url_or_request)
177 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
181 errnote = u'Unable to download webpage'
182 errmsg = u'%s: %s' % (errnote, compat_str(err))
184 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
186 self._downloader.report_warning(errmsg)
189 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
190 """ Returns a tuple (page content as string, URL handle) """
192 # Strip hashes from the URL (#1038)
193 if isinstance(url_or_request, (compat_str, str)):
194 url_or_request = url_or_request.partition('#')[0]
196 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
200 content_type = urlh.headers.get('Content-Type', '')
201 webpage_bytes = urlh.read()
202 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
204 encoding = m.group(1)
206 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
207 webpage_bytes[:1024])
209 encoding = m.group(1).decode('ascii')
212 if self._downloader.params.get('dump_intermediate_pages', False):
214 url = url_or_request.get_full_url()
215 except AttributeError:
217 self.to_screen(u'Dumping request to ' + url)
218 dump = base64.b64encode(webpage_bytes).decode('ascii')
219 self._downloader.to_screen(dump)
220 if self._downloader.params.get('write_pages', False):
222 url = url_or_request.get_full_url()
223 except AttributeError:
225 raw_filename = ('%s_%s.dump' % (video_id, url))
226 filename = sanitize_filename(raw_filename, restricted=True)
227 self.to_screen(u'Saving request to ' + filename)
228 with open(filename, 'wb') as outf:
229 outf.write(webpage_bytes)
231 content = webpage_bytes.decode(encoding, 'replace')
232 return (content, urlh)
234 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
235 """ Returns the data of the page as a string """
236 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
243 def _download_xml(self, url_or_request, video_id,
244 note=u'Downloading XML', errnote=u'Unable to download XML',
245 transform_source=None):
246 """Return the xml as an xml.etree.ElementTree.Element"""
247 xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
249 xml_string = transform_source(xml_string)
250 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
252 def report_warning(self, msg, video_id=None):
253 idstr = u'' if video_id is None else u'%s: ' % video_id
254 self._downloader.report_warning(
255 u'[%s] %s%s' % (self.IE_NAME, idstr, msg))
257 def to_screen(self, msg):
258 """Print msg to screen, prefixing it with '[ie_name]'"""
259 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
261 def report_extraction(self, id_or_name):
262 """Report information extraction."""
263 self.to_screen(u'%s: Extracting information' % id_or_name)
265 def report_download_webpage(self, video_id):
266 """Report webpage download."""
267 self.to_screen(u'%s: Downloading webpage' % video_id)
269 def report_age_confirmation(self):
270 """Report attempt to confirm age."""
271 self.to_screen(u'Confirming age')
273 def report_login(self):
274 """Report attempt to log in."""
275 self.to_screen(u'Logging in')
277 #Methods for following #608
279 def url_result(url, ie=None, video_id=None):
280 """Returns a url that points to a page that should be processed"""
281 #TODO: ie should be the class used for getting the info
282 video_info = {'_type': 'url',
285 if video_id is not None:
286 video_info['id'] = video_id
289 def playlist_result(entries, playlist_id=None, playlist_title=None):
290 """Returns a playlist"""
291 video_info = {'_type': 'playlist',
294 video_info['id'] = playlist_id
296 video_info['title'] = playlist_title
299 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
301 Perform a regex search on the given string, using a single or a list of
302 patterns returning the first matching group.
303 In case of failure return a default value or raise a WARNING or a
304 RegexNotFoundError, depending on fatal, specifying the field name.
306 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
307 mobj = re.search(pattern, string, flags)
310 mobj = re.search(p, string, flags)
313 if os.name != 'nt' and sys.stderr.isatty():
314 _name = u'\033[0;34m%s\033[0m' % name
319 # return the first matching group
320 return next(g for g in mobj.groups() if g is not None)
321 elif default is not _NO_DEFAULT:
324 raise RegexNotFoundError(u'Unable to extract %s' % _name)
326 self._downloader.report_warning(u'unable to extract %s; '
327 u'please report this issue on http://yt-dl.org/bug' % _name)
330 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
332 Like _search_regex, but strips HTML tags and unescapes entities.
334 res = self._search_regex(pattern, string, name, default, fatal, flags)
336 return clean_html(res).strip()
340 def _get_login_info(self):
342 Get the the login info as (username, password)
343 It will look in the netrc file using the _NETRC_MACHINE value
344 If there's no info available, return (None, None)
346 if self._downloader is None:
351 downloader_params = self._downloader.params
353 # Attempt to use provided username and password or .netrc data
354 if downloader_params.get('username', None) is not None:
355 username = downloader_params['username']
356 password = downloader_params['password']
357 elif downloader_params.get('usenetrc', False):
359 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
364 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
365 except (IOError, netrc.NetrcParseError) as err:
366 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
368 return (username, password)
370 # Helper functions for extracting OpenGraph info
372 def _og_regexes(prop):
373 content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
374 property_re = r'property=[\'"]og:%s[\'"]' % re.escape(prop)
375 template = r'<meta[^>]+?%s[^>]+?%s'
377 template % (property_re, content_re),
378 template % (content_re, property_re),
381 def _og_search_property(self, prop, html, name=None, **kargs):
383 name = 'OpenGraph %s' % prop
384 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
387 return unescapeHTML(escaped)
389 def _og_search_thumbnail(self, html, **kargs):
390 return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
392 def _og_search_description(self, html, **kargs):
393 return self._og_search_property('description', html, fatal=False, **kargs)
395 def _og_search_title(self, html, **kargs):
396 return self._og_search_property('title', html, **kargs)
398 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
399 regexes = self._og_regexes('video')
400 if secure: regexes = self._og_regexes('video:secure_url') + regexes
401 return self._html_search_regex(regexes, html, name, **kargs)
403 def _html_search_meta(self, name, html, display_name=None):
404 if display_name is None:
406 return self._html_search_regex(
408 (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
409 [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
410 html, display_name, fatal=False)
412 def _dc_search_uploader(self, html):
413 return self._html_search_meta('dc.creator', html, 'uploader')
415 def _rta_search(self, html):
416 # See http://www.rtalabel.org/index.php?content=howtofaq#single
417 if re.search(r'(?ix)<meta\s+name="rating"\s+'
418 r' content="RTA-5042-1996-1400-1577-RTA"',
423 def _media_rating_search(self, html):
424 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
425 rating = self._html_search_meta('rating', html)
437 return RATING_TABLE.get(rating.lower(), None)
441 class SearchInfoExtractor(InfoExtractor):
443 Base class for paged search queries extractors.
444 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
445 Instances should define _SEARCH_KEY and _MAX_RESULTS.
449 def _make_valid_url(cls):
450 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
453 def suitable(cls, url):
454 return re.match(cls._make_valid_url(), url) is not None
456 def _real_extract(self, query):
457 mobj = re.match(self._make_valid_url(), query)
459 raise ExtractorError(u'Invalid search query "%s"' % query)
461 prefix = mobj.group('prefix')
462 query = mobj.group('query')
464 return self._get_n_results(query, 1)
465 elif prefix == 'all':
466 return self._get_n_results(query, self._MAX_RESULTS)
470 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
471 elif n > self._MAX_RESULTS:
472 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
473 n = self._MAX_RESULTS
474 return self._get_n_results(query, n)
476 def _get_n_results(self, query, n):
477 """Get a specified number of results for a query"""
478 raise NotImplementedError("This method must be implemented by subclasses")
481 def SEARCH_KEY(self):
482 return self._SEARCH_KEY