7 import xml.etree.ElementTree
23 class InfoExtractor(object):
24 """Information Extractor class.
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
29 others. The information is stored in a dictionary which is then
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
34 The dictionaries must include the following fields:
38 title: Video title, unescaped.
39 ext: Video filename extension.
41 Instead of url and ext, formats can also specified.
43 The following fields are optional:
45 format: The video format, defaults to ext (used for --get-format)
46 thumbnails: A list of dictionaries (with the entries "resolution" and
47 "url") for the varying thumbnails
48 thumbnail: Full URL to a video thumbnail image.
49 description: One-line video description.
50 uploader: Full name of the video uploader.
51 upload_date: Video upload date (YYYYMMDD).
52 uploader_id: Nickname or id of the video uploader.
53 location: Physical location of the video.
54 player_url: SWF Player URL (used for rtmpdump).
55 subtitles: The subtitle file contents as a dictionary in the format
56 {language: subtitles}.
57 view_count: How many users have watched the video on the platform.
58 urlhandle: [internal] The urlHandle to be used to download the file,
59 like returned by urllib.request.urlopen
60 age_limit: Age restriction for the video, as an integer (years)
61 formats: A list of dictionaries for each format available, it must
62 be ordered from worst to best quality. Potential fields:
63 * url Mandatory. The URL of the video file
64 * ext Will be calculated from url if missing
65 * format A human-readable description of the format
66 ("mp4 container with h264/opus").
67 Calculated from the format_id, width, height.
68 and format_note fields if missing.
69 * format_id A short description of the format
70 ("mp4_h264_opus" or "19")
71 * format_note Additional info about the format
72 ("3D" or "DASH video")
73 * width Width of the video, if known
74 * height Height of the video, if known
75 * abr Average audio bitrate in KBit/s
76 * acodec Name of the audio codec in use
77 * vbr Average video bitrate in KBit/s
78 * vcodec Name of the video codec in use
79 webpage_url: The url to the video webpage, if given to youtube-dl it
80 should allow to get the same result again. (It will be set
81 by YoutubeDL if it's missing)
83 Unless mentioned otherwise, the fields should be Unicode strings.
85 Subclasses of this one should re-define the _real_initialize() and
86 _real_extract() methods and define a _VALID_URL regexp.
87 Probably, they should also be added to the list of extractors.
89 _real_extract() must return a *list* of information dictionaries as
92 Finally, the _WORKING attribute should be set to False for broken IEs
93 in order to warn the users and skip the tests.
100 def __init__(self, downloader=None):
101 """Constructor. Receives an optional downloader."""
103 self.set_downloader(downloader)
106 def suitable(cls, url):
107 """Receives a URL and returns True if suitable for this IE."""
109 # This does not use has/getattr intentionally - we want to know whether
110 # we have cached the regexp for *this* class, whereas getattr would also
111 # match the superclass
112 if '_VALID_URL_RE' not in cls.__dict__:
113 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
114 return cls._VALID_URL_RE.match(url) is not None
118 """Getter method for _WORKING."""
121 def initialize(self):
122 """Initializes an instance (authentication, etc)."""
124 self._real_initialize()
127 def extract(self, url):
128 """Extracts URL information and returns it in list of dicts."""
130 return self._real_extract(url)
132 def set_downloader(self, downloader):
133 """Sets the downloader for this IE."""
134 self._downloader = downloader
136 def _real_initialize(self):
137 """Real initialization process. Redefine in subclasses."""
140 def _real_extract(self, url):
141 """Real extraction process. Redefine in subclasses."""
146 """A string for getting the InfoExtractor with get_info_extractor"""
147 return cls.__name__[:-2]
151 return type(self).__name__[:-2]
153 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
154 """ Returns the response handle """
156 self.report_download_webpage(video_id)
157 elif note is not False:
158 self.to_screen(u'%s: %s' % (video_id, note))
160 return self._downloader.urlopen(url_or_request)
161 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
163 errnote = u'Unable to download webpage'
164 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2], cause=err)
166 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
167 """ Returns a tuple (page content as string, URL handle) """
169 # Strip hashes from the URL (#1038)
170 if isinstance(url_or_request, (compat_str, str)):
171 url_or_request = url_or_request.partition('#')[0]
173 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
174 content_type = urlh.headers.get('Content-Type', '')
175 webpage_bytes = urlh.read()
176 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
178 encoding = m.group(1)
180 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
181 webpage_bytes[:1024])
183 encoding = m.group(1).decode('ascii')
186 if self._downloader.params.get('dump_intermediate_pages', False):
188 url = url_or_request.get_full_url()
189 except AttributeError:
191 self.to_screen(u'Dumping request to ' + url)
192 dump = base64.b64encode(webpage_bytes).decode('ascii')
193 self._downloader.to_screen(dump)
194 if self._downloader.params.get('write_pages', False):
196 url = url_or_request.get_full_url()
197 except AttributeError:
199 raw_filename = ('%s_%s.dump' % (video_id, url))
200 filename = sanitize_filename(raw_filename, restricted=True)
201 self.to_screen(u'Saving request to ' + filename)
202 with open(filename, 'wb') as outf:
203 outf.write(webpage_bytes)
205 content = webpage_bytes.decode(encoding, 'replace')
206 return (content, urlh)
208 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
209 """ Returns the data of the page as a string """
210 return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
212 def _download_xml(self, url_or_request, video_id, note=u'Downloading XML', errnote=u'Unable to downloand XML'):
213 """Return the xml as an xml.etree.ElementTree.Element"""
214 xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
215 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
217 def to_screen(self, msg):
218 """Print msg to screen, prefixing it with '[ie_name]'"""
219 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
221 def report_extraction(self, id_or_name):
222 """Report information extraction."""
223 self.to_screen(u'%s: Extracting information' % id_or_name)
225 def report_download_webpage(self, video_id):
226 """Report webpage download."""
227 self.to_screen(u'%s: Downloading webpage' % video_id)
229 def report_age_confirmation(self):
230 """Report attempt to confirm age."""
231 self.to_screen(u'Confirming age')
233 def report_login(self):
234 """Report attempt to log in."""
235 self.to_screen(u'Logging in')
237 #Methods for following #608
238 def url_result(self, url, ie=None, video_id=None):
239 """Returns a url that points to a page that should be processed"""
240 #TODO: ie should be the class used for getting the info
241 video_info = {'_type': 'url',
244 if video_id is not None:
245 video_info['id'] = video_id
247 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
248 """Returns a playlist"""
249 video_info = {'_type': 'playlist',
252 video_info['id'] = playlist_id
254 video_info['title'] = playlist_title
257 def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
259 Perform a regex search on the given string, using a single or a list of
260 patterns returning the first matching group.
261 In case of failure return a default value or raise a WARNING or a
262 RegexNotFoundError, depending on fatal, specifying the field name.
264 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
265 mobj = re.search(pattern, string, flags)
268 mobj = re.search(p, string, flags)
271 if sys.stderr.isatty() and os.name != 'nt':
272 _name = u'\033[0;34m%s\033[0m' % name
277 # return the first matching group
278 return next(g for g in mobj.groups() if g is not None)
279 elif default is not None:
282 raise RegexNotFoundError(u'Unable to extract %s' % _name)
284 self._downloader.report_warning(u'unable to extract %s; '
285 u'please report this issue on http://yt-dl.org/bug' % _name)
288 def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
290 Like _search_regex, but strips HTML tags and unescapes entities.
292 res = self._search_regex(pattern, string, name, default, fatal, flags)
294 return clean_html(res).strip()
298 def _get_login_info(self):
300 Get the the login info as (username, password)
301 It will look in the netrc file using the _NETRC_MACHINE value
302 If there's no info available, return (None, None)
304 if self._downloader is None:
309 downloader_params = self._downloader.params
311 # Attempt to use provided username and password or .netrc data
312 if downloader_params.get('username', None) is not None:
313 username = downloader_params['username']
314 password = downloader_params['password']
315 elif downloader_params.get('usenetrc', False):
317 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
322 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
323 except (IOError, netrc.NetrcParseError) as err:
324 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
326 return (username, password)
328 # Helper functions for extracting OpenGraph info
330 def _og_regexes(prop):
331 content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
332 property_re = r'property=[\'"]og:%s[\'"]' % re.escape(prop)
333 template = r'<meta[^>]+?%s[^>]+?%s'
335 template % (property_re, content_re),
336 template % (content_re, property_re),
339 def _og_search_property(self, prop, html, name=None, **kargs):
341 name = 'OpenGraph %s' % prop
342 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
345 return unescapeHTML(escaped)
347 def _og_search_thumbnail(self, html, **kargs):
348 return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
350 def _og_search_description(self, html, **kargs):
351 return self._og_search_property('description', html, fatal=False, **kargs)
353 def _og_search_title(self, html, **kargs):
354 return self._og_search_property('title', html, **kargs)
356 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
357 regexes = self._og_regexes('video')
358 if secure: regexes = self._og_regexes('video:secure_url') + regexes
359 return self._html_search_regex(regexes, html, name, **kargs)
361 def _html_search_meta(self, name, html, display_name=None):
362 if display_name is None:
364 return self._html_search_regex(
365 r'''(?ix)<meta(?=[^>]+(?:name|property)=["\']%s["\'])
366 [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
367 html, display_name, fatal=False)
369 def _dc_search_uploader(self, html):
370 return self._html_search_meta('dc.creator', html, 'uploader')
372 def _rta_search(self, html):
373 # See http://www.rtalabel.org/index.php?content=howtofaq#single
374 if re.search(r'(?ix)<meta\s+name="rating"\s+'
375 r' content="RTA-5042-1996-1400-1577-RTA"',
380 def _media_rating_search(self, html):
381 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
382 rating = self._html_search_meta('rating', html)
394 return RATING_TABLE.get(rating.lower(), None)
398 class SearchInfoExtractor(InfoExtractor):
400 Base class for paged search queries extractors.
401 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
402 Instances should define _SEARCH_KEY and _MAX_RESULTS.
406 def _make_valid_url(cls):
407 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
410 def suitable(cls, url):
411 return re.match(cls._make_valid_url(), url) is not None
413 def _real_extract(self, query):
414 mobj = re.match(self._make_valid_url(), query)
416 raise ExtractorError(u'Invalid search query "%s"' % query)
418 prefix = mobj.group('prefix')
419 query = mobj.group('query')
421 return self._get_n_results(query, 1)
422 elif prefix == 'all':
423 return self._get_n_results(query, self._MAX_RESULTS)
427 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
428 elif n > self._MAX_RESULTS:
429 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
430 n = self._MAX_RESULTS
431 return self._get_n_results(query, n)
433 def _get_n_results(self, query, n):
434 """Get a specified number of results for a query"""
435 raise NotImplementedError("This method must be implemented by subclasses")
438 def SEARCH_KEY(self):
439 return self._SEARCH_KEY