7 import xml.etree.ElementTree
12 compat_urllib_parse_urlparse,
22 _NO_DEFAULT = object()
25 class InfoExtractor(object):
26 """Information Extractor class.
28 Information extractors are the classes that, given a URL, extract
29 information about the video (or videos) the URL refers to. This
30 information includes the real video URL, the video title, author and
31 others. The information is stored in a dictionary which is then
32 passed to the FileDownloader. The FileDownloader processes this
33 information possibly downloading the video to the file system, among
34 other possible outcomes.
36 The dictionaries must include the following fields:
39 title: Video title, unescaped.
41 Additionally, it must contain either a formats entry or a url one:
43 formats: A list of dictionaries for each format available, ordered
44 from worst to best quality.
47 * url Mandatory. The URL of the video file
48 * ext Will be calculated from url if missing
49 * format A human-readable description of the format
50 ("mp4 container with h264/opus").
51 Calculated from the format_id, width, height.
52 and format_note fields if missing.
53 * format_id A short description of the format
54 ("mp4_h264_opus" or "19")
55 * format_note Additional info about the format
56 ("3D" or "DASH video")
57 * width Width of the video, if known
58 * height Height of the video, if known
59 * resolution Textual description of width and height
60 * tbr Average bitrate of audio and video in KBit/s
61 * abr Average audio bitrate in KBit/s
62 * acodec Name of the audio codec in use
63 * vbr Average video bitrate in KBit/s
64 * vcodec Name of the video codec in use
65 * filesize The number of bytes, if known in advance
66 * player_url SWF Player URL (used for rtmpdump).
67 * protocol The protocol that will be used for the actual
69 "http", "https", "rtsp", "rtmp" or so.
70 * preference Order number of this format. If this field is
71 present, the formats get sorted by this field.
72 -1 for default (order by other properties),
73 -2 or smaller for less than default.
75 ext: Video filename extension.
76 format: The video format, defaults to ext (used for --get-format)
77 player_url: SWF Player URL (used for rtmpdump).
79 The following fields are optional:
81 thumbnails: A list of dictionaries (with the entries "resolution" and
82 "url") for the varying thumbnails
83 thumbnail: Full URL to a video thumbnail image.
84 description: One-line video description.
85 uploader: Full name of the video uploader.
86 upload_date: Video upload date (YYYYMMDD).
87 uploader_id: Nickname or id of the video uploader.
88 location: Physical location of the video.
89 subtitles: The subtitle file contents as a dictionary in the format
90 {language: subtitles}.
91 duration: Length of the video in seconds, as an integer.
92 view_count: How many users have watched the video on the platform.
93 like_count: Number of positive ratings of the video
94 dislike_count: Number of negative ratings of the video
95 comment_count: Number of comments on the video
96 age_limit: Age restriction for the video, as an integer (years)
97 webpage_url: The url to the video webpage, if given to youtube-dl it
98 should allow to get the same result again. (It will be set
99 by YoutubeDL if it's missing)
101 Unless mentioned otherwise, the fields should be Unicode strings.
103 Subclasses of this one should re-define the _real_initialize() and
104 _real_extract() methods and define a _VALID_URL regexp.
105 Probably, they should also be added to the list of extractors.
107 _real_extract() must return a *list* of information dictionaries as
110 Finally, the _WORKING attribute should be set to False for broken IEs
111 in order to warn the users and skip the tests.
118 def __init__(self, downloader=None):
119 """Constructor. Receives an optional downloader."""
121 self.set_downloader(downloader)
124 def suitable(cls, url):
125 """Receives a URL and returns True if suitable for this IE."""
127 # This does not use has/getattr intentionally - we want to know whether
128 # we have cached the regexp for *this* class, whereas getattr would also
129 # match the superclass
130 if '_VALID_URL_RE' not in cls.__dict__:
131 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
132 return cls._VALID_URL_RE.match(url) is not None
136 """Getter method for _WORKING."""
139 def initialize(self):
140 """Initializes an instance (authentication, etc)."""
142 self._real_initialize()
145 def extract(self, url):
146 """Extracts URL information and returns it in list of dicts."""
148 return self._real_extract(url)
150 def set_downloader(self, downloader):
151 """Sets the downloader for this IE."""
152 self._downloader = downloader
154 def _real_initialize(self):
155 """Real initialization process. Redefine in subclasses."""
158 def _real_extract(self, url):
159 """Real extraction process. Redefine in subclasses."""
164 """A string for getting the InfoExtractor with get_info_extractor"""
165 return cls.__name__[:-2]
169 return type(self).__name__[:-2]
171 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
172 """ Returns the response handle """
174 self.report_download_webpage(video_id)
175 elif note is not False:
177 self.to_screen(u'%s' % (note,))
179 self.to_screen(u'%s: %s' % (video_id, note))
181 return self._downloader.urlopen(url_or_request)
182 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
186 errnote = u'Unable to download webpage'
187 errmsg = u'%s: %s' % (errnote, compat_str(err))
189 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
191 self._downloader.report_warning(errmsg)
194 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
195 """ Returns a tuple (page content as string, URL handle) """
197 # Strip hashes from the URL (#1038)
198 if isinstance(url_or_request, (compat_str, str)):
199 url_or_request = url_or_request.partition('#')[0]
201 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
205 content_type = urlh.headers.get('Content-Type', '')
206 webpage_bytes = urlh.read()
207 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
209 encoding = m.group(1)
211 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
212 webpage_bytes[:1024])
214 encoding = m.group(1).decode('ascii')
217 if self._downloader.params.get('dump_intermediate_pages', False):
219 url = url_or_request.get_full_url()
220 except AttributeError:
222 self.to_screen(u'Dumping request to ' + url)
223 dump = base64.b64encode(webpage_bytes).decode('ascii')
224 self._downloader.to_screen(dump)
225 if self._downloader.params.get('write_pages', False):
227 url = url_or_request.get_full_url()
228 except AttributeError:
230 raw_filename = ('%s_%s.dump' % (video_id, url))
231 filename = sanitize_filename(raw_filename, restricted=True)
232 self.to_screen(u'Saving request to ' + filename)
233 with open(filename, 'wb') as outf:
234 outf.write(webpage_bytes)
236 content = webpage_bytes.decode(encoding, 'replace')
237 return (content, urlh)
239 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
240 """ Returns the data of the page as a string """
241 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
248 def _download_xml(self, url_or_request, video_id,
249 note=u'Downloading XML', errnote=u'Unable to download XML',
250 transform_source=None):
251 """Return the xml as an xml.etree.ElementTree.Element"""
252 xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
254 xml_string = transform_source(xml_string)
255 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
257 def report_warning(self, msg, video_id=None):
258 idstr = u'' if video_id is None else u'%s: ' % video_id
259 self._downloader.report_warning(
260 u'[%s] %s%s' % (self.IE_NAME, idstr, msg))
262 def to_screen(self, msg):
263 """Print msg to screen, prefixing it with '[ie_name]'"""
264 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
266 def report_extraction(self, id_or_name):
267 """Report information extraction."""
268 self.to_screen(u'%s: Extracting information' % id_or_name)
270 def report_download_webpage(self, video_id):
271 """Report webpage download."""
272 self.to_screen(u'%s: Downloading webpage' % video_id)
274 def report_age_confirmation(self):
275 """Report attempt to confirm age."""
276 self.to_screen(u'Confirming age')
278 def report_login(self):
279 """Report attempt to log in."""
280 self.to_screen(u'Logging in')
282 #Methods for following #608
284 def url_result(url, ie=None, video_id=None):
285 """Returns a url that points to a page that should be processed"""
286 #TODO: ie should be the class used for getting the info
287 video_info = {'_type': 'url',
290 if video_id is not None:
291 video_info['id'] = video_id
294 def playlist_result(entries, playlist_id=None, playlist_title=None):
295 """Returns a playlist"""
296 video_info = {'_type': 'playlist',
299 video_info['id'] = playlist_id
301 video_info['title'] = playlist_title
304 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
306 Perform a regex search on the given string, using a single or a list of
307 patterns returning the first matching group.
308 In case of failure return a default value or raise a WARNING or a
309 RegexNotFoundError, depending on fatal, specifying the field name.
311 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
312 mobj = re.search(pattern, string, flags)
315 mobj = re.search(p, string, flags)
318 if os.name != 'nt' and sys.stderr.isatty():
319 _name = u'\033[0;34m%s\033[0m' % name
324 # return the first matching group
325 return next(g for g in mobj.groups() if g is not None)
326 elif default is not _NO_DEFAULT:
329 raise RegexNotFoundError(u'Unable to extract %s' % _name)
331 self._downloader.report_warning(u'unable to extract %s; '
332 u'please report this issue on http://yt-dl.org/bug' % _name)
335 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
337 Like _search_regex, but strips HTML tags and unescapes entities.
339 res = self._search_regex(pattern, string, name, default, fatal, flags)
341 return clean_html(res).strip()
345 def _get_login_info(self):
347 Get the the login info as (username, password)
348 It will look in the netrc file using the _NETRC_MACHINE value
349 If there's no info available, return (None, None)
351 if self._downloader is None:
356 downloader_params = self._downloader.params
358 # Attempt to use provided username and password or .netrc data
359 if downloader_params.get('username', None) is not None:
360 username = downloader_params['username']
361 password = downloader_params['password']
362 elif downloader_params.get('usenetrc', False):
364 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
369 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
370 except (IOError, netrc.NetrcParseError) as err:
371 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
373 return (username, password)
375 # Helper functions for extracting OpenGraph info
377 def _og_regexes(prop):
378 content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
379 property_re = r'property=[\'"]og:%s[\'"]' % re.escape(prop)
380 template = r'<meta[^>]+?%s[^>]+?%s'
382 template % (property_re, content_re),
383 template % (content_re, property_re),
386 def _og_search_property(self, prop, html, name=None, **kargs):
388 name = 'OpenGraph %s' % prop
389 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
392 return unescapeHTML(escaped)
394 def _og_search_thumbnail(self, html, **kargs):
395 return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
397 def _og_search_description(self, html, **kargs):
398 return self._og_search_property('description', html, fatal=False, **kargs)
400 def _og_search_title(self, html, **kargs):
401 return self._og_search_property('title', html, **kargs)
403 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
404 regexes = self._og_regexes('video')
405 if secure: regexes = self._og_regexes('video:secure_url') + regexes
406 return self._html_search_regex(regexes, html, name, **kargs)
408 def _html_search_meta(self, name, html, display_name=None):
409 if display_name is None:
411 return self._html_search_regex(
413 (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
414 [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
415 html, display_name, fatal=False)
417 def _dc_search_uploader(self, html):
418 return self._html_search_meta('dc.creator', html, 'uploader')
420 def _rta_search(self, html):
421 # See http://www.rtalabel.org/index.php?content=howtofaq#single
422 if re.search(r'(?ix)<meta\s+name="rating"\s+'
423 r' content="RTA-5042-1996-1400-1577-RTA"',
428 def _media_rating_search(self, html):
429 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
430 rating = self._html_search_meta('rating', html)
442 return RATING_TABLE.get(rating.lower(), None)
444 def _sort_formats(self, formats):
446 # TODO remove the following workaround
447 from ..utils import determine_ext
448 if not f.get('ext') and 'url' in f:
449 f['ext'] = determine_ext(f['url'])
451 preference = f.get('preference')
452 if preference is None:
453 proto = f.get('protocol')
455 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
457 preference = 0 if proto in ['http', 'https'] else -0.1
458 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
461 if f.get('vcodec') == 'none': # audio only
462 if self._downloader.params.get('prefer_free_formats'):
463 ORDER = [u'aac', u'mp3', u'm4a', u'webm', u'ogg', u'opus']
465 ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a']
468 audio_ext_preference = ORDER.index(f['ext'])
470 audio_ext_preference = -1
472 if self._downloader.params.get('prefer_free_formats'):
473 ORDER = [u'flv', u'mp4', u'webm']
475 ORDER = [u'webm', u'flv', u'mp4']
477 ext_preference = ORDER.index(f['ext'])
480 audio_ext_preference = 0
484 f.get('height') if f.get('height') is not None else -1,
485 f.get('width') if f.get('width') is not None else -1,
487 f.get('vbr') if f.get('vbr') is not None else -1,
488 f.get('abr') if f.get('abr') is not None else -1,
489 audio_ext_preference,
490 f.get('filesize') if f.get('filesize') is not None else -1,
493 formats.sort(key=_formats_key)
496 class SearchInfoExtractor(InfoExtractor):
498 Base class for paged search queries extractors.
499 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
500 Instances should define _SEARCH_KEY and _MAX_RESULTS.
504 def _make_valid_url(cls):
505 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
508 def suitable(cls, url):
509 return re.match(cls._make_valid_url(), url) is not None
511 def _real_extract(self, query):
512 mobj = re.match(self._make_valid_url(), query)
514 raise ExtractorError(u'Invalid search query "%s"' % query)
516 prefix = mobj.group('prefix')
517 query = mobj.group('query')
519 return self._get_n_results(query, 1)
520 elif prefix == 'all':
521 return self._get_n_results(query, self._MAX_RESULTS)
525 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
526 elif n > self._MAX_RESULTS:
527 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
528 n = self._MAX_RESULTS
529 return self._get_n_results(query, n)
531 def _get_n_results(self, query, n):
532 """Get a specified number of results for a query"""
533 raise NotImplementedError("This method must be implemented by subclasses")
536 def SEARCH_KEY(self):
537 return self._SEARCH_KEY