9 import xml.etree.ElementTree
14 compat_urllib_parse_urlparse,
24 _NO_DEFAULT = object()
27 class InfoExtractor(object):
28 """Information Extractor class.
30 Information extractors are the classes that, given a URL, extract
31 information about the video (or videos) the URL refers to. This
32 information includes the real video URL, the video title, author and
33 others. The information is stored in a dictionary which is then
34 passed to the FileDownloader. The FileDownloader processes this
35 information possibly downloading the video to the file system, among
36 other possible outcomes.
38 The dictionaries must include the following fields:
41 title: Video title, unescaped.
43 Additionally, it must contain either a formats entry or a url one:
45 formats: A list of dictionaries for each format available, ordered
46 from worst to best quality.
49 * url Mandatory. The URL of the video file
50 * ext Will be calculated from url if missing
51 * format A human-readable description of the format
52 ("mp4 container with h264/opus").
53 Calculated from the format_id, width, height.
54 and format_note fields if missing.
55 * format_id A short description of the format
56 ("mp4_h264_opus" or "19").
57 Technically optional, but strongly recommended.
58 * format_note Additional info about the format
59 ("3D" or "DASH video")
60 * width Width of the video, if known
61 * height Height of the video, if known
62 * resolution Textual description of width and height
63 * tbr Average bitrate of audio and video in KBit/s
64 * abr Average audio bitrate in KBit/s
65 * acodec Name of the audio codec in use
66 * vbr Average video bitrate in KBit/s
67 * vcodec Name of the video codec in use
68 * filesize The number of bytes, if known in advance
69 * player_url SWF Player URL (used for rtmpdump).
70 * protocol The protocol that will be used for the actual
72 "http", "https", "rtsp", "rtmp" or so.
73 * preference Order number of this format. If this field is
74 present and not None, the formats get sorted
76 -1 for default (order by other properties),
77 -2 or smaller for less than default.
78 * quality Order number of the video quality of this
79 format, irrespective of the file format.
80 -1 for default (order by other properties),
81 -2 or smaller for less than default.
83 ext: Video filename extension.
84 format: The video format, defaults to ext (used for --get-format)
85 player_url: SWF Player URL (used for rtmpdump).
87 The following fields are optional:
89 thumbnails: A list of dictionaries (with the entries "resolution" and
90 "url") for the varying thumbnails
91 thumbnail: Full URL to a video thumbnail image.
92 description: One-line video description.
93 uploader: Full name of the video uploader.
94 upload_date: Video upload date (YYYYMMDD).
95 uploader_id: Nickname or id of the video uploader.
96 location: Physical location of the video.
97 subtitles: The subtitle file contents as a dictionary in the format
98 {language: subtitles}.
99 duration: Length of the video in seconds, as an integer.
100 view_count: How many users have watched the video on the platform.
101 like_count: Number of positive ratings of the video
102 dislike_count: Number of negative ratings of the video
103 comment_count: Number of comments on the video
104 age_limit: Age restriction for the video, as an integer (years)
105 webpage_url: The url to the video webpage, if given to youtube-dl it
106 should allow to get the same result again. (It will be set
107 by YoutubeDL if it's missing)
109 Unless mentioned otherwise, the fields should be Unicode strings.
111 Subclasses of this one should re-define the _real_initialize() and
112 _real_extract() methods and define a _VALID_URL regexp.
113 Probably, they should also be added to the list of extractors.
115 _real_extract() must return a *list* of information dictionaries as
118 Finally, the _WORKING attribute should be set to False for broken IEs
119 in order to warn the users and skip the tests.
126 def __init__(self, downloader=None):
127 """Constructor. Receives an optional downloader."""
129 self.set_downloader(downloader)
132 def suitable(cls, url):
133 """Receives a URL and returns True if suitable for this IE."""
135 # This does not use has/getattr intentionally - we want to know whether
136 # we have cached the regexp for *this* class, whereas getattr would also
137 # match the superclass
138 if '_VALID_URL_RE' not in cls.__dict__:
139 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
140 return cls._VALID_URL_RE.match(url) is not None
144 """Getter method for _WORKING."""
147 def initialize(self):
148 """Initializes an instance (authentication, etc)."""
150 self._real_initialize()
153 def extract(self, url):
154 """Extracts URL information and returns it in list of dicts."""
156 return self._real_extract(url)
158 def set_downloader(self, downloader):
159 """Sets the downloader for this IE."""
160 self._downloader = downloader
162 def _real_initialize(self):
163 """Real initialization process. Redefine in subclasses."""
166 def _real_extract(self, url):
167 """Real extraction process. Redefine in subclasses."""
172 """A string for getting the InfoExtractor with get_info_extractor"""
173 return cls.__name__[:-2]
177 return type(self).__name__[:-2]
179 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
180 """ Returns the response handle """
182 self.report_download_webpage(video_id)
183 elif note is not False:
185 self.to_screen(u'%s' % (note,))
187 self.to_screen(u'%s: %s' % (video_id, note))
189 return self._downloader.urlopen(url_or_request)
190 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
194 errnote = u'Unable to download webpage'
195 errmsg = u'%s: %s' % (errnote, compat_str(err))
197 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
199 self._downloader.report_warning(errmsg)
202 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
203 """ Returns a tuple (page content as string, URL handle) """
205 # Strip hashes from the URL (#1038)
206 if isinstance(url_or_request, (compat_str, str)):
207 url_or_request = url_or_request.partition('#')[0]
209 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
213 content_type = urlh.headers.get('Content-Type', '')
214 webpage_bytes = urlh.read()
215 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
217 encoding = m.group(1)
219 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
220 webpage_bytes[:1024])
222 encoding = m.group(1).decode('ascii')
225 if self._downloader.params.get('dump_intermediate_pages', False):
227 url = url_or_request.get_full_url()
228 except AttributeError:
230 self.to_screen(u'Dumping request to ' + url)
231 dump = base64.b64encode(webpage_bytes).decode('ascii')
232 self._downloader.to_screen(dump)
233 if self._downloader.params.get('write_pages', False):
235 url = url_or_request.get_full_url()
236 except AttributeError:
239 h = hashlib.md5(url).hexdigest()
240 url = url[:200 - len(h)] + h
241 raw_filename = ('%s_%s.dump' % (video_id, url))
242 filename = sanitize_filename(raw_filename, restricted=True)
243 self.to_screen(u'Saving request to ' + filename)
244 with open(filename, 'wb') as outf:
245 outf.write(webpage_bytes)
247 content = webpage_bytes.decode(encoding, 'replace')
248 return (content, urlh)
250 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
251 """ Returns the data of the page as a string """
252 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
259 def _download_xml(self, url_or_request, video_id,
260 note=u'Downloading XML', errnote=u'Unable to download XML',
261 transform_source=None):
262 """Return the xml as an xml.etree.ElementTree.Element"""
263 xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
265 xml_string = transform_source(xml_string)
266 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
268 def _download_json(self, url_or_request, video_id,
269 note=u'Downloading JSON metadata',
270 errnote=u'Unable to download JSON metadata'):
271 json_string = self._download_webpage(url_or_request, video_id, note, errnote)
273 return json.loads(json_string)
274 except ValueError as ve:
275 raise ExtractorError('Failed to download JSON', cause=ve)
277 def report_warning(self, msg, video_id=None):
278 idstr = u'' if video_id is None else u'%s: ' % video_id
279 self._downloader.report_warning(
280 u'[%s] %s%s' % (self.IE_NAME, idstr, msg))
282 def to_screen(self, msg):
283 """Print msg to screen, prefixing it with '[ie_name]'"""
284 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
286 def report_extraction(self, id_or_name):
287 """Report information extraction."""
288 self.to_screen(u'%s: Extracting information' % id_or_name)
290 def report_download_webpage(self, video_id):
291 """Report webpage download."""
292 self.to_screen(u'%s: Downloading webpage' % video_id)
294 def report_age_confirmation(self):
295 """Report attempt to confirm age."""
296 self.to_screen(u'Confirming age')
298 def report_login(self):
299 """Report attempt to log in."""
300 self.to_screen(u'Logging in')
302 #Methods for following #608
304 def url_result(url, ie=None, video_id=None):
305 """Returns a url that points to a page that should be processed"""
306 #TODO: ie should be the class used for getting the info
307 video_info = {'_type': 'url',
310 if video_id is not None:
311 video_info['id'] = video_id
314 def playlist_result(entries, playlist_id=None, playlist_title=None):
315 """Returns a playlist"""
316 video_info = {'_type': 'playlist',
319 video_info['id'] = playlist_id
321 video_info['title'] = playlist_title
324 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
326 Perform a regex search on the given string, using a single or a list of
327 patterns returning the first matching group.
328 In case of failure return a default value or raise a WARNING or a
329 RegexNotFoundError, depending on fatal, specifying the field name.
331 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
332 mobj = re.search(pattern, string, flags)
335 mobj = re.search(p, string, flags)
338 if os.name != 'nt' and sys.stderr.isatty():
339 _name = u'\033[0;34m%s\033[0m' % name
344 # return the first matching group
345 return next(g for g in mobj.groups() if g is not None)
346 elif default is not _NO_DEFAULT:
349 raise RegexNotFoundError(u'Unable to extract %s' % _name)
351 self._downloader.report_warning(u'unable to extract %s; '
352 u'please report this issue on http://yt-dl.org/bug' % _name)
355 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
357 Like _search_regex, but strips HTML tags and unescapes entities.
359 res = self._search_regex(pattern, string, name, default, fatal, flags)
361 return clean_html(res).strip()
365 def _get_login_info(self):
367 Get the the login info as (username, password)
368 It will look in the netrc file using the _NETRC_MACHINE value
369 If there's no info available, return (None, None)
371 if self._downloader is None:
376 downloader_params = self._downloader.params
378 # Attempt to use provided username and password or .netrc data
379 if downloader_params.get('username', None) is not None:
380 username = downloader_params['username']
381 password = downloader_params['password']
382 elif downloader_params.get('usenetrc', False):
384 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
389 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
390 except (IOError, netrc.NetrcParseError) as err:
391 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
393 return (username, password)
395 # Helper functions for extracting OpenGraph info
397 def _og_regexes(prop):
398 content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
399 property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
400 template = r'<meta[^>]+?%s[^>]+?%s'
402 template % (property_re, content_re),
403 template % (content_re, property_re),
406 def _og_search_property(self, prop, html, name=None, **kargs):
408 name = 'OpenGraph %s' % prop
409 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
412 return unescapeHTML(escaped)
414 def _og_search_thumbnail(self, html, **kargs):
415 return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
417 def _og_search_description(self, html, **kargs):
418 return self._og_search_property('description', html, fatal=False, **kargs)
420 def _og_search_title(self, html, **kargs):
421 return self._og_search_property('title', html, **kargs)
423 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
424 regexes = self._og_regexes('video')
425 if secure: regexes = self._og_regexes('video:secure_url') + regexes
426 return self._html_search_regex(regexes, html, name, **kargs)
428 def _html_search_meta(self, name, html, display_name=None):
429 if display_name is None:
431 return self._html_search_regex(
433 (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
434 [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
435 html, display_name, fatal=False)
437 def _dc_search_uploader(self, html):
438 return self._html_search_meta('dc.creator', html, 'uploader')
440 def _rta_search(self, html):
441 # See http://www.rtalabel.org/index.php?content=howtofaq#single
442 if re.search(r'(?ix)<meta\s+name="rating"\s+'
443 r' content="RTA-5042-1996-1400-1577-RTA"',
448 def _media_rating_search(self, html):
449 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
450 rating = self._html_search_meta('rating', html)
462 return RATING_TABLE.get(rating.lower(), None)
464 def _sort_formats(self, formats):
466 # TODO remove the following workaround
467 from ..utils import determine_ext
468 if not f.get('ext') and 'url' in f:
469 f['ext'] = determine_ext(f['url'])
471 preference = f.get('preference')
472 if preference is None:
473 proto = f.get('protocol')
475 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
477 preference = 0 if proto in ['http', 'https'] else -0.1
478 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
481 if f.get('vcodec') == 'none': # audio only
482 if self._downloader.params.get('prefer_free_formats'):
483 ORDER = [u'aac', u'mp3', u'm4a', u'webm', u'ogg', u'opus']
485 ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a']
488 audio_ext_preference = ORDER.index(f['ext'])
490 audio_ext_preference = -1
492 if self._downloader.params.get('prefer_free_formats'):
493 ORDER = [u'flv', u'mp4', u'webm']
495 ORDER = [u'webm', u'flv', u'mp4']
497 ext_preference = ORDER.index(f['ext'])
500 audio_ext_preference = 0
504 f.get('quality') if f.get('quality') is not None else -1,
505 f.get('height') if f.get('height') is not None else -1,
506 f.get('width') if f.get('width') is not None else -1,
508 f.get('tbr') if f.get('tbr') is not None else -1,
509 f.get('vbr') if f.get('vbr') is not None else -1,
510 f.get('abr') if f.get('abr') is not None else -1,
511 audio_ext_preference,
512 f.get('filesize') if f.get('filesize') is not None else -1,
515 formats.sort(key=_formats_key)
518 class SearchInfoExtractor(InfoExtractor):
520 Base class for paged search queries extractors.
521 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
522 Instances should define _SEARCH_KEY and _MAX_RESULTS.
526 def _make_valid_url(cls):
527 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
530 def suitable(cls, url):
531 return re.match(cls._make_valid_url(), url) is not None
533 def _real_extract(self, query):
534 mobj = re.match(self._make_valid_url(), query)
536 raise ExtractorError(u'Invalid search query "%s"' % query)
538 prefix = mobj.group('prefix')
539 query = mobj.group('query')
541 return self._get_n_results(query, 1)
542 elif prefix == 'all':
543 return self._get_n_results(query, self._MAX_RESULTS)
547 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
548 elif n > self._MAX_RESULTS:
549 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
550 n = self._MAX_RESULTS
551 return self._get_n_results(query, n)
553 def _get_n_results(self, query, n):
554 """Get a specified number of results for a query"""
555 raise NotImplementedError("This method must be implemented by subclasses")
558 def SEARCH_KEY(self):
559 return self._SEARCH_KEY