9 import xml.etree.ElementTree
14 compat_urllib_parse_urlparse,
24 _NO_DEFAULT = object()
27 class InfoExtractor(object):
28 """Information Extractor class.
30 Information extractors are the classes that, given a URL, extract
31 information about the video (or videos) the URL refers to. This
32 information includes the real video URL, the video title, author and
33 others. The information is stored in a dictionary which is then
34 passed to the FileDownloader. The FileDownloader processes this
35 information possibly downloading the video to the file system, among
36 other possible outcomes.
38 The dictionaries must include the following fields:
41 title: Video title, unescaped.
43 Additionally, it must contain either a formats entry or a url one:
45 formats: A list of dictionaries for each format available, ordered
46 from worst to best quality.
49 * url Mandatory. The URL of the video file
50 * ext Will be calculated from url if missing
51 * format A human-readable description of the format
52 ("mp4 container with h264/opus").
53 Calculated from the format_id, width, height.
54 and format_note fields if missing.
55 * format_id A short description of the format
56 ("mp4_h264_opus" or "19").
57 Technically optional, but strongly recommended.
58 * format_note Additional info about the format
59 ("3D" or "DASH video")
60 * width Width of the video, if known
61 * height Height of the video, if known
62 * resolution Textual description of width and height
63 * tbr Average bitrate of audio and video in KBit/s
64 * abr Average audio bitrate in KBit/s
65 * acodec Name of the audio codec in use
66 * asr Audio sampling rate in Hertz
67 * vbr Average video bitrate in KBit/s
68 * vcodec Name of the video codec in use
69 * container Name of the container format
70 * filesize The number of bytes, if known in advance
71 * player_url SWF Player URL (used for rtmpdump).
72 * protocol The protocol that will be used for the actual
74 "http", "https", "rtsp", "rtmp", "m3u8" or so.
75 * preference Order number of this format. If this field is
76 present and not None, the formats get sorted
77 by this field, regardless of all other values.
78 -1 for default (order by other properties),
79 -2 or smaller for less than default.
80 * quality Order number of the video quality of this
81 format, irrespective of the file format.
82 -1 for default (order by other properties),
83 -2 or smaller for less than default.
85 ext: Video filename extension.
86 format: The video format, defaults to ext (used for --get-format)
87 player_url: SWF Player URL (used for rtmpdump).
89 The following fields are optional:
91 display_id An alternative identifier for the video, not necessarily
92 unique, but available before title. Typically, id is
93 something like "4234987", title "Dancing naked mole rats",
94 and display_id "dancing-naked-mole-rats"
95 thumbnails: A list of dictionaries (with the entries "resolution" and
96 "url") for the varying thumbnails
97 thumbnail: Full URL to a video thumbnail image.
98 description: One-line video description.
99 uploader: Full name of the video uploader.
100 timestamp: UNIX timestamp of the moment the video became available.
101 upload_date: Video upload date (YYYYMMDD).
102 If not explicitly set, calculated from timestamp.
103 uploader_id: Nickname or id of the video uploader.
104 location: Physical location of the video.
105 subtitles: The subtitle file contents as a dictionary in the format
106 {language: subtitles}.
107 duration: Length of the video in seconds, as an integer.
108 view_count: How many users have watched the video on the platform.
109 like_count: Number of positive ratings of the video
110 dislike_count: Number of negative ratings of the video
111 comment_count: Number of comments on the video
112 age_limit: Age restriction for the video, as an integer (years)
113 webpage_url: The url to the video webpage, if given to youtube-dl it
114 should allow to get the same result again. (It will be set
115 by YoutubeDL if it's missing)
117 Unless mentioned otherwise, the fields should be Unicode strings.
119 Subclasses of this one should re-define the _real_initialize() and
120 _real_extract() methods and define a _VALID_URL regexp.
121 Probably, they should also be added to the list of extractors.
123 Finally, the _WORKING attribute should be set to False for broken IEs
124 in order to warn the users and skip the tests.
131 def __init__(self, downloader=None):
132 """Constructor. Receives an optional downloader."""
134 self.set_downloader(downloader)
137 def suitable(cls, url):
138 """Receives a URL and returns True if suitable for this IE."""
140 # This does not use has/getattr intentionally - we want to know whether
141 # we have cached the regexp for *this* class, whereas getattr would also
142 # match the superclass
143 if '_VALID_URL_RE' not in cls.__dict__:
144 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
145 return cls._VALID_URL_RE.match(url) is not None
149 """Getter method for _WORKING."""
152 def initialize(self):
153 """Initializes an instance (authentication, etc)."""
155 self._real_initialize()
158 def extract(self, url):
159 """Extracts URL information and returns it in list of dicts."""
161 return self._real_extract(url)
163 def set_downloader(self, downloader):
164 """Sets the downloader for this IE."""
165 self._downloader = downloader
167 def _real_initialize(self):
168 """Real initialization process. Redefine in subclasses."""
171 def _real_extract(self, url):
172 """Real extraction process. Redefine in subclasses."""
177 """A string for getting the InfoExtractor with get_info_extractor"""
178 return cls.__name__[:-2]
182 return type(self).__name__[:-2]
184 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
185 """ Returns the response handle """
187 self.report_download_webpage(video_id)
188 elif note is not False:
190 self.to_screen(u'%s' % (note,))
192 self.to_screen(u'%s: %s' % (video_id, note))
194 return self._downloader.urlopen(url_or_request)
195 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
199 errnote = u'Unable to download webpage'
200 errmsg = u'%s: %s' % (errnote, compat_str(err))
202 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
204 self._downloader.report_warning(errmsg)
207 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
208 """ Returns a tuple (page content as string, URL handle) """
210 # Strip hashes from the URL (#1038)
211 if isinstance(url_or_request, (compat_str, str)):
212 url_or_request = url_or_request.partition('#')[0]
214 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
218 content_type = urlh.headers.get('Content-Type', '')
219 webpage_bytes = urlh.read()
220 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
222 encoding = m.group(1)
224 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
225 webpage_bytes[:1024])
227 encoding = m.group(1).decode('ascii')
228 elif webpage_bytes.startswith(b'\xff\xfe'):
232 if self._downloader.params.get('dump_intermediate_pages', False):
234 url = url_or_request.get_full_url()
235 except AttributeError:
237 self.to_screen(u'Dumping request to ' + url)
238 dump = base64.b64encode(webpage_bytes).decode('ascii')
239 self._downloader.to_screen(dump)
240 if self._downloader.params.get('write_pages', False):
242 url = url_or_request.get_full_url()
243 except AttributeError:
246 h = u'___' + hashlib.md5(url.encode('utf-8')).hexdigest()
247 url = url[:200 - len(h)] + h
248 raw_filename = ('%s_%s.dump' % (video_id, url))
249 filename = sanitize_filename(raw_filename, restricted=True)
250 self.to_screen(u'Saving request to ' + filename)
251 with open(filename, 'wb') as outf:
252 outf.write(webpage_bytes)
255 content = webpage_bytes.decode(encoding, 'replace')
257 content = webpage_bytes.decode('utf-8', 'replace')
259 if (u'<title>Access to this site is blocked</title>' in content and
260 u'Websense' in content[:512]):
261 msg = u'Access to this webpage has been blocked by Websense filtering software in your network.'
262 blocked_iframe = self._html_search_regex(
263 r'<iframe src="([^"]+)"', content,
264 u'Websense information URL', default=None)
266 msg += u' Visit %s for more details' % blocked_iframe
267 raise ExtractorError(msg, expected=True)
269 return (content, urlh)
271 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
272 """ Returns the data of the page as a string """
273 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
280 def _download_xml(self, url_or_request, video_id,
281 note=u'Downloading XML', errnote=u'Unable to download XML',
282 transform_source=None, fatal=True):
283 """Return the xml as an xml.etree.ElementTree.Element"""
284 xml_string = self._download_webpage(
285 url_or_request, video_id, note, errnote, fatal=fatal)
286 if xml_string is False:
289 xml_string = transform_source(xml_string)
290 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
292 def _download_json(self, url_or_request, video_id,
293 note=u'Downloading JSON metadata',
294 errnote=u'Unable to download JSON metadata',
295 transform_source=None):
296 json_string = self._download_webpage(url_or_request, video_id, note, errnote)
298 json_string = transform_source(json_string)
300 return json.loads(json_string)
301 except ValueError as ve:
302 raise ExtractorError('Failed to download JSON', cause=ve)
304 def report_warning(self, msg, video_id=None):
305 idstr = u'' if video_id is None else u'%s: ' % video_id
306 self._downloader.report_warning(
307 u'[%s] %s%s' % (self.IE_NAME, idstr, msg))
309 def to_screen(self, msg):
310 """Print msg to screen, prefixing it with '[ie_name]'"""
311 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
313 def report_extraction(self, id_or_name):
314 """Report information extraction."""
315 self.to_screen(u'%s: Extracting information' % id_or_name)
317 def report_download_webpage(self, video_id):
318 """Report webpage download."""
319 self.to_screen(u'%s: Downloading webpage' % video_id)
321 def report_age_confirmation(self):
322 """Report attempt to confirm age."""
323 self.to_screen(u'Confirming age')
325 def report_login(self):
326 """Report attempt to log in."""
327 self.to_screen(u'Logging in')
329 #Methods for following #608
331 def url_result(url, ie=None, video_id=None):
332 """Returns a url that points to a page that should be processed"""
333 #TODO: ie should be the class used for getting the info
334 video_info = {'_type': 'url',
337 if video_id is not None:
338 video_info['id'] = video_id
341 def playlist_result(entries, playlist_id=None, playlist_title=None):
342 """Returns a playlist"""
343 video_info = {'_type': 'playlist',
346 video_info['id'] = playlist_id
348 video_info['title'] = playlist_title
351 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
353 Perform a regex search on the given string, using a single or a list of
354 patterns returning the first matching group.
355 In case of failure return a default value or raise a WARNING or a
356 RegexNotFoundError, depending on fatal, specifying the field name.
358 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
359 mobj = re.search(pattern, string, flags)
362 mobj = re.search(p, string, flags)
365 if os.name != 'nt' and sys.stderr.isatty():
366 _name = u'\033[0;34m%s\033[0m' % name
371 # return the first matching group
372 return next(g for g in mobj.groups() if g is not None)
373 elif default is not _NO_DEFAULT:
376 raise RegexNotFoundError(u'Unable to extract %s' % _name)
378 self._downloader.report_warning(u'unable to extract %s; '
379 u'please report this issue on http://yt-dl.org/bug' % _name)
382 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
384 Like _search_regex, but strips HTML tags and unescapes entities.
386 res = self._search_regex(pattern, string, name, default, fatal, flags)
388 return clean_html(res).strip()
392 def _get_login_info(self):
394 Get the the login info as (username, password)
395 It will look in the netrc file using the _NETRC_MACHINE value
396 If there's no info available, return (None, None)
398 if self._downloader is None:
403 downloader_params = self._downloader.params
405 # Attempt to use provided username and password or .netrc data
406 if downloader_params.get('username', None) is not None:
407 username = downloader_params['username']
408 password = downloader_params['password']
409 elif downloader_params.get('usenetrc', False):
411 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
416 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
417 except (IOError, netrc.NetrcParseError) as err:
418 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
420 return (username, password)
422 # Helper functions for extracting OpenGraph info
424 def _og_regexes(prop):
425 content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
426 property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
427 template = r'<meta[^>]+?%s[^>]+?%s'
429 template % (property_re, content_re),
430 template % (content_re, property_re),
433 def _og_search_property(self, prop, html, name=None, **kargs):
435 name = 'OpenGraph %s' % prop
436 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
439 return unescapeHTML(escaped)
441 def _og_search_thumbnail(self, html, **kargs):
442 return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
444 def _og_search_description(self, html, **kargs):
445 return self._og_search_property('description', html, fatal=False, **kargs)
447 def _og_search_title(self, html, **kargs):
448 return self._og_search_property('title', html, **kargs)
450 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
451 regexes = self._og_regexes('video')
452 if secure: regexes = self._og_regexes('video:secure_url') + regexes
453 return self._html_search_regex(regexes, html, name, **kargs)
455 def _html_search_meta(self, name, html, display_name=None, fatal=False):
456 if display_name is None:
458 return self._html_search_regex(
460 (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
461 [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
462 html, display_name, fatal=fatal)
464 def _dc_search_uploader(self, html):
465 return self._html_search_meta('dc.creator', html, 'uploader')
467 def _rta_search(self, html):
468 # See http://www.rtalabel.org/index.php?content=howtofaq#single
469 if re.search(r'(?ix)<meta\s+name="rating"\s+'
470 r' content="RTA-5042-1996-1400-1577-RTA"',
475 def _media_rating_search(self, html):
476 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
477 rating = self._html_search_meta('rating', html)
489 return RATING_TABLE.get(rating.lower(), None)
491 def _twitter_search_player(self, html):
492 return self._html_search_meta('twitter:player', html,
493 'twitter card player')
495 def _sort_formats(self, formats):
497 raise ExtractorError(u'No video formats found')
500 # TODO remove the following workaround
501 from ..utils import determine_ext
502 if not f.get('ext') and 'url' in f:
503 f['ext'] = determine_ext(f['url'])
505 preference = f.get('preference')
506 if preference is None:
507 proto = f.get('protocol')
509 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
511 preference = 0 if proto in ['http', 'https'] else -0.1
512 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
515 if f.get('vcodec') == 'none': # audio only
516 if self._downloader.params.get('prefer_free_formats'):
517 ORDER = [u'aac', u'mp3', u'm4a', u'webm', u'ogg', u'opus']
519 ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a']
522 audio_ext_preference = ORDER.index(f['ext'])
524 audio_ext_preference = -1
526 if self._downloader.params.get('prefer_free_formats'):
527 ORDER = [u'flv', u'mp4', u'webm']
529 ORDER = [u'webm', u'flv', u'mp4']
531 ext_preference = ORDER.index(f['ext'])
534 audio_ext_preference = 0
538 f.get('quality') if f.get('quality') is not None else -1,
539 f.get('height') if f.get('height') is not None else -1,
540 f.get('width') if f.get('width') is not None else -1,
542 f.get('tbr') if f.get('tbr') is not None else -1,
543 f.get('vbr') if f.get('vbr') is not None else -1,
544 f.get('abr') if f.get('abr') is not None else -1,
545 audio_ext_preference,
546 f.get('filesize') if f.get('filesize') is not None else -1,
549 formats.sort(key=_formats_key)
551 def http_scheme(self):
552 """ Either "https:" or "https:", depending on the user's preferences """
555 if self._downloader.params.get('prefer_insecure', False)
559 class SearchInfoExtractor(InfoExtractor):
561 Base class for paged search queries extractors.
562 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
563 Instances should define _SEARCH_KEY and _MAX_RESULTS.
567 def _make_valid_url(cls):
568 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
571 def suitable(cls, url):
572 return re.match(cls._make_valid_url(), url) is not None
574 def _real_extract(self, query):
575 mobj = re.match(self._make_valid_url(), query)
577 raise ExtractorError(u'Invalid search query "%s"' % query)
579 prefix = mobj.group('prefix')
580 query = mobj.group('query')
582 return self._get_n_results(query, 1)
583 elif prefix == 'all':
584 return self._get_n_results(query, self._MAX_RESULTS)
588 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
589 elif n > self._MAX_RESULTS:
590 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
591 n = self._MAX_RESULTS
592 return self._get_n_results(query, n)
594 def _get_n_results(self, query, n):
595 """Get a specified number of results for a query"""
596 raise NotImplementedError("This method must be implemented by subclasses")
599 def SEARCH_KEY(self):
600 return self._SEARCH_KEY