9 import xml.etree.ElementTree
14 compat_urllib_parse_urlparse,
24 _NO_DEFAULT = object()
27 class InfoExtractor(object):
28 """Information Extractor class.
30 Information extractors are the classes that, given a URL, extract
31 information about the video (or videos) the URL refers to. This
32 information includes the real video URL, the video title, author and
33 others. The information is stored in a dictionary which is then
34 passed to the FileDownloader. The FileDownloader processes this
35 information possibly downloading the video to the file system, among
36 other possible outcomes.
38 The dictionaries must include the following fields:
41 title: Video title, unescaped.
43 Additionally, it must contain either a formats entry or a url one:
45 formats: A list of dictionaries for each format available, ordered
46 from worst to best quality.
49 * url Mandatory. The URL of the video file
50 * ext Will be calculated from url if missing
51 * format A human-readable description of the format
52 ("mp4 container with h264/opus").
53 Calculated from the format_id, width, height.
54 and format_note fields if missing.
55 * format_id A short description of the format
56 ("mp4_h264_opus" or "19").
57 Technically optional, but strongly recommended.
58 * format_note Additional info about the format
59 ("3D" or "DASH video")
60 * width Width of the video, if known
61 * height Height of the video, if known
62 * resolution Textual description of width and height
63 * tbr Average bitrate of audio and video in KBit/s
64 * abr Average audio bitrate in KBit/s
65 * acodec Name of the audio codec in use
66 * vbr Average video bitrate in KBit/s
67 * vcodec Name of the video codec in use
68 * filesize The number of bytes, if known in advance
69 * player_url SWF Player URL (used for rtmpdump).
70 * protocol The protocol that will be used for the actual
72 "http", "https", "rtsp", "rtmp" or so.
73 * preference Order number of this format. If this field is
74 present and not None, the formats get sorted
76 -1 for default (order by other properties),
77 -2 or smaller for less than default.
78 * quality Order number of the video quality of this
79 format, irrespective of the file format.
80 -1 for default (order by other properties),
81 -2 or smaller for less than default.
83 ext: Video filename extension.
84 format: The video format, defaults to ext (used for --get-format)
85 player_url: SWF Player URL (used for rtmpdump).
87 The following fields are optional:
89 thumbnails: A list of dictionaries (with the entries "resolution" and
90 "url") for the varying thumbnails
91 thumbnail: Full URL to a video thumbnail image.
92 description: One-line video description.
93 uploader: Full name of the video uploader.
94 upload_date: Video upload date (YYYYMMDD).
95 uploader_id: Nickname or id of the video uploader.
96 location: Physical location of the video.
97 subtitles: The subtitle file contents as a dictionary in the format
98 {language: subtitles}.
99 duration: Length of the video in seconds, as an integer.
100 view_count: How many users have watched the video on the platform.
101 like_count: Number of positive ratings of the video
102 dislike_count: Number of negative ratings of the video
103 comment_count: Number of comments on the video
104 age_limit: Age restriction for the video, as an integer (years)
105 webpage_url: The url to the video webpage, if given to youtube-dl it
106 should allow to get the same result again. (It will be set
107 by YoutubeDL if it's missing)
109 Unless mentioned otherwise, the fields should be Unicode strings.
111 Subclasses of this one should re-define the _real_initialize() and
112 _real_extract() methods and define a _VALID_URL regexp.
113 Probably, they should also be added to the list of extractors.
115 _real_extract() must return a *list* of information dictionaries as
118 Finally, the _WORKING attribute should be set to False for broken IEs
119 in order to warn the users and skip the tests.
126 def __init__(self, downloader=None):
127 """Constructor. Receives an optional downloader."""
129 self.set_downloader(downloader)
132 def suitable(cls, url):
133 """Receives a URL and returns True if suitable for this IE."""
135 # This does not use has/getattr intentionally - we want to know whether
136 # we have cached the regexp for *this* class, whereas getattr would also
137 # match the superclass
138 if '_VALID_URL_RE' not in cls.__dict__:
139 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
140 return cls._VALID_URL_RE.match(url) is not None
144 """Getter method for _WORKING."""
147 def initialize(self):
148 """Initializes an instance (authentication, etc)."""
150 self._real_initialize()
153 def extract(self, url):
154 """Extracts URL information and returns it in list of dicts."""
156 return self._real_extract(url)
158 def set_downloader(self, downloader):
159 """Sets the downloader for this IE."""
160 self._downloader = downloader
162 def _real_initialize(self):
163 """Real initialization process. Redefine in subclasses."""
166 def _real_extract(self, url):
167 """Real extraction process. Redefine in subclasses."""
172 """A string for getting the InfoExtractor with get_info_extractor"""
173 return cls.__name__[:-2]
177 return type(self).__name__[:-2]
179 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
180 """ Returns the response handle """
182 self.report_download_webpage(video_id)
183 elif note is not False:
185 self.to_screen(u'%s' % (note,))
187 self.to_screen(u'%s: %s' % (video_id, note))
189 return self._downloader.urlopen(url_or_request)
190 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
194 errnote = u'Unable to download webpage'
195 errmsg = u'%s: %s' % (errnote, compat_str(err))
197 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
199 self._downloader.report_warning(errmsg)
202 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
203 """ Returns a tuple (page content as string, URL handle) """
205 # Strip hashes from the URL (#1038)
206 if isinstance(url_or_request, (compat_str, str)):
207 url_or_request = url_or_request.partition('#')[0]
209 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
213 content_type = urlh.headers.get('Content-Type', '')
214 webpage_bytes = urlh.read()
215 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
217 encoding = m.group(1)
219 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
220 webpage_bytes[:1024])
222 encoding = m.group(1).decode('ascii')
223 elif webpage_bytes.startswith(b'\xff\xfe'):
227 if self._downloader.params.get('dump_intermediate_pages', False):
229 url = url_or_request.get_full_url()
230 except AttributeError:
232 self.to_screen(u'Dumping request to ' + url)
233 dump = base64.b64encode(webpage_bytes).decode('ascii')
234 self._downloader.to_screen(dump)
235 if self._downloader.params.get('write_pages', False):
237 url = url_or_request.get_full_url()
238 except AttributeError:
241 h = hashlib.md5(url).hexdigest()
242 url = url[:200 - len(h)] + h
243 raw_filename = ('%s_%s.dump' % (video_id, url))
244 filename = sanitize_filename(raw_filename, restricted=True)
245 self.to_screen(u'Saving request to ' + filename)
246 with open(filename, 'wb') as outf:
247 outf.write(webpage_bytes)
249 content = webpage_bytes.decode(encoding, 'replace')
250 return (content, urlh)
252 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
253 """ Returns the data of the page as a string """
254 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
261 def _download_xml(self, url_or_request, video_id,
262 note=u'Downloading XML', errnote=u'Unable to download XML',
263 transform_source=None):
264 """Return the xml as an xml.etree.ElementTree.Element"""
265 xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
267 xml_string = transform_source(xml_string)
268 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
270 def _download_json(self, url_or_request, video_id,
271 note=u'Downloading JSON metadata',
272 errnote=u'Unable to download JSON metadata'):
273 json_string = self._download_webpage(url_or_request, video_id, note, errnote)
275 return json.loads(json_string)
276 except ValueError as ve:
277 raise ExtractorError('Failed to download JSON', cause=ve)
279 def report_warning(self, msg, video_id=None):
280 idstr = u'' if video_id is None else u'%s: ' % video_id
281 self._downloader.report_warning(
282 u'[%s] %s%s' % (self.IE_NAME, idstr, msg))
284 def to_screen(self, msg):
285 """Print msg to screen, prefixing it with '[ie_name]'"""
286 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
288 def report_extraction(self, id_or_name):
289 """Report information extraction."""
290 self.to_screen(u'%s: Extracting information' % id_or_name)
292 def report_download_webpage(self, video_id):
293 """Report webpage download."""
294 self.to_screen(u'%s: Downloading webpage' % video_id)
296 def report_age_confirmation(self):
297 """Report attempt to confirm age."""
298 self.to_screen(u'Confirming age')
300 def report_login(self):
301 """Report attempt to log in."""
302 self.to_screen(u'Logging in')
304 #Methods for following #608
306 def url_result(url, ie=None, video_id=None):
307 """Returns a url that points to a page that should be processed"""
308 #TODO: ie should be the class used for getting the info
309 video_info = {'_type': 'url',
312 if video_id is not None:
313 video_info['id'] = video_id
316 def playlist_result(entries, playlist_id=None, playlist_title=None):
317 """Returns a playlist"""
318 video_info = {'_type': 'playlist',
321 video_info['id'] = playlist_id
323 video_info['title'] = playlist_title
326 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
328 Perform a regex search on the given string, using a single or a list of
329 patterns returning the first matching group.
330 In case of failure return a default value or raise a WARNING or a
331 RegexNotFoundError, depending on fatal, specifying the field name.
333 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
334 mobj = re.search(pattern, string, flags)
337 mobj = re.search(p, string, flags)
340 if os.name != 'nt' and sys.stderr.isatty():
341 _name = u'\033[0;34m%s\033[0m' % name
346 # return the first matching group
347 return next(g for g in mobj.groups() if g is not None)
348 elif default is not _NO_DEFAULT:
351 raise RegexNotFoundError(u'Unable to extract %s' % _name)
353 self._downloader.report_warning(u'unable to extract %s; '
354 u'please report this issue on http://yt-dl.org/bug' % _name)
357 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
359 Like _search_regex, but strips HTML tags and unescapes entities.
361 res = self._search_regex(pattern, string, name, default, fatal, flags)
363 return clean_html(res).strip()
367 def _get_login_info(self):
369 Get the the login info as (username, password)
370 It will look in the netrc file using the _NETRC_MACHINE value
371 If there's no info available, return (None, None)
373 if self._downloader is None:
378 downloader_params = self._downloader.params
380 # Attempt to use provided username and password or .netrc data
381 if downloader_params.get('username', None) is not None:
382 username = downloader_params['username']
383 password = downloader_params['password']
384 elif downloader_params.get('usenetrc', False):
386 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
391 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
392 except (IOError, netrc.NetrcParseError) as err:
393 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
395 return (username, password)
397 # Helper functions for extracting OpenGraph info
399 def _og_regexes(prop):
400 content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
401 property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
402 template = r'<meta[^>]+?%s[^>]+?%s'
404 template % (property_re, content_re),
405 template % (content_re, property_re),
408 def _og_search_property(self, prop, html, name=None, **kargs):
410 name = 'OpenGraph %s' % prop
411 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
414 return unescapeHTML(escaped)
416 def _og_search_thumbnail(self, html, **kargs):
417 return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
419 def _og_search_description(self, html, **kargs):
420 return self._og_search_property('description', html, fatal=False, **kargs)
422 def _og_search_title(self, html, **kargs):
423 return self._og_search_property('title', html, **kargs)
425 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
426 regexes = self._og_regexes('video')
427 if secure: regexes = self._og_regexes('video:secure_url') + regexes
428 return self._html_search_regex(regexes, html, name, **kargs)
430 def _html_search_meta(self, name, html, display_name=None):
431 if display_name is None:
433 return self._html_search_regex(
435 (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
436 [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
437 html, display_name, fatal=False)
439 def _dc_search_uploader(self, html):
440 return self._html_search_meta('dc.creator', html, 'uploader')
442 def _rta_search(self, html):
443 # See http://www.rtalabel.org/index.php?content=howtofaq#single
444 if re.search(r'(?ix)<meta\s+name="rating"\s+'
445 r' content="RTA-5042-1996-1400-1577-RTA"',
450 def _media_rating_search(self, html):
451 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
452 rating = self._html_search_meta('rating', html)
464 return RATING_TABLE.get(rating.lower(), None)
466 def _sort_formats(self, formats):
468 # TODO remove the following workaround
469 from ..utils import determine_ext
470 if not f.get('ext') and 'url' in f:
471 f['ext'] = determine_ext(f['url'])
473 preference = f.get('preference')
474 if preference is None:
475 proto = f.get('protocol')
477 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
479 preference = 0 if proto in ['http', 'https'] else -0.1
480 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
483 if f.get('vcodec') == 'none': # audio only
484 if self._downloader.params.get('prefer_free_formats'):
485 ORDER = [u'aac', u'mp3', u'm4a', u'webm', u'ogg', u'opus']
487 ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a']
490 audio_ext_preference = ORDER.index(f['ext'])
492 audio_ext_preference = -1
494 if self._downloader.params.get('prefer_free_formats'):
495 ORDER = [u'flv', u'mp4', u'webm']
497 ORDER = [u'webm', u'flv', u'mp4']
499 ext_preference = ORDER.index(f['ext'])
502 audio_ext_preference = 0
506 f.get('quality') if f.get('quality') is not None else -1,
507 f.get('height') if f.get('height') is not None else -1,
508 f.get('width') if f.get('width') is not None else -1,
510 f.get('tbr') if f.get('tbr') is not None else -1,
511 f.get('vbr') if f.get('vbr') is not None else -1,
512 f.get('abr') if f.get('abr') is not None else -1,
513 audio_ext_preference,
514 f.get('filesize') if f.get('filesize') is not None else -1,
517 formats.sort(key=_formats_key)
520 class SearchInfoExtractor(InfoExtractor):
522 Base class for paged search queries extractors.
523 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
524 Instances should define _SEARCH_KEY and _MAX_RESULTS.
528 def _make_valid_url(cls):
529 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
532 def suitable(cls, url):
533 return re.match(cls._make_valid_url(), url) is not None
535 def _real_extract(self, query):
536 mobj = re.match(self._make_valid_url(), query)
538 raise ExtractorError(u'Invalid search query "%s"' % query)
540 prefix = mobj.group('prefix')
541 query = mobj.group('query')
543 return self._get_n_results(query, 1)
544 elif prefix == 'all':
545 return self._get_n_results(query, self._MAX_RESULTS)
549 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
550 elif n > self._MAX_RESULTS:
551 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
552 n = self._MAX_RESULTS
553 return self._get_n_results(query, n)
555 def _get_n_results(self, query, n):
556 """Get a specified number of results for a query"""
557 raise NotImplementedError("This method must be implemented by subclasses")
560 def SEARCH_KEY(self):
561 return self._SEARCH_KEY