10 import xml.etree.ElementTree
15 compat_urllib_parse_urlparse,
25 _NO_DEFAULT = object()
28 class InfoExtractor(object):
29 """Information Extractor class.
31 Information extractors are the classes that, given a URL, extract
32 information about the video (or videos) the URL refers to. This
33 information includes the real video URL, the video title, author and
34 others. The information is stored in a dictionary which is then
35 passed to the FileDownloader. The FileDownloader processes this
36 information possibly downloading the video to the file system, among
37 other possible outcomes.
39 The dictionaries must include the following fields:
42 title: Video title, unescaped.
44 Additionally, it must contain either a formats entry or a url one:
46 formats: A list of dictionaries for each format available, ordered
47 from worst to best quality.
50 * url Mandatory. The URL of the video file
51 * ext Will be calculated from url if missing
52 * format A human-readable description of the format
53 ("mp4 container with h264/opus").
54 Calculated from the format_id, width, height.
55 and format_note fields if missing.
56 * format_id A short description of the format
57 ("mp4_h264_opus" or "19").
58 Technically optional, but strongly recommended.
59 * format_note Additional info about the format
60 ("3D" or "DASH video")
61 * width Width of the video, if known
62 * height Height of the video, if known
63 * resolution Textual description of width and height
64 * tbr Average bitrate of audio and video in KBit/s
65 * abr Average audio bitrate in KBit/s
66 * acodec Name of the audio codec in use
67 * asr Audio sampling rate in Hertz
68 * vbr Average video bitrate in KBit/s
69 * vcodec Name of the video codec in use
70 * container Name of the container format
71 * filesize The number of bytes, if known in advance
72 * filesize_approx An estimate for the number of bytes
73 * player_url SWF Player URL (used for rtmpdump).
74 * protocol The protocol that will be used for the actual
76 "http", "https", "rtsp", "rtmp", "m3u8" or so.
77 * preference Order number of this format. If this field is
78 present and not None, the formats get sorted
79 by this field, regardless of all other values.
80 -1 for default (order by other properties),
81 -2 or smaller for less than default.
82 * quality Order number of the video quality of this
83 format, irrespective of the file format.
84 -1 for default (order by other properties),
85 -2 or smaller for less than default.
87 ext: Video filename extension.
88 format: The video format, defaults to ext (used for --get-format)
89 player_url: SWF Player URL (used for rtmpdump).
91 The following fields are optional:
93 display_id An alternative identifier for the video, not necessarily
94 unique, but available before title. Typically, id is
95 something like "4234987", title "Dancing naked mole rats",
96 and display_id "dancing-naked-mole-rats"
97 thumbnails: A list of dictionaries, with the following entries:
99 * "width" (optional, int)
100 * "height" (optional, int)
101 * "resolution" (optional, string "{width}x{height"},
103 thumbnail: Full URL to a video thumbnail image.
104 description: One-line video description.
105 uploader: Full name of the video uploader.
106 timestamp: UNIX timestamp of the moment the video became available.
107 upload_date: Video upload date (YYYYMMDD).
108 If not explicitly set, calculated from timestamp.
109 uploader_id: Nickname or id of the video uploader.
110 location: Physical location of the video.
111 subtitles: The subtitle file contents as a dictionary in the format
112 {language: subtitles}.
113 duration: Length of the video in seconds, as an integer.
114 view_count: How many users have watched the video on the platform.
115 like_count: Number of positive ratings of the video
116 dislike_count: Number of negative ratings of the video
117 comment_count: Number of comments on the video
118 age_limit: Age restriction for the video, as an integer (years)
119 webpage_url: The url to the video webpage, if given to youtube-dl it
120 should allow to get the same result again. (It will be set
121 by YoutubeDL if it's missing)
122 categories: A list of categories that the video falls in, for example
125 Unless mentioned otherwise, the fields should be Unicode strings.
127 Subclasses of this one should re-define the _real_initialize() and
128 _real_extract() methods and define a _VALID_URL regexp.
129 Probably, they should also be added to the list of extractors.
131 Finally, the _WORKING attribute should be set to False for broken IEs
132 in order to warn the users and skip the tests.
139 def __init__(self, downloader=None):
140 """Constructor. Receives an optional downloader."""
142 self.set_downloader(downloader)
145 def suitable(cls, url):
146 """Receives a URL and returns True if suitable for this IE."""
148 # This does not use has/getattr intentionally - we want to know whether
149 # we have cached the regexp for *this* class, whereas getattr would also
150 # match the superclass
151 if '_VALID_URL_RE' not in cls.__dict__:
152 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
153 return cls._VALID_URL_RE.match(url) is not None
157 """Getter method for _WORKING."""
160 def initialize(self):
161 """Initializes an instance (authentication, etc)."""
163 self._real_initialize()
166 def extract(self, url):
167 """Extracts URL information and returns it in list of dicts."""
169 return self._real_extract(url)
171 def set_downloader(self, downloader):
172 """Sets the downloader for this IE."""
173 self._downloader = downloader
175 def _real_initialize(self):
176 """Real initialization process. Redefine in subclasses."""
179 def _real_extract(self, url):
180 """Real extraction process. Redefine in subclasses."""
185 """A string for getting the InfoExtractor with get_info_extractor"""
186 return cls.__name__[:-2]
190 return type(self).__name__[:-2]
192 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
193 """ Returns the response handle """
195 self.report_download_webpage(video_id)
196 elif note is not False:
198 self.to_screen(u'%s' % (note,))
200 self.to_screen(u'%s: %s' % (video_id, note))
202 return self._downloader.urlopen(url_or_request)
203 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
207 errnote = u'Unable to download webpage'
208 errmsg = u'%s: %s' % (errnote, compat_str(err))
210 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
212 self._downloader.report_warning(errmsg)
215 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
216 """ Returns a tuple (page content as string, URL handle) """
218 # Strip hashes from the URL (#1038)
219 if isinstance(url_or_request, (compat_str, str)):
220 url_or_request = url_or_request.partition('#')[0]
222 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
226 content_type = urlh.headers.get('Content-Type', '')
227 webpage_bytes = urlh.read()
228 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
230 encoding = m.group(1)
232 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
233 webpage_bytes[:1024])
235 encoding = m.group(1).decode('ascii')
236 elif webpage_bytes.startswith(b'\xff\xfe'):
240 if self._downloader.params.get('dump_intermediate_pages', False):
242 url = url_or_request.get_full_url()
243 except AttributeError:
245 self.to_screen(u'Dumping request to ' + url)
246 dump = base64.b64encode(webpage_bytes).decode('ascii')
247 self._downloader.to_screen(dump)
248 if self._downloader.params.get('write_pages', False):
250 url = url_or_request.get_full_url()
251 except AttributeError:
253 basen = '%s_%s' % (video_id, url)
255 h = u'___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
256 basen = basen[:240 - len(h)] + h
257 raw_filename = basen + '.dump'
258 filename = sanitize_filename(raw_filename, restricted=True)
259 self.to_screen(u'Saving request to ' + filename)
260 with open(filename, 'wb') as outf:
261 outf.write(webpage_bytes)
264 content = webpage_bytes.decode(encoding, 'replace')
266 content = webpage_bytes.decode('utf-8', 'replace')
268 if (u'<title>Access to this site is blocked</title>' in content and
269 u'Websense' in content[:512]):
270 msg = u'Access to this webpage has been blocked by Websense filtering software in your network.'
271 blocked_iframe = self._html_search_regex(
272 r'<iframe src="([^"]+)"', content,
273 u'Websense information URL', default=None)
275 msg += u' Visit %s for more details' % blocked_iframe
276 raise ExtractorError(msg, expected=True)
278 return (content, urlh)
280 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
281 """ Returns the data of the page as a string """
282 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
289 def _download_xml(self, url_or_request, video_id,
290 note=u'Downloading XML', errnote=u'Unable to download XML',
291 transform_source=None, fatal=True):
292 """Return the xml as an xml.etree.ElementTree.Element"""
293 xml_string = self._download_webpage(
294 url_or_request, video_id, note, errnote, fatal=fatal)
295 if xml_string is False:
298 xml_string = transform_source(xml_string)
299 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
301 def _download_json(self, url_or_request, video_id,
302 note=u'Downloading JSON metadata',
303 errnote=u'Unable to download JSON metadata',
304 transform_source=None,
306 json_string = self._download_webpage(
307 url_or_request, video_id, note, errnote, fatal=fatal)
308 if (not fatal) and json_string is False:
311 json_string = transform_source(json_string)
313 return json.loads(json_string)
314 except ValueError as ve:
315 raise ExtractorError('Failed to download JSON', cause=ve)
317 def report_warning(self, msg, video_id=None):
318 idstr = u'' if video_id is None else u'%s: ' % video_id
319 self._downloader.report_warning(
320 u'[%s] %s%s' % (self.IE_NAME, idstr, msg))
322 def to_screen(self, msg):
323 """Print msg to screen, prefixing it with '[ie_name]'"""
324 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
326 def report_extraction(self, id_or_name):
327 """Report information extraction."""
328 self.to_screen(u'%s: Extracting information' % id_or_name)
330 def report_download_webpage(self, video_id):
331 """Report webpage download."""
332 self.to_screen(u'%s: Downloading webpage' % video_id)
334 def report_age_confirmation(self):
335 """Report attempt to confirm age."""
336 self.to_screen(u'Confirming age')
338 def report_login(self):
339 """Report attempt to log in."""
340 self.to_screen(u'Logging in')
342 #Methods for following #608
344 def url_result(url, ie=None, video_id=None):
345 """Returns a url that points to a page that should be processed"""
346 #TODO: ie should be the class used for getting the info
347 video_info = {'_type': 'url',
350 if video_id is not None:
351 video_info['id'] = video_id
354 def playlist_result(entries, playlist_id=None, playlist_title=None):
355 """Returns a playlist"""
356 video_info = {'_type': 'playlist',
359 video_info['id'] = playlist_id
361 video_info['title'] = playlist_title
364 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
366 Perform a regex search on the given string, using a single or a list of
367 patterns returning the first matching group.
368 In case of failure return a default value or raise a WARNING or a
369 RegexNotFoundError, depending on fatal, specifying the field name.
371 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
372 mobj = re.search(pattern, string, flags)
375 mobj = re.search(p, string, flags)
379 if os.name != 'nt' and sys.stderr.isatty():
380 _name = u'\033[0;34m%s\033[0m' % name
385 # return the first matching group
386 return next(g for g in mobj.groups() if g is not None)
387 elif default is not _NO_DEFAULT:
390 raise RegexNotFoundError(u'Unable to extract %s' % _name)
392 self._downloader.report_warning(u'unable to extract %s; '
393 u'please report this issue on http://yt-dl.org/bug' % _name)
396 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
398 Like _search_regex, but strips HTML tags and unescapes entities.
400 res = self._search_regex(pattern, string, name, default, fatal, flags)
402 return clean_html(res).strip()
406 def _get_login_info(self):
408 Get the the login info as (username, password)
409 It will look in the netrc file using the _NETRC_MACHINE value
410 If there's no info available, return (None, None)
412 if self._downloader is None:
417 downloader_params = self._downloader.params
419 # Attempt to use provided username and password or .netrc data
420 if downloader_params.get('username', None) is not None:
421 username = downloader_params['username']
422 password = downloader_params['password']
423 elif downloader_params.get('usenetrc', False):
425 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
430 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
431 except (IOError, netrc.NetrcParseError) as err:
432 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
434 return (username, password)
436 # Helper functions for extracting OpenGraph info
438 def _og_regexes(prop):
439 content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
440 property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
441 template = r'<meta[^>]+?%s[^>]+?%s'
443 template % (property_re, content_re),
444 template % (content_re, property_re),
447 def _og_search_property(self, prop, html, name=None, **kargs):
449 name = 'OpenGraph %s' % prop
450 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
453 return unescapeHTML(escaped)
455 def _og_search_thumbnail(self, html, **kargs):
456 return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
458 def _og_search_description(self, html, **kargs):
459 return self._og_search_property('description', html, fatal=False, **kargs)
461 def _og_search_title(self, html, **kargs):
462 return self._og_search_property('title', html, **kargs)
464 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
465 regexes = self._og_regexes('video')
466 if secure: regexes = self._og_regexes('video:secure_url') + regexes
467 return self._html_search_regex(regexes, html, name, **kargs)
469 def _og_search_url(self, html, **kargs):
470 return self._og_search_property('url', html, **kargs)
472 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
473 if display_name is None:
475 return self._html_search_regex(
477 (?=[^>]+(?:itemprop|name|property)=["\']?%s["\']?)
478 [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
479 html, display_name, fatal=fatal, **kwargs)
481 def _dc_search_uploader(self, html):
482 return self._html_search_meta('dc.creator', html, 'uploader')
484 def _rta_search(self, html):
485 # See http://www.rtalabel.org/index.php?content=howtofaq#single
486 if re.search(r'(?ix)<meta\s+name="rating"\s+'
487 r' content="RTA-5042-1996-1400-1577-RTA"',
492 def _media_rating_search(self, html):
493 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
494 rating = self._html_search_meta('rating', html)
506 return RATING_TABLE.get(rating.lower(), None)
508 def _twitter_search_player(self, html):
509 return self._html_search_meta('twitter:player', html,
510 'twitter card player')
512 def _sort_formats(self, formats):
514 raise ExtractorError(u'No video formats found')
517 # TODO remove the following workaround
518 from ..utils import determine_ext
519 if not f.get('ext') and 'url' in f:
520 f['ext'] = determine_ext(f['url'])
522 preference = f.get('preference')
523 if preference is None:
524 proto = f.get('protocol')
526 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
528 preference = 0 if proto in ['http', 'https'] else -0.1
529 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
532 if f.get('vcodec') == 'none': # audio only
533 if self._downloader.params.get('prefer_free_formats'):
534 ORDER = [u'aac', u'mp3', u'm4a', u'webm', u'ogg', u'opus']
536 ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a']
539 audio_ext_preference = ORDER.index(f['ext'])
541 audio_ext_preference = -1
543 if self._downloader.params.get('prefer_free_formats'):
544 ORDER = [u'flv', u'mp4', u'webm']
546 ORDER = [u'webm', u'flv', u'mp4']
548 ext_preference = ORDER.index(f['ext'])
551 audio_ext_preference = 0
555 f.get('quality') if f.get('quality') is not None else -1,
556 f.get('height') if f.get('height') is not None else -1,
557 f.get('width') if f.get('width') is not None else -1,
559 f.get('tbr') if f.get('tbr') is not None else -1,
560 f.get('vbr') if f.get('vbr') is not None else -1,
561 f.get('abr') if f.get('abr') is not None else -1,
562 audio_ext_preference,
563 f.get('filesize') if f.get('filesize') is not None else -1,
564 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
567 formats.sort(key=_formats_key)
569 def http_scheme(self):
570 """ Either "https:" or "https:", depending on the user's preferences """
573 if self._downloader.params.get('prefer_insecure', False)
576 def _proto_relative_url(self, url, scheme=None):
579 if url.startswith('//'):
581 scheme = self.http_scheme()
586 def _sleep(self, timeout, video_id, msg_template=None):
587 if msg_template is None:
588 msg_template = u'%(video_id)s: Waiting for %(timeout)s seconds'
589 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
594 class SearchInfoExtractor(InfoExtractor):
596 Base class for paged search queries extractors.
597 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
598 Instances should define _SEARCH_KEY and _MAX_RESULTS.
602 def _make_valid_url(cls):
603 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
606 def suitable(cls, url):
607 return re.match(cls._make_valid_url(), url) is not None
609 def _real_extract(self, query):
610 mobj = re.match(self._make_valid_url(), query)
612 raise ExtractorError(u'Invalid search query "%s"' % query)
614 prefix = mobj.group('prefix')
615 query = mobj.group('query')
617 return self._get_n_results(query, 1)
618 elif prefix == 'all':
619 return self._get_n_results(query, self._MAX_RESULTS)
623 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
624 elif n > self._MAX_RESULTS:
625 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
626 n = self._MAX_RESULTS
627 return self._get_n_results(query, n)
629 def _get_n_results(self, query, n):
630 """Get a specified number of results for a query"""
631 raise NotImplementedError("This method must be implemented by subclasses")
634 def SEARCH_KEY(self):
635 return self._SEARCH_KEY