9 import xml.etree.ElementTree
14 compat_urllib_parse_urlparse,
24 _NO_DEFAULT = object()
27 class InfoExtractor(object):
28 """Information Extractor class.
30 Information extractors are the classes that, given a URL, extract
31 information about the video (or videos) the URL refers to. This
32 information includes the real video URL, the video title, author and
33 others. The information is stored in a dictionary which is then
34 passed to the FileDownloader. The FileDownloader processes this
35 information possibly downloading the video to the file system, among
36 other possible outcomes.
38 The dictionaries must include the following fields:
41 title: Video title, unescaped.
43 Additionally, it must contain either a formats entry or a url one:
45 formats: A list of dictionaries for each format available, ordered
46 from worst to best quality.
49 * url Mandatory. The URL of the video file
50 * ext Will be calculated from url if missing
51 * format A human-readable description of the format
52 ("mp4 container with h264/opus").
53 Calculated from the format_id, width, height.
54 and format_note fields if missing.
55 * format_id A short description of the format
56 ("mp4_h264_opus" or "19").
57 Technically optional, but strongly recommended.
58 * format_note Additional info about the format
59 ("3D" or "DASH video")
60 * width Width of the video, if known
61 * height Height of the video, if known
62 * resolution Textual description of width and height
63 * tbr Average bitrate of audio and video in KBit/s
64 * abr Average audio bitrate in KBit/s
65 * acodec Name of the audio codec in use
66 * asr Audio sampling rate in Hertz
67 * vbr Average video bitrate in KBit/s
68 * vcodec Name of the video codec in use
69 * container Name of the container format
70 * filesize The number of bytes, if known in advance
71 * player_url SWF Player URL (used for rtmpdump).
72 * protocol The protocol that will be used for the actual
74 "http", "https", "rtsp", "rtmp", "m3u8" or so.
75 * preference Order number of this format. If this field is
76 present and not None, the formats get sorted
78 -1 for default (order by other properties),
79 -2 or smaller for less than default.
80 * quality Order number of the video quality of this
81 format, irrespective of the file format.
82 -1 for default (order by other properties),
83 -2 or smaller for less than default.
85 ext: Video filename extension.
86 format: The video format, defaults to ext (used for --get-format)
87 player_url: SWF Player URL (used for rtmpdump).
89 The following fields are optional:
91 display_id An alternative identifier for the video, not necessarily
92 unique, but available before title. Typically, id is
93 something like "4234987", title "Dancing naked mole rats",
94 and display_id "dancing-naked-mole-rats"
95 thumbnails: A list of dictionaries (with the entries "resolution" and
96 "url") for the varying thumbnails
97 thumbnail: Full URL to a video thumbnail image.
98 description: One-line video description.
99 uploader: Full name of the video uploader.
100 upload_date: Video upload date (YYYYMMDD).
101 uploader_id: Nickname or id of the video uploader.
102 location: Physical location of the video.
103 subtitles: The subtitle file contents as a dictionary in the format
104 {language: subtitles}.
105 duration: Length of the video in seconds, as an integer.
106 view_count: How many users have watched the video on the platform.
107 like_count: Number of positive ratings of the video
108 dislike_count: Number of negative ratings of the video
109 comment_count: Number of comments on the video
110 age_limit: Age restriction for the video, as an integer (years)
111 webpage_url: The url to the video webpage, if given to youtube-dl it
112 should allow to get the same result again. (It will be set
113 by YoutubeDL if it's missing)
115 Unless mentioned otherwise, the fields should be Unicode strings.
117 Subclasses of this one should re-define the _real_initialize() and
118 _real_extract() methods and define a _VALID_URL regexp.
119 Probably, they should also be added to the list of extractors.
121 Finally, the _WORKING attribute should be set to False for broken IEs
122 in order to warn the users and skip the tests.
129 def __init__(self, downloader=None):
130 """Constructor. Receives an optional downloader."""
132 self.set_downloader(downloader)
135 def suitable(cls, url):
136 """Receives a URL and returns True if suitable for this IE."""
138 # This does not use has/getattr intentionally - we want to know whether
139 # we have cached the regexp for *this* class, whereas getattr would also
140 # match the superclass
141 if '_VALID_URL_RE' not in cls.__dict__:
142 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
143 return cls._VALID_URL_RE.match(url) is not None
147 """Getter method for _WORKING."""
150 def initialize(self):
151 """Initializes an instance (authentication, etc)."""
153 self._real_initialize()
156 def extract(self, url):
157 """Extracts URL information and returns it in list of dicts."""
159 return self._real_extract(url)
161 def set_downloader(self, downloader):
162 """Sets the downloader for this IE."""
163 self._downloader = downloader
165 def _real_initialize(self):
166 """Real initialization process. Redefine in subclasses."""
169 def _real_extract(self, url):
170 """Real extraction process. Redefine in subclasses."""
175 """A string for getting the InfoExtractor with get_info_extractor"""
176 return cls.__name__[:-2]
180 return type(self).__name__[:-2]
182 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
183 """ Returns the response handle """
185 self.report_download_webpage(video_id)
186 elif note is not False:
188 self.to_screen(u'%s' % (note,))
190 self.to_screen(u'%s: %s' % (video_id, note))
192 return self._downloader.urlopen(url_or_request)
193 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
197 errnote = u'Unable to download webpage'
198 errmsg = u'%s: %s' % (errnote, compat_str(err))
200 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
202 self._downloader.report_warning(errmsg)
205 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
206 """ Returns a tuple (page content as string, URL handle) """
208 # Strip hashes from the URL (#1038)
209 if isinstance(url_or_request, (compat_str, str)):
210 url_or_request = url_or_request.partition('#')[0]
212 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
216 content_type = urlh.headers.get('Content-Type', '')
217 webpage_bytes = urlh.read()
218 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
220 encoding = m.group(1)
222 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
223 webpage_bytes[:1024])
225 encoding = m.group(1).decode('ascii')
226 elif webpage_bytes.startswith(b'\xff\xfe'):
230 if self._downloader.params.get('dump_intermediate_pages', False):
232 url = url_or_request.get_full_url()
233 except AttributeError:
235 self.to_screen(u'Dumping request to ' + url)
236 dump = base64.b64encode(webpage_bytes).decode('ascii')
237 self._downloader.to_screen(dump)
238 if self._downloader.params.get('write_pages', False):
240 url = url_or_request.get_full_url()
241 except AttributeError:
244 h = u'___' + hashlib.md5(url.encode('utf-8')).hexdigest()
245 url = url[:200 - len(h)] + h
246 raw_filename = ('%s_%s.dump' % (video_id, url))
247 filename = sanitize_filename(raw_filename, restricted=True)
248 self.to_screen(u'Saving request to ' + filename)
249 with open(filename, 'wb') as outf:
250 outf.write(webpage_bytes)
252 content = webpage_bytes.decode(encoding, 'replace')
253 return (content, urlh)
255 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
256 """ Returns the data of the page as a string """
257 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
264 def _download_xml(self, url_or_request, video_id,
265 note=u'Downloading XML', errnote=u'Unable to download XML',
266 transform_source=None):
267 """Return the xml as an xml.etree.ElementTree.Element"""
268 xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
270 xml_string = transform_source(xml_string)
271 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
273 def _download_json(self, url_or_request, video_id,
274 note=u'Downloading JSON metadata',
275 errnote=u'Unable to download JSON metadata',
276 transform_source=None):
277 json_string = self._download_webpage(url_or_request, video_id, note, errnote)
279 json_string = transform_source(json_string)
281 return json.loads(json_string)
282 except ValueError as ve:
283 raise ExtractorError('Failed to download JSON', cause=ve)
285 def report_warning(self, msg, video_id=None):
286 idstr = u'' if video_id is None else u'%s: ' % video_id
287 self._downloader.report_warning(
288 u'[%s] %s%s' % (self.IE_NAME, idstr, msg))
290 def to_screen(self, msg):
291 """Print msg to screen, prefixing it with '[ie_name]'"""
292 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
294 def report_extraction(self, id_or_name):
295 """Report information extraction."""
296 self.to_screen(u'%s: Extracting information' % id_or_name)
298 def report_download_webpage(self, video_id):
299 """Report webpage download."""
300 self.to_screen(u'%s: Downloading webpage' % video_id)
302 def report_age_confirmation(self):
303 """Report attempt to confirm age."""
304 self.to_screen(u'Confirming age')
306 def report_login(self):
307 """Report attempt to log in."""
308 self.to_screen(u'Logging in')
310 #Methods for following #608
312 def url_result(url, ie=None, video_id=None):
313 """Returns a url that points to a page that should be processed"""
314 #TODO: ie should be the class used for getting the info
315 video_info = {'_type': 'url',
318 if video_id is not None:
319 video_info['id'] = video_id
322 def playlist_result(entries, playlist_id=None, playlist_title=None):
323 """Returns a playlist"""
324 video_info = {'_type': 'playlist',
327 video_info['id'] = playlist_id
329 video_info['title'] = playlist_title
332 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
334 Perform a regex search on the given string, using a single or a list of
335 patterns returning the first matching group.
336 In case of failure return a default value or raise a WARNING or a
337 RegexNotFoundError, depending on fatal, specifying the field name.
339 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
340 mobj = re.search(pattern, string, flags)
343 mobj = re.search(p, string, flags)
346 if os.name != 'nt' and sys.stderr.isatty():
347 _name = u'\033[0;34m%s\033[0m' % name
352 # return the first matching group
353 return next(g for g in mobj.groups() if g is not None)
354 elif default is not _NO_DEFAULT:
357 raise RegexNotFoundError(u'Unable to extract %s' % _name)
359 self._downloader.report_warning(u'unable to extract %s; '
360 u'please report this issue on http://yt-dl.org/bug' % _name)
363 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
365 Like _search_regex, but strips HTML tags and unescapes entities.
367 res = self._search_regex(pattern, string, name, default, fatal, flags)
369 return clean_html(res).strip()
373 def _get_login_info(self):
375 Get the the login info as (username, password)
376 It will look in the netrc file using the _NETRC_MACHINE value
377 If there's no info available, return (None, None)
379 if self._downloader is None:
384 downloader_params = self._downloader.params
386 # Attempt to use provided username and password or .netrc data
387 if downloader_params.get('username', None) is not None:
388 username = downloader_params['username']
389 password = downloader_params['password']
390 elif downloader_params.get('usenetrc', False):
392 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
397 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
398 except (IOError, netrc.NetrcParseError) as err:
399 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
401 return (username, password)
403 # Helper functions for extracting OpenGraph info
405 def _og_regexes(prop):
406 content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
407 property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
408 template = r'<meta[^>]+?%s[^>]+?%s'
410 template % (property_re, content_re),
411 template % (content_re, property_re),
414 def _og_search_property(self, prop, html, name=None, **kargs):
416 name = 'OpenGraph %s' % prop
417 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
420 return unescapeHTML(escaped)
422 def _og_search_thumbnail(self, html, **kargs):
423 return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
425 def _og_search_description(self, html, **kargs):
426 return self._og_search_property('description', html, fatal=False, **kargs)
428 def _og_search_title(self, html, **kargs):
429 return self._og_search_property('title', html, **kargs)
431 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
432 regexes = self._og_regexes('video')
433 if secure: regexes = self._og_regexes('video:secure_url') + regexes
434 return self._html_search_regex(regexes, html, name, **kargs)
436 def _html_search_meta(self, name, html, display_name=None, fatal=False):
437 if display_name is None:
439 return self._html_search_regex(
441 (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
442 [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
443 html, display_name, fatal=fatal)
445 def _dc_search_uploader(self, html):
446 return self._html_search_meta('dc.creator', html, 'uploader')
448 def _rta_search(self, html):
449 # See http://www.rtalabel.org/index.php?content=howtofaq#single
450 if re.search(r'(?ix)<meta\s+name="rating"\s+'
451 r' content="RTA-5042-1996-1400-1577-RTA"',
456 def _media_rating_search(self, html):
457 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
458 rating = self._html_search_meta('rating', html)
470 return RATING_TABLE.get(rating.lower(), None)
472 def _twitter_search_player(self, html):
473 return self._html_search_meta('twitter:player', html,
474 'twitter card player')
476 def _sort_formats(self, formats):
478 raise ExtractorError(u'No video formats found')
481 # TODO remove the following workaround
482 from ..utils import determine_ext
483 if not f.get('ext') and 'url' in f:
484 f['ext'] = determine_ext(f['url'])
486 preference = f.get('preference')
487 if preference is None:
488 proto = f.get('protocol')
490 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
492 preference = 0 if proto in ['http', 'https'] else -0.1
493 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
496 if f.get('vcodec') == 'none': # audio only
497 if self._downloader.params.get('prefer_free_formats'):
498 ORDER = [u'aac', u'mp3', u'm4a', u'webm', u'ogg', u'opus']
500 ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a']
503 audio_ext_preference = ORDER.index(f['ext'])
505 audio_ext_preference = -1
507 if self._downloader.params.get('prefer_free_formats'):
508 ORDER = [u'flv', u'mp4', u'webm']
510 ORDER = [u'webm', u'flv', u'mp4']
512 ext_preference = ORDER.index(f['ext'])
515 audio_ext_preference = 0
519 f.get('quality') if f.get('quality') is not None else -1,
520 f.get('height') if f.get('height') is not None else -1,
521 f.get('width') if f.get('width') is not None else -1,
523 f.get('tbr') if f.get('tbr') is not None else -1,
524 f.get('vbr') if f.get('vbr') is not None else -1,
525 f.get('abr') if f.get('abr') is not None else -1,
526 audio_ext_preference,
527 f.get('filesize') if f.get('filesize') is not None else -1,
530 formats.sort(key=_formats_key)
533 class SearchInfoExtractor(InfoExtractor):
535 Base class for paged search queries extractors.
536 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
537 Instances should define _SEARCH_KEY and _MAX_RESULTS.
541 def _make_valid_url(cls):
542 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
545 def suitable(cls, url):
546 return re.match(cls._make_valid_url(), url) is not None
548 def _real_extract(self, query):
549 mobj = re.match(self._make_valid_url(), query)
551 raise ExtractorError(u'Invalid search query "%s"' % query)
553 prefix = mobj.group('prefix')
554 query = mobj.group('query')
556 return self._get_n_results(query, 1)
557 elif prefix == 'all':
558 return self._get_n_results(query, self._MAX_RESULTS)
562 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
563 elif n > self._MAX_RESULTS:
564 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
565 n = self._MAX_RESULTS
566 return self._get_n_results(query, n)
568 def _get_n_results(self, query, n):
569 """Get a specified number of results for a query"""
570 raise NotImplementedError("This method must be implemented by subclasses")
573 def SEARCH_KEY(self):
574 return self._SEARCH_KEY