10 import xml.etree.ElementTree
15 compat_urllib_parse_urlparse,
26 _NO_DEFAULT = object()
29 class InfoExtractor(object):
30 """Information Extractor class.
32 Information extractors are the classes that, given a URL, extract
33 information about the video (or videos) the URL refers to. This
34 information includes the real video URL, the video title, author and
35 others. The information is stored in a dictionary which is then
36 passed to the FileDownloader. The FileDownloader processes this
37 information possibly downloading the video to the file system, among
38 other possible outcomes.
40 The dictionaries must include the following fields:
43 title: Video title, unescaped.
45 Additionally, it must contain either a formats entry or a url one:
47 formats: A list of dictionaries for each format available, ordered
48 from worst to best quality.
51 * url Mandatory. The URL of the video file
52 * ext Will be calculated from url if missing
53 * format A human-readable description of the format
54 ("mp4 container with h264/opus").
55 Calculated from the format_id, width, height.
56 and format_note fields if missing.
57 * format_id A short description of the format
58 ("mp4_h264_opus" or "19").
59 Technically optional, but strongly recommended.
60 * format_note Additional info about the format
61 ("3D" or "DASH video")
62 * width Width of the video, if known
63 * height Height of the video, if known
64 * resolution Textual description of width and height
65 * tbr Average bitrate of audio and video in KBit/s
66 * abr Average audio bitrate in KBit/s
67 * acodec Name of the audio codec in use
68 * asr Audio sampling rate in Hertz
69 * vbr Average video bitrate in KBit/s
70 * vcodec Name of the video codec in use
71 * container Name of the container format
72 * filesize The number of bytes, if known in advance
73 * filesize_approx An estimate for the number of bytes
74 * player_url SWF Player URL (used for rtmpdump).
75 * protocol The protocol that will be used for the actual
77 "http", "https", "rtsp", "rtmp", "m3u8" or so.
78 * preference Order number of this format. If this field is
79 present and not None, the formats get sorted
80 by this field, regardless of all other values.
81 -1 for default (order by other properties),
82 -2 or smaller for less than default.
83 * quality Order number of the video quality of this
84 format, irrespective of the file format.
85 -1 for default (order by other properties),
86 -2 or smaller for less than default.
87 * http_referer HTTP Referer header value to set.
88 * http_method HTTP method to use for the download.
89 * http_headers A dictionary of additional HTTP headers
90 to add to the request.
91 * http_post_data Additional data to send with a POST
94 ext: Video filename extension.
95 format: The video format, defaults to ext (used for --get-format)
96 player_url: SWF Player URL (used for rtmpdump).
98 The following fields are optional:
100 display_id An alternative identifier for the video, not necessarily
101 unique, but available before title. Typically, id is
102 something like "4234987", title "Dancing naked mole rats",
103 and display_id "dancing-naked-mole-rats"
104 thumbnails: A list of dictionaries, with the following entries:
106 * "width" (optional, int)
107 * "height" (optional, int)
108 * "resolution" (optional, string "{width}x{height"},
110 thumbnail: Full URL to a video thumbnail image.
111 description: One-line video description.
112 uploader: Full name of the video uploader.
113 timestamp: UNIX timestamp of the moment the video became available.
114 upload_date: Video upload date (YYYYMMDD).
115 If not explicitly set, calculated from timestamp.
116 uploader_id: Nickname or id of the video uploader.
117 location: Physical location of the video.
118 subtitles: The subtitle file contents as a dictionary in the format
119 {language: subtitles}.
120 duration: Length of the video in seconds, as an integer.
121 view_count: How many users have watched the video on the platform.
122 like_count: Number of positive ratings of the video
123 dislike_count: Number of negative ratings of the video
124 comment_count: Number of comments on the video
125 age_limit: Age restriction for the video, as an integer (years)
126 webpage_url: The url to the video webpage, if given to youtube-dl it
127 should allow to get the same result again. (It will be set
128 by YoutubeDL if it's missing)
129 categories: A list of categories that the video falls in, for example
132 Unless mentioned otherwise, the fields should be Unicode strings.
134 Subclasses of this one should re-define the _real_initialize() and
135 _real_extract() methods and define a _VALID_URL regexp.
136 Probably, they should also be added to the list of extractors.
138 Finally, the _WORKING attribute should be set to False for broken IEs
139 in order to warn the users and skip the tests.
146 def __init__(self, downloader=None):
147 """Constructor. Receives an optional downloader."""
149 self.set_downloader(downloader)
152 def suitable(cls, url):
153 """Receives a URL and returns True if suitable for this IE."""
155 # This does not use has/getattr intentionally - we want to know whether
156 # we have cached the regexp for *this* class, whereas getattr would also
157 # match the superclass
158 if '_VALID_URL_RE' not in cls.__dict__:
159 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
160 return cls._VALID_URL_RE.match(url) is not None
164 """Getter method for _WORKING."""
167 def initialize(self):
168 """Initializes an instance (authentication, etc)."""
170 self._real_initialize()
173 def extract(self, url):
174 """Extracts URL information and returns it in list of dicts."""
176 return self._real_extract(url)
178 def set_downloader(self, downloader):
179 """Sets the downloader for this IE."""
180 self._downloader = downloader
182 def _real_initialize(self):
183 """Real initialization process. Redefine in subclasses."""
186 def _real_extract(self, url):
187 """Real extraction process. Redefine in subclasses."""
192 """A string for getting the InfoExtractor with get_info_extractor"""
193 return cls.__name__[:-2]
197 return type(self).__name__[:-2]
199 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
200 """ Returns the response handle """
202 self.report_download_webpage(video_id)
203 elif note is not False:
205 self.to_screen(u'%s' % (note,))
207 self.to_screen(u'%s: %s' % (video_id, note))
209 return self._downloader.urlopen(url_or_request)
210 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
214 errnote = u'Unable to download webpage'
215 errmsg = u'%s: %s' % (errnote, compat_str(err))
217 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
219 self._downloader.report_warning(errmsg)
222 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
223 """ Returns a tuple (page content as string, URL handle) """
225 # Strip hashes from the URL (#1038)
226 if isinstance(url_or_request, (compat_str, str)):
227 url_or_request = url_or_request.partition('#')[0]
229 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
233 content_type = urlh.headers.get('Content-Type', '')
234 webpage_bytes = urlh.read()
235 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
237 encoding = m.group(1)
239 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
240 webpage_bytes[:1024])
242 encoding = m.group(1).decode('ascii')
243 elif webpage_bytes.startswith(b'\xff\xfe'):
247 if self._downloader.params.get('dump_intermediate_pages', False):
249 url = url_or_request.get_full_url()
250 except AttributeError:
252 self.to_screen(u'Dumping request to ' + url)
253 dump = base64.b64encode(webpage_bytes).decode('ascii')
254 self._downloader.to_screen(dump)
255 if self._downloader.params.get('write_pages', False):
257 url = url_or_request.get_full_url()
258 except AttributeError:
260 basen = '%s_%s' % (video_id, url)
262 h = u'___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
263 basen = basen[:240 - len(h)] + h
264 raw_filename = basen + '.dump'
265 filename = sanitize_filename(raw_filename, restricted=True)
266 self.to_screen(u'Saving request to ' + filename)
267 with open(filename, 'wb') as outf:
268 outf.write(webpage_bytes)
271 content = webpage_bytes.decode(encoding, 'replace')
273 content = webpage_bytes.decode('utf-8', 'replace')
275 if (u'<title>Access to this site is blocked</title>' in content and
276 u'Websense' in content[:512]):
277 msg = u'Access to this webpage has been blocked by Websense filtering software in your network.'
278 blocked_iframe = self._html_search_regex(
279 r'<iframe src="([^"]+)"', content,
280 u'Websense information URL', default=None)
282 msg += u' Visit %s for more details' % blocked_iframe
283 raise ExtractorError(msg, expected=True)
285 return (content, urlh)
287 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
288 """ Returns the data of the page as a string """
289 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
296 def _download_xml(self, url_or_request, video_id,
297 note=u'Downloading XML', errnote=u'Unable to download XML',
298 transform_source=None, fatal=True):
299 """Return the xml as an xml.etree.ElementTree.Element"""
300 xml_string = self._download_webpage(
301 url_or_request, video_id, note, errnote, fatal=fatal)
302 if xml_string is False:
305 xml_string = transform_source(xml_string)
306 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
308 def _download_json(self, url_or_request, video_id,
309 note=u'Downloading JSON metadata',
310 errnote=u'Unable to download JSON metadata',
311 transform_source=None,
313 json_string = self._download_webpage(
314 url_or_request, video_id, note, errnote, fatal=fatal)
315 if (not fatal) and json_string is False:
318 json_string = transform_source(json_string)
320 return json.loads(json_string)
321 except ValueError as ve:
322 raise ExtractorError('Failed to download JSON', cause=ve)
324 def report_warning(self, msg, video_id=None):
325 idstr = u'' if video_id is None else u'%s: ' % video_id
326 self._downloader.report_warning(
327 u'[%s] %s%s' % (self.IE_NAME, idstr, msg))
329 def to_screen(self, msg):
330 """Print msg to screen, prefixing it with '[ie_name]'"""
331 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
333 def report_extraction(self, id_or_name):
334 """Report information extraction."""
335 self.to_screen(u'%s: Extracting information' % id_or_name)
337 def report_download_webpage(self, video_id):
338 """Report webpage download."""
339 self.to_screen(u'%s: Downloading webpage' % video_id)
341 def report_age_confirmation(self):
342 """Report attempt to confirm age."""
343 self.to_screen(u'Confirming age')
345 def report_login(self):
346 """Report attempt to log in."""
347 self.to_screen(u'Logging in')
349 #Methods for following #608
351 def url_result(url, ie=None, video_id=None):
352 """Returns a url that points to a page that should be processed"""
353 #TODO: ie should be the class used for getting the info
354 video_info = {'_type': 'url',
357 if video_id is not None:
358 video_info['id'] = video_id
361 def playlist_result(entries, playlist_id=None, playlist_title=None):
362 """Returns a playlist"""
363 video_info = {'_type': 'playlist',
366 video_info['id'] = playlist_id
368 video_info['title'] = playlist_title
371 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
373 Perform a regex search on the given string, using a single or a list of
374 patterns returning the first matching group.
375 In case of failure return a default value or raise a WARNING or a
376 RegexNotFoundError, depending on fatal, specifying the field name.
378 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
379 mobj = re.search(pattern, string, flags)
382 mobj = re.search(p, string, flags)
386 if os.name != 'nt' and sys.stderr.isatty():
387 _name = u'\033[0;34m%s\033[0m' % name
392 # return the first matching group
393 return next(g for g in mobj.groups() if g is not None)
394 elif default is not _NO_DEFAULT:
397 raise RegexNotFoundError(u'Unable to extract %s' % _name)
399 self._downloader.report_warning(u'unable to extract %s; '
400 u'please report this issue on http://yt-dl.org/bug' % _name)
403 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
405 Like _search_regex, but strips HTML tags and unescapes entities.
407 res = self._search_regex(pattern, string, name, default, fatal, flags)
409 return clean_html(res).strip()
413 def _get_login_info(self):
415 Get the the login info as (username, password)
416 It will look in the netrc file using the _NETRC_MACHINE value
417 If there's no info available, return (None, None)
419 if self._downloader is None:
424 downloader_params = self._downloader.params
426 # Attempt to use provided username and password or .netrc data
427 if downloader_params.get('username', None) is not None:
428 username = downloader_params['username']
429 password = downloader_params['password']
430 elif downloader_params.get('usenetrc', False):
432 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
437 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
438 except (IOError, netrc.NetrcParseError) as err:
439 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
441 return (username, password)
443 def _get_tfa_info(self):
445 Get the two-factor authentication info
446 TODO - asking the user will be required for sms/phone verify
447 currently just uses the command line option
448 If there's no info available, return None
450 if self._downloader is None:
452 downloader_params = self._downloader.params
454 if downloader_params.get('twofactor', None) is not None:
455 return downloader_params['twofactor']
459 # Helper functions for extracting OpenGraph info
461 def _og_regexes(prop):
462 content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
463 property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
464 template = r'<meta[^>]+?%s[^>]+?%s'
466 template % (property_re, content_re),
467 template % (content_re, property_re),
470 def _og_search_property(self, prop, html, name=None, **kargs):
472 name = 'OpenGraph %s' % prop
473 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
476 return unescapeHTML(escaped)
478 def _og_search_thumbnail(self, html, **kargs):
479 return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
481 def _og_search_description(self, html, **kargs):
482 return self._og_search_property('description', html, fatal=False, **kargs)
484 def _og_search_title(self, html, **kargs):
485 return self._og_search_property('title', html, **kargs)
487 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
488 regexes = self._og_regexes('video') + self._og_regexes('video:url')
490 regexes = self._og_regexes('video:secure_url') + regexes
491 return self._html_search_regex(regexes, html, name, **kargs)
493 def _og_search_url(self, html, **kargs):
494 return self._og_search_property('url', html, **kargs)
496 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
497 if display_name is None:
499 return self._html_search_regex(
501 (?=[^>]+(?:itemprop|name|property)=["\']?%s["\']?)
502 [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
503 html, display_name, fatal=fatal, **kwargs)
505 def _dc_search_uploader(self, html):
506 return self._html_search_meta('dc.creator', html, 'uploader')
508 def _rta_search(self, html):
509 # See http://www.rtalabel.org/index.php?content=howtofaq#single
510 if re.search(r'(?ix)<meta\s+name="rating"\s+'
511 r' content="RTA-5042-1996-1400-1577-RTA"',
516 def _media_rating_search(self, html):
517 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
518 rating = self._html_search_meta('rating', html)
530 return RATING_TABLE.get(rating.lower(), None)
532 def _twitter_search_player(self, html):
533 return self._html_search_meta('twitter:player', html,
534 'twitter card player')
536 def _sort_formats(self, formats):
538 raise ExtractorError(u'No video formats found')
541 # TODO remove the following workaround
542 from ..utils import determine_ext
543 if not f.get('ext') and 'url' in f:
544 f['ext'] = determine_ext(f['url'])
546 preference = f.get('preference')
547 if preference is None:
548 proto = f.get('protocol')
550 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
552 preference = 0 if proto in ['http', 'https'] else -0.1
553 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
556 if f.get('vcodec') == 'none': # audio only
557 if self._downloader.params.get('prefer_free_formats'):
558 ORDER = [u'aac', u'mp3', u'm4a', u'webm', u'ogg', u'opus']
560 ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a']
563 audio_ext_preference = ORDER.index(f['ext'])
565 audio_ext_preference = -1
567 if self._downloader.params.get('prefer_free_formats'):
568 ORDER = [u'flv', u'mp4', u'webm']
570 ORDER = [u'webm', u'flv', u'mp4']
572 ext_preference = ORDER.index(f['ext'])
575 audio_ext_preference = 0
579 f.get('quality') if f.get('quality') is not None else -1,
580 f.get('height') if f.get('height') is not None else -1,
581 f.get('width') if f.get('width') is not None else -1,
583 f.get('tbr') if f.get('tbr') is not None else -1,
584 f.get('vbr') if f.get('vbr') is not None else -1,
585 f.get('abr') if f.get('abr') is not None else -1,
586 audio_ext_preference,
587 f.get('filesize') if f.get('filesize') is not None else -1,
588 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
591 formats.sort(key=_formats_key)
593 def http_scheme(self):
594 """ Either "https:" or "https:", depending on the user's preferences """
597 if self._downloader.params.get('prefer_insecure', False)
600 def _proto_relative_url(self, url, scheme=None):
603 if url.startswith('//'):
605 scheme = self.http_scheme()
610 def _sleep(self, timeout, video_id, msg_template=None):
611 if msg_template is None:
612 msg_template = u'%(video_id)s: Waiting for %(timeout)s seconds'
613 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
617 def _extract_f4m_formats(self, manifest_url, video_id):
618 manifest = self._download_xml(
619 manifest_url, video_id, 'Downloading f4m manifest',
620 'Unable to download f4m manifest')
623 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
624 for i, media_el in enumerate(media_nodes):
625 tbr = int_or_none(media_el.attrib.get('bitrate'))
626 format_id = 'f4m-%d' % (i if tbr is None else tbr)
628 'format_id': format_id,
632 'width': int_or_none(media_el.attrib.get('width')),
633 'height': int_or_none(media_el.attrib.get('height')),
635 self._sort_formats(formats)
640 class SearchInfoExtractor(InfoExtractor):
642 Base class for paged search queries extractors.
643 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
644 Instances should define _SEARCH_KEY and _MAX_RESULTS.
648 def _make_valid_url(cls):
649 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
652 def suitable(cls, url):
653 return re.match(cls._make_valid_url(), url) is not None
655 def _real_extract(self, query):
656 mobj = re.match(self._make_valid_url(), query)
658 raise ExtractorError(u'Invalid search query "%s"' % query)
660 prefix = mobj.group('prefix')
661 query = mobj.group('query')
663 return self._get_n_results(query, 1)
664 elif prefix == 'all':
665 return self._get_n_results(query, self._MAX_RESULTS)
669 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
670 elif n > self._MAX_RESULTS:
671 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
672 n = self._MAX_RESULTS
673 return self._get_n_results(query, n)
675 def _get_n_results(self, query, n):
676 """Get a specified number of results for a query"""
677 raise NotImplementedError("This method must be implemented by subclasses")
680 def SEARCH_KEY(self):
681 return self._SEARCH_KEY