1 from __future__ import unicode_literals
13 import xml.etree.ElementTree
18 compat_urllib_parse_urlparse,
31 _NO_DEFAULT = object()
34 class InfoExtractor(object):
35 """Information Extractor class.
37 Information extractors are the classes that, given a URL, extract
38 information about the video (or videos) the URL refers to. This
39 information includes the real video URL, the video title, author and
40 others. The information is stored in a dictionary which is then
41 passed to the FileDownloader. The FileDownloader processes this
42 information possibly downloading the video to the file system, among
43 other possible outcomes.
45 The dictionaries must include the following fields:
48 title: Video title, unescaped.
50 Additionally, it must contain either a formats entry or a url one:
52 formats: A list of dictionaries for each format available, ordered
53 from worst to best quality.
56 * url Mandatory. The URL of the video file
57 * ext Will be calculated from url if missing
58 * format A human-readable description of the format
59 ("mp4 container with h264/opus").
60 Calculated from the format_id, width, height.
61 and format_note fields if missing.
62 * format_id A short description of the format
63 ("mp4_h264_opus" or "19").
64 Technically optional, but strongly recommended.
65 * format_note Additional info about the format
66 ("3D" or "DASH video")
67 * width Width of the video, if known
68 * height Height of the video, if known
69 * resolution Textual description of width and height
70 * tbr Average bitrate of audio and video in KBit/s
71 * abr Average audio bitrate in KBit/s
72 * acodec Name of the audio codec in use
73 * asr Audio sampling rate in Hertz
74 * vbr Average video bitrate in KBit/s
75 * vcodec Name of the video codec in use
76 * container Name of the container format
77 * filesize The number of bytes, if known in advance
78 * filesize_approx An estimate for the number of bytes
79 * player_url SWF Player URL (used for rtmpdump).
80 * protocol The protocol that will be used for the actual
82 "http", "https", "rtsp", "rtmp", "m3u8" or so.
83 * preference Order number of this format. If this field is
84 present and not None, the formats get sorted
85 by this field, regardless of all other values.
86 -1 for default (order by other properties),
87 -2 or smaller for less than default.
88 * quality Order number of the video quality of this
89 format, irrespective of the file format.
90 -1 for default (order by other properties),
91 -2 or smaller for less than default.
92 * http_referer HTTP Referer header value to set.
93 * http_method HTTP method to use for the download.
94 * http_headers A dictionary of additional HTTP headers
95 to add to the request.
96 * http_post_data Additional data to send with a POST
99 ext: Video filename extension.
100 format: The video format, defaults to ext (used for --get-format)
101 player_url: SWF Player URL (used for rtmpdump).
103 The following fields are optional:
105 display_id An alternative identifier for the video, not necessarily
106 unique, but available before title. Typically, id is
107 something like "4234987", title "Dancing naked mole rats",
108 and display_id "dancing-naked-mole-rats"
109 thumbnails: A list of dictionaries, with the following entries:
111 * "width" (optional, int)
112 * "height" (optional, int)
113 * "resolution" (optional, string "{width}x{height"},
115 thumbnail: Full URL to a video thumbnail image.
116 description: One-line video description.
117 uploader: Full name of the video uploader.
118 timestamp: UNIX timestamp of the moment the video became available.
119 upload_date: Video upload date (YYYYMMDD).
120 If not explicitly set, calculated from timestamp.
121 uploader_id: Nickname or id of the video uploader.
122 location: Physical location where the video was filmed.
123 subtitles: The subtitle file contents as a dictionary in the format
124 {language: subtitles}.
125 duration: Length of the video in seconds, as an integer.
126 view_count: How many users have watched the video on the platform.
127 like_count: Number of positive ratings of the video
128 dislike_count: Number of negative ratings of the video
129 comment_count: Number of comments on the video
130 age_limit: Age restriction for the video, as an integer (years)
131 webpage_url: The url to the video webpage, if given to youtube-dl it
132 should allow to get the same result again. (It will be set
133 by YoutubeDL if it's missing)
134 categories: A list of categories that the video falls in, for example
136 is_live: True, False, or None (=unknown). Whether this video is a
137 live stream that goes on instead of a fixed-length video.
139 Unless mentioned otherwise, the fields should be Unicode strings.
141 Unless mentioned otherwise, None is equivalent to absence of information.
143 Subclasses of this one should re-define the _real_initialize() and
144 _real_extract() methods and define a _VALID_URL regexp.
145 Probably, they should also be added to the list of extractors.
147 Finally, the _WORKING attribute should be set to False for broken IEs
148 in order to warn the users and skip the tests.
155 def __init__(self, downloader=None):
156 """Constructor. Receives an optional downloader."""
158 self.set_downloader(downloader)
161 def suitable(cls, url):
162 """Receives a URL and returns True if suitable for this IE."""
164 # This does not use has/getattr intentionally - we want to know whether
165 # we have cached the regexp for *this* class, whereas getattr would also
166 # match the superclass
167 if '_VALID_URL_RE' not in cls.__dict__:
168 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
169 return cls._VALID_URL_RE.match(url) is not None
172 def _match_id(cls, url):
173 if '_VALID_URL_RE' not in cls.__dict__:
174 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
175 m = cls._VALID_URL_RE.match(url)
181 """Getter method for _WORKING."""
184 def initialize(self):
185 """Initializes an instance (authentication, etc)."""
187 self._real_initialize()
190 def extract(self, url):
191 """Extracts URL information and returns it in list of dicts."""
193 return self._real_extract(url)
195 def set_downloader(self, downloader):
196 """Sets the downloader for this IE."""
197 self._downloader = downloader
199 def _real_initialize(self):
200 """Real initialization process. Redefine in subclasses."""
203 def _real_extract(self, url):
204 """Real extraction process. Redefine in subclasses."""
209 """A string for getting the InfoExtractor with get_info_extractor"""
210 return cls.__name__[:-2]
214 return type(self).__name__[:-2]
216 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
217 """ Returns the response handle """
219 self.report_download_webpage(video_id)
220 elif note is not False:
222 self.to_screen('%s' % (note,))
224 self.to_screen('%s: %s' % (video_id, note))
226 return self._downloader.urlopen(url_or_request)
227 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
231 errnote = 'Unable to download webpage'
232 errmsg = '%s: %s' % (errnote, compat_str(err))
234 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
236 self._downloader.report_warning(errmsg)
239 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
240 """ Returns a tuple (page content as string, URL handle) """
242 # Strip hashes from the URL (#1038)
243 if isinstance(url_or_request, (compat_str, str)):
244 url_or_request = url_or_request.partition('#')[0]
246 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
250 content_type = urlh.headers.get('Content-Type', '')
251 webpage_bytes = urlh.read()
252 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
254 encoding = m.group(1)
256 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
257 webpage_bytes[:1024])
259 encoding = m.group(1).decode('ascii')
260 elif webpage_bytes.startswith(b'\xff\xfe'):
264 if self._downloader.params.get('dump_intermediate_pages', False):
266 url = url_or_request.get_full_url()
267 except AttributeError:
269 self.to_screen('Dumping request to ' + url)
270 dump = base64.b64encode(webpage_bytes).decode('ascii')
271 self._downloader.to_screen(dump)
272 if self._downloader.params.get('write_pages', False):
274 url = url_or_request.get_full_url()
275 except AttributeError:
277 basen = '%s_%s' % (video_id, url)
279 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
280 basen = basen[:240 - len(h)] + h
281 raw_filename = basen + '.dump'
282 filename = sanitize_filename(raw_filename, restricted=True)
283 self.to_screen('Saving request to ' + filename)
284 with open(filename, 'wb') as outf:
285 outf.write(webpage_bytes)
288 content = webpage_bytes.decode(encoding, 'replace')
290 content = webpage_bytes.decode('utf-8', 'replace')
292 if ('<title>Access to this site is blocked</title>' in content and
293 'Websense' in content[:512]):
294 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
295 blocked_iframe = self._html_search_regex(
296 r'<iframe src="([^"]+)"', content,
297 'Websense information URL', default=None)
299 msg += ' Visit %s for more details' % blocked_iframe
300 raise ExtractorError(msg, expected=True)
302 return (content, urlh)
304 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
305 """ Returns the data of the page as a string """
306 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
313 def _download_xml(self, url_or_request, video_id,
314 note='Downloading XML', errnote='Unable to download XML',
315 transform_source=None, fatal=True):
316 """Return the xml as an xml.etree.ElementTree.Element"""
317 xml_string = self._download_webpage(
318 url_or_request, video_id, note, errnote, fatal=fatal)
319 if xml_string is False:
322 xml_string = transform_source(xml_string)
323 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
325 def _download_json(self, url_or_request, video_id,
326 note='Downloading JSON metadata',
327 errnote='Unable to download JSON metadata',
328 transform_source=None,
330 json_string = self._download_webpage(
331 url_or_request, video_id, note, errnote, fatal=fatal)
332 if (not fatal) and json_string is False:
335 json_string = transform_source(json_string)
337 return json.loads(json_string)
338 except ValueError as ve:
339 errmsg = '%s: Failed to parse JSON ' % video_id
341 raise ExtractorError(errmsg, cause=ve)
343 self.report_warning(errmsg + str(ve))
345 def report_warning(self, msg, video_id=None):
346 idstr = '' if video_id is None else '%s: ' % video_id
347 self._downloader.report_warning(
348 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
350 def to_screen(self, msg):
351 """Print msg to screen, prefixing it with '[ie_name]'"""
352 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
354 def report_extraction(self, id_or_name):
355 """Report information extraction."""
356 self.to_screen('%s: Extracting information' % id_or_name)
358 def report_download_webpage(self, video_id):
359 """Report webpage download."""
360 self.to_screen('%s: Downloading webpage' % video_id)
362 def report_age_confirmation(self):
363 """Report attempt to confirm age."""
364 self.to_screen('Confirming age')
366 def report_login(self):
367 """Report attempt to log in."""
368 self.to_screen('Logging in')
370 #Methods for following #608
372 def url_result(url, ie=None, video_id=None):
373 """Returns a url that points to a page that should be processed"""
374 #TODO: ie should be the class used for getting the info
375 video_info = {'_type': 'url',
378 if video_id is not None:
379 video_info['id'] = video_id
382 def playlist_result(entries, playlist_id=None, playlist_title=None):
383 """Returns a playlist"""
384 video_info = {'_type': 'playlist',
387 video_info['id'] = playlist_id
389 video_info['title'] = playlist_title
392 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
394 Perform a regex search on the given string, using a single or a list of
395 patterns returning the first matching group.
396 In case of failure return a default value or raise a WARNING or a
397 RegexNotFoundError, depending on fatal, specifying the field name.
399 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
400 mobj = re.search(pattern, string, flags)
403 mobj = re.search(p, string, flags)
407 if os.name != 'nt' and sys.stderr.isatty():
408 _name = '\033[0;34m%s\033[0m' % name
413 # return the first matching group
414 return next(g for g in mobj.groups() if g is not None)
415 elif default is not _NO_DEFAULT:
418 raise RegexNotFoundError('Unable to extract %s' % _name)
420 self._downloader.report_warning('unable to extract %s; '
421 'please report this issue on http://yt-dl.org/bug' % _name)
424 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
426 Like _search_regex, but strips HTML tags and unescapes entities.
428 res = self._search_regex(pattern, string, name, default, fatal, flags)
430 return clean_html(res).strip()
434 def _get_login_info(self):
436 Get the the login info as (username, password)
437 It will look in the netrc file using the _NETRC_MACHINE value
438 If there's no info available, return (None, None)
440 if self._downloader is None:
445 downloader_params = self._downloader.params
447 # Attempt to use provided username and password or .netrc data
448 if downloader_params.get('username', None) is not None:
449 username = downloader_params['username']
450 password = downloader_params['password']
451 elif downloader_params.get('usenetrc', False):
453 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
458 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
459 except (IOError, netrc.NetrcParseError) as err:
460 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
462 return (username, password)
464 def _get_tfa_info(self):
466 Get the two-factor authentication info
467 TODO - asking the user will be required for sms/phone verify
468 currently just uses the command line option
469 If there's no info available, return None
471 if self._downloader is None:
473 downloader_params = self._downloader.params
475 if downloader_params.get('twofactor', None) is not None:
476 return downloader_params['twofactor']
480 # Helper functions for extracting OpenGraph info
482 def _og_regexes(prop):
483 content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
484 property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
485 template = r'<meta[^>]+?%s[^>]+?%s'
487 template % (property_re, content_re),
488 template % (content_re, property_re),
491 def _og_search_property(self, prop, html, name=None, **kargs):
493 name = 'OpenGraph %s' % prop
494 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
497 return unescapeHTML(escaped)
499 def _og_search_thumbnail(self, html, **kargs):
500 return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs)
502 def _og_search_description(self, html, **kargs):
503 return self._og_search_property('description', html, fatal=False, **kargs)
505 def _og_search_title(self, html, **kargs):
506 return self._og_search_property('title', html, **kargs)
508 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
509 regexes = self._og_regexes('video') + self._og_regexes('video:url')
511 regexes = self._og_regexes('video:secure_url') + regexes
512 return self._html_search_regex(regexes, html, name, **kargs)
514 def _og_search_url(self, html, **kargs):
515 return self._og_search_property('url', html, **kargs)
517 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
518 if display_name is None:
520 return self._html_search_regex(
522 (?=[^>]+(?:itemprop|name|property)=["\']?%s["\']?)
523 [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
524 html, display_name, fatal=fatal, **kwargs)
526 def _dc_search_uploader(self, html):
527 return self._html_search_meta('dc.creator', html, 'uploader')
529 def _rta_search(self, html):
530 # See http://www.rtalabel.org/index.php?content=howtofaq#single
531 if re.search(r'(?ix)<meta\s+name="rating"\s+'
532 r' content="RTA-5042-1996-1400-1577-RTA"',
537 def _media_rating_search(self, html):
538 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
539 rating = self._html_search_meta('rating', html)
551 return RATING_TABLE.get(rating.lower(), None)
553 def _twitter_search_player(self, html):
554 return self._html_search_meta('twitter:player', html,
555 'twitter card player')
557 def _sort_formats(self, formats):
559 raise ExtractorError('No video formats found')
562 # TODO remove the following workaround
563 from ..utils import determine_ext
564 if not f.get('ext') and 'url' in f:
565 f['ext'] = determine_ext(f['url'])
567 preference = f.get('preference')
568 if preference is None:
569 proto = f.get('protocol')
571 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
573 preference = 0 if proto in ['http', 'https'] else -0.1
574 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
577 if f.get('vcodec') == 'none': # audio only
578 if self._downloader.params.get('prefer_free_formats'):
579 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
581 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
584 audio_ext_preference = ORDER.index(f['ext'])
586 audio_ext_preference = -1
588 if self._downloader.params.get('prefer_free_formats'):
589 ORDER = ['flv', 'mp4', 'webm']
591 ORDER = ['webm', 'flv', 'mp4']
593 ext_preference = ORDER.index(f['ext'])
596 audio_ext_preference = 0
600 f.get('quality') if f.get('quality') is not None else -1,
601 f.get('height') if f.get('height') is not None else -1,
602 f.get('width') if f.get('width') is not None else -1,
604 f.get('tbr') if f.get('tbr') is not None else -1,
605 f.get('vbr') if f.get('vbr') is not None else -1,
606 f.get('abr') if f.get('abr') is not None else -1,
607 audio_ext_preference,
608 f.get('filesize') if f.get('filesize') is not None else -1,
609 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
612 formats.sort(key=_formats_key)
614 def http_scheme(self):
615 """ Either "https:" or "https:", depending on the user's preferences """
618 if self._downloader.params.get('prefer_insecure', False)
621 def _proto_relative_url(self, url, scheme=None):
624 if url.startswith('//'):
626 scheme = self.http_scheme()
631 def _sleep(self, timeout, video_id, msg_template=None):
632 if msg_template is None:
633 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
634 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
638 def _extract_f4m_formats(self, manifest_url, video_id):
639 manifest = self._download_xml(
640 manifest_url, video_id, 'Downloading f4m manifest',
641 'Unable to download f4m manifest')
644 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
645 for i, media_el in enumerate(media_nodes):
646 tbr = int_or_none(media_el.attrib.get('bitrate'))
647 format_id = 'f4m-%d' % (i if tbr is None else tbr)
649 'format_id': format_id,
653 'width': int_or_none(media_el.attrib.get('width')),
654 'height': int_or_none(media_el.attrib.get('height')),
656 self._sort_formats(formats)
660 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
661 entry_protocol='m3u8', preference=None):
664 'format_id': 'm3u8-meta',
669 'resolution': 'multiple',
670 'format_note': 'Quality selection URL',
673 format_url = lambda u: (
675 if re.match(r'^https?://', u)
676 else compat_urlparse.urljoin(m3u8_url, u))
678 m3u8_doc = self._download_webpage(m3u8_url, video_id)
681 r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
682 for line in m3u8_doc.splitlines():
683 if line.startswith('#EXT-X-STREAM-INF:'):
685 for m in kv_rex.finditer(line):
687 if v.startswith('"'):
689 last_info[m.group('key')] = v
690 elif line.startswith('#') or not line.strip():
693 if last_info is None:
694 formats.append({'url': format_url(line)})
696 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
699 'format_id': 'm3u8-%d' % (tbr if tbr else len(formats)),
700 'url': format_url(line.strip()),
703 'protocol': entry_protocol,
704 'preference': preference,
706 codecs = last_info.get('CODECS')
708 # TODO: looks like video codec is not always necessarily goes first
709 va_codecs = codecs.split(',')
711 f['vcodec'] = va_codecs[0].partition('.')[0]
712 if len(va_codecs) > 1 and va_codecs[1]:
713 f['acodec'] = va_codecs[1].partition('.')[0]
714 resolution = last_info.get('RESOLUTION')
716 width_str, height_str = resolution.split('x')
717 f['width'] = int(width_str)
718 f['height'] = int(height_str)
721 self._sort_formats(formats)
724 def _live_title(self, name):
725 """ Generate the title for a live video """
726 now = datetime.datetime.now()
727 now_str = now.strftime("%Y-%m-%d %H:%M")
728 return name + ' ' + now_str
730 def _int(self, v, name, fatal=False, **kwargs):
731 res = int_or_none(v, **kwargs)
732 if 'get_attr' in kwargs:
733 print(getattr(v, kwargs['get_attr']))
735 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
737 raise ExtractorError(msg)
739 self._downloader.report_warning(msg)
742 def _float(self, v, name, fatal=False, **kwargs):
743 res = float_or_none(v, **kwargs)
745 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
747 raise ExtractorError(msg)
749 self._downloader.report_warning(msg)
753 class SearchInfoExtractor(InfoExtractor):
755 Base class for paged search queries extractors.
756 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
757 Instances should define _SEARCH_KEY and _MAX_RESULTS.
761 def _make_valid_url(cls):
762 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
765 def suitable(cls, url):
766 return re.match(cls._make_valid_url(), url) is not None
768 def _real_extract(self, query):
769 mobj = re.match(self._make_valid_url(), query)
771 raise ExtractorError('Invalid search query "%s"' % query)
773 prefix = mobj.group('prefix')
774 query = mobj.group('query')
776 return self._get_n_results(query, 1)
777 elif prefix == 'all':
778 return self._get_n_results(query, self._MAX_RESULTS)
782 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
783 elif n > self._MAX_RESULTS:
784 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
785 n = self._MAX_RESULTS
786 return self._get_n_results(query, n)
788 def _get_n_results(self, query, n):
789 """Get a specified number of results for a query"""
790 raise NotImplementedError("This method must be implemented by subclasses")
793 def SEARCH_KEY(self):
794 return self._SEARCH_KEY