1 from __future__ import unicode_literals
12 import xml.etree.ElementTree
17 compat_urllib_parse_urlparse,
28 _NO_DEFAULT = object()
31 class InfoExtractor(object):
32 """Information Extractor class.
34 Information extractors are the classes that, given a URL, extract
35 information about the video (or videos) the URL refers to. This
36 information includes the real video URL, the video title, author and
37 others. The information is stored in a dictionary which is then
38 passed to the FileDownloader. The FileDownloader processes this
39 information possibly downloading the video to the file system, among
40 other possible outcomes.
42 The dictionaries must include the following fields:
45 title: Video title, unescaped.
47 Additionally, it must contain either a formats entry or a url one:
49 formats: A list of dictionaries for each format available, ordered
50 from worst to best quality.
53 * url Mandatory. The URL of the video file
54 * ext Will be calculated from url if missing
55 * format A human-readable description of the format
56 ("mp4 container with h264/opus").
57 Calculated from the format_id, width, height.
58 and format_note fields if missing.
59 * format_id A short description of the format
60 ("mp4_h264_opus" or "19").
61 Technically optional, but strongly recommended.
62 * format_note Additional info about the format
63 ("3D" or "DASH video")
64 * width Width of the video, if known
65 * height Height of the video, if known
66 * resolution Textual description of width and height
67 * tbr Average bitrate of audio and video in KBit/s
68 * abr Average audio bitrate in KBit/s
69 * acodec Name of the audio codec in use
70 * asr Audio sampling rate in Hertz
71 * vbr Average video bitrate in KBit/s
72 * vcodec Name of the video codec in use
73 * container Name of the container format
74 * filesize The number of bytes, if known in advance
75 * filesize_approx An estimate for the number of bytes
76 * player_url SWF Player URL (used for rtmpdump).
77 * protocol The protocol that will be used for the actual
79 "http", "https", "rtsp", "rtmp", "m3u8" or so.
80 * preference Order number of this format. If this field is
81 present and not None, the formats get sorted
82 by this field, regardless of all other values.
83 -1 for default (order by other properties),
84 -2 or smaller for less than default.
85 * quality Order number of the video quality of this
86 format, irrespective of the file format.
87 -1 for default (order by other properties),
88 -2 or smaller for less than default.
89 * http_referer HTTP Referer header value to set.
90 * http_method HTTP method to use for the download.
91 * http_headers A dictionary of additional HTTP headers
92 to add to the request.
93 * http_post_data Additional data to send with a POST
96 ext: Video filename extension.
97 format: The video format, defaults to ext (used for --get-format)
98 player_url: SWF Player URL (used for rtmpdump).
100 The following fields are optional:
102 display_id An alternative identifier for the video, not necessarily
103 unique, but available before title. Typically, id is
104 something like "4234987", title "Dancing naked mole rats",
105 and display_id "dancing-naked-mole-rats"
106 thumbnails: A list of dictionaries, with the following entries:
108 * "width" (optional, int)
109 * "height" (optional, int)
110 * "resolution" (optional, string "{width}x{height"},
112 thumbnail: Full URL to a video thumbnail image.
113 description: One-line video description.
114 uploader: Full name of the video uploader.
115 timestamp: UNIX timestamp of the moment the video became available.
116 upload_date: Video upload date (YYYYMMDD).
117 If not explicitly set, calculated from timestamp.
118 uploader_id: Nickname or id of the video uploader.
119 location: Physical location where the video was filmed.
120 subtitles: The subtitle file contents as a dictionary in the format
121 {language: subtitles}.
122 duration: Length of the video in seconds, as an integer.
123 view_count: How many users have watched the video on the platform.
124 like_count: Number of positive ratings of the video
125 dislike_count: Number of negative ratings of the video
126 comment_count: Number of comments on the video
127 age_limit: Age restriction for the video, as an integer (years)
128 webpage_url: The url to the video webpage, if given to youtube-dl it
129 should allow to get the same result again. (It will be set
130 by YoutubeDL if it's missing)
131 categories: A list of categories that the video falls in, for example
133 is_live: True, False, or None (=unknown). Whether this video is a
134 live stream that goes on instead of a fixed-length video.
136 Unless mentioned otherwise, the fields should be Unicode strings.
138 Subclasses of this one should re-define the _real_initialize() and
139 _real_extract() methods and define a _VALID_URL regexp.
140 Probably, they should also be added to the list of extractors.
142 Finally, the _WORKING attribute should be set to False for broken IEs
143 in order to warn the users and skip the tests.
150 def __init__(self, downloader=None):
151 """Constructor. Receives an optional downloader."""
153 self.set_downloader(downloader)
156 def suitable(cls, url):
157 """Receives a URL and returns True if suitable for this IE."""
159 # This does not use has/getattr intentionally - we want to know whether
160 # we have cached the regexp for *this* class, whereas getattr would also
161 # match the superclass
162 if '_VALID_URL_RE' not in cls.__dict__:
163 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
164 return cls._VALID_URL_RE.match(url) is not None
168 """Getter method for _WORKING."""
171 def initialize(self):
172 """Initializes an instance (authentication, etc)."""
174 self._real_initialize()
177 def extract(self, url):
178 """Extracts URL information and returns it in list of dicts."""
180 return self._real_extract(url)
182 def set_downloader(self, downloader):
183 """Sets the downloader for this IE."""
184 self._downloader = downloader
186 def _real_initialize(self):
187 """Real initialization process. Redefine in subclasses."""
190 def _real_extract(self, url):
191 """Real extraction process. Redefine in subclasses."""
196 """A string for getting the InfoExtractor with get_info_extractor"""
197 return cls.__name__[:-2]
201 return type(self).__name__[:-2]
203 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
204 """ Returns the response handle """
206 self.report_download_webpage(video_id)
207 elif note is not False:
209 self.to_screen('%s' % (note,))
211 self.to_screen('%s: %s' % (video_id, note))
213 return self._downloader.urlopen(url_or_request)
214 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
218 errnote = 'Unable to download webpage'
219 errmsg = '%s: %s' % (errnote, compat_str(err))
221 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
223 self._downloader.report_warning(errmsg)
226 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
227 """ Returns a tuple (page content as string, URL handle) """
229 # Strip hashes from the URL (#1038)
230 if isinstance(url_or_request, (compat_str, str)):
231 url_or_request = url_or_request.partition('#')[0]
233 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
237 content_type = urlh.headers.get('Content-Type', '')
238 webpage_bytes = urlh.read()
239 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
241 encoding = m.group(1)
243 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
244 webpage_bytes[:1024])
246 encoding = m.group(1).decode('ascii')
247 elif webpage_bytes.startswith(b'\xff\xfe'):
251 if self._downloader.params.get('dump_intermediate_pages', False):
253 url = url_or_request.get_full_url()
254 except AttributeError:
256 self.to_screen('Dumping request to ' + url)
257 dump = base64.b64encode(webpage_bytes).decode('ascii')
258 self._downloader.to_screen(dump)
259 if self._downloader.params.get('write_pages', False):
261 url = url_or_request.get_full_url()
262 except AttributeError:
264 basen = '%s_%s' % (video_id, url)
266 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
267 basen = basen[:240 - len(h)] + h
268 raw_filename = basen + '.dump'
269 filename = sanitize_filename(raw_filename, restricted=True)
270 self.to_screen('Saving request to ' + filename)
271 with open(filename, 'wb') as outf:
272 outf.write(webpage_bytes)
275 content = webpage_bytes.decode(encoding, 'replace')
277 content = webpage_bytes.decode('utf-8', 'replace')
279 if ('<title>Access to this site is blocked</title>' in content and
280 'Websense' in content[:512]):
281 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
282 blocked_iframe = self._html_search_regex(
283 r'<iframe src="([^"]+)"', content,
284 'Websense information URL', default=None)
286 msg += ' Visit %s for more details' % blocked_iframe
287 raise ExtractorError(msg, expected=True)
289 return (content, urlh)
291 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
292 """ Returns the data of the page as a string """
293 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
300 def _download_xml(self, url_or_request, video_id,
301 note='Downloading XML', errnote='Unable to download XML',
302 transform_source=None, fatal=True):
303 """Return the xml as an xml.etree.ElementTree.Element"""
304 xml_string = self._download_webpage(
305 url_or_request, video_id, note, errnote, fatal=fatal)
306 if xml_string is False:
309 xml_string = transform_source(xml_string)
310 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
312 def _download_json(self, url_or_request, video_id,
313 note='Downloading JSON metadata',
314 errnote='Unable to download JSON metadata',
315 transform_source=None,
317 json_string = self._download_webpage(
318 url_or_request, video_id, note, errnote, fatal=fatal)
319 if (not fatal) and json_string is False:
322 json_string = transform_source(json_string)
324 return json.loads(json_string)
325 except ValueError as ve:
326 raise ExtractorError('Failed to download JSON', cause=ve)
328 def report_warning(self, msg, video_id=None):
329 idstr = '' if video_id is None else '%s: ' % video_id
330 self._downloader.report_warning(
331 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
333 def to_screen(self, msg):
334 """Print msg to screen, prefixing it with '[ie_name]'"""
335 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
337 def report_extraction(self, id_or_name):
338 """Report information extraction."""
339 self.to_screen('%s: Extracting information' % id_or_name)
341 def report_download_webpage(self, video_id):
342 """Report webpage download."""
343 self.to_screen('%s: Downloading webpage' % video_id)
345 def report_age_confirmation(self):
346 """Report attempt to confirm age."""
347 self.to_screen('Confirming age')
349 def report_login(self):
350 """Report attempt to log in."""
351 self.to_screen('Logging in')
353 #Methods for following #608
355 def url_result(url, ie=None, video_id=None):
356 """Returns a url that points to a page that should be processed"""
357 #TODO: ie should be the class used for getting the info
358 video_info = {'_type': 'url',
361 if video_id is not None:
362 video_info['id'] = video_id
365 def playlist_result(entries, playlist_id=None, playlist_title=None):
366 """Returns a playlist"""
367 video_info = {'_type': 'playlist',
370 video_info['id'] = playlist_id
372 video_info['title'] = playlist_title
375 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
377 Perform a regex search on the given string, using a single or a list of
378 patterns returning the first matching group.
379 In case of failure return a default value or raise a WARNING or a
380 RegexNotFoundError, depending on fatal, specifying the field name.
382 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
383 mobj = re.search(pattern, string, flags)
386 mobj = re.search(p, string, flags)
390 if os.name != 'nt' and sys.stderr.isatty():
391 _name = '\033[0;34m%s\033[0m' % name
396 # return the first matching group
397 return next(g for g in mobj.groups() if g is not None)
398 elif default is not _NO_DEFAULT:
401 raise RegexNotFoundError('Unable to extract %s' % _name)
403 self._downloader.report_warning('unable to extract %s; '
404 'please report this issue on http://yt-dl.org/bug' % _name)
407 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
409 Like _search_regex, but strips HTML tags and unescapes entities.
411 res = self._search_regex(pattern, string, name, default, fatal, flags)
413 return clean_html(res).strip()
417 def _get_login_info(self):
419 Get the the login info as (username, password)
420 It will look in the netrc file using the _NETRC_MACHINE value
421 If there's no info available, return (None, None)
423 if self._downloader is None:
428 downloader_params = self._downloader.params
430 # Attempt to use provided username and password or .netrc data
431 if downloader_params.get('username', None) is not None:
432 username = downloader_params['username']
433 password = downloader_params['password']
434 elif downloader_params.get('usenetrc', False):
436 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
441 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
442 except (IOError, netrc.NetrcParseError) as err:
443 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
445 return (username, password)
447 def _get_tfa_info(self):
449 Get the two-factor authentication info
450 TODO - asking the user will be required for sms/phone verify
451 currently just uses the command line option
452 If there's no info available, return None
454 if self._downloader is None:
456 downloader_params = self._downloader.params
458 if downloader_params.get('twofactor', None) is not None:
459 return downloader_params['twofactor']
463 # Helper functions for extracting OpenGraph info
465 def _og_regexes(prop):
466 content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
467 property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
468 template = r'<meta[^>]+?%s[^>]+?%s'
470 template % (property_re, content_re),
471 template % (content_re, property_re),
474 def _og_search_property(self, prop, html, name=None, **kargs):
476 name = 'OpenGraph %s' % prop
477 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
480 return unescapeHTML(escaped)
482 def _og_search_thumbnail(self, html, **kargs):
483 return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs)
485 def _og_search_description(self, html, **kargs):
486 return self._og_search_property('description', html, fatal=False, **kargs)
488 def _og_search_title(self, html, **kargs):
489 return self._og_search_property('title', html, **kargs)
491 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
492 regexes = self._og_regexes('video') + self._og_regexes('video:url')
494 regexes = self._og_regexes('video:secure_url') + regexes
495 return self._html_search_regex(regexes, html, name, **kargs)
497 def _og_search_url(self, html, **kargs):
498 return self._og_search_property('url', html, **kargs)
500 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
501 if display_name is None:
503 return self._html_search_regex(
505 (?=[^>]+(?:itemprop|name|property)=["\']?%s["\']?)
506 [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
507 html, display_name, fatal=fatal, **kwargs)
509 def _dc_search_uploader(self, html):
510 return self._html_search_meta('dc.creator', html, 'uploader')
512 def _rta_search(self, html):
513 # See http://www.rtalabel.org/index.php?content=howtofaq#single
514 if re.search(r'(?ix)<meta\s+name="rating"\s+'
515 r' content="RTA-5042-1996-1400-1577-RTA"',
520 def _media_rating_search(self, html):
521 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
522 rating = self._html_search_meta('rating', html)
534 return RATING_TABLE.get(rating.lower(), None)
536 def _twitter_search_player(self, html):
537 return self._html_search_meta('twitter:player', html,
538 'twitter card player')
540 def _sort_formats(self, formats):
542 raise ExtractorError('No video formats found')
545 # TODO remove the following workaround
546 from ..utils import determine_ext
547 if not f.get('ext') and 'url' in f:
548 f['ext'] = determine_ext(f['url'])
550 preference = f.get('preference')
551 if preference is None:
552 proto = f.get('protocol')
554 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
556 preference = 0 if proto in ['http', 'https'] else -0.1
557 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
560 if f.get('vcodec') == 'none': # audio only
561 if self._downloader.params.get('prefer_free_formats'):
562 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
564 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
567 audio_ext_preference = ORDER.index(f['ext'])
569 audio_ext_preference = -1
571 if self._downloader.params.get('prefer_free_formats'):
572 ORDER = ['flv', 'mp4', 'webm']
574 ORDER = ['webm', 'flv', 'mp4']
576 ext_preference = ORDER.index(f['ext'])
579 audio_ext_preference = 0
583 f.get('quality') if f.get('quality') is not None else -1,
584 f.get('height') if f.get('height') is not None else -1,
585 f.get('width') if f.get('width') is not None else -1,
587 f.get('tbr') if f.get('tbr') is not None else -1,
588 f.get('vbr') if f.get('vbr') is not None else -1,
589 f.get('abr') if f.get('abr') is not None else -1,
590 audio_ext_preference,
591 f.get('filesize') if f.get('filesize') is not None else -1,
592 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
595 formats.sort(key=_formats_key)
597 def http_scheme(self):
598 """ Either "https:" or "https:", depending on the user's preferences """
601 if self._downloader.params.get('prefer_insecure', False)
604 def _proto_relative_url(self, url, scheme=None):
607 if url.startswith('//'):
609 scheme = self.http_scheme()
614 def _sleep(self, timeout, video_id, msg_template=None):
615 if msg_template is None:
616 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
617 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
621 def _extract_f4m_formats(self, manifest_url, video_id):
622 manifest = self._download_xml(
623 manifest_url, video_id, 'Downloading f4m manifest',
624 'Unable to download f4m manifest')
627 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
628 for i, media_el in enumerate(media_nodes):
629 tbr = int_or_none(media_el.attrib.get('bitrate'))
630 format_id = 'f4m-%d' % (i if tbr is None else tbr)
632 'format_id': format_id,
636 'width': int_or_none(media_el.attrib.get('width')),
637 'height': int_or_none(media_el.attrib.get('height')),
639 self._sort_formats(formats)
643 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None):
645 'format_id': 'm3u8-meta',
650 'resolution': 'multiple',
651 'format_note': 'Quality selection URL',
654 m3u8_doc = self._download_webpage(m3u8_url, video_id)
657 r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
658 for line in m3u8_doc.splitlines():
659 if line.startswith('#EXT-X-STREAM-INF:'):
661 for m in kv_rex.finditer(line):
663 if v.startswith('"'):
665 last_info[m.group('key')] = v
666 elif line.startswith('#') or not line.strip():
669 if last_info is None:
670 formats.append({'url': line})
672 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
675 'format_id': 'm3u8-%d' % (tbr if tbr else len(formats)),
680 codecs = last_info.get('CODECS')
682 # TODO: looks like video codec is not always necessarily goes first
683 va_codecs = codecs.split(',')
685 f['vcodec'] = va_codecs[0].partition('.')[0]
686 if len(va_codecs) > 1 and va_codecs[1]:
687 f['acodec'] = va_codecs[1].partition('.')[0]
688 resolution = last_info.get('RESOLUTION')
690 width_str, height_str = resolution.split('x')
691 f['width'] = int(width_str)
692 f['height'] = int(height_str)
695 self._sort_formats(formats)
699 class SearchInfoExtractor(InfoExtractor):
701 Base class for paged search queries extractors.
702 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
703 Instances should define _SEARCH_KEY and _MAX_RESULTS.
707 def _make_valid_url(cls):
708 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
711 def suitable(cls, url):
712 return re.match(cls._make_valid_url(), url) is not None
714 def _real_extract(self, query):
715 mobj = re.match(self._make_valid_url(), query)
717 raise ExtractorError('Invalid search query "%s"' % query)
719 prefix = mobj.group('prefix')
720 query = mobj.group('query')
722 return self._get_n_results(query, 1)
723 elif prefix == 'all':
724 return self._get_n_results(query, self._MAX_RESULTS)
728 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
729 elif n > self._MAX_RESULTS:
730 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
731 n = self._MAX_RESULTS
732 return self._get_n_results(query, n)
734 def _get_n_results(self, query, n):
735 """Get a specified number of results for a query"""
736 raise NotImplementedError("This method must be implemented by subclasses")
739 def SEARCH_KEY(self):
740 return self._SEARCH_KEY