1 from __future__ import unicode_literals
13 import xml.etree.ElementTree
18 compat_urllib_parse_urlparse,
31 _NO_DEFAULT = object()
34 class InfoExtractor(object):
35 """Information Extractor class.
37 Information extractors are the classes that, given a URL, extract
38 information about the video (or videos) the URL refers to. This
39 information includes the real video URL, the video title, author and
40 others. The information is stored in a dictionary which is then
41 passed to the FileDownloader. The FileDownloader processes this
42 information possibly downloading the video to the file system, among
43 other possible outcomes.
45 The dictionaries must include the following fields:
48 title: Video title, unescaped.
50 Additionally, it must contain either a formats entry or a url one:
52 formats: A list of dictionaries for each format available, ordered
53 from worst to best quality.
56 * url Mandatory. The URL of the video file
57 * ext Will be calculated from url if missing
58 * format A human-readable description of the format
59 ("mp4 container with h264/opus").
60 Calculated from the format_id, width, height.
61 and format_note fields if missing.
62 * format_id A short description of the format
63 ("mp4_h264_opus" or "19").
64 Technically optional, but strongly recommended.
65 * format_note Additional info about the format
66 ("3D" or "DASH video")
67 * width Width of the video, if known
68 * height Height of the video, if known
69 * resolution Textual description of width and height
70 * tbr Average bitrate of audio and video in KBit/s
71 * abr Average audio bitrate in KBit/s
72 * acodec Name of the audio codec in use
73 * asr Audio sampling rate in Hertz
74 * vbr Average video bitrate in KBit/s
75 * vcodec Name of the video codec in use
76 * container Name of the container format
77 * filesize The number of bytes, if known in advance
78 * filesize_approx An estimate for the number of bytes
79 * player_url SWF Player URL (used for rtmpdump).
80 * protocol The protocol that will be used for the actual
82 "http", "https", "rtsp", "rtmp", "m3u8" or so.
83 * preference Order number of this format. If this field is
84 present and not None, the formats get sorted
85 by this field, regardless of all other values.
86 -1 for default (order by other properties),
87 -2 or smaller for less than default.
88 * quality Order number of the video quality of this
89 format, irrespective of the file format.
90 -1 for default (order by other properties),
91 -2 or smaller for less than default.
92 * http_referer HTTP Referer header value to set.
93 * http_method HTTP method to use for the download.
94 * http_headers A dictionary of additional HTTP headers
95 to add to the request.
96 * http_post_data Additional data to send with a POST
99 ext: Video filename extension.
100 format: The video format, defaults to ext (used for --get-format)
101 player_url: SWF Player URL (used for rtmpdump).
103 The following fields are optional:
105 display_id An alternative identifier for the video, not necessarily
106 unique, but available before title. Typically, id is
107 something like "4234987", title "Dancing naked mole rats",
108 and display_id "dancing-naked-mole-rats"
109 thumbnails: A list of dictionaries, with the following entries:
111 * "width" (optional, int)
112 * "height" (optional, int)
113 * "resolution" (optional, string "{width}x{height"},
115 thumbnail: Full URL to a video thumbnail image.
116 description: One-line video description.
117 uploader: Full name of the video uploader.
118 timestamp: UNIX timestamp of the moment the video became available.
119 upload_date: Video upload date (YYYYMMDD).
120 If not explicitly set, calculated from timestamp.
121 uploader_id: Nickname or id of the video uploader.
122 location: Physical location where the video was filmed.
123 subtitles: The subtitle file contents as a dictionary in the format
124 {language: subtitles}.
125 duration: Length of the video in seconds, as an integer.
126 view_count: How many users have watched the video on the platform.
127 like_count: Number of positive ratings of the video
128 dislike_count: Number of negative ratings of the video
129 comment_count: Number of comments on the video
130 age_limit: Age restriction for the video, as an integer (years)
131 webpage_url: The url to the video webpage, if given to youtube-dl it
132 should allow to get the same result again. (It will be set
133 by YoutubeDL if it's missing)
134 categories: A list of categories that the video falls in, for example
136 is_live: True, False, or None (=unknown). Whether this video is a
137 live stream that goes on instead of a fixed-length video.
139 Unless mentioned otherwise, the fields should be Unicode strings.
141 Unless mentioned otherwise, None is equivalent to absence of information.
143 Subclasses of this one should re-define the _real_initialize() and
144 _real_extract() methods and define a _VALID_URL regexp.
145 Probably, they should also be added to the list of extractors.
147 Finally, the _WORKING attribute should be set to False for broken IEs
148 in order to warn the users and skip the tests.
155 def __init__(self, downloader=None):
156 """Constructor. Receives an optional downloader."""
158 self.set_downloader(downloader)
161 def suitable(cls, url):
162 """Receives a URL and returns True if suitable for this IE."""
164 # This does not use has/getattr intentionally - we want to know whether
165 # we have cached the regexp for *this* class, whereas getattr would also
166 # match the superclass
167 if '_VALID_URL_RE' not in cls.__dict__:
168 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
169 return cls._VALID_URL_RE.match(url) is not None
172 def _match_id(cls, url):
173 if '_VALID_URL_RE' not in cls.__dict__:
174 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
175 m = cls._VALID_URL_RE.match(url)
181 """Getter method for _WORKING."""
184 def initialize(self):
185 """Initializes an instance (authentication, etc)."""
187 self._real_initialize()
190 def extract(self, url):
191 """Extracts URL information and returns it in list of dicts."""
193 return self._real_extract(url)
195 def set_downloader(self, downloader):
196 """Sets the downloader for this IE."""
197 self._downloader = downloader
199 def _real_initialize(self):
200 """Real initialization process. Redefine in subclasses."""
203 def _real_extract(self, url):
204 """Real extraction process. Redefine in subclasses."""
209 """A string for getting the InfoExtractor with get_info_extractor"""
210 return cls.__name__[:-2]
214 return type(self).__name__[:-2]
216 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
217 """ Returns the response handle """
219 self.report_download_webpage(video_id)
220 elif note is not False:
222 self.to_screen('%s' % (note,))
224 self.to_screen('%s: %s' % (video_id, note))
226 return self._downloader.urlopen(url_or_request)
227 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
231 errnote = 'Unable to download webpage'
232 errmsg = '%s: %s' % (errnote, compat_str(err))
234 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
236 self._downloader.report_warning(errmsg)
239 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
240 """ Returns a tuple (page content as string, URL handle) """
242 # Strip hashes from the URL (#1038)
243 if isinstance(url_or_request, (compat_str, str)):
244 url_or_request = url_or_request.partition('#')[0]
246 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
250 content_type = urlh.headers.get('Content-Type', '')
251 webpage_bytes = urlh.read()
252 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
254 encoding = m.group(1)
256 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
257 webpage_bytes[:1024])
259 encoding = m.group(1).decode('ascii')
260 elif webpage_bytes.startswith(b'\xff\xfe'):
264 if self._downloader.params.get('dump_intermediate_pages', False):
266 url = url_or_request.get_full_url()
267 except AttributeError:
269 self.to_screen('Dumping request to ' + url)
270 dump = base64.b64encode(webpage_bytes).decode('ascii')
271 self._downloader.to_screen(dump)
272 if self._downloader.params.get('write_pages', False):
274 url = url_or_request.get_full_url()
275 except AttributeError:
277 basen = '%s_%s' % (video_id, url)
279 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
280 basen = basen[:240 - len(h)] + h
281 raw_filename = basen + '.dump'
282 filename = sanitize_filename(raw_filename, restricted=True)
283 self.to_screen('Saving request to ' + filename)
284 # Working around MAX_PATH limitation on Windows (see
285 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
287 absfilepath = os.path.abspath(filename)
288 if len(absfilepath) > 259:
289 filename = '\\\\?\\' + absfilepath
290 with open(filename, 'wb') as outf:
291 outf.write(webpage_bytes)
294 content = webpage_bytes.decode(encoding, 'replace')
296 content = webpage_bytes.decode('utf-8', 'replace')
298 if ('<title>Access to this site is blocked</title>' in content and
299 'Websense' in content[:512]):
300 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
301 blocked_iframe = self._html_search_regex(
302 r'<iframe src="([^"]+)"', content,
303 'Websense information URL', default=None)
305 msg += ' Visit %s for more details' % blocked_iframe
306 raise ExtractorError(msg, expected=True)
308 return (content, urlh)
310 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
311 """ Returns the data of the page as a string """
312 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
319 def _download_xml(self, url_or_request, video_id,
320 note='Downloading XML', errnote='Unable to download XML',
321 transform_source=None, fatal=True):
322 """Return the xml as an xml.etree.ElementTree.Element"""
323 xml_string = self._download_webpage(
324 url_or_request, video_id, note, errnote, fatal=fatal)
325 if xml_string is False:
328 xml_string = transform_source(xml_string)
329 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
331 def _download_json(self, url_or_request, video_id,
332 note='Downloading JSON metadata',
333 errnote='Unable to download JSON metadata',
334 transform_source=None,
336 json_string = self._download_webpage(
337 url_or_request, video_id, note, errnote, fatal=fatal)
338 if (not fatal) and json_string is False:
341 json_string = transform_source(json_string)
343 return json.loads(json_string)
344 except ValueError as ve:
345 errmsg = '%s: Failed to parse JSON ' % video_id
347 raise ExtractorError(errmsg, cause=ve)
349 self.report_warning(errmsg + str(ve))
351 def report_warning(self, msg, video_id=None):
352 idstr = '' if video_id is None else '%s: ' % video_id
353 self._downloader.report_warning(
354 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
356 def to_screen(self, msg):
357 """Print msg to screen, prefixing it with '[ie_name]'"""
358 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
360 def report_extraction(self, id_or_name):
361 """Report information extraction."""
362 self.to_screen('%s: Extracting information' % id_or_name)
364 def report_download_webpage(self, video_id):
365 """Report webpage download."""
366 self.to_screen('%s: Downloading webpage' % video_id)
368 def report_age_confirmation(self):
369 """Report attempt to confirm age."""
370 self.to_screen('Confirming age')
372 def report_login(self):
373 """Report attempt to log in."""
374 self.to_screen('Logging in')
376 #Methods for following #608
378 def url_result(url, ie=None, video_id=None):
379 """Returns a url that points to a page that should be processed"""
380 #TODO: ie should be the class used for getting the info
381 video_info = {'_type': 'url',
384 if video_id is not None:
385 video_info['id'] = video_id
388 def playlist_result(entries, playlist_id=None, playlist_title=None):
389 """Returns a playlist"""
390 video_info = {'_type': 'playlist',
393 video_info['id'] = playlist_id
395 video_info['title'] = playlist_title
398 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
400 Perform a regex search on the given string, using a single or a list of
401 patterns returning the first matching group.
402 In case of failure return a default value or raise a WARNING or a
403 RegexNotFoundError, depending on fatal, specifying the field name.
405 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
406 mobj = re.search(pattern, string, flags)
409 mobj = re.search(p, string, flags)
413 if os.name != 'nt' and sys.stderr.isatty():
414 _name = '\033[0;34m%s\033[0m' % name
419 # return the first matching group
420 return next(g for g in mobj.groups() if g is not None)
421 elif default is not _NO_DEFAULT:
424 raise RegexNotFoundError('Unable to extract %s' % _name)
426 self._downloader.report_warning('unable to extract %s; '
427 'please report this issue on http://yt-dl.org/bug' % _name)
430 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
432 Like _search_regex, but strips HTML tags and unescapes entities.
434 res = self._search_regex(pattern, string, name, default, fatal, flags)
436 return clean_html(res).strip()
440 def _get_login_info(self):
442 Get the the login info as (username, password)
443 It will look in the netrc file using the _NETRC_MACHINE value
444 If there's no info available, return (None, None)
446 if self._downloader is None:
451 downloader_params = self._downloader.params
453 # Attempt to use provided username and password or .netrc data
454 if downloader_params.get('username', None) is not None:
455 username = downloader_params['username']
456 password = downloader_params['password']
457 elif downloader_params.get('usenetrc', False):
459 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
464 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
465 except (IOError, netrc.NetrcParseError) as err:
466 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
468 return (username, password)
470 def _get_tfa_info(self):
472 Get the two-factor authentication info
473 TODO - asking the user will be required for sms/phone verify
474 currently just uses the command line option
475 If there's no info available, return None
477 if self._downloader is None:
479 downloader_params = self._downloader.params
481 if downloader_params.get('twofactor', None) is not None:
482 return downloader_params['twofactor']
486 # Helper functions for extracting OpenGraph info
488 def _og_regexes(prop):
489 content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
490 property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
491 template = r'<meta[^>]+?%s[^>]+?%s'
493 template % (property_re, content_re),
494 template % (content_re, property_re),
497 def _og_search_property(self, prop, html, name=None, **kargs):
499 name = 'OpenGraph %s' % prop
500 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
503 return unescapeHTML(escaped)
505 def _og_search_thumbnail(self, html, **kargs):
506 return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs)
508 def _og_search_description(self, html, **kargs):
509 return self._og_search_property('description', html, fatal=False, **kargs)
511 def _og_search_title(self, html, **kargs):
512 return self._og_search_property('title', html, **kargs)
514 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
515 regexes = self._og_regexes('video') + self._og_regexes('video:url')
517 regexes = self._og_regexes('video:secure_url') + regexes
518 return self._html_search_regex(regexes, html, name, **kargs)
520 def _og_search_url(self, html, **kargs):
521 return self._og_search_property('url', html, **kargs)
523 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
524 if display_name is None:
526 return self._html_search_regex(
528 (?=[^>]+(?:itemprop|name|property)=["\']?%s["\']?)
529 [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
530 html, display_name, fatal=fatal, **kwargs)
532 def _dc_search_uploader(self, html):
533 return self._html_search_meta('dc.creator', html, 'uploader')
535 def _rta_search(self, html):
536 # See http://www.rtalabel.org/index.php?content=howtofaq#single
537 if re.search(r'(?ix)<meta\s+name="rating"\s+'
538 r' content="RTA-5042-1996-1400-1577-RTA"',
543 def _media_rating_search(self, html):
544 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
545 rating = self._html_search_meta('rating', html)
557 return RATING_TABLE.get(rating.lower(), None)
559 def _twitter_search_player(self, html):
560 return self._html_search_meta('twitter:player', html,
561 'twitter card player')
563 def _sort_formats(self, formats):
565 raise ExtractorError('No video formats found')
568 # TODO remove the following workaround
569 from ..utils import determine_ext
570 if not f.get('ext') and 'url' in f:
571 f['ext'] = determine_ext(f['url'])
573 preference = f.get('preference')
574 if preference is None:
575 proto = f.get('protocol')
577 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
579 preference = 0 if proto in ['http', 'https'] else -0.1
580 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
583 if f.get('vcodec') == 'none': # audio only
584 if self._downloader.params.get('prefer_free_formats'):
585 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
587 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
590 audio_ext_preference = ORDER.index(f['ext'])
592 audio_ext_preference = -1
594 if self._downloader.params.get('prefer_free_formats'):
595 ORDER = ['flv', 'mp4', 'webm']
597 ORDER = ['webm', 'flv', 'mp4']
599 ext_preference = ORDER.index(f['ext'])
602 audio_ext_preference = 0
606 f.get('quality') if f.get('quality') is not None else -1,
607 f.get('height') if f.get('height') is not None else -1,
608 f.get('width') if f.get('width') is not None else -1,
610 f.get('tbr') if f.get('tbr') is not None else -1,
611 f.get('vbr') if f.get('vbr') is not None else -1,
612 f.get('abr') if f.get('abr') is not None else -1,
613 audio_ext_preference,
614 f.get('filesize') if f.get('filesize') is not None else -1,
615 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
618 formats.sort(key=_formats_key)
620 def http_scheme(self):
621 """ Either "https:" or "https:", depending on the user's preferences """
624 if self._downloader.params.get('prefer_insecure', False)
627 def _proto_relative_url(self, url, scheme=None):
630 if url.startswith('//'):
632 scheme = self.http_scheme()
637 def _sleep(self, timeout, video_id, msg_template=None):
638 if msg_template is None:
639 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
640 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
644 def _extract_f4m_formats(self, manifest_url, video_id):
645 manifest = self._download_xml(
646 manifest_url, video_id, 'Downloading f4m manifest',
647 'Unable to download f4m manifest')
650 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
652 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
653 for i, media_el in enumerate(media_nodes):
654 tbr = int_or_none(media_el.attrib.get('bitrate'))
655 format_id = 'f4m-%d' % (i if tbr is None else tbr)
657 'format_id': format_id,
661 'width': int_or_none(media_el.attrib.get('width')),
662 'height': int_or_none(media_el.attrib.get('height')),
664 self._sort_formats(formats)
668 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
669 entry_protocol='m3u8', preference=None):
672 'format_id': 'm3u8-meta',
677 'resolution': 'multiple',
678 'format_note': 'Quality selection URL',
681 format_url = lambda u: (
683 if re.match(r'^https?://', u)
684 else compat_urlparse.urljoin(m3u8_url, u))
686 m3u8_doc = self._download_webpage(m3u8_url, video_id)
689 r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
690 for line in m3u8_doc.splitlines():
691 if line.startswith('#EXT-X-STREAM-INF:'):
693 for m in kv_rex.finditer(line):
695 if v.startswith('"'):
697 last_info[m.group('key')] = v
698 elif line.startswith('#') or not line.strip():
701 if last_info is None:
702 formats.append({'url': format_url(line)})
704 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
707 'format_id': 'm3u8-%d' % (tbr if tbr else len(formats)),
708 'url': format_url(line.strip()),
711 'protocol': entry_protocol,
712 'preference': preference,
714 codecs = last_info.get('CODECS')
716 # TODO: looks like video codec is not always necessarily goes first
717 va_codecs = codecs.split(',')
719 f['vcodec'] = va_codecs[0].partition('.')[0]
720 if len(va_codecs) > 1 and va_codecs[1]:
721 f['acodec'] = va_codecs[1].partition('.')[0]
722 resolution = last_info.get('RESOLUTION')
724 width_str, height_str = resolution.split('x')
725 f['width'] = int(width_str)
726 f['height'] = int(height_str)
729 self._sort_formats(formats)
732 def _live_title(self, name):
733 """ Generate the title for a live video """
734 now = datetime.datetime.now()
735 now_str = now.strftime("%Y-%m-%d %H:%M")
736 return name + ' ' + now_str
738 def _int(self, v, name, fatal=False, **kwargs):
739 res = int_or_none(v, **kwargs)
740 if 'get_attr' in kwargs:
741 print(getattr(v, kwargs['get_attr']))
743 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
745 raise ExtractorError(msg)
747 self._downloader.report_warning(msg)
750 def _float(self, v, name, fatal=False, **kwargs):
751 res = float_or_none(v, **kwargs)
753 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
755 raise ExtractorError(msg)
757 self._downloader.report_warning(msg)
761 class SearchInfoExtractor(InfoExtractor):
763 Base class for paged search queries extractors.
764 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
765 Instances should define _SEARCH_KEY and _MAX_RESULTS.
769 def _make_valid_url(cls):
770 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
773 def suitable(cls, url):
774 return re.match(cls._make_valid_url(), url) is not None
776 def _real_extract(self, query):
777 mobj = re.match(self._make_valid_url(), query)
779 raise ExtractorError('Invalid search query "%s"' % query)
781 prefix = mobj.group('prefix')
782 query = mobj.group('query')
784 return self._get_n_results(query, 1)
785 elif prefix == 'all':
786 return self._get_n_results(query, self._MAX_RESULTS)
790 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
791 elif n > self._MAX_RESULTS:
792 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
793 n = self._MAX_RESULTS
794 return self._get_n_results(query, n)
796 def _get_n_results(self, query, n):
797 """Get a specified number of results for a query"""
798 raise NotImplementedError("This method must be implemented by subclasses")
801 def SEARCH_KEY(self):
802 return self._SEARCH_KEY