1 from __future__ import unicode_literals
12 import xml.etree.ElementTree
17 compat_urllib_parse_urlparse,
28 _NO_DEFAULT = object()
31 class InfoExtractor(object):
32 """Information Extractor class.
34 Information extractors are the classes that, given a URL, extract
35 information about the video (or videos) the URL refers to. This
36 information includes the real video URL, the video title, author and
37 others. The information is stored in a dictionary which is then
38 passed to the FileDownloader. The FileDownloader processes this
39 information possibly downloading the video to the file system, among
40 other possible outcomes.
42 The dictionaries must include the following fields:
45 title: Video title, unescaped.
47 Additionally, it must contain either a formats entry or a url one:
49 formats: A list of dictionaries for each format available, ordered
50 from worst to best quality.
53 * url Mandatory. The URL of the video file
54 * ext Will be calculated from url if missing
55 * format A human-readable description of the format
56 ("mp4 container with h264/opus").
57 Calculated from the format_id, width, height.
58 and format_note fields if missing.
59 * format_id A short description of the format
60 ("mp4_h264_opus" or "19").
61 Technically optional, but strongly recommended.
62 * format_note Additional info about the format
63 ("3D" or "DASH video")
64 * width Width of the video, if known
65 * height Height of the video, if known
66 * resolution Textual description of width and height
67 * tbr Average bitrate of audio and video in KBit/s
68 * abr Average audio bitrate in KBit/s
69 * acodec Name of the audio codec in use
70 * asr Audio sampling rate in Hertz
71 * vbr Average video bitrate in KBit/s
72 * vcodec Name of the video codec in use
73 * container Name of the container format
74 * filesize The number of bytes, if known in advance
75 * filesize_approx An estimate for the number of bytes
76 * player_url SWF Player URL (used for rtmpdump).
77 * protocol The protocol that will be used for the actual
79 "http", "https", "rtsp", "rtmp", "m3u8" or so.
80 * preference Order number of this format. If this field is
81 present and not None, the formats get sorted
82 by this field, regardless of all other values.
83 -1 for default (order by other properties),
84 -2 or smaller for less than default.
85 * quality Order number of the video quality of this
86 format, irrespective of the file format.
87 -1 for default (order by other properties),
88 -2 or smaller for less than default.
89 * http_referer HTTP Referer header value to set.
90 * http_method HTTP method to use for the download.
91 * http_headers A dictionary of additional HTTP headers
92 to add to the request.
93 * http_post_data Additional data to send with a POST
96 ext: Video filename extension.
97 format: The video format, defaults to ext (used for --get-format)
98 player_url: SWF Player URL (used for rtmpdump).
100 The following fields are optional:
102 display_id An alternative identifier for the video, not necessarily
103 unique, but available before title. Typically, id is
104 something like "4234987", title "Dancing naked mole rats",
105 and display_id "dancing-naked-mole-rats"
106 thumbnails: A list of dictionaries, with the following entries:
108 * "width" (optional, int)
109 * "height" (optional, int)
110 * "resolution" (optional, string "{width}x{height"},
112 thumbnail: Full URL to a video thumbnail image.
113 description: One-line video description.
114 uploader: Full name of the video uploader.
115 timestamp: UNIX timestamp of the moment the video became available.
116 upload_date: Video upload date (YYYYMMDD).
117 If not explicitly set, calculated from timestamp.
118 uploader_id: Nickname or id of the video uploader.
119 location: Physical location where the video was filmed.
120 subtitles: The subtitle file contents as a dictionary in the format
121 {language: subtitles}.
122 duration: Length of the video in seconds, as an integer.
123 view_count: How many users have watched the video on the platform.
124 like_count: Number of positive ratings of the video
125 dislike_count: Number of negative ratings of the video
126 comment_count: Number of comments on the video
127 age_limit: Age restriction for the video, as an integer (years)
128 webpage_url: The url to the video webpage, if given to youtube-dl it
129 should allow to get the same result again. (It will be set
130 by YoutubeDL if it's missing)
131 categories: A list of categories that the video falls in, for example
134 Unless mentioned otherwise, the fields should be Unicode strings.
136 Subclasses of this one should re-define the _real_initialize() and
137 _real_extract() methods and define a _VALID_URL regexp.
138 Probably, they should also be added to the list of extractors.
140 Finally, the _WORKING attribute should be set to False for broken IEs
141 in order to warn the users and skip the tests.
148 def __init__(self, downloader=None):
149 """Constructor. Receives an optional downloader."""
151 self.set_downloader(downloader)
154 def suitable(cls, url):
155 """Receives a URL and returns True if suitable for this IE."""
157 # This does not use has/getattr intentionally - we want to know whether
158 # we have cached the regexp for *this* class, whereas getattr would also
159 # match the superclass
160 if '_VALID_URL_RE' not in cls.__dict__:
161 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
162 return cls._VALID_URL_RE.match(url) is not None
166 """Getter method for _WORKING."""
169 def initialize(self):
170 """Initializes an instance (authentication, etc)."""
172 self._real_initialize()
175 def extract(self, url):
176 """Extracts URL information and returns it in list of dicts."""
178 return self._real_extract(url)
180 def set_downloader(self, downloader):
181 """Sets the downloader for this IE."""
182 self._downloader = downloader
184 def _real_initialize(self):
185 """Real initialization process. Redefine in subclasses."""
188 def _real_extract(self, url):
189 """Real extraction process. Redefine in subclasses."""
194 """A string for getting the InfoExtractor with get_info_extractor"""
195 return cls.__name__[:-2]
199 return type(self).__name__[:-2]
201 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
202 """ Returns the response handle """
204 self.report_download_webpage(video_id)
205 elif note is not False:
207 self.to_screen('%s' % (note,))
209 self.to_screen('%s: %s' % (video_id, note))
211 return self._downloader.urlopen(url_or_request)
212 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
216 errnote = 'Unable to download webpage'
217 errmsg = '%s: %s' % (errnote, compat_str(err))
219 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
221 self._downloader.report_warning(errmsg)
224 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
225 """ Returns a tuple (page content as string, URL handle) """
227 # Strip hashes from the URL (#1038)
228 if isinstance(url_or_request, (compat_str, str)):
229 url_or_request = url_or_request.partition('#')[0]
231 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
235 content_type = urlh.headers.get('Content-Type', '')
236 webpage_bytes = urlh.read()
237 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
239 encoding = m.group(1)
241 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
242 webpage_bytes[:1024])
244 encoding = m.group(1).decode('ascii')
245 elif webpage_bytes.startswith(b'\xff\xfe'):
249 if self._downloader.params.get('dump_intermediate_pages', False):
251 url = url_or_request.get_full_url()
252 except AttributeError:
254 self.to_screen('Dumping request to ' + url)
255 dump = base64.b64encode(webpage_bytes).decode('ascii')
256 self._downloader.to_screen(dump)
257 if self._downloader.params.get('write_pages', False):
259 url = url_or_request.get_full_url()
260 except AttributeError:
262 basen = '%s_%s' % (video_id, url)
264 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
265 basen = basen[:240 - len(h)] + h
266 raw_filename = basen + '.dump'
267 filename = sanitize_filename(raw_filename, restricted=True)
268 self.to_screen('Saving request to ' + filename)
269 with open(filename, 'wb') as outf:
270 outf.write(webpage_bytes)
273 content = webpage_bytes.decode(encoding, 'replace')
275 content = webpage_bytes.decode('utf-8', 'replace')
277 if ('<title>Access to this site is blocked</title>' in content and
278 'Websense' in content[:512]):
279 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
280 blocked_iframe = self._html_search_regex(
281 r'<iframe src="([^"]+)"', content,
282 'Websense information URL', default=None)
284 msg += ' Visit %s for more details' % blocked_iframe
285 raise ExtractorError(msg, expected=True)
287 return (content, urlh)
289 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
290 """ Returns the data of the page as a string """
291 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
298 def _download_xml(self, url_or_request, video_id,
299 note='Downloading XML', errnote='Unable to download XML',
300 transform_source=None, fatal=True):
301 """Return the xml as an xml.etree.ElementTree.Element"""
302 xml_string = self._download_webpage(
303 url_or_request, video_id, note, errnote, fatal=fatal)
304 if xml_string is False:
307 xml_string = transform_source(xml_string)
308 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
310 def _download_json(self, url_or_request, video_id,
311 note='Downloading JSON metadata',
312 errnote='Unable to download JSON metadata',
313 transform_source=None,
315 json_string = self._download_webpage(
316 url_or_request, video_id, note, errnote, fatal=fatal)
317 if (not fatal) and json_string is False:
320 json_string = transform_source(json_string)
322 return json.loads(json_string)
323 except ValueError as ve:
324 raise ExtractorError('Failed to download JSON', cause=ve)
326 def report_warning(self, msg, video_id=None):
327 idstr = '' if video_id is None else '%s: ' % video_id
328 self._downloader.report_warning(
329 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
331 def to_screen(self, msg):
332 """Print msg to screen, prefixing it with '[ie_name]'"""
333 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
335 def report_extraction(self, id_or_name):
336 """Report information extraction."""
337 self.to_screen('%s: Extracting information' % id_or_name)
339 def report_download_webpage(self, video_id):
340 """Report webpage download."""
341 self.to_screen('%s: Downloading webpage' % video_id)
343 def report_age_confirmation(self):
344 """Report attempt to confirm age."""
345 self.to_screen('Confirming age')
347 def report_login(self):
348 """Report attempt to log in."""
349 self.to_screen('Logging in')
351 #Methods for following #608
353 def url_result(url, ie=None, video_id=None):
354 """Returns a url that points to a page that should be processed"""
355 #TODO: ie should be the class used for getting the info
356 video_info = {'_type': 'url',
359 if video_id is not None:
360 video_info['id'] = video_id
363 def playlist_result(entries, playlist_id=None, playlist_title=None):
364 """Returns a playlist"""
365 video_info = {'_type': 'playlist',
368 video_info['id'] = playlist_id
370 video_info['title'] = playlist_title
373 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
375 Perform a regex search on the given string, using a single or a list of
376 patterns returning the first matching group.
377 In case of failure return a default value or raise a WARNING or a
378 RegexNotFoundError, depending on fatal, specifying the field name.
380 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
381 mobj = re.search(pattern, string, flags)
384 mobj = re.search(p, string, flags)
388 if os.name != 'nt' and sys.stderr.isatty():
389 _name = '\033[0;34m%s\033[0m' % name
394 # return the first matching group
395 return next(g for g in mobj.groups() if g is not None)
396 elif default is not _NO_DEFAULT:
399 raise RegexNotFoundError('Unable to extract %s' % _name)
401 self._downloader.report_warning('unable to extract %s; '
402 'please report this issue on http://yt-dl.org/bug' % _name)
405 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
407 Like _search_regex, but strips HTML tags and unescapes entities.
409 res = self._search_regex(pattern, string, name, default, fatal, flags)
411 return clean_html(res).strip()
415 def _get_login_info(self):
417 Get the the login info as (username, password)
418 It will look in the netrc file using the _NETRC_MACHINE value
419 If there's no info available, return (None, None)
421 if self._downloader is None:
426 downloader_params = self._downloader.params
428 # Attempt to use provided username and password or .netrc data
429 if downloader_params.get('username', None) is not None:
430 username = downloader_params['username']
431 password = downloader_params['password']
432 elif downloader_params.get('usenetrc', False):
434 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
439 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
440 except (IOError, netrc.NetrcParseError) as err:
441 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
443 return (username, password)
445 def _get_tfa_info(self):
447 Get the two-factor authentication info
448 TODO - asking the user will be required for sms/phone verify
449 currently just uses the command line option
450 If there's no info available, return None
452 if self._downloader is None:
454 downloader_params = self._downloader.params
456 if downloader_params.get('twofactor', None) is not None:
457 return downloader_params['twofactor']
461 # Helper functions for extracting OpenGraph info
463 def _og_regexes(prop):
464 content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
465 property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
466 template = r'<meta[^>]+?%s[^>]+?%s'
468 template % (property_re, content_re),
469 template % (content_re, property_re),
472 def _og_search_property(self, prop, html, name=None, **kargs):
474 name = 'OpenGraph %s' % prop
475 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
478 return unescapeHTML(escaped)
480 def _og_search_thumbnail(self, html, **kargs):
481 return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs)
483 def _og_search_description(self, html, **kargs):
484 return self._og_search_property('description', html, fatal=False, **kargs)
486 def _og_search_title(self, html, **kargs):
487 return self._og_search_property('title', html, **kargs)
489 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
490 regexes = self._og_regexes('video') + self._og_regexes('video:url')
492 regexes = self._og_regexes('video:secure_url') + regexes
493 return self._html_search_regex(regexes, html, name, **kargs)
495 def _og_search_url(self, html, **kargs):
496 return self._og_search_property('url', html, **kargs)
498 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
499 if display_name is None:
501 return self._html_search_regex(
503 (?=[^>]+(?:itemprop|name|property)=["\']?%s["\']?)
504 [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
505 html, display_name, fatal=fatal, **kwargs)
507 def _dc_search_uploader(self, html):
508 return self._html_search_meta('dc.creator', html, 'uploader')
510 def _rta_search(self, html):
511 # See http://www.rtalabel.org/index.php?content=howtofaq#single
512 if re.search(r'(?ix)<meta\s+name="rating"\s+'
513 r' content="RTA-5042-1996-1400-1577-RTA"',
518 def _media_rating_search(self, html):
519 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
520 rating = self._html_search_meta('rating', html)
532 return RATING_TABLE.get(rating.lower(), None)
534 def _twitter_search_player(self, html):
535 return self._html_search_meta('twitter:player', html,
536 'twitter card player')
538 def _sort_formats(self, formats):
540 raise ExtractorError('No video formats found')
543 # TODO remove the following workaround
544 from ..utils import determine_ext
545 if not f.get('ext') and 'url' in f:
546 f['ext'] = determine_ext(f['url'])
548 preference = f.get('preference')
549 if preference is None:
550 proto = f.get('protocol')
552 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
554 preference = 0 if proto in ['http', 'https'] else -0.1
555 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
558 if f.get('vcodec') == 'none': # audio only
559 if self._downloader.params.get('prefer_free_formats'):
560 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
562 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
565 audio_ext_preference = ORDER.index(f['ext'])
567 audio_ext_preference = -1
569 if self._downloader.params.get('prefer_free_formats'):
570 ORDER = ['flv', 'mp4', 'webm']
572 ORDER = ['webm', 'flv', 'mp4']
574 ext_preference = ORDER.index(f['ext'])
577 audio_ext_preference = 0
581 f.get('quality') if f.get('quality') is not None else -1,
582 f.get('height') if f.get('height') is not None else -1,
583 f.get('width') if f.get('width') is not None else -1,
585 f.get('tbr') if f.get('tbr') is not None else -1,
586 f.get('vbr') if f.get('vbr') is not None else -1,
587 f.get('abr') if f.get('abr') is not None else -1,
588 audio_ext_preference,
589 f.get('filesize') if f.get('filesize') is not None else -1,
590 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
593 formats.sort(key=_formats_key)
595 def http_scheme(self):
596 """ Either "https:" or "https:", depending on the user's preferences """
599 if self._downloader.params.get('prefer_insecure', False)
602 def _proto_relative_url(self, url, scheme=None):
605 if url.startswith('//'):
607 scheme = self.http_scheme()
612 def _sleep(self, timeout, video_id, msg_template=None):
613 if msg_template is None:
614 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
615 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
619 def _extract_f4m_formats(self, manifest_url, video_id):
620 manifest = self._download_xml(
621 manifest_url, video_id, 'Downloading f4m manifest',
622 'Unable to download f4m manifest')
625 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
626 for i, media_el in enumerate(media_nodes):
627 tbr = int_or_none(media_el.attrib.get('bitrate'))
628 format_id = 'f4m-%d' % (i if tbr is None else tbr)
630 'format_id': format_id,
634 'width': int_or_none(media_el.attrib.get('width')),
635 'height': int_or_none(media_el.attrib.get('height')),
637 self._sort_formats(formats)
641 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None):
643 'format_id': 'm3u8-meta',
648 'resolution': 'multiple',
649 'format_note': 'Quality selection URL',
652 m3u8_doc = self._download_webpage(m3u8_url, video_id)
655 r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
656 for line in m3u8_doc.splitlines():
657 if line.startswith('#EXT-X-STREAM-INF:'):
659 for m in kv_rex.finditer(line):
661 if v.startswith('"'):
663 last_info[m.group('key')] = v
664 elif line.startswith('#') or not line.strip():
667 if last_info is None:
668 formats.append({'url': line})
670 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
673 'format_id': 'm3u8-%d' % (tbr if tbr else len(formats)),
678 codecs = last_info.get('CODECS')
680 # TODO: looks like video codec is not always necessarily goes first
681 va_codecs = codecs.split(',')
683 f['vcodec'] = va_codecs[0].partition('.')[0]
684 if len(va_codecs) > 1 and va_codecs[1]:
685 f['acodec'] = va_codecs[1].partition('.')[0]
686 resolution = last_info.get('RESOLUTION')
688 width_str, height_str = resolution.split('x')
689 f['width'] = int(width_str)
690 f['height'] = int(height_str)
693 self._sort_formats(formats)
697 class SearchInfoExtractor(InfoExtractor):
699 Base class for paged search queries extractors.
700 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
701 Instances should define _SEARCH_KEY and _MAX_RESULTS.
705 def _make_valid_url(cls):
706 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
709 def suitable(cls, url):
710 return re.match(cls._make_valid_url(), url) is not None
712 def _real_extract(self, query):
713 mobj = re.match(self._make_valid_url(), query)
715 raise ExtractorError('Invalid search query "%s"' % query)
717 prefix = mobj.group('prefix')
718 query = mobj.group('query')
720 return self._get_n_results(query, 1)
721 elif prefix == 'all':
722 return self._get_n_results(query, self._MAX_RESULTS)
726 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
727 elif n > self._MAX_RESULTS:
728 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
729 n = self._MAX_RESULTS
730 return self._get_n_results(query, n)
732 def _get_n_results(self, query, n):
733 """Get a specified number of results for a query"""
734 raise NotImplementedError("This method must be implemented by subclasses")
737 def SEARCH_KEY(self):
738 return self._SEARCH_KEY