1 from __future__ import unicode_literals
12 import xml.etree.ElementTree
17 compat_urllib_parse_urlparse,
29 _NO_DEFAULT = object()
32 class InfoExtractor(object):
33 """Information Extractor class.
35 Information extractors are the classes that, given a URL, extract
36 information about the video (or videos) the URL refers to. This
37 information includes the real video URL, the video title, author and
38 others. The information is stored in a dictionary which is then
39 passed to the FileDownloader. The FileDownloader processes this
40 information possibly downloading the video to the file system, among
41 other possible outcomes.
43 The dictionaries must include the following fields:
46 title: Video title, unescaped.
48 Additionally, it must contain either a formats entry or a url one:
50 formats: A list of dictionaries for each format available, ordered
51 from worst to best quality.
54 * url Mandatory. The URL of the video file
55 * ext Will be calculated from url if missing
56 * format A human-readable description of the format
57 ("mp4 container with h264/opus").
58 Calculated from the format_id, width, height.
59 and format_note fields if missing.
60 * format_id A short description of the format
61 ("mp4_h264_opus" or "19").
62 Technically optional, but strongly recommended.
63 * format_note Additional info about the format
64 ("3D" or "DASH video")
65 * width Width of the video, if known
66 * height Height of the video, if known
67 * resolution Textual description of width and height
68 * tbr Average bitrate of audio and video in KBit/s
69 * abr Average audio bitrate in KBit/s
70 * acodec Name of the audio codec in use
71 * asr Audio sampling rate in Hertz
72 * vbr Average video bitrate in KBit/s
73 * vcodec Name of the video codec in use
74 * container Name of the container format
75 * filesize The number of bytes, if known in advance
76 * filesize_approx An estimate for the number of bytes
77 * player_url SWF Player URL (used for rtmpdump).
78 * protocol The protocol that will be used for the actual
80 "http", "https", "rtsp", "rtmp", "m3u8" or so.
81 * preference Order number of this format. If this field is
82 present and not None, the formats get sorted
83 by this field, regardless of all other values.
84 -1 for default (order by other properties),
85 -2 or smaller for less than default.
86 * quality Order number of the video quality of this
87 format, irrespective of the file format.
88 -1 for default (order by other properties),
89 -2 or smaller for less than default.
90 * http_referer HTTP Referer header value to set.
91 * http_method HTTP method to use for the download.
92 * http_headers A dictionary of additional HTTP headers
93 to add to the request.
94 * http_post_data Additional data to send with a POST
97 ext: Video filename extension.
98 format: The video format, defaults to ext (used for --get-format)
99 player_url: SWF Player URL (used for rtmpdump).
101 The following fields are optional:
103 display_id An alternative identifier for the video, not necessarily
104 unique, but available before title. Typically, id is
105 something like "4234987", title "Dancing naked mole rats",
106 and display_id "dancing-naked-mole-rats"
107 thumbnails: A list of dictionaries, with the following entries:
109 * "width" (optional, int)
110 * "height" (optional, int)
111 * "resolution" (optional, string "{width}x{height"},
113 thumbnail: Full URL to a video thumbnail image.
114 description: One-line video description.
115 uploader: Full name of the video uploader.
116 timestamp: UNIX timestamp of the moment the video became available.
117 upload_date: Video upload date (YYYYMMDD).
118 If not explicitly set, calculated from timestamp.
119 uploader_id: Nickname or id of the video uploader.
120 location: Physical location where the video was filmed.
121 subtitles: The subtitle file contents as a dictionary in the format
122 {language: subtitles}.
123 duration: Length of the video in seconds, as an integer.
124 view_count: How many users have watched the video on the platform.
125 like_count: Number of positive ratings of the video
126 dislike_count: Number of negative ratings of the video
127 comment_count: Number of comments on the video
128 age_limit: Age restriction for the video, as an integer (years)
129 webpage_url: The url to the video webpage, if given to youtube-dl it
130 should allow to get the same result again. (It will be set
131 by YoutubeDL if it's missing)
132 categories: A list of categories that the video falls in, for example
134 is_live: True, False, or None (=unknown). Whether this video is a
135 live stream that goes on instead of a fixed-length video.
137 Unless mentioned otherwise, the fields should be Unicode strings.
139 Subclasses of this one should re-define the _real_initialize() and
140 _real_extract() methods and define a _VALID_URL regexp.
141 Probably, they should also be added to the list of extractors.
143 Finally, the _WORKING attribute should be set to False for broken IEs
144 in order to warn the users and skip the tests.
151 def __init__(self, downloader=None):
152 """Constructor. Receives an optional downloader."""
154 self.set_downloader(downloader)
157 def suitable(cls, url):
158 """Receives a URL and returns True if suitable for this IE."""
160 # This does not use has/getattr intentionally - we want to know whether
161 # we have cached the regexp for *this* class, whereas getattr would also
162 # match the superclass
163 if '_VALID_URL_RE' not in cls.__dict__:
164 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
165 return cls._VALID_URL_RE.match(url) is not None
169 """Getter method for _WORKING."""
172 def initialize(self):
173 """Initializes an instance (authentication, etc)."""
175 self._real_initialize()
178 def extract(self, url):
179 """Extracts URL information and returns it in list of dicts."""
181 return self._real_extract(url)
183 def set_downloader(self, downloader):
184 """Sets the downloader for this IE."""
185 self._downloader = downloader
187 def _real_initialize(self):
188 """Real initialization process. Redefine in subclasses."""
191 def _real_extract(self, url):
192 """Real extraction process. Redefine in subclasses."""
197 """A string for getting the InfoExtractor with get_info_extractor"""
198 return cls.__name__[:-2]
202 return type(self).__name__[:-2]
204 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
205 """ Returns the response handle """
207 self.report_download_webpage(video_id)
208 elif note is not False:
210 self.to_screen('%s' % (note,))
212 self.to_screen('%s: %s' % (video_id, note))
214 return self._downloader.urlopen(url_or_request)
215 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
219 errnote = 'Unable to download webpage'
220 errmsg = '%s: %s' % (errnote, compat_str(err))
222 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
224 self._downloader.report_warning(errmsg)
227 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
228 """ Returns a tuple (page content as string, URL handle) """
230 # Strip hashes from the URL (#1038)
231 if isinstance(url_or_request, (compat_str, str)):
232 url_or_request = url_or_request.partition('#')[0]
234 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
238 content_type = urlh.headers.get('Content-Type', '')
239 webpage_bytes = urlh.read()
240 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
242 encoding = m.group(1)
244 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
245 webpage_bytes[:1024])
247 encoding = m.group(1).decode('ascii')
248 elif webpage_bytes.startswith(b'\xff\xfe'):
252 if self._downloader.params.get('dump_intermediate_pages', False):
254 url = url_or_request.get_full_url()
255 except AttributeError:
257 self.to_screen('Dumping request to ' + url)
258 dump = base64.b64encode(webpage_bytes).decode('ascii')
259 self._downloader.to_screen(dump)
260 if self._downloader.params.get('write_pages', False):
262 url = url_or_request.get_full_url()
263 except AttributeError:
265 basen = '%s_%s' % (video_id, url)
267 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
268 basen = basen[:240 - len(h)] + h
269 raw_filename = basen + '.dump'
270 filename = sanitize_filename(raw_filename, restricted=True)
271 self.to_screen('Saving request to ' + filename)
272 with open(filename, 'wb') as outf:
273 outf.write(webpage_bytes)
276 content = webpage_bytes.decode(encoding, 'replace')
278 content = webpage_bytes.decode('utf-8', 'replace')
280 if ('<title>Access to this site is blocked</title>' in content and
281 'Websense' in content[:512]):
282 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
283 blocked_iframe = self._html_search_regex(
284 r'<iframe src="([^"]+)"', content,
285 'Websense information URL', default=None)
287 msg += ' Visit %s for more details' % blocked_iframe
288 raise ExtractorError(msg, expected=True)
290 return (content, urlh)
292 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
293 """ Returns the data of the page as a string """
294 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
301 def _download_xml(self, url_or_request, video_id,
302 note='Downloading XML', errnote='Unable to download XML',
303 transform_source=None, fatal=True):
304 """Return the xml as an xml.etree.ElementTree.Element"""
305 xml_string = self._download_webpage(
306 url_or_request, video_id, note, errnote, fatal=fatal)
307 if xml_string is False:
310 xml_string = transform_source(xml_string)
311 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
313 def _download_json(self, url_or_request, video_id,
314 note='Downloading JSON metadata',
315 errnote='Unable to download JSON metadata',
316 transform_source=None,
318 json_string = self._download_webpage(
319 url_or_request, video_id, note, errnote, fatal=fatal)
320 if (not fatal) and json_string is False:
323 json_string = transform_source(json_string)
325 return json.loads(json_string)
326 except ValueError as ve:
327 raise ExtractorError('Failed to download JSON', cause=ve)
329 def report_warning(self, msg, video_id=None):
330 idstr = '' if video_id is None else '%s: ' % video_id
331 self._downloader.report_warning(
332 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
334 def to_screen(self, msg):
335 """Print msg to screen, prefixing it with '[ie_name]'"""
336 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
338 def report_extraction(self, id_or_name):
339 """Report information extraction."""
340 self.to_screen('%s: Extracting information' % id_or_name)
342 def report_download_webpage(self, video_id):
343 """Report webpage download."""
344 self.to_screen('%s: Downloading webpage' % video_id)
346 def report_age_confirmation(self):
347 """Report attempt to confirm age."""
348 self.to_screen('Confirming age')
350 def report_login(self):
351 """Report attempt to log in."""
352 self.to_screen('Logging in')
354 #Methods for following #608
356 def url_result(url, ie=None, video_id=None):
357 """Returns a url that points to a page that should be processed"""
358 #TODO: ie should be the class used for getting the info
359 video_info = {'_type': 'url',
362 if video_id is not None:
363 video_info['id'] = video_id
366 def playlist_result(entries, playlist_id=None, playlist_title=None):
367 """Returns a playlist"""
368 video_info = {'_type': 'playlist',
371 video_info['id'] = playlist_id
373 video_info['title'] = playlist_title
376 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
378 Perform a regex search on the given string, using a single or a list of
379 patterns returning the first matching group.
380 In case of failure return a default value or raise a WARNING or a
381 RegexNotFoundError, depending on fatal, specifying the field name.
383 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
384 mobj = re.search(pattern, string, flags)
387 mobj = re.search(p, string, flags)
391 if os.name != 'nt' and sys.stderr.isatty():
392 _name = '\033[0;34m%s\033[0m' % name
397 # return the first matching group
398 return next(g for g in mobj.groups() if g is not None)
399 elif default is not _NO_DEFAULT:
402 raise RegexNotFoundError('Unable to extract %s' % _name)
404 self._downloader.report_warning('unable to extract %s; '
405 'please report this issue on http://yt-dl.org/bug' % _name)
408 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
410 Like _search_regex, but strips HTML tags and unescapes entities.
412 res = self._search_regex(pattern, string, name, default, fatal, flags)
414 return clean_html(res).strip()
418 def _get_login_info(self):
420 Get the the login info as (username, password)
421 It will look in the netrc file using the _NETRC_MACHINE value
422 If there's no info available, return (None, None)
424 if self._downloader is None:
429 downloader_params = self._downloader.params
431 # Attempt to use provided username and password or .netrc data
432 if downloader_params.get('username', None) is not None:
433 username = downloader_params['username']
434 password = downloader_params['password']
435 elif downloader_params.get('usenetrc', False):
437 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
442 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
443 except (IOError, netrc.NetrcParseError) as err:
444 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
446 return (username, password)
448 def _get_tfa_info(self):
450 Get the two-factor authentication info
451 TODO - asking the user will be required for sms/phone verify
452 currently just uses the command line option
453 If there's no info available, return None
455 if self._downloader is None:
457 downloader_params = self._downloader.params
459 if downloader_params.get('twofactor', None) is not None:
460 return downloader_params['twofactor']
464 # Helper functions for extracting OpenGraph info
466 def _og_regexes(prop):
467 content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
468 property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
469 template = r'<meta[^>]+?%s[^>]+?%s'
471 template % (property_re, content_re),
472 template % (content_re, property_re),
475 def _og_search_property(self, prop, html, name=None, **kargs):
477 name = 'OpenGraph %s' % prop
478 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
481 return unescapeHTML(escaped)
483 def _og_search_thumbnail(self, html, **kargs):
484 return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs)
486 def _og_search_description(self, html, **kargs):
487 return self._og_search_property('description', html, fatal=False, **kargs)
489 def _og_search_title(self, html, **kargs):
490 return self._og_search_property('title', html, **kargs)
492 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
493 regexes = self._og_regexes('video') + self._og_regexes('video:url')
495 regexes = self._og_regexes('video:secure_url') + regexes
496 return self._html_search_regex(regexes, html, name, **kargs)
498 def _og_search_url(self, html, **kargs):
499 return self._og_search_property('url', html, **kargs)
501 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
502 if display_name is None:
504 return self._html_search_regex(
506 (?=[^>]+(?:itemprop|name|property)=["\']?%s["\']?)
507 [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
508 html, display_name, fatal=fatal, **kwargs)
510 def _dc_search_uploader(self, html):
511 return self._html_search_meta('dc.creator', html, 'uploader')
513 def _rta_search(self, html):
514 # See http://www.rtalabel.org/index.php?content=howtofaq#single
515 if re.search(r'(?ix)<meta\s+name="rating"\s+'
516 r' content="RTA-5042-1996-1400-1577-RTA"',
521 def _media_rating_search(self, html):
522 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
523 rating = self._html_search_meta('rating', html)
535 return RATING_TABLE.get(rating.lower(), None)
537 def _twitter_search_player(self, html):
538 return self._html_search_meta('twitter:player', html,
539 'twitter card player')
541 def _sort_formats(self, formats):
543 raise ExtractorError('No video formats found')
546 # TODO remove the following workaround
547 from ..utils import determine_ext
548 if not f.get('ext') and 'url' in f:
549 f['ext'] = determine_ext(f['url'])
551 preference = f.get('preference')
552 if preference is None:
553 proto = f.get('protocol')
555 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
557 preference = 0 if proto in ['http', 'https'] else -0.1
558 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
561 if f.get('vcodec') == 'none': # audio only
562 if self._downloader.params.get('prefer_free_formats'):
563 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
565 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
568 audio_ext_preference = ORDER.index(f['ext'])
570 audio_ext_preference = -1
572 if self._downloader.params.get('prefer_free_formats'):
573 ORDER = ['flv', 'mp4', 'webm']
575 ORDER = ['webm', 'flv', 'mp4']
577 ext_preference = ORDER.index(f['ext'])
580 audio_ext_preference = 0
584 f.get('quality') if f.get('quality') is not None else -1,
585 f.get('height') if f.get('height') is not None else -1,
586 f.get('width') if f.get('width') is not None else -1,
588 f.get('tbr') if f.get('tbr') is not None else -1,
589 f.get('vbr') if f.get('vbr') is not None else -1,
590 f.get('abr') if f.get('abr') is not None else -1,
591 audio_ext_preference,
592 f.get('filesize') if f.get('filesize') is not None else -1,
593 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
596 formats.sort(key=_formats_key)
598 def http_scheme(self):
599 """ Either "https:" or "https:", depending on the user's preferences """
602 if self._downloader.params.get('prefer_insecure', False)
605 def _proto_relative_url(self, url, scheme=None):
608 if url.startswith('//'):
610 scheme = self.http_scheme()
615 def _sleep(self, timeout, video_id, msg_template=None):
616 if msg_template is None:
617 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
618 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
622 def _extract_f4m_formats(self, manifest_url, video_id):
623 manifest = self._download_xml(
624 manifest_url, video_id, 'Downloading f4m manifest',
625 'Unable to download f4m manifest')
628 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
629 for i, media_el in enumerate(media_nodes):
630 tbr = int_or_none(media_el.attrib.get('bitrate'))
631 format_id = 'f4m-%d' % (i if tbr is None else tbr)
633 'format_id': format_id,
637 'width': int_or_none(media_el.attrib.get('width')),
638 'height': int_or_none(media_el.attrib.get('height')),
640 self._sort_formats(formats)
644 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
645 entry_protocol='m3u8', preference=None):
648 'format_id': 'm3u8-meta',
653 'resolution': 'multiple',
654 'format_note': 'Quality selection URL',
657 format_url = lambda u: (
659 if re.match(r'^https?://', u)
660 else compat_urlparse.urljoin(m3u8_url, u))
662 m3u8_doc = self._download_webpage(m3u8_url, video_id)
665 r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
666 for line in m3u8_doc.splitlines():
667 if line.startswith('#EXT-X-STREAM-INF:'):
669 for m in kv_rex.finditer(line):
671 if v.startswith('"'):
673 last_info[m.group('key')] = v
674 elif line.startswith('#') or not line.strip():
677 if last_info is None:
678 formats.append({'url': format_url(line)})
680 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
683 'format_id': 'm3u8-%d' % (tbr if tbr else len(formats)),
684 'url': format_url(line.strip()),
687 'protocol': entry_protocol,
688 'preference': preference,
690 codecs = last_info.get('CODECS')
692 # TODO: looks like video codec is not always necessarily goes first
693 va_codecs = codecs.split(',')
695 f['vcodec'] = va_codecs[0].partition('.')[0]
696 if len(va_codecs) > 1 and va_codecs[1]:
697 f['acodec'] = va_codecs[1].partition('.')[0]
698 resolution = last_info.get('RESOLUTION')
700 width_str, height_str = resolution.split('x')
701 f['width'] = int(width_str)
702 f['height'] = int(height_str)
705 self._sort_formats(formats)
709 class SearchInfoExtractor(InfoExtractor):
711 Base class for paged search queries extractors.
712 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
713 Instances should define _SEARCH_KEY and _MAX_RESULTS.
717 def _make_valid_url(cls):
718 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
721 def suitable(cls, url):
722 return re.match(cls._make_valid_url(), url) is not None
724 def _real_extract(self, query):
725 mobj = re.match(self._make_valid_url(), query)
727 raise ExtractorError('Invalid search query "%s"' % query)
729 prefix = mobj.group('prefix')
730 query = mobj.group('query')
732 return self._get_n_results(query, 1)
733 elif prefix == 'all':
734 return self._get_n_results(query, self._MAX_RESULTS)
738 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
739 elif n > self._MAX_RESULTS:
740 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
741 n = self._MAX_RESULTS
742 return self._get_n_results(query, n)
744 def _get_n_results(self, query, n):
745 """Get a specified number of results for a query"""
746 raise NotImplementedError("This method must be implemented by subclasses")
749 def SEARCH_KEY(self):
750 return self._SEARCH_KEY