1 from __future__ import unicode_literals
13 import xml.etree.ElementTree
18 compat_urllib_parse_urlparse,
30 _NO_DEFAULT = object()
33 class InfoExtractor(object):
34 """Information Extractor class.
36 Information extractors are the classes that, given a URL, extract
37 information about the video (or videos) the URL refers to. This
38 information includes the real video URL, the video title, author and
39 others. The information is stored in a dictionary which is then
40 passed to the FileDownloader. The FileDownloader processes this
41 information possibly downloading the video to the file system, among
42 other possible outcomes.
44 The dictionaries must include the following fields:
47 title: Video title, unescaped.
49 Additionally, it must contain either a formats entry or a url one:
51 formats: A list of dictionaries for each format available, ordered
52 from worst to best quality.
55 * url Mandatory. The URL of the video file
56 * ext Will be calculated from url if missing
57 * format A human-readable description of the format
58 ("mp4 container with h264/opus").
59 Calculated from the format_id, width, height.
60 and format_note fields if missing.
61 * format_id A short description of the format
62 ("mp4_h264_opus" or "19").
63 Technically optional, but strongly recommended.
64 * format_note Additional info about the format
65 ("3D" or "DASH video")
66 * width Width of the video, if known
67 * height Height of the video, if known
68 * resolution Textual description of width and height
69 * tbr Average bitrate of audio and video in KBit/s
70 * abr Average audio bitrate in KBit/s
71 * acodec Name of the audio codec in use
72 * asr Audio sampling rate in Hertz
73 * vbr Average video bitrate in KBit/s
74 * vcodec Name of the video codec in use
75 * container Name of the container format
76 * filesize The number of bytes, if known in advance
77 * filesize_approx An estimate for the number of bytes
78 * player_url SWF Player URL (used for rtmpdump).
79 * protocol The protocol that will be used for the actual
81 "http", "https", "rtsp", "rtmp", "m3u8" or so.
82 * preference Order number of this format. If this field is
83 present and not None, the formats get sorted
84 by this field, regardless of all other values.
85 -1 for default (order by other properties),
86 -2 or smaller for less than default.
87 * quality Order number of the video quality of this
88 format, irrespective of the file format.
89 -1 for default (order by other properties),
90 -2 or smaller for less than default.
91 * http_referer HTTP Referer header value to set.
92 * http_method HTTP method to use for the download.
93 * http_headers A dictionary of additional HTTP headers
94 to add to the request.
95 * http_post_data Additional data to send with a POST
98 ext: Video filename extension.
99 format: The video format, defaults to ext (used for --get-format)
100 player_url: SWF Player URL (used for rtmpdump).
102 The following fields are optional:
104 display_id An alternative identifier for the video, not necessarily
105 unique, but available before title. Typically, id is
106 something like "4234987", title "Dancing naked mole rats",
107 and display_id "dancing-naked-mole-rats"
108 thumbnails: A list of dictionaries, with the following entries:
110 * "width" (optional, int)
111 * "height" (optional, int)
112 * "resolution" (optional, string "{width}x{height"},
114 thumbnail: Full URL to a video thumbnail image.
115 description: One-line video description.
116 uploader: Full name of the video uploader.
117 timestamp: UNIX timestamp of the moment the video became available.
118 upload_date: Video upload date (YYYYMMDD).
119 If not explicitly set, calculated from timestamp.
120 uploader_id: Nickname or id of the video uploader.
121 location: Physical location where the video was filmed.
122 subtitles: The subtitle file contents as a dictionary in the format
123 {language: subtitles}.
124 duration: Length of the video in seconds, as an integer.
125 view_count: How many users have watched the video on the platform.
126 like_count: Number of positive ratings of the video
127 dislike_count: Number of negative ratings of the video
128 comment_count: Number of comments on the video
129 age_limit: Age restriction for the video, as an integer (years)
130 webpage_url: The url to the video webpage, if given to youtube-dl it
131 should allow to get the same result again. (It will be set
132 by YoutubeDL if it's missing)
133 categories: A list of categories that the video falls in, for example
135 is_live: True, False, or None (=unknown). Whether this video is a
136 live stream that goes on instead of a fixed-length video.
138 Unless mentioned otherwise, the fields should be Unicode strings.
140 Subclasses of this one should re-define the _real_initialize() and
141 _real_extract() methods and define a _VALID_URL regexp.
142 Probably, they should also be added to the list of extractors.
144 Finally, the _WORKING attribute should be set to False for broken IEs
145 in order to warn the users and skip the tests.
152 def __init__(self, downloader=None):
153 """Constructor. Receives an optional downloader."""
155 self.set_downloader(downloader)
158 def suitable(cls, url):
159 """Receives a URL and returns True if suitable for this IE."""
161 # This does not use has/getattr intentionally - we want to know whether
162 # we have cached the regexp for *this* class, whereas getattr would also
163 # match the superclass
164 if '_VALID_URL_RE' not in cls.__dict__:
165 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
166 return cls._VALID_URL_RE.match(url) is not None
170 """Getter method for _WORKING."""
173 def initialize(self):
174 """Initializes an instance (authentication, etc)."""
176 self._real_initialize()
179 def extract(self, url):
180 """Extracts URL information and returns it in list of dicts."""
182 return self._real_extract(url)
184 def set_downloader(self, downloader):
185 """Sets the downloader for this IE."""
186 self._downloader = downloader
188 def _real_initialize(self):
189 """Real initialization process. Redefine in subclasses."""
192 def _real_extract(self, url):
193 """Real extraction process. Redefine in subclasses."""
198 """A string for getting the InfoExtractor with get_info_extractor"""
199 return cls.__name__[:-2]
203 return type(self).__name__[:-2]
205 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
206 """ Returns the response handle """
208 self.report_download_webpage(video_id)
209 elif note is not False:
211 self.to_screen('%s' % (note,))
213 self.to_screen('%s: %s' % (video_id, note))
215 return self._downloader.urlopen(url_or_request)
216 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
220 errnote = 'Unable to download webpage'
221 errmsg = '%s: %s' % (errnote, compat_str(err))
223 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
225 self._downloader.report_warning(errmsg)
228 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
229 """ Returns a tuple (page content as string, URL handle) """
231 # Strip hashes from the URL (#1038)
232 if isinstance(url_or_request, (compat_str, str)):
233 url_or_request = url_or_request.partition('#')[0]
235 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
239 content_type = urlh.headers.get('Content-Type', '')
240 webpage_bytes = urlh.read()
241 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
243 encoding = m.group(1)
245 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
246 webpage_bytes[:1024])
248 encoding = m.group(1).decode('ascii')
249 elif webpage_bytes.startswith(b'\xff\xfe'):
253 if self._downloader.params.get('dump_intermediate_pages', False):
255 url = url_or_request.get_full_url()
256 except AttributeError:
258 self.to_screen('Dumping request to ' + url)
259 dump = base64.b64encode(webpage_bytes).decode('ascii')
260 self._downloader.to_screen(dump)
261 if self._downloader.params.get('write_pages', False):
263 url = url_or_request.get_full_url()
264 except AttributeError:
266 basen = '%s_%s' % (video_id, url)
268 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
269 basen = basen[:240 - len(h)] + h
270 raw_filename = basen + '.dump'
271 filename = sanitize_filename(raw_filename, restricted=True)
272 self.to_screen('Saving request to ' + filename)
273 with open(filename, 'wb') as outf:
274 outf.write(webpage_bytes)
277 content = webpage_bytes.decode(encoding, 'replace')
279 content = webpage_bytes.decode('utf-8', 'replace')
281 if ('<title>Access to this site is blocked</title>' in content and
282 'Websense' in content[:512]):
283 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
284 blocked_iframe = self._html_search_regex(
285 r'<iframe src="([^"]+)"', content,
286 'Websense information URL', default=None)
288 msg += ' Visit %s for more details' % blocked_iframe
289 raise ExtractorError(msg, expected=True)
291 return (content, urlh)
293 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
294 """ Returns the data of the page as a string """
295 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
302 def _download_xml(self, url_or_request, video_id,
303 note='Downloading XML', errnote='Unable to download XML',
304 transform_source=None, fatal=True):
305 """Return the xml as an xml.etree.ElementTree.Element"""
306 xml_string = self._download_webpage(
307 url_or_request, video_id, note, errnote, fatal=fatal)
308 if xml_string is False:
311 xml_string = transform_source(xml_string)
312 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
314 def _download_json(self, url_or_request, video_id,
315 note='Downloading JSON metadata',
316 errnote='Unable to download JSON metadata',
317 transform_source=None,
319 json_string = self._download_webpage(
320 url_or_request, video_id, note, errnote, fatal=fatal)
321 if (not fatal) and json_string is False:
324 json_string = transform_source(json_string)
326 return json.loads(json_string)
327 except ValueError as ve:
328 raise ExtractorError('Failed to download JSON', cause=ve)
330 def report_warning(self, msg, video_id=None):
331 idstr = '' if video_id is None else '%s: ' % video_id
332 self._downloader.report_warning(
333 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
335 def to_screen(self, msg):
336 """Print msg to screen, prefixing it with '[ie_name]'"""
337 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
339 def report_extraction(self, id_or_name):
340 """Report information extraction."""
341 self.to_screen('%s: Extracting information' % id_or_name)
343 def report_download_webpage(self, video_id):
344 """Report webpage download."""
345 self.to_screen('%s: Downloading webpage' % video_id)
347 def report_age_confirmation(self):
348 """Report attempt to confirm age."""
349 self.to_screen('Confirming age')
351 def report_login(self):
352 """Report attempt to log in."""
353 self.to_screen('Logging in')
355 #Methods for following #608
357 def url_result(url, ie=None, video_id=None):
358 """Returns a url that points to a page that should be processed"""
359 #TODO: ie should be the class used for getting the info
360 video_info = {'_type': 'url',
363 if video_id is not None:
364 video_info['id'] = video_id
367 def playlist_result(entries, playlist_id=None, playlist_title=None):
368 """Returns a playlist"""
369 video_info = {'_type': 'playlist',
372 video_info['id'] = playlist_id
374 video_info['title'] = playlist_title
377 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
379 Perform a regex search on the given string, using a single or a list of
380 patterns returning the first matching group.
381 In case of failure return a default value or raise a WARNING or a
382 RegexNotFoundError, depending on fatal, specifying the field name.
384 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
385 mobj = re.search(pattern, string, flags)
388 mobj = re.search(p, string, flags)
392 if os.name != 'nt' and sys.stderr.isatty():
393 _name = '\033[0;34m%s\033[0m' % name
398 # return the first matching group
399 return next(g for g in mobj.groups() if g is not None)
400 elif default is not _NO_DEFAULT:
403 raise RegexNotFoundError('Unable to extract %s' % _name)
405 self._downloader.report_warning('unable to extract %s; '
406 'please report this issue on http://yt-dl.org/bug' % _name)
409 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
411 Like _search_regex, but strips HTML tags and unescapes entities.
413 res = self._search_regex(pattern, string, name, default, fatal, flags)
415 return clean_html(res).strip()
419 def _get_login_info(self):
421 Get the the login info as (username, password)
422 It will look in the netrc file using the _NETRC_MACHINE value
423 If there's no info available, return (None, None)
425 if self._downloader is None:
430 downloader_params = self._downloader.params
432 # Attempt to use provided username and password or .netrc data
433 if downloader_params.get('username', None) is not None:
434 username = downloader_params['username']
435 password = downloader_params['password']
436 elif downloader_params.get('usenetrc', False):
438 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
443 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
444 except (IOError, netrc.NetrcParseError) as err:
445 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
447 return (username, password)
449 def _get_tfa_info(self):
451 Get the two-factor authentication info
452 TODO - asking the user will be required for sms/phone verify
453 currently just uses the command line option
454 If there's no info available, return None
456 if self._downloader is None:
458 downloader_params = self._downloader.params
460 if downloader_params.get('twofactor', None) is not None:
461 return downloader_params['twofactor']
465 # Helper functions for extracting OpenGraph info
467 def _og_regexes(prop):
468 content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
469 property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
470 template = r'<meta[^>]+?%s[^>]+?%s'
472 template % (property_re, content_re),
473 template % (content_re, property_re),
476 def _og_search_property(self, prop, html, name=None, **kargs):
478 name = 'OpenGraph %s' % prop
479 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
482 return unescapeHTML(escaped)
484 def _og_search_thumbnail(self, html, **kargs):
485 return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs)
487 def _og_search_description(self, html, **kargs):
488 return self._og_search_property('description', html, fatal=False, **kargs)
490 def _og_search_title(self, html, **kargs):
491 return self._og_search_property('title', html, **kargs)
493 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
494 regexes = self._og_regexes('video') + self._og_regexes('video:url')
496 regexes = self._og_regexes('video:secure_url') + regexes
497 return self._html_search_regex(regexes, html, name, **kargs)
499 def _og_search_url(self, html, **kargs):
500 return self._og_search_property('url', html, **kargs)
502 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
503 if display_name is None:
505 return self._html_search_regex(
507 (?=[^>]+(?:itemprop|name|property)=["\']?%s["\']?)
508 [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
509 html, display_name, fatal=fatal, **kwargs)
511 def _dc_search_uploader(self, html):
512 return self._html_search_meta('dc.creator', html, 'uploader')
514 def _rta_search(self, html):
515 # See http://www.rtalabel.org/index.php?content=howtofaq#single
516 if re.search(r'(?ix)<meta\s+name="rating"\s+'
517 r' content="RTA-5042-1996-1400-1577-RTA"',
522 def _media_rating_search(self, html):
523 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
524 rating = self._html_search_meta('rating', html)
536 return RATING_TABLE.get(rating.lower(), None)
538 def _twitter_search_player(self, html):
539 return self._html_search_meta('twitter:player', html,
540 'twitter card player')
542 def _sort_formats(self, formats):
544 raise ExtractorError('No video formats found')
547 # TODO remove the following workaround
548 from ..utils import determine_ext
549 if not f.get('ext') and 'url' in f:
550 f['ext'] = determine_ext(f['url'])
552 preference = f.get('preference')
553 if preference is None:
554 proto = f.get('protocol')
556 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
558 preference = 0 if proto in ['http', 'https'] else -0.1
559 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
562 if f.get('vcodec') == 'none': # audio only
563 if self._downloader.params.get('prefer_free_formats'):
564 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
566 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
569 audio_ext_preference = ORDER.index(f['ext'])
571 audio_ext_preference = -1
573 if self._downloader.params.get('prefer_free_formats'):
574 ORDER = ['flv', 'mp4', 'webm']
576 ORDER = ['webm', 'flv', 'mp4']
578 ext_preference = ORDER.index(f['ext'])
581 audio_ext_preference = 0
585 f.get('quality') if f.get('quality') is not None else -1,
586 f.get('height') if f.get('height') is not None else -1,
587 f.get('width') if f.get('width') is not None else -1,
589 f.get('tbr') if f.get('tbr') is not None else -1,
590 f.get('vbr') if f.get('vbr') is not None else -1,
591 f.get('abr') if f.get('abr') is not None else -1,
592 audio_ext_preference,
593 f.get('filesize') if f.get('filesize') is not None else -1,
594 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
597 formats.sort(key=_formats_key)
599 def http_scheme(self):
600 """ Either "https:" or "https:", depending on the user's preferences """
603 if self._downloader.params.get('prefer_insecure', False)
606 def _proto_relative_url(self, url, scheme=None):
609 if url.startswith('//'):
611 scheme = self.http_scheme()
616 def _sleep(self, timeout, video_id, msg_template=None):
617 if msg_template is None:
618 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
619 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
623 def _extract_f4m_formats(self, manifest_url, video_id):
624 manifest = self._download_xml(
625 manifest_url, video_id, 'Downloading f4m manifest',
626 'Unable to download f4m manifest')
629 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
630 for i, media_el in enumerate(media_nodes):
631 tbr = int_or_none(media_el.attrib.get('bitrate'))
632 format_id = 'f4m-%d' % (i if tbr is None else tbr)
634 'format_id': format_id,
638 'width': int_or_none(media_el.attrib.get('width')),
639 'height': int_or_none(media_el.attrib.get('height')),
641 self._sort_formats(formats)
645 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
646 entry_protocol='m3u8', preference=None):
649 'format_id': 'm3u8-meta',
654 'resolution': 'multiple',
655 'format_note': 'Quality selection URL',
658 format_url = lambda u: (
660 if re.match(r'^https?://', u)
661 else compat_urlparse.urljoin(m3u8_url, u))
663 m3u8_doc = self._download_webpage(m3u8_url, video_id)
666 r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
667 for line in m3u8_doc.splitlines():
668 if line.startswith('#EXT-X-STREAM-INF:'):
670 for m in kv_rex.finditer(line):
672 if v.startswith('"'):
674 last_info[m.group('key')] = v
675 elif line.startswith('#') or not line.strip():
678 if last_info is None:
679 formats.append({'url': format_url(line)})
681 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
684 'format_id': 'm3u8-%d' % (tbr if tbr else len(formats)),
685 'url': format_url(line.strip()),
688 'protocol': entry_protocol,
689 'preference': preference,
691 codecs = last_info.get('CODECS')
693 # TODO: looks like video codec is not always necessarily goes first
694 va_codecs = codecs.split(',')
696 f['vcodec'] = va_codecs[0].partition('.')[0]
697 if len(va_codecs) > 1 and va_codecs[1]:
698 f['acodec'] = va_codecs[1].partition('.')[0]
699 resolution = last_info.get('RESOLUTION')
701 width_str, height_str = resolution.split('x')
702 f['width'] = int(width_str)
703 f['height'] = int(height_str)
706 self._sort_formats(formats)
709 def _live_title(self, name):
710 """ Generate the title for a live video """
711 now = datetime.datetime.now()
712 now_str = now.strftime("%Y-%m-%d %H:%M")
713 return name + ' ' + now_str
716 class SearchInfoExtractor(InfoExtractor):
718 Base class for paged search queries extractors.
719 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
720 Instances should define _SEARCH_KEY and _MAX_RESULTS.
724 def _make_valid_url(cls):
725 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
728 def suitable(cls, url):
729 return re.match(cls._make_valid_url(), url) is not None
731 def _real_extract(self, query):
732 mobj = re.match(self._make_valid_url(), query)
734 raise ExtractorError('Invalid search query "%s"' % query)
736 prefix = mobj.group('prefix')
737 query = mobj.group('query')
739 return self._get_n_results(query, 1)
740 elif prefix == 'all':
741 return self._get_n_results(query, self._MAX_RESULTS)
745 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
746 elif n > self._MAX_RESULTS:
747 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
748 n = self._MAX_RESULTS
749 return self._get_n_results(query, n)
751 def _get_n_results(self, query, n):
752 """Get a specified number of results for a query"""
753 raise NotImplementedError("This method must be implemented by subclasses")
756 def SEARCH_KEY(self):
757 return self._SEARCH_KEY