1 from __future__ import unicode_literals
13 import xml.etree.ElementTree
18 compat_urllib_parse_urlparse,
31 _NO_DEFAULT = object()
34 class InfoExtractor(object):
35 """Information Extractor class.
37 Information extractors are the classes that, given a URL, extract
38 information about the video (or videos) the URL refers to. This
39 information includes the real video URL, the video title, author and
40 others. The information is stored in a dictionary which is then
41 passed to the FileDownloader. The FileDownloader processes this
42 information possibly downloading the video to the file system, among
43 other possible outcomes.
45 The dictionaries must include the following fields:
48 title: Video title, unescaped.
50 Additionally, it must contain either a formats entry or a url one:
52 formats: A list of dictionaries for each format available, ordered
53 from worst to best quality.
56 * url Mandatory. The URL of the video file
57 * ext Will be calculated from url if missing
58 * format A human-readable description of the format
59 ("mp4 container with h264/opus").
60 Calculated from the format_id, width, height.
61 and format_note fields if missing.
62 * format_id A short description of the format
63 ("mp4_h264_opus" or "19").
64 Technically optional, but strongly recommended.
65 * format_note Additional info about the format
66 ("3D" or "DASH video")
67 * width Width of the video, if known
68 * height Height of the video, if known
69 * resolution Textual description of width and height
70 * tbr Average bitrate of audio and video in KBit/s
71 * abr Average audio bitrate in KBit/s
72 * acodec Name of the audio codec in use
73 * asr Audio sampling rate in Hertz
74 * vbr Average video bitrate in KBit/s
75 * vcodec Name of the video codec in use
76 * container Name of the container format
77 * filesize The number of bytes, if known in advance
78 * filesize_approx An estimate for the number of bytes
79 * player_url SWF Player URL (used for rtmpdump).
80 * protocol The protocol that will be used for the actual
82 "http", "https", "rtsp", "rtmp", "m3u8" or so.
83 * preference Order number of this format. If this field is
84 present and not None, the formats get sorted
85 by this field, regardless of all other values.
86 -1 for default (order by other properties),
87 -2 or smaller for less than default.
88 * quality Order number of the video quality of this
89 format, irrespective of the file format.
90 -1 for default (order by other properties),
91 -2 or smaller for less than default.
92 * source_preference Order number for this video source
93 (quality takes higher priority)
94 -1 for default (order by other properties),
95 -2 or smaller for less than default.
96 * http_referer HTTP Referer header value to set.
97 * http_method HTTP method to use for the download.
98 * http_headers A dictionary of additional HTTP headers
99 to add to the request.
100 * http_post_data Additional data to send with a POST
102 url: Final video URL.
103 ext: Video filename extension.
104 format: The video format, defaults to ext (used for --get-format)
105 player_url: SWF Player URL (used for rtmpdump).
107 The following fields are optional:
109 display_id An alternative identifier for the video, not necessarily
110 unique, but available before title. Typically, id is
111 something like "4234987", title "Dancing naked mole rats",
112 and display_id "dancing-naked-mole-rats"
113 thumbnails: A list of dictionaries, with the following entries:
115 * "width" (optional, int)
116 * "height" (optional, int)
117 * "resolution" (optional, string "{width}x{height"},
119 thumbnail: Full URL to a video thumbnail image.
120 description: One-line video description.
121 uploader: Full name of the video uploader.
122 timestamp: UNIX timestamp of the moment the video became available.
123 upload_date: Video upload date (YYYYMMDD).
124 If not explicitly set, calculated from timestamp.
125 uploader_id: Nickname or id of the video uploader.
126 location: Physical location where the video was filmed.
127 subtitles: The subtitle file contents as a dictionary in the format
128 {language: subtitles}.
129 duration: Length of the video in seconds, as an integer.
130 view_count: How many users have watched the video on the platform.
131 like_count: Number of positive ratings of the video
132 dislike_count: Number of negative ratings of the video
133 comment_count: Number of comments on the video
134 age_limit: Age restriction for the video, as an integer (years)
135 webpage_url: The url to the video webpage, if given to youtube-dl it
136 should allow to get the same result again. (It will be set
137 by YoutubeDL if it's missing)
138 categories: A list of categories that the video falls in, for example
140 is_live: True, False, or None (=unknown). Whether this video is a
141 live stream that goes on instead of a fixed-length video.
143 Unless mentioned otherwise, the fields should be Unicode strings.
145 Unless mentioned otherwise, None is equivalent to absence of information.
147 Subclasses of this one should re-define the _real_initialize() and
148 _real_extract() methods and define a _VALID_URL regexp.
149 Probably, they should also be added to the list of extractors.
151 Finally, the _WORKING attribute should be set to False for broken IEs
152 in order to warn the users and skip the tests.
159 def __init__(self, downloader=None):
160 """Constructor. Receives an optional downloader."""
162 self.set_downloader(downloader)
165 def suitable(cls, url):
166 """Receives a URL and returns True if suitable for this IE."""
168 # This does not use has/getattr intentionally - we want to know whether
169 # we have cached the regexp for *this* class, whereas getattr would also
170 # match the superclass
171 if '_VALID_URL_RE' not in cls.__dict__:
172 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
173 return cls._VALID_URL_RE.match(url) is not None
176 def _match_id(cls, url):
177 if '_VALID_URL_RE' not in cls.__dict__:
178 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
179 m = cls._VALID_URL_RE.match(url)
185 """Getter method for _WORKING."""
188 def initialize(self):
189 """Initializes an instance (authentication, etc)."""
191 self._real_initialize()
194 def extract(self, url):
195 """Extracts URL information and returns it in list of dicts."""
197 return self._real_extract(url)
199 def set_downloader(self, downloader):
200 """Sets the downloader for this IE."""
201 self._downloader = downloader
203 def _real_initialize(self):
204 """Real initialization process. Redefine in subclasses."""
207 def _real_extract(self, url):
208 """Real extraction process. Redefine in subclasses."""
213 """A string for getting the InfoExtractor with get_info_extractor"""
214 return cls.__name__[:-2]
218 return type(self).__name__[:-2]
220 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
221 """ Returns the response handle """
223 self.report_download_webpage(video_id)
224 elif note is not False:
226 self.to_screen('%s' % (note,))
228 self.to_screen('%s: %s' % (video_id, note))
230 return self._downloader.urlopen(url_or_request)
231 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
235 errnote = 'Unable to download webpage'
236 errmsg = '%s: %s' % (errnote, compat_str(err))
238 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
240 self._downloader.report_warning(errmsg)
243 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
244 """ Returns a tuple (page content as string, URL handle) """
245 # Strip hashes from the URL (#1038)
246 if isinstance(url_or_request, (compat_str, str)):
247 url_or_request = url_or_request.partition('#')[0]
249 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
253 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal)
254 return (content, urlh)
256 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True):
257 content_type = urlh.headers.get('Content-Type', '')
258 webpage_bytes = urlh.read()
259 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
261 encoding = m.group(1)
263 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
264 webpage_bytes[:1024])
266 encoding = m.group(1).decode('ascii')
267 elif webpage_bytes.startswith(b'\xff\xfe'):
271 if self._downloader.params.get('dump_intermediate_pages', False):
273 url = url_or_request.get_full_url()
274 except AttributeError:
276 self.to_screen('Dumping request to ' + url)
277 dump = base64.b64encode(webpage_bytes).decode('ascii')
278 self._downloader.to_screen(dump)
279 if self._downloader.params.get('write_pages', False):
281 url = url_or_request.get_full_url()
282 except AttributeError:
284 basen = '%s_%s' % (video_id, url)
286 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
287 basen = basen[:240 - len(h)] + h
288 raw_filename = basen + '.dump'
289 filename = sanitize_filename(raw_filename, restricted=True)
290 self.to_screen('Saving request to ' + filename)
291 # Working around MAX_PATH limitation on Windows (see
292 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
294 absfilepath = os.path.abspath(filename)
295 if len(absfilepath) > 259:
296 filename = '\\\\?\\' + absfilepath
297 with open(filename, 'wb') as outf:
298 outf.write(webpage_bytes)
301 content = webpage_bytes.decode(encoding, 'replace')
303 content = webpage_bytes.decode('utf-8', 'replace')
305 if ('<title>Access to this site is blocked</title>' in content and
306 'Websense' in content[:512]):
307 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
308 blocked_iframe = self._html_search_regex(
309 r'<iframe src="([^"]+)"', content,
310 'Websense information URL', default=None)
312 msg += ' Visit %s for more details' % blocked_iframe
313 raise ExtractorError(msg, expected=True)
317 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
318 """ Returns the data of the page as a string """
319 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
326 def _download_xml(self, url_or_request, video_id,
327 note='Downloading XML', errnote='Unable to download XML',
328 transform_source=None, fatal=True):
329 """Return the xml as an xml.etree.ElementTree.Element"""
330 xml_string = self._download_webpage(
331 url_or_request, video_id, note, errnote, fatal=fatal)
332 if xml_string is False:
335 xml_string = transform_source(xml_string)
336 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
338 def _download_json(self, url_or_request, video_id,
339 note='Downloading JSON metadata',
340 errnote='Unable to download JSON metadata',
341 transform_source=None,
343 json_string = self._download_webpage(
344 url_or_request, video_id, note, errnote, fatal=fatal)
345 if (not fatal) and json_string is False:
348 json_string = transform_source(json_string)
350 return json.loads(json_string)
351 except ValueError as ve:
352 errmsg = '%s: Failed to parse JSON ' % video_id
354 raise ExtractorError(errmsg, cause=ve)
356 self.report_warning(errmsg + str(ve))
358 def report_warning(self, msg, video_id=None):
359 idstr = '' if video_id is None else '%s: ' % video_id
360 self._downloader.report_warning(
361 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
363 def to_screen(self, msg):
364 """Print msg to screen, prefixing it with '[ie_name]'"""
365 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
367 def report_extraction(self, id_or_name):
368 """Report information extraction."""
369 self.to_screen('%s: Extracting information' % id_or_name)
371 def report_download_webpage(self, video_id):
372 """Report webpage download."""
373 self.to_screen('%s: Downloading webpage' % video_id)
375 def report_age_confirmation(self):
376 """Report attempt to confirm age."""
377 self.to_screen('Confirming age')
379 def report_login(self):
380 """Report attempt to log in."""
381 self.to_screen('Logging in')
383 #Methods for following #608
385 def url_result(url, ie=None, video_id=None):
386 """Returns a url that points to a page that should be processed"""
387 #TODO: ie should be the class used for getting the info
388 video_info = {'_type': 'url',
391 if video_id is not None:
392 video_info['id'] = video_id
395 def playlist_result(entries, playlist_id=None, playlist_title=None):
396 """Returns a playlist"""
397 video_info = {'_type': 'playlist',
400 video_info['id'] = playlist_id
402 video_info['title'] = playlist_title
405 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
407 Perform a regex search on the given string, using a single or a list of
408 patterns returning the first matching group.
409 In case of failure return a default value or raise a WARNING or a
410 RegexNotFoundError, depending on fatal, specifying the field name.
412 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
413 mobj = re.search(pattern, string, flags)
416 mobj = re.search(p, string, flags)
420 if os.name != 'nt' and sys.stderr.isatty():
421 _name = '\033[0;34m%s\033[0m' % name
426 # return the first matching group
427 return next(g for g in mobj.groups() if g is not None)
428 elif default is not _NO_DEFAULT:
431 raise RegexNotFoundError('Unable to extract %s' % _name)
433 self._downloader.report_warning('unable to extract %s; '
434 'please report this issue on http://yt-dl.org/bug' % _name)
437 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
439 Like _search_regex, but strips HTML tags and unescapes entities.
441 res = self._search_regex(pattern, string, name, default, fatal, flags)
443 return clean_html(res).strip()
447 def _get_login_info(self):
449 Get the the login info as (username, password)
450 It will look in the netrc file using the _NETRC_MACHINE value
451 If there's no info available, return (None, None)
453 if self._downloader is None:
458 downloader_params = self._downloader.params
460 # Attempt to use provided username and password or .netrc data
461 if downloader_params.get('username', None) is not None:
462 username = downloader_params['username']
463 password = downloader_params['password']
464 elif downloader_params.get('usenetrc', False):
466 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
471 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
472 except (IOError, netrc.NetrcParseError) as err:
473 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
475 return (username, password)
477 def _get_tfa_info(self):
479 Get the two-factor authentication info
480 TODO - asking the user will be required for sms/phone verify
481 currently just uses the command line option
482 If there's no info available, return None
484 if self._downloader is None:
486 downloader_params = self._downloader.params
488 if downloader_params.get('twofactor', None) is not None:
489 return downloader_params['twofactor']
493 # Helper functions for extracting OpenGraph info
495 def _og_regexes(prop):
496 content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
497 property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
498 template = r'<meta[^>]+?%s[^>]+?%s'
500 template % (property_re, content_re),
501 template % (content_re, property_re),
504 def _og_search_property(self, prop, html, name=None, **kargs):
506 name = 'OpenGraph %s' % prop
507 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
510 return unescapeHTML(escaped)
512 def _og_search_thumbnail(self, html, **kargs):
513 return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs)
515 def _og_search_description(self, html, **kargs):
516 return self._og_search_property('description', html, fatal=False, **kargs)
518 def _og_search_title(self, html, **kargs):
519 return self._og_search_property('title', html, **kargs)
521 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
522 regexes = self._og_regexes('video') + self._og_regexes('video:url')
524 regexes = self._og_regexes('video:secure_url') + regexes
525 return self._html_search_regex(regexes, html, name, **kargs)
527 def _og_search_url(self, html, **kargs):
528 return self._og_search_property('url', html, **kargs)
530 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
531 if display_name is None:
533 return self._html_search_regex(
535 (?=[^>]+(?:itemprop|name|property)=["\']?%s["\']?)
536 [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
537 html, display_name, fatal=fatal, **kwargs)
539 def _dc_search_uploader(self, html):
540 return self._html_search_meta('dc.creator', html, 'uploader')
542 def _rta_search(self, html):
543 # See http://www.rtalabel.org/index.php?content=howtofaq#single
544 if re.search(r'(?ix)<meta\s+name="rating"\s+'
545 r' content="RTA-5042-1996-1400-1577-RTA"',
550 def _media_rating_search(self, html):
551 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
552 rating = self._html_search_meta('rating', html)
564 return RATING_TABLE.get(rating.lower(), None)
566 def _twitter_search_player(self, html):
567 return self._html_search_meta('twitter:player', html,
568 'twitter card player')
570 def _sort_formats(self, formats):
572 raise ExtractorError('No video formats found')
575 # TODO remove the following workaround
576 from ..utils import determine_ext
577 if not f.get('ext') and 'url' in f:
578 f['ext'] = determine_ext(f['url'])
580 preference = f.get('preference')
581 if preference is None:
582 proto = f.get('protocol')
584 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
586 preference = 0 if proto in ['http', 'https'] else -0.1
587 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
590 if f.get('vcodec') == 'none': # audio only
591 if self._downloader.params.get('prefer_free_formats'):
592 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
594 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
597 audio_ext_preference = ORDER.index(f['ext'])
599 audio_ext_preference = -1
601 if self._downloader.params.get('prefer_free_formats'):
602 ORDER = ['flv', 'mp4', 'webm']
604 ORDER = ['webm', 'flv', 'mp4']
606 ext_preference = ORDER.index(f['ext'])
609 audio_ext_preference = 0
613 f.get('quality') if f.get('quality') is not None else -1,
614 f.get('height') if f.get('height') is not None else -1,
615 f.get('width') if f.get('width') is not None else -1,
617 f.get('tbr') if f.get('tbr') is not None else -1,
618 f.get('vbr') if f.get('vbr') is not None else -1,
619 f.get('abr') if f.get('abr') is not None else -1,
620 audio_ext_preference,
621 f.get('filesize') if f.get('filesize') is not None else -1,
622 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
623 f.get('source_preference') if f.get('source_preference') is not None else -1,
626 formats.sort(key=_formats_key)
628 def http_scheme(self):
629 """ Either "http:" or "https:", depending on the user's preferences """
632 if self._downloader.params.get('prefer_insecure', False)
635 def _proto_relative_url(self, url, scheme=None):
638 if url.startswith('//'):
640 scheme = self.http_scheme()
645 def _sleep(self, timeout, video_id, msg_template=None):
646 if msg_template is None:
647 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
648 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
652 def _extract_f4m_formats(self, manifest_url, video_id):
653 manifest = self._download_xml(
654 manifest_url, video_id, 'Downloading f4m manifest',
655 'Unable to download f4m manifest')
658 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
659 for i, media_el in enumerate(media_nodes):
660 tbr = int_or_none(media_el.attrib.get('bitrate'))
661 format_id = 'f4m-%d' % (i if tbr is None else tbr)
663 'format_id': format_id,
667 'width': int_or_none(media_el.attrib.get('width')),
668 'height': int_or_none(media_el.attrib.get('height')),
670 self._sort_formats(formats)
674 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
675 entry_protocol='m3u8', preference=None):
678 'format_id': 'm3u8-meta',
683 'resolution': 'multiple',
684 'format_note': 'Quality selection URL',
687 format_url = lambda u: (
689 if re.match(r'^https?://', u)
690 else compat_urlparse.urljoin(m3u8_url, u))
692 m3u8_doc = self._download_webpage(m3u8_url, video_id)
695 r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
696 for line in m3u8_doc.splitlines():
697 if line.startswith('#EXT-X-STREAM-INF:'):
699 for m in kv_rex.finditer(line):
701 if v.startswith('"'):
703 last_info[m.group('key')] = v
704 elif line.startswith('#') or not line.strip():
707 if last_info is None:
708 formats.append({'url': format_url(line)})
710 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
713 'format_id': 'm3u8-%d' % (tbr if tbr else len(formats)),
714 'url': format_url(line.strip()),
717 'protocol': entry_protocol,
718 'preference': preference,
720 codecs = last_info.get('CODECS')
722 # TODO: looks like video codec is not always necessarily goes first
723 va_codecs = codecs.split(',')
725 f['vcodec'] = va_codecs[0].partition('.')[0]
726 if len(va_codecs) > 1 and va_codecs[1]:
727 f['acodec'] = va_codecs[1].partition('.')[0]
728 resolution = last_info.get('RESOLUTION')
730 width_str, height_str = resolution.split('x')
731 f['width'] = int(width_str)
732 f['height'] = int(height_str)
735 self._sort_formats(formats)
738 def _live_title(self, name):
739 """ Generate the title for a live video """
740 now = datetime.datetime.now()
741 now_str = now.strftime("%Y-%m-%d %H:%M")
742 return name + ' ' + now_str
744 def _int(self, v, name, fatal=False, **kwargs):
745 res = int_or_none(v, **kwargs)
746 if 'get_attr' in kwargs:
747 print(getattr(v, kwargs['get_attr']))
749 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
751 raise ExtractorError(msg)
753 self._downloader.report_warning(msg)
756 def _float(self, v, name, fatal=False, **kwargs):
757 res = float_or_none(v, **kwargs)
759 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
761 raise ExtractorError(msg)
763 self._downloader.report_warning(msg)
767 class SearchInfoExtractor(InfoExtractor):
769 Base class for paged search queries extractors.
770 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
771 Instances should define _SEARCH_KEY and _MAX_RESULTS.
775 def _make_valid_url(cls):
776 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
779 def suitable(cls, url):
780 return re.match(cls._make_valid_url(), url) is not None
782 def _real_extract(self, query):
783 mobj = re.match(self._make_valid_url(), query)
785 raise ExtractorError('Invalid search query "%s"' % query)
787 prefix = mobj.group('prefix')
788 query = mobj.group('query')
790 return self._get_n_results(query, 1)
791 elif prefix == 'all':
792 return self._get_n_results(query, self._MAX_RESULTS)
796 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
797 elif n > self._MAX_RESULTS:
798 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
799 n = self._MAX_RESULTS
800 return self._get_n_results(query, n)
802 def _get_n_results(self, query, n):
803 """Get a specified number of results for a query"""
804 raise NotImplementedError("This method must be implemented by subclasses")
807 def SEARCH_KEY(self):
808 return self._SEARCH_KEY