1 from __future__ import unicode_literals
13 import xml.etree.ElementTree
15 from ..compat import (
23 compat_urllib_parse_urlparse,
24 compat_urllib_request,
48 class InfoExtractor(object):
49 """Information Extractor class.
51 Information extractors are the classes that, given a URL, extract
52 information about the video (or videos) the URL refers to. This
53 information includes the real video URL, the video title, author and
54 others. The information is stored in a dictionary which is then
55 passed to the YoutubeDL. The YoutubeDL processes this
56 information possibly downloading the video to the file system, among
57 other possible outcomes.
59 The type field determines the type of the result.
60 By far the most common value (and the default if _type is missing) is
61 "video", which indicates a single video.
63 For a video, the dictionaries must include the following fields:
66 title: Video title, unescaped.
68 Additionally, it must contain either a formats entry or a url one:
70 formats: A list of dictionaries for each format available, ordered
71 from worst to best quality.
74 * url Mandatory. The URL of the video file
75 * ext Will be calculated from URL if missing
76 * format A human-readable description of the format
77 ("mp4 container with h264/opus").
78 Calculated from the format_id, width, height.
79 and format_note fields if missing.
80 * format_id A short description of the format
81 ("mp4_h264_opus" or "19").
82 Technically optional, but strongly recommended.
83 * format_note Additional info about the format
84 ("3D" or "DASH video")
85 * width Width of the video, if known
86 * height Height of the video, if known
87 * resolution Textual description of width and height
88 * tbr Average bitrate of audio and video in KBit/s
89 * abr Average audio bitrate in KBit/s
90 * acodec Name of the audio codec in use
91 * asr Audio sampling rate in Hertz
92 * vbr Average video bitrate in KBit/s
94 * vcodec Name of the video codec in use
95 * container Name of the container format
96 * filesize The number of bytes, if known in advance
97 * filesize_approx An estimate for the number of bytes
98 * player_url SWF Player URL (used for rtmpdump).
99 * protocol The protocol that will be used for the actual
100 download, lower-case.
101 "http", "https", "rtsp", "rtmp", "rtmpe",
102 "m3u8", or "m3u8_native".
103 * preference Order number of this format. If this field is
104 present and not None, the formats get sorted
105 by this field, regardless of all other values.
106 -1 for default (order by other properties),
107 -2 or smaller for less than default.
108 < -1000 to hide the format (if there is
109 another one which is strictly better)
110 * language_preference Is this in the correct requested
112 10 if it's what the URL is about,
113 -1 for default (don't know),
114 -10 otherwise, other values reserved for now.
115 * quality Order number of the video quality of this
116 format, irrespective of the file format.
117 -1 for default (order by other properties),
118 -2 or smaller for less than default.
119 * source_preference Order number for this video source
120 (quality takes higher priority)
121 -1 for default (order by other properties),
122 -2 or smaller for less than default.
123 * http_headers A dictionary of additional HTTP headers
124 to add to the request.
125 * stretched_ratio If given and not 1, indicates that the
126 video's pixels are not square.
127 width : height ratio as float.
128 * no_resume The server does not support resuming the
129 (HTTP or RTMP) download. Boolean.
131 url: Final video URL.
132 ext: Video filename extension.
133 format: The video format, defaults to ext (used for --get-format)
134 player_url: SWF Player URL (used for rtmpdump).
136 The following fields are optional:
138 alt_title: A secondary title of the video.
139 display_id An alternative identifier for the video, not necessarily
140 unique, but available before title. Typically, id is
141 something like "4234987", title "Dancing naked mole rats",
142 and display_id "dancing-naked-mole-rats"
143 thumbnails: A list of dictionaries, with the following entries:
144 * "id" (optional, string) - Thumbnail format ID
146 * "preference" (optional, int) - quality of the image
147 * "width" (optional, int)
148 * "height" (optional, int)
149 * "resolution" (optional, string "{width}x{height"},
151 thumbnail: Full URL to a video thumbnail image.
152 description: Full video description.
153 uploader: Full name of the video uploader.
154 creator: The main artist who created the video.
155 timestamp: UNIX timestamp of the moment the video became available.
156 upload_date: Video upload date (YYYYMMDD).
157 If not explicitly set, calculated from timestamp.
158 uploader_id: Nickname or id of the video uploader.
159 location: Physical location where the video was filmed.
160 subtitles: The available subtitles as a dictionary in the format
161 {language: subformats}. "subformats" is a list sorted from
162 lower to higher preference, each element is a dictionary
163 with the "ext" entry and one of:
164 * "data": The subtitles file contents
165 * "url": A URL pointing to the subtitles file
166 automatic_captions: Like 'subtitles', used by the YoutubeIE for
167 automatically generated captions
168 duration: Length of the video in seconds, as an integer.
169 view_count: How many users have watched the video on the platform.
170 like_count: Number of positive ratings of the video
171 dislike_count: Number of negative ratings of the video
172 average_rating: Average rating give by users, the scale used depends on the webpage
173 comment_count: Number of comments on the video
174 comments: A list of comments, each with one or more of the following
175 properties (all but one of text or html optional):
176 * "author" - human-readable name of the comment author
177 * "author_id" - user ID of the comment author
179 * "html" - Comment as HTML
180 * "text" - Plain text of the comment
181 * "timestamp" - UNIX timestamp of comment
182 * "parent" - ID of the comment this one is replying to.
183 Set to "root" to indicate that this is a
184 comment to the original video.
185 age_limit: Age restriction for the video, as an integer (years)
186 webpage_url: The URL to the video webpage, if given to youtube-dl it
187 should allow to get the same result again. (It will be set
188 by YoutubeDL if it's missing)
189 categories: A list of categories that the video falls in, for example
191 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
192 is_live: True, False, or None (=unknown). Whether this video is a
193 live stream that goes on instead of a fixed-length video.
194 start_time: Time in seconds where the reproduction should start, as
195 specified in the URL.
196 end_time: Time in seconds where the reproduction should end, as
197 specified in the URL.
199 Unless mentioned otherwise, the fields should be Unicode strings.
201 Unless mentioned otherwise, None is equivalent to absence of information.
204 _type "playlist" indicates multiple videos.
205 There must be a key "entries", which is a list, an iterable, or a PagedList
206 object, each element of which is a valid dictionary by this specification.
208 Additionally, playlists can have "title", "description" and "id" attributes
209 with the same semantics as videos (see above).
212 _type "multi_video" indicates that there are multiple videos that
213 form a single show, for examples multiple acts of an opera or TV episode.
214 It must have an entries key like a playlist and contain all the keys
215 required for a video at the same time.
218 _type "url" indicates that the video must be extracted from another
219 location, possibly by a different extractor. Its only required key is:
220 "url" - the next URL to extract.
221 The key "ie_key" can be set to the class name (minus the trailing "IE",
222 e.g. "Youtube") if the extractor class is known in advance.
223 Additionally, the dictionary may have any properties of the resolved entity
224 known in advance, for example "title" if the title of the referred video is
228 _type "url_transparent" entities have the same specification as "url", but
229 indicate that the given additional information is more precise than the one
230 associated with the resolved URL.
231 This is useful when a site employs a video service that hosts the video and
232 its technical metadata, but that video service does not embed a useful
233 title, description etc.
236 Subclasses of this one should re-define the _real_initialize() and
237 _real_extract() methods and define a _VALID_URL regexp.
238 Probably, they should also be added to the list of extractors.
240 Finally, the _WORKING attribute should be set to False for broken IEs
241 in order to warn the users and skip the tests.
248 def __init__(self, downloader=None):
249 """Constructor. Receives an optional downloader."""
251 self.set_downloader(downloader)
254 def suitable(cls, url):
255 """Receives a URL and returns True if suitable for this IE."""
257 # This does not use has/getattr intentionally - we want to know whether
258 # we have cached the regexp for *this* class, whereas getattr would also
259 # match the superclass
260 if '_VALID_URL_RE' not in cls.__dict__:
261 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
262 return cls._VALID_URL_RE.match(url) is not None
265 def _match_id(cls, url):
266 if '_VALID_URL_RE' not in cls.__dict__:
267 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
268 m = cls._VALID_URL_RE.match(url)
274 """Getter method for _WORKING."""
277 def initialize(self):
278 """Initializes an instance (authentication, etc)."""
280 self._real_initialize()
283 def extract(self, url):
284 """Extracts URL information and returns it in list of dicts."""
287 return self._real_extract(url)
288 except ExtractorError:
290 except compat_http_client.IncompleteRead as e:
291 raise ExtractorError('A network error has occured.', cause=e, expected=True)
292 except (KeyError, StopIteration) as e:
293 raise ExtractorError('An extractor error has occured.', cause=e)
295 def set_downloader(self, downloader):
296 """Sets the downloader for this IE."""
297 self._downloader = downloader
299 def _real_initialize(self):
300 """Real initialization process. Redefine in subclasses."""
303 def _real_extract(self, url):
304 """Real extraction process. Redefine in subclasses."""
309 """A string for getting the InfoExtractor with get_info_extractor"""
310 return cls.__name__[:-2]
314 return type(self).__name__[:-2]
316 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
317 """ Returns the response handle """
319 self.report_download_webpage(video_id)
320 elif note is not False:
322 self.to_screen('%s' % (note,))
324 self.to_screen('%s: %s' % (video_id, note))
326 return self._downloader.urlopen(url_or_request)
327 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
331 errnote = 'Unable to download webpage'
332 errmsg = '%s: %s' % (errnote, compat_str(err))
334 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
336 self._downloader.report_warning(errmsg)
339 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None):
340 """ Returns a tuple (page content as string, URL handle) """
341 # Strip hashes from the URL (#1038)
342 if isinstance(url_or_request, (compat_str, str)):
343 url_or_request = url_or_request.partition('#')[0]
345 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
349 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
350 return (content, urlh)
353 def _guess_encoding_from_content(content_type, webpage_bytes):
354 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
356 encoding = m.group(1)
358 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
359 webpage_bytes[:1024])
361 encoding = m.group(1).decode('ascii')
362 elif webpage_bytes.startswith(b'\xff\xfe'):
369 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
370 content_type = urlh.headers.get('Content-Type', '')
371 webpage_bytes = urlh.read()
372 if prefix is not None:
373 webpage_bytes = prefix + webpage_bytes
375 encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
376 if self._downloader.params.get('dump_intermediate_pages', False):
378 url = url_or_request.get_full_url()
379 except AttributeError:
381 self.to_screen('Dumping request to ' + url)
382 dump = base64.b64encode(webpage_bytes).decode('ascii')
383 self._downloader.to_screen(dump)
384 if self._downloader.params.get('write_pages', False):
386 url = url_or_request.get_full_url()
387 except AttributeError:
389 basen = '%s_%s' % (video_id, url)
391 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
392 basen = basen[:240 - len(h)] + h
393 raw_filename = basen + '.dump'
394 filename = sanitize_filename(raw_filename, restricted=True)
395 self.to_screen('Saving request to ' + filename)
396 # Working around MAX_PATH limitation on Windows (see
397 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
399 absfilepath = os.path.abspath(filename)
400 if len(absfilepath) > 259:
401 filename = '\\\\?\\' + absfilepath
402 with open(filename, 'wb') as outf:
403 outf.write(webpage_bytes)
406 content = webpage_bytes.decode(encoding, 'replace')
408 content = webpage_bytes.decode('utf-8', 'replace')
410 if ('<title>Access to this site is blocked</title>' in content and
411 'Websense' in content[:512]):
412 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
413 blocked_iframe = self._html_search_regex(
414 r'<iframe src="([^"]+)"', content,
415 'Websense information URL', default=None)
417 msg += ' Visit %s for more details' % blocked_iframe
418 raise ExtractorError(msg, expected=True)
419 if '<title>The URL you requested has been blocked</title>' in content[:512]:
421 'Access to this webpage has been blocked by Indian censorship. '
422 'Use a VPN or proxy server (with --proxy) to route around it.')
423 block_msg = self._html_search_regex(
424 r'</h1><p>(.*?)</p>',
425 content, 'block message', default=None)
427 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
428 raise ExtractorError(msg, expected=True)
432 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None):
433 """ Returns the data of the page as a string """
436 while success is False:
438 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding)
440 except compat_http_client.IncompleteRead as e:
442 if try_count >= tries:
444 self._sleep(timeout, video_id)
451 def _download_xml(self, url_or_request, video_id,
452 note='Downloading XML', errnote='Unable to download XML',
453 transform_source=None, fatal=True, encoding=None):
454 """Return the xml as an xml.etree.ElementTree.Element"""
455 xml_string = self._download_webpage(
456 url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding)
457 if xml_string is False:
460 xml_string = transform_source(xml_string)
461 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
463 def _download_json(self, url_or_request, video_id,
464 note='Downloading JSON metadata',
465 errnote='Unable to download JSON metadata',
466 transform_source=None,
467 fatal=True, encoding=None):
468 json_string = self._download_webpage(
469 url_or_request, video_id, note, errnote, fatal=fatal,
471 if (not fatal) and json_string is False:
473 return self._parse_json(
474 json_string, video_id, transform_source=transform_source, fatal=fatal)
476 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
478 json_string = transform_source(json_string)
480 return json.loads(json_string)
481 except ValueError as ve:
482 errmsg = '%s: Failed to parse JSON ' % video_id
484 raise ExtractorError(errmsg, cause=ve)
486 self.report_warning(errmsg + str(ve))
488 def report_warning(self, msg, video_id=None):
489 idstr = '' if video_id is None else '%s: ' % video_id
490 self._downloader.report_warning(
491 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
493 def to_screen(self, msg):
494 """Print msg to screen, prefixing it with '[ie_name]'"""
495 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
497 def report_extraction(self, id_or_name):
498 """Report information extraction."""
499 self.to_screen('%s: Extracting information' % id_or_name)
501 def report_download_webpage(self, video_id):
502 """Report webpage download."""
503 self.to_screen('%s: Downloading webpage' % video_id)
505 def report_age_confirmation(self):
506 """Report attempt to confirm age."""
507 self.to_screen('Confirming age')
509 def report_login(self):
510 """Report attempt to log in."""
511 self.to_screen('Logging in')
514 def raise_login_required(msg='This video is only available for registered users'):
515 raise ExtractorError(
516 '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
520 def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'):
521 raise ExtractorError(
522 '%s. You might want to use --proxy to workaround.' % msg,
525 # Methods for following #608
527 def url_result(url, ie=None, video_id=None, video_title=None):
528 """Returns a URL that points to a page that should be processed"""
529 # TODO: ie should be the class used for getting the info
530 video_info = {'_type': 'url',
533 if video_id is not None:
534 video_info['id'] = video_id
535 if video_title is not None:
536 video_info['title'] = video_title
540 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
541 """Returns a playlist"""
542 video_info = {'_type': 'playlist',
545 video_info['id'] = playlist_id
547 video_info['title'] = playlist_title
548 if playlist_description:
549 video_info['description'] = playlist_description
552 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
554 Perform a regex search on the given string, using a single or a list of
555 patterns returning the first matching group.
556 In case of failure return a default value or raise a WARNING or a
557 RegexNotFoundError, depending on fatal, specifying the field name.
559 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
560 mobj = re.search(pattern, string, flags)
563 mobj = re.search(p, string, flags)
567 if not self._downloader.params.get('no_color') and os.name != 'nt' and sys.stderr.isatty():
568 _name = '\033[0;34m%s\033[0m' % name
574 # return the first matching group
575 return next(g for g in mobj.groups() if g is not None)
577 return mobj.group(group)
578 elif default is not NO_DEFAULT:
581 raise RegexNotFoundError('Unable to extract %s' % _name)
583 self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
586 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
588 Like _search_regex, but strips HTML tags and unescapes entities.
590 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
592 return clean_html(res).strip()
596 def _get_login_info(self):
598 Get the login info as (username, password)
599 It will look in the netrc file using the _NETRC_MACHINE value
600 If there's no info available, return (None, None)
602 if self._downloader is None:
607 downloader_params = self._downloader.params
609 # Attempt to use provided username and password or .netrc data
610 if downloader_params.get('username', None) is not None:
611 username = downloader_params['username']
612 password = downloader_params['password']
613 elif downloader_params.get('usenetrc', False):
615 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
620 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
621 except (IOError, netrc.NetrcParseError) as err:
622 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
624 return (username, password)
626 def _get_tfa_info(self, note='two-factor verification code'):
628 Get the two-factor authentication info
629 TODO - asking the user will be required for sms/phone verify
630 currently just uses the command line option
631 If there's no info available, return None
633 if self._downloader is None:
635 downloader_params = self._downloader.params
637 if downloader_params.get('twofactor', None) is not None:
638 return downloader_params['twofactor']
640 return compat_getpass('Type %s and press [Return]: ' % note)
642 # Helper functions for extracting OpenGraph info
644 def _og_regexes(prop):
645 content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
646 property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
647 template = r'<meta[^>]+?%s[^>]+?%s'
649 template % (property_re, content_re),
650 template % (content_re, property_re),
654 def _meta_regex(prop):
655 return r'''(?isx)<meta
656 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
657 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
659 def _og_search_property(self, prop, html, name=None, **kargs):
661 name = 'OpenGraph %s' % prop
662 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
665 return unescapeHTML(escaped)
667 def _og_search_thumbnail(self, html, **kargs):
668 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
670 def _og_search_description(self, html, **kargs):
671 return self._og_search_property('description', html, fatal=False, **kargs)
673 def _og_search_title(self, html, **kargs):
674 return self._og_search_property('title', html, **kargs)
676 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
677 regexes = self._og_regexes('video') + self._og_regexes('video:url')
679 regexes = self._og_regexes('video:secure_url') + regexes
680 return self._html_search_regex(regexes, html, name, **kargs)
682 def _og_search_url(self, html, **kargs):
683 return self._og_search_property('url', html, **kargs)
685 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
686 if display_name is None:
688 return self._html_search_regex(
689 self._meta_regex(name),
690 html, display_name, fatal=fatal, group='content', **kwargs)
692 def _dc_search_uploader(self, html):
693 return self._html_search_meta('dc.creator', html, 'uploader')
695 def _rta_search(self, html):
696 # See http://www.rtalabel.org/index.php?content=howtofaq#single
697 if re.search(r'(?ix)<meta\s+name="rating"\s+'
698 r' content="RTA-5042-1996-1400-1577-RTA"',
703 def _media_rating_search(self, html):
704 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
705 rating = self._html_search_meta('rating', html)
717 return RATING_TABLE.get(rating.lower(), None)
719 def _family_friendly_search(self, html):
720 # See http://schema.org/VideoObject
721 family_friendly = self._html_search_meta('isFamilyFriendly', html)
723 if not family_friendly:
732 return RATING_TABLE.get(family_friendly.lower(), None)
734 def _twitter_search_player(self, html):
735 return self._html_search_meta('twitter:player', html,
736 'twitter card player')
739 def _hidden_inputs(html):
740 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
742 for input in re.findall(r'(?i)<input([^>]+)>', html):
743 if not re.search(r'type=(["\'])(?:hidden|submit)\1', input):
745 name = re.search(r'name=(["\'])(?P<value>.+?)\1', input)
748 value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
751 hidden_inputs[name.group('value')] = value.group('value')
754 def _form_hidden_inputs(self, form_id, html):
755 form = self._search_regex(
756 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
757 html, '%s form' % form_id, group='form')
758 return self._hidden_inputs(form)
760 def _sort_formats(self, formats, field_preference=None):
762 raise ExtractorError('No video formats found')
765 # TODO remove the following workaround
766 from ..utils import determine_ext
767 if not f.get('ext') and 'url' in f:
768 f['ext'] = determine_ext(f['url'])
770 if isinstance(field_preference, (list, tuple)):
771 return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
773 preference = f.get('preference')
774 if preference is None:
775 proto = f.get('protocol')
777 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
779 preference = 0 if proto in ['http', 'https'] else -0.1
780 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
783 if f.get('vcodec') == 'none': # audio only
784 if self._downloader.params.get('prefer_free_formats'):
785 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
787 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
790 audio_ext_preference = ORDER.index(f['ext'])
792 audio_ext_preference = -1
794 if self._downloader.params.get('prefer_free_formats'):
795 ORDER = ['flv', 'mp4', 'webm']
797 ORDER = ['webm', 'flv', 'mp4']
799 ext_preference = ORDER.index(f['ext'])
802 audio_ext_preference = 0
806 f.get('language_preference') if f.get('language_preference') is not None else -1,
807 f.get('quality') if f.get('quality') is not None else -1,
808 f.get('tbr') if f.get('tbr') is not None else -1,
809 f.get('filesize') if f.get('filesize') is not None else -1,
810 f.get('vbr') if f.get('vbr') is not None else -1,
811 f.get('height') if f.get('height') is not None else -1,
812 f.get('width') if f.get('width') is not None else -1,
814 f.get('abr') if f.get('abr') is not None else -1,
815 audio_ext_preference,
816 f.get('fps') if f.get('fps') is not None else -1,
817 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
818 f.get('source_preference') if f.get('source_preference') is not None else -1,
819 f.get('format_id') if f.get('format_id') is not None else '',
821 formats.sort(key=_formats_key)
823 def _check_formats(self, formats, video_id):
826 lambda f: self._is_valid_url(
828 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
831 def _is_valid_url(self, url, video_id, item='video'):
832 url = self._proto_relative_url(url, scheme='http:')
833 # For now assume non HTTP(S) URLs always valid
834 if not (url.startswith('http://') or url.startswith('https://')):
837 self._request_webpage(url, video_id, 'Checking %s URL' % item)
839 except ExtractorError as e:
840 if isinstance(e.cause, compat_HTTPError):
842 '%s: %s URL is invalid, skipping' % (video_id, item))
846 def http_scheme(self):
847 """ Either "http:" or "https:", depending on the user's preferences """
850 if self._downloader.params.get('prefer_insecure', False)
853 def _proto_relative_url(self, url, scheme=None):
856 if url.startswith('//'):
858 scheme = self.http_scheme()
863 def _sleep(self, timeout, video_id, msg_template=None):
864 if msg_template is None:
865 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
866 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
870 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
871 transform_source=lambda s: fix_xml_ampersands(s).strip()):
872 manifest = self._download_xml(
873 manifest_url, video_id, 'Downloading f4m manifest',
874 'Unable to download f4m manifest',
875 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
876 # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
877 transform_source=transform_source)
880 manifest_version = '1.0'
881 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
883 manifest_version = '2.0'
884 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
885 for i, media_el in enumerate(media_nodes):
886 if manifest_version == '2.0':
887 media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
891 media_url if media_url.startswith('http://') or media_url.startswith('https://')
892 else ('/'.join(manifest_url.split('/')[:-1]) + '/' + media_url))
893 # If media_url is itself a f4m manifest do the recursive extraction
894 # since bitrates in parent manifest (this one) and media_url manifest
895 # may differ leading to inability to resolve the format by requested
896 # bitrate in f4m downloader
897 if determine_ext(manifest_url) == 'f4m':
898 formats.extend(self._extract_f4m_formats(manifest_url, video_id, preference, f4m_id))
900 tbr = int_or_none(media_el.attrib.get('bitrate'))
902 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
906 'width': int_or_none(media_el.attrib.get('width')),
907 'height': int_or_none(media_el.attrib.get('height')),
908 'preference': preference,
910 self._sort_formats(formats)
914 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
915 entry_protocol='m3u8', preference=None,
916 m3u8_id=None, note=None, errnote=None,
920 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
924 'preference': preference - 1 if preference else -1,
925 'resolution': 'multiple',
926 'format_note': 'Quality selection URL',
929 format_url = lambda u: (
931 if re.match(r'^https?://', u)
932 else compat_urlparse.urljoin(m3u8_url, u))
934 m3u8_doc = self._download_webpage(
936 note=note or 'Downloading m3u8 information',
937 errnote=errnote or 'Failed to download m3u8 information',
939 if m3u8_doc is False:
944 r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
945 for line in m3u8_doc.splitlines():
946 if line.startswith('#EXT-X-STREAM-INF:'):
948 for m in kv_rex.finditer(line):
950 if v.startswith('"'):
952 last_info[m.group('key')] = v
953 elif line.startswith('#EXT-X-MEDIA:'):
955 for m in kv_rex.finditer(line):
957 if v.startswith('"'):
959 last_media[m.group('key')] = v
960 elif line.startswith('#') or not line.strip():
963 if last_info is None:
964 formats.append({'url': format_url(line)})
966 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
969 format_id.append(m3u8_id)
970 last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
971 format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
973 'format_id': '-'.join(format_id),
974 'url': format_url(line.strip()),
977 'protocol': entry_protocol,
978 'preference': preference,
980 codecs = last_info.get('CODECS')
982 # TODO: looks like video codec is not always necessarily goes first
983 va_codecs = codecs.split(',')
985 f['vcodec'] = va_codecs[0].partition('.')[0]
986 if len(va_codecs) > 1 and va_codecs[1]:
987 f['acodec'] = va_codecs[1].partition('.')[0]
988 resolution = last_info.get('RESOLUTION')
990 width_str, height_str = resolution.split('x')
991 f['width'] = int(width_str)
992 f['height'] = int(height_str)
993 if last_media is not None:
994 f['m3u8_media'] = last_media
998 self._sort_formats(formats)
1002 def _xpath_ns(path, namespace=None):
1006 for c in path.split('/'):
1007 if not c or c == '.':
1010 out.append('{%s}%s' % (namespace, c))
1011 return '/'.join(out)
1013 def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None):
1014 smil = self._download_smil(smil_url, video_id, fatal=fatal)
1020 namespace = self._parse_smil_namespace(smil)
1022 return self._parse_smil_formats(
1023 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1025 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1026 smil = self._download_smil(smil_url, video_id, fatal=fatal)
1029 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1031 def _download_smil(self, smil_url, video_id, fatal=True):
1032 return self._download_xml(
1033 smil_url, video_id, 'Downloading SMIL file',
1034 'Unable to download SMIL file', fatal=fatal)
1036 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1037 namespace = self._parse_smil_namespace(smil)
1039 formats = self._parse_smil_formats(
1040 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1041 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1043 video_id = os.path.splitext(url_basename(smil_url))[0]
1046 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1047 name = meta.attrib.get('name')
1048 content = meta.attrib.get('content')
1049 if not name or not content:
1051 if not title and name == 'title':
1053 elif not description and name in ('description', 'abstract'):
1054 description = content
1058 'title': title or video_id,
1059 'description': description,
1061 'subtitles': subtitles,
1064 def _parse_smil_namespace(self, smil):
1065 return self._search_regex(
1066 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1068 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1070 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1071 b = meta.get('base') or meta.get('httpBase')
1080 videos = smil.findall(self._xpath_ns('.//video', namespace))
1081 for video in videos:
1082 src = video.get('src')
1086 bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
1087 filesize = int_or_none(video.get('size') or video.get('fileSize'))
1088 width = int_or_none(video.get('width'))
1089 height = int_or_none(video.get('height'))
1090 proto = video.get('proto')
1091 ext = video.get('ext')
1092 src_ext = determine_ext(src)
1093 streamer = video.get('streamer') or base
1095 if proto == 'rtmp' or streamer.startswith('rtmp'):
1101 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1103 'filesize': filesize,
1107 if transform_rtmp_url:
1108 streamer, src = transform_rtmp_url(streamer, src)
1109 formats[-1].update({
1115 src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1117 if proto == 'm3u8' or src_ext == 'm3u8':
1118 formats.extend(self._extract_m3u8_formats(
1119 src_url, video_id, ext or 'mp4', m3u8_id='hls'))
1122 if src_ext == 'f4m':
1127 'plugin': 'flowplayer-3.2.0.1',
1129 f4m_url += '&' if '?' in f4m_url else '?'
1130 f4m_url += compat_urllib_parse.urlencode(f4m_params)
1131 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds'))
1134 if src_url.startswith('http'):
1138 'ext': ext or src_ext or 'flv',
1139 'format_id': 'http-%d' % (bitrate or http_count),
1141 'filesize': filesize,
1147 self._sort_formats(formats)
1151 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1153 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1154 src = textstream.get('src')
1157 ext = textstream.get('ext') or determine_ext(src)
1159 type_ = textstream.get('type')
1163 'application/smptett+xml': 'tt',
1165 if type_ in SUBTITLES_TYPES:
1166 ext = SUBTITLES_TYPES[type_]
1167 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1168 subtitles.setdefault(lang, []).append({
1174 def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1175 xspf = self._download_xml(
1176 playlist_url, playlist_id, 'Downloading xpsf playlist',
1177 'Unable to download xspf manifest', fatal=fatal)
1180 return self._parse_xspf(xspf, playlist_id)
1182 def _parse_xspf(self, playlist, playlist_id):
1184 'xspf': 'http://xspf.org/ns/0/',
1185 's1': 'http://static.streamone.nl/player/ns/0',
1189 for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1191 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1192 description = xpath_text(
1193 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1194 thumbnail = xpath_text(
1195 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1196 duration = float_or_none(
1197 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1200 'url': location.text,
1201 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1202 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1203 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1204 } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1205 self._sort_formats(formats)
1210 'description': description,
1211 'thumbnail': thumbnail,
1212 'duration': duration,
1217 def _live_title(self, name):
1218 """ Generate the title for a live video """
1219 now = datetime.datetime.now()
1220 now_str = now.strftime("%Y-%m-%d %H:%M")
1221 return name + ' ' + now_str
1223 def _int(self, v, name, fatal=False, **kwargs):
1224 res = int_or_none(v, **kwargs)
1225 if 'get_attr' in kwargs:
1226 print(getattr(v, kwargs['get_attr']))
1228 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1230 raise ExtractorError(msg)
1232 self._downloader.report_warning(msg)
1235 def _float(self, v, name, fatal=False, **kwargs):
1236 res = float_or_none(v, **kwargs)
1238 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1240 raise ExtractorError(msg)
1242 self._downloader.report_warning(msg)
1245 def _set_cookie(self, domain, name, value, expire_time=None):
1246 cookie = compat_cookiejar.Cookie(
1247 0, name, value, None, None, domain, None,
1248 None, '/', True, False, expire_time, '', None, None, None)
1249 self._downloader.cookiejar.set_cookie(cookie)
1251 def _get_cookies(self, url):
1252 """ Return a compat_cookies.SimpleCookie with the cookies for the url """
1253 req = compat_urllib_request.Request(url)
1254 self._downloader.cookiejar.add_cookie_header(req)
1255 return compat_cookies.SimpleCookie(req.get_header('Cookie'))
1257 def get_testcases(self, include_onlymatching=False):
1258 t = getattr(self, '_TEST', None)
1260 assert not hasattr(self, '_TESTS'), \
1261 '%s has _TEST and _TESTS' % type(self).__name__
1264 tests = getattr(self, '_TESTS', [])
1266 if not include_onlymatching and t.get('only_matching', False):
1268 t['name'] = type(self).__name__[:-len('IE')]
1271 def is_suitable(self, age_limit):
1272 """ Test whether the extractor is generally suitable for the given
1273 age limit (i.e. pornographic sites are not, all others usually are) """
1275 any_restricted = False
1276 for tc in self.get_testcases(include_onlymatching=False):
1277 if 'playlist' in tc:
1278 tc = tc['playlist'][0]
1279 is_restricted = age_restricted(
1280 tc.get('info_dict', {}).get('age_limit'), age_limit)
1281 if not is_restricted:
1283 any_restricted = any_restricted or is_restricted
1284 return not any_restricted
1286 def extract_subtitles(self, *args, **kwargs):
1287 if (self._downloader.params.get('writesubtitles', False) or
1288 self._downloader.params.get('listsubtitles')):
1289 return self._get_subtitles(*args, **kwargs)
1292 def _get_subtitles(self, *args, **kwargs):
1293 raise NotImplementedError("This method must be implemented by subclasses")
1296 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
1297 """ Merge subtitle items for one language. Items with duplicated URLs
1298 will be dropped. """
1299 list1_urls = set([item['url'] for item in subtitle_list1])
1300 ret = list(subtitle_list1)
1301 ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
1305 def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
1306 """ Merge two subtitle dictionaries, language by language. """
1307 ret = dict(subtitle_dict1)
1308 for lang in subtitle_dict2:
1309 ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
1312 def extract_automatic_captions(self, *args, **kwargs):
1313 if (self._downloader.params.get('writeautomaticsub', False) or
1314 self._downloader.params.get('listsubtitles')):
1315 return self._get_automatic_captions(*args, **kwargs)
1318 def _get_automatic_captions(self, *args, **kwargs):
1319 raise NotImplementedError("This method must be implemented by subclasses")
1322 class SearchInfoExtractor(InfoExtractor):
1324 Base class for paged search queries extractors.
1325 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
1326 Instances should define _SEARCH_KEY and _MAX_RESULTS.
1330 def _make_valid_url(cls):
1331 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
1334 def suitable(cls, url):
1335 return re.match(cls._make_valid_url(), url) is not None
1337 def _real_extract(self, query):
1338 mobj = re.match(self._make_valid_url(), query)
1340 raise ExtractorError('Invalid search query "%s"' % query)
1342 prefix = mobj.group('prefix')
1343 query = mobj.group('query')
1345 return self._get_n_results(query, 1)
1346 elif prefix == 'all':
1347 return self._get_n_results(query, self._MAX_RESULTS)
1351 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
1352 elif n > self._MAX_RESULTS:
1353 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
1354 n = self._MAX_RESULTS
1355 return self._get_n_results(query, n)
1357 def _get_n_results(self, query, n):
1358 """Get a specified number of results for a query"""
1359 raise NotImplementedError("This method must be implemented by subclasses")
1362 def SEARCH_KEY(self):
1363 return self._SEARCH_KEY