2 from __future__ import unicode_literals
17 from ..compat import (
21 compat_etree_fromstring,
28 compat_urllib_parse_unquote,
29 compat_urllib_parse_urlencode,
30 compat_urllib_request,
32 compat_xml_parse_error,
34 from ..downloader.f4m import (
36 remove_encrypted_media,
62 parse_m3u8_attributes,
80 class InfoExtractor(object):
81 """Information Extractor class.
83 Information extractors are the classes that, given a URL, extract
84 information about the video (or videos) the URL refers to. This
85 information includes the real video URL, the video title, author and
86 others. The information is stored in a dictionary which is then
87 passed to the YoutubeDL. The YoutubeDL processes this
88 information possibly downloading the video to the file system, among
89 other possible outcomes.
91 The type field determines the type of the result.
92 By far the most common value (and the default if _type is missing) is
93 "video", which indicates a single video.
95 For a video, the dictionaries must include the following fields:
98 title: Video title, unescaped.
100 Additionally, it must contain either a formats entry or a url one:
102 formats: A list of dictionaries for each format available, ordered
103 from worst to best quality.
106 * url The mandatory URL representing the media:
107 for plain file media - HTTP URL of this file,
109 for HLS - URL of the M3U8 media playlist,
110 for HDS - URL of the F4M manifest,
112 - HTTP URL to plain file media (in case of
114 - URL of the MPD manifest or base URL
115 representing the media if MPD manifest
116 is parsed froma string (in case of
118 for MSS - URL of the ISM manifest.
120 The URL of the manifest file in case of
122 for HLS - URL of the M3U8 master playlist,
123 for HDS - URL of the F4M manifest,
124 for DASH - URL of the MPD manifest,
125 for MSS - URL of the ISM manifest.
126 * ext Will be calculated from URL if missing
127 * format A human-readable description of the format
128 ("mp4 container with h264/opus").
129 Calculated from the format_id, width, height.
130 and format_note fields if missing.
131 * format_id A short description of the format
132 ("mp4_h264_opus" or "19").
133 Technically optional, but strongly recommended.
134 * format_note Additional info about the format
135 ("3D" or "DASH video")
136 * width Width of the video, if known
137 * height Height of the video, if known
138 * resolution Textual description of width and height
139 * tbr Average bitrate of audio and video in KBit/s
140 * abr Average audio bitrate in KBit/s
141 * acodec Name of the audio codec in use
142 * asr Audio sampling rate in Hertz
143 * vbr Average video bitrate in KBit/s
145 * vcodec Name of the video codec in use
146 * container Name of the container format
147 * filesize The number of bytes, if known in advance
148 * filesize_approx An estimate for the number of bytes
149 * player_url SWF Player URL (used for rtmpdump).
150 * protocol The protocol that will be used for the actual
151 download, lower-case.
152 "http", "https", "rtsp", "rtmp", "rtmpe",
153 "m3u8", "m3u8_native" or "http_dash_segments".
155 Base URL for fragments. Each fragment's path
156 value (if present) will be relative to
158 * fragments A list of fragments of a fragmented media.
159 Each fragment entry must contain either an url
160 or a path. If an url is present it should be
161 considered by a client. Otherwise both path and
162 fragment_base_url must be present. Here is
163 the list of all potential fields:
164 * "url" - fragment's URL
165 * "path" - fragment's path relative to
167 * "duration" (optional, int or float)
168 * "filesize" (optional, int)
169 * preference Order number of this format. If this field is
170 present and not None, the formats get sorted
171 by this field, regardless of all other values.
172 -1 for default (order by other properties),
173 -2 or smaller for less than default.
174 < -1000 to hide the format (if there is
175 another one which is strictly better)
176 * language Language code, e.g. "de" or "en-US".
177 * language_preference Is this in the language mentioned in
179 10 if it's what the URL is about,
180 -1 for default (don't know),
181 -10 otherwise, other values reserved for now.
182 * quality Order number of the video quality of this
183 format, irrespective of the file format.
184 -1 for default (order by other properties),
185 -2 or smaller for less than default.
186 * source_preference Order number for this video source
187 (quality takes higher priority)
188 -1 for default (order by other properties),
189 -2 or smaller for less than default.
190 * http_headers A dictionary of additional HTTP headers
191 to add to the request.
192 * stretched_ratio If given and not 1, indicates that the
193 video's pixels are not square.
194 width : height ratio as float.
195 * no_resume The server does not support resuming the
196 (HTTP or RTMP) download. Boolean.
197 * downloader_options A dictionary of downloader options as
198 described in FileDownloader
200 url: Final video URL.
201 ext: Video filename extension.
202 format: The video format, defaults to ext (used for --get-format)
203 player_url: SWF Player URL (used for rtmpdump).
205 The following fields are optional:
207 alt_title: A secondary title of the video.
208 display_id An alternative identifier for the video, not necessarily
209 unique, but available before title. Typically, id is
210 something like "4234987", title "Dancing naked mole rats",
211 and display_id "dancing-naked-mole-rats"
212 thumbnails: A list of dictionaries, with the following entries:
213 * "id" (optional, string) - Thumbnail format ID
215 * "preference" (optional, int) - quality of the image
216 * "width" (optional, int)
217 * "height" (optional, int)
218 * "resolution" (optional, string "{width}x{height"},
220 * "filesize" (optional, int)
221 thumbnail: Full URL to a video thumbnail image.
222 description: Full video description.
223 uploader: Full name of the video uploader.
224 license: License name the video is licensed under.
225 creator: The creator of the video.
226 release_date: The date (YYYYMMDD) when the video was released.
227 timestamp: UNIX timestamp of the moment the video became available.
228 upload_date: Video upload date (YYYYMMDD).
229 If not explicitly set, calculated from timestamp.
230 uploader_id: Nickname or id of the video uploader.
231 uploader_url: Full URL to a personal webpage of the video uploader.
232 channel: Full name of the channel the video is uploaded on.
233 Note that channel fields may or may not repeat uploader
234 fields. This depends on a particular extractor.
235 channel_id: Id of the channel.
236 channel_url: Full URL to a channel webpage.
237 location: Physical location where the video was filmed.
238 subtitles: The available subtitles as a dictionary in the format
239 {tag: subformats}. "tag" is usually a language code, and
240 "subformats" is a list sorted from lower to higher
241 preference, each element is a dictionary with the "ext"
243 * "data": The subtitles file contents
244 * "url": A URL pointing to the subtitles file
245 "ext" will be calculated from URL if missing
246 automatic_captions: Like 'subtitles', used by the YoutubeIE for
247 automatically generated captions
248 duration: Length of the video in seconds, as an integer or float.
249 view_count: How many users have watched the video on the platform.
250 like_count: Number of positive ratings of the video
251 dislike_count: Number of negative ratings of the video
252 repost_count: Number of reposts of the video
253 average_rating: Average rating give by users, the scale used depends on the webpage
254 comment_count: Number of comments on the video
255 comments: A list of comments, each with one or more of the following
256 properties (all but one of text or html optional):
257 * "author" - human-readable name of the comment author
258 * "author_id" - user ID of the comment author
260 * "html" - Comment as HTML
261 * "text" - Plain text of the comment
262 * "timestamp" - UNIX timestamp of comment
263 * "parent" - ID of the comment this one is replying to.
264 Set to "root" to indicate that this is a
265 comment to the original video.
266 age_limit: Age restriction for the video, as an integer (years)
267 webpage_url: The URL to the video webpage, if given to youtube-dl it
268 should allow to get the same result again. (It will be set
269 by YoutubeDL if it's missing)
270 categories: A list of categories that the video falls in, for example
272 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
273 is_live: True, False, or None (=unknown). Whether this video is a
274 live stream that goes on instead of a fixed-length video.
275 start_time: Time in seconds where the reproduction should start, as
276 specified in the URL.
277 end_time: Time in seconds where the reproduction should end, as
278 specified in the URL.
279 chapters: A list of dictionaries, with the following entries:
280 * "start_time" - The start time of the chapter in seconds
281 * "end_time" - The end time of the chapter in seconds
282 * "title" (optional, string)
284 The following fields should only be used when the video belongs to some logical
287 chapter: Name or title of the chapter the video belongs to.
288 chapter_number: Number of the chapter the video belongs to, as an integer.
289 chapter_id: Id of the chapter the video belongs to, as a unicode string.
291 The following fields should only be used when the video is an episode of some
292 series, programme or podcast:
294 series: Title of the series or programme the video episode belongs to.
295 season: Title of the season the video episode belongs to.
296 season_number: Number of the season the video episode belongs to, as an integer.
297 season_id: Id of the season the video episode belongs to, as a unicode string.
298 episode: Title of the video episode. Unlike mandatory video title field,
299 this field should denote the exact title of the video episode
300 without any kind of decoration.
301 episode_number: Number of the video episode within a season, as an integer.
302 episode_id: Id of the video episode, as a unicode string.
304 The following fields should only be used when the media is a track or a part of
307 track: Title of the track.
308 track_number: Number of the track within an album or a disc, as an integer.
309 track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
311 artist: Artist(s) of the track.
312 genre: Genre(s) of the track.
313 album: Title of the album the track belongs to.
314 album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
315 album_artist: List of all artists appeared on the album (e.g.
316 "Ash Borer / Fell Voices" or "Various Artists", useful for splits
318 disc_number: Number of the disc or other physical medium the track belongs to,
320 release_year: Year (YYYY) when the album was released.
322 Unless mentioned otherwise, the fields should be Unicode strings.
324 Unless mentioned otherwise, None is equivalent to absence of information.
327 _type "playlist" indicates multiple videos.
328 There must be a key "entries", which is a list, an iterable, or a PagedList
329 object, each element of which is a valid dictionary by this specification.
331 Additionally, playlists can have "id", "title", "description", "uploader",
332 "uploader_id", "uploader_url" attributes with the same semantics as videos
336 _type "multi_video" indicates that there are multiple videos that
337 form a single show, for examples multiple acts of an opera or TV episode.
338 It must have an entries key like a playlist and contain all the keys
339 required for a video at the same time.
342 _type "url" indicates that the video must be extracted from another
343 location, possibly by a different extractor. Its only required key is:
344 "url" - the next URL to extract.
345 The key "ie_key" can be set to the class name (minus the trailing "IE",
346 e.g. "Youtube") if the extractor class is known in advance.
347 Additionally, the dictionary may have any properties of the resolved entity
348 known in advance, for example "title" if the title of the referred video is
352 _type "url_transparent" entities have the same specification as "url", but
353 indicate that the given additional information is more precise than the one
354 associated with the resolved URL.
355 This is useful when a site employs a video service that hosts the video and
356 its technical metadata, but that video service does not embed a useful
357 title, description etc.
360 Subclasses of this one should re-define the _real_initialize() and
361 _real_extract() methods and define a _VALID_URL regexp.
362 Probably, they should also be added to the list of extractors.
364 _GEO_BYPASS attribute may be set to False in order to disable
365 geo restriction bypass mechanisms for a particular extractor.
366 Though it won't disable explicit geo restriction bypass based on
367 country code provided with geo_bypass_country.
369 _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
370 countries for this extractor. One of these countries will be used by
371 geo restriction bypass mechanism right away in order to bypass
372 geo restriction, of course, if the mechanism is not disabled.
374 _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
375 IP blocks in CIDR notation for this extractor. One of these IP blocks
376 will be used by geo restriction bypass mechanism similarly
379 Finally, the _WORKING attribute should be set to False for broken IEs
380 in order to warn the users and skip the tests.
385 _x_forwarded_for_ip = None
387 _GEO_COUNTRIES = None
388 _GEO_IP_BLOCKS = None
391 def __init__(self, downloader=None):
392 """Constructor. Receives an optional downloader."""
394 self._x_forwarded_for_ip = None
395 self.set_downloader(downloader)
398 def suitable(cls, url):
399 """Receives a URL and returns True if suitable for this IE."""
401 # This does not use has/getattr intentionally - we want to know whether
402 # we have cached the regexp for *this* class, whereas getattr would also
403 # match the superclass
404 if '_VALID_URL_RE' not in cls.__dict__:
405 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
406 return cls._VALID_URL_RE.match(url) is not None
409 def _match_id(cls, url):
410 if '_VALID_URL_RE' not in cls.__dict__:
411 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
412 m = cls._VALID_URL_RE.match(url)
414 return compat_str(m.group('id'))
418 """Getter method for _WORKING."""
421 def initialize(self):
422 """Initializes an instance (authentication, etc)."""
423 self._initialize_geo_bypass({
424 'countries': self._GEO_COUNTRIES,
425 'ip_blocks': self._GEO_IP_BLOCKS,
428 self._real_initialize()
431 def _initialize_geo_bypass(self, geo_bypass_context):
433 Initialize geo restriction bypass mechanism.
435 This method is used to initialize geo bypass mechanism based on faking
436 X-Forwarded-For HTTP header. A random country from provided country list
437 is selected and a random IP belonging to this country is generated. This
438 IP will be passed as X-Forwarded-For HTTP header in all subsequent
441 This method will be used for initial geo bypass mechanism initialization
442 during the instance initialization with _GEO_COUNTRIES and
445 You may also manually call it from extractor's code if geo bypass
446 information is not available beforehand (e.g. obtained during
447 extraction) or due to some other reason. In this case you should pass
448 this information in geo bypass context passed as first argument. It may
449 contain following fields:
451 countries: List of geo unrestricted countries (similar
453 ip_blocks: List of geo unrestricted IP blocks in CIDR notation
454 (similar to _GEO_IP_BLOCKS)
457 if not self._x_forwarded_for_ip:
459 # Geo bypass mechanism is explicitly disabled by user
460 if not self._downloader.params.get('geo_bypass', True):
463 if not geo_bypass_context:
464 geo_bypass_context = {}
466 # Backward compatibility: previously _initialize_geo_bypass
467 # expected a list of countries, some 3rd party code may still use
469 if isinstance(geo_bypass_context, (list, tuple)):
470 geo_bypass_context = {
471 'countries': geo_bypass_context,
474 # The whole point of geo bypass mechanism is to fake IP
475 # as X-Forwarded-For HTTP header based on some IP block or
478 # Path 1: bypassing based on IP block in CIDR notation
480 # Explicit IP block specified by user, use it right away
481 # regardless of whether extractor is geo bypassable or not
482 ip_block = self._downloader.params.get('geo_bypass_ip_block', None)
484 # Otherwise use random IP block from geo bypass context but only
485 # if extractor is known as geo bypassable
487 ip_blocks = geo_bypass_context.get('ip_blocks')
488 if self._GEO_BYPASS and ip_blocks:
489 ip_block = random.choice(ip_blocks)
492 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
493 if self._downloader.params.get('verbose', False):
494 self._downloader.to_screen(
495 '[debug] Using fake IP %s as X-Forwarded-For.'
496 % self._x_forwarded_for_ip)
499 # Path 2: bypassing based on country code
501 # Explicit country code specified by user, use it right away
502 # regardless of whether extractor is geo bypassable or not
503 country = self._downloader.params.get('geo_bypass_country', None)
505 # Otherwise use random country code from geo bypass context but
506 # only if extractor is known as geo bypassable
508 countries = geo_bypass_context.get('countries')
509 if self._GEO_BYPASS and countries:
510 country = random.choice(countries)
513 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
514 if self._downloader.params.get('verbose', False):
515 self._downloader.to_screen(
516 '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
517 % (self._x_forwarded_for_ip, country.upper()))
519 def extract(self, url):
520 """Extracts URL information and returns it in list of dicts."""
525 ie_result = self._real_extract(url)
526 if self._x_forwarded_for_ip:
527 ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
529 except GeoRestrictedError as e:
530 if self.__maybe_fake_ip_and_retry(e.countries):
533 except ExtractorError:
535 except compat_http_client.IncompleteRead as e:
536 raise ExtractorError('A network error has occurred.', cause=e, expected=True)
537 except (KeyError, StopIteration) as e:
538 raise ExtractorError('An extractor error has occurred.', cause=e)
540 def __maybe_fake_ip_and_retry(self, countries):
541 if (not self._downloader.params.get('geo_bypass_country', None) and
543 self._downloader.params.get('geo_bypass', True) and
544 not self._x_forwarded_for_ip and
546 country_code = random.choice(countries)
547 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
548 if self._x_forwarded_for_ip:
550 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
551 % (self._x_forwarded_for_ip, country_code.upper()))
555 def set_downloader(self, downloader):
556 """Sets the downloader for this IE."""
557 self._downloader = downloader
559 def _real_initialize(self):
560 """Real initialization process. Redefine in subclasses."""
563 def _real_extract(self, url):
564 """Real extraction process. Redefine in subclasses."""
569 """A string for getting the InfoExtractor with get_info_extractor"""
570 return compat_str(cls.__name__[:-2])
574 return compat_str(type(self).__name__[:-2])
577 def __can_accept_status_code(err, expected_status):
578 assert isinstance(err, compat_urllib_error.HTTPError)
579 if expected_status is None:
581 if isinstance(expected_status, compat_integer_types):
582 return err.code == expected_status
583 elif isinstance(expected_status, (list, tuple)):
584 return err.code in expected_status
585 elif callable(expected_status):
586 return expected_status(err.code) is True
590 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
592 Return the response handle.
594 See _download_webpage docstring for arguments specification.
597 self.report_download_webpage(video_id)
598 elif note is not False:
600 self.to_screen('%s' % (note,))
602 self.to_screen('%s: %s' % (video_id, note))
604 # Some sites check X-Forwarded-For HTTP header in order to figure out
605 # the origin of the client behind proxy. This allows bypassing geo
606 # restriction by faking this header's value to IP that belongs to some
607 # geo unrestricted country. We will do so once we encounter any
608 # geo restriction error.
609 if self._x_forwarded_for_ip:
610 if 'X-Forwarded-For' not in headers:
611 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
613 if isinstance(url_or_request, compat_urllib_request.Request):
614 url_or_request = update_Request(
615 url_or_request, data=data, headers=headers, query=query)
618 url_or_request = update_url_query(url_or_request, query)
619 if data is not None or headers:
620 url_or_request = sanitized_Request(url_or_request, data, headers)
622 return self._downloader.urlopen(url_or_request)
623 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
624 if isinstance(err, compat_urllib_error.HTTPError):
625 if self.__can_accept_status_code(err, expected_status):
626 # Retain reference to error to prevent file object from
627 # being closed before it can be read. Works around the
628 # effects of <https://bugs.python.org/issue15002>
629 # introduced in Python 3.4.1.
636 errnote = 'Unable to download webpage'
638 errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
640 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
642 self._downloader.report_warning(errmsg)
645 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
647 Return a tuple (page content as string, URL handle).
649 See _download_webpage docstring for arguments specification.
651 # Strip hashes from the URL (#1038)
652 if isinstance(url_or_request, (compat_str, str)):
653 url_or_request = url_or_request.partition('#')[0]
655 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
659 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
660 return (content, urlh)
663 def _guess_encoding_from_content(content_type, webpage_bytes):
664 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
666 encoding = m.group(1)
668 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
669 webpage_bytes[:1024])
671 encoding = m.group(1).decode('ascii')
672 elif webpage_bytes.startswith(b'\xff\xfe'):
679 def __check_blocked(self, content):
680 first_block = content[:512]
681 if ('<title>Access to this site is blocked</title>' in content and
682 'Websense' in first_block):
683 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
684 blocked_iframe = self._html_search_regex(
685 r'<iframe src="([^"]+)"', content,
686 'Websense information URL', default=None)
688 msg += ' Visit %s for more details' % blocked_iframe
689 raise ExtractorError(msg, expected=True)
690 if '<title>The URL you requested has been blocked</title>' in first_block:
692 'Access to this webpage has been blocked by Indian censorship. '
693 'Use a VPN or proxy server (with --proxy) to route around it.')
694 block_msg = self._html_search_regex(
695 r'</h1><p>(.*?)</p>',
696 content, 'block message', default=None)
698 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
699 raise ExtractorError(msg, expected=True)
700 if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content and
701 'blocklist.rkn.gov.ru' in content):
702 raise ExtractorError(
703 'Access to this webpage has been blocked by decision of the Russian government. '
704 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
707 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
708 content_type = urlh.headers.get('Content-Type', '')
709 webpage_bytes = urlh.read()
710 if prefix is not None:
711 webpage_bytes = prefix + webpage_bytes
713 encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
714 if self._downloader.params.get('dump_intermediate_pages', False):
715 self.to_screen('Dumping request to ' + urlh.geturl())
716 dump = base64.b64encode(webpage_bytes).decode('ascii')
717 self._downloader.to_screen(dump)
718 if self._downloader.params.get('write_pages', False):
719 basen = '%s_%s' % (video_id, urlh.geturl())
721 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
722 basen = basen[:240 - len(h)] + h
723 raw_filename = basen + '.dump'
724 filename = sanitize_filename(raw_filename, restricted=True)
725 self.to_screen('Saving request to ' + filename)
726 # Working around MAX_PATH limitation on Windows (see
727 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
728 if compat_os_name == 'nt':
729 absfilepath = os.path.abspath(filename)
730 if len(absfilepath) > 259:
731 filename = '\\\\?\\' + absfilepath
732 with open(filename, 'wb') as outf:
733 outf.write(webpage_bytes)
736 content = webpage_bytes.decode(encoding, 'replace')
738 content = webpage_bytes.decode('utf-8', 'replace')
740 self.__check_blocked(content)
744 def _download_webpage(
745 self, url_or_request, video_id, note=None, errnote=None,
746 fatal=True, tries=1, timeout=5, encoding=None, data=None,
747 headers={}, query={}, expected_status=None):
749 Return the data of the page as a string.
752 url_or_request -- plain text URL as a string or
753 a compat_urllib_request.Requestobject
754 video_id -- Video/playlist/item identifier (string)
757 note -- note printed before downloading (string)
758 errnote -- note printed in case of an error (string)
759 fatal -- flag denoting whether error should be considered fatal,
760 i.e. whether it should cause ExtractionError to be raised,
761 otherwise a warning will be reported and extraction continued
762 tries -- number of tries
763 timeout -- sleep interval between tries
764 encoding -- encoding for a page content decoding, guessed automatically
765 when not explicitly specified
766 data -- POST data (bytes)
767 headers -- HTTP headers (dict)
768 query -- URL query (dict)
769 expected_status -- allows to accept failed HTTP requests (non 2xx
770 status code) by explicitly specifying a set of accepted status
771 codes. Can be any of the following entities:
772 - an integer type specifying an exact failed status code to
774 - a list or a tuple of integer types specifying a list of
775 failed status codes to accept
776 - a callable accepting an actual failed status code and
777 returning True if it should be accepted
778 Note that this argument does not affect success status codes (2xx)
779 which are always accepted.
784 while success is False:
786 res = self._download_webpage_handle(
787 url_or_request, video_id, note, errnote, fatal,
788 encoding=encoding, data=data, headers=headers, query=query,
789 expected_status=expected_status)
791 except compat_http_client.IncompleteRead as e:
793 if try_count >= tries:
795 self._sleep(timeout, video_id)
802 def _download_xml_handle(
803 self, url_or_request, video_id, note='Downloading XML',
804 errnote='Unable to download XML', transform_source=None,
805 fatal=True, encoding=None, data=None, headers={}, query={},
806 expected_status=None):
808 Return a tuple (xml as an compat_etree_Element, URL handle).
810 See _download_webpage docstring for arguments specification.
812 res = self._download_webpage_handle(
813 url_or_request, video_id, note, errnote, fatal=fatal,
814 encoding=encoding, data=data, headers=headers, query=query,
815 expected_status=expected_status)
818 xml_string, urlh = res
819 return self._parse_xml(
820 xml_string, video_id, transform_source=transform_source,
824 self, url_or_request, video_id,
825 note='Downloading XML', errnote='Unable to download XML',
826 transform_source=None, fatal=True, encoding=None,
827 data=None, headers={}, query={}, expected_status=None):
829 Return the xml as an compat_etree_Element.
831 See _download_webpage docstring for arguments specification.
833 res = self._download_xml_handle(
834 url_or_request, video_id, note=note, errnote=errnote,
835 transform_source=transform_source, fatal=fatal, encoding=encoding,
836 data=data, headers=headers, query=query,
837 expected_status=expected_status)
838 return res if res is False else res[0]
840 def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
842 xml_string = transform_source(xml_string)
844 return compat_etree_fromstring(xml_string.encode('utf-8'))
845 except compat_xml_parse_error as ve:
846 errmsg = '%s: Failed to parse XML ' % video_id
848 raise ExtractorError(errmsg, cause=ve)
850 self.report_warning(errmsg + str(ve))
852 def _download_json_handle(
853 self, url_or_request, video_id, note='Downloading JSON metadata',
854 errnote='Unable to download JSON metadata', transform_source=None,
855 fatal=True, encoding=None, data=None, headers={}, query={},
856 expected_status=None):
858 Return a tuple (JSON object, URL handle).
860 See _download_webpage docstring for arguments specification.
862 res = self._download_webpage_handle(
863 url_or_request, video_id, note, errnote, fatal=fatal,
864 encoding=encoding, data=data, headers=headers, query=query,
865 expected_status=expected_status)
868 json_string, urlh = res
869 return self._parse_json(
870 json_string, video_id, transform_source=transform_source,
874 self, url_or_request, video_id, note='Downloading JSON metadata',
875 errnote='Unable to download JSON metadata', transform_source=None,
876 fatal=True, encoding=None, data=None, headers={}, query={},
877 expected_status=None):
879 Return the JSON object as a dict.
881 See _download_webpage docstring for arguments specification.
883 res = self._download_json_handle(
884 url_or_request, video_id, note=note, errnote=errnote,
885 transform_source=transform_source, fatal=fatal, encoding=encoding,
886 data=data, headers=headers, query=query,
887 expected_status=expected_status)
888 return res if res is False else res[0]
890 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
892 json_string = transform_source(json_string)
894 return json.loads(json_string)
895 except ValueError as ve:
896 errmsg = '%s: Failed to parse JSON ' % video_id
898 raise ExtractorError(errmsg, cause=ve)
900 self.report_warning(errmsg + str(ve))
902 def report_warning(self, msg, video_id=None):
903 idstr = '' if video_id is None else '%s: ' % video_id
904 self._downloader.report_warning(
905 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
907 def to_screen(self, msg):
908 """Print msg to screen, prefixing it with '[ie_name]'"""
909 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
911 def report_extraction(self, id_or_name):
912 """Report information extraction."""
913 self.to_screen('%s: Extracting information' % id_or_name)
915 def report_download_webpage(self, video_id):
916 """Report webpage download."""
917 self.to_screen('%s: Downloading webpage' % video_id)
919 def report_age_confirmation(self):
920 """Report attempt to confirm age."""
921 self.to_screen('Confirming age')
923 def report_login(self):
924 """Report attempt to log in."""
925 self.to_screen('Logging in')
928 def raise_login_required(msg='This video is only available for registered users'):
929 raise ExtractorError(
930 '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
934 def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
935 raise GeoRestrictedError(msg, countries=countries)
937 # Methods for following #608
939 def url_result(url, ie=None, video_id=None, video_title=None):
940 """Returns a URL that points to a page that should be processed"""
941 # TODO: ie should be the class used for getting the info
942 video_info = {'_type': 'url',
945 if video_id is not None:
946 video_info['id'] = video_id
947 if video_title is not None:
948 video_info['title'] = video_title
951 def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
953 self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
955 return self.playlist_result(
956 urls, playlist_id=playlist_id, playlist_title=playlist_title)
959 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
960 """Returns a playlist"""
961 video_info = {'_type': 'playlist',
964 video_info['id'] = playlist_id
966 video_info['title'] = playlist_title
967 if playlist_description:
968 video_info['description'] = playlist_description
971 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
973 Perform a regex search on the given string, using a single or a list of
974 patterns returning the first matching group.
975 In case of failure return a default value or raise a WARNING or a
976 RegexNotFoundError, depending on fatal, specifying the field name.
978 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
979 mobj = re.search(pattern, string, flags)
982 mobj = re.search(p, string, flags)
986 if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
987 _name = '\033[0;34m%s\033[0m' % name
993 # return the first matching group
994 return next(g for g in mobj.groups() if g is not None)
996 return mobj.group(group)
997 elif default is not NO_DEFAULT:
1000 raise RegexNotFoundError('Unable to extract %s' % _name)
1002 self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
1005 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1007 Like _search_regex, but strips HTML tags and unescapes entities.
1009 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1011 return clean_html(res).strip()
1015 def _get_netrc_login_info(self, netrc_machine=None):
1018 netrc_machine = netrc_machine or self._NETRC_MACHINE
1020 if self._downloader.params.get('usenetrc', False):
1022 info = netrc.netrc().authenticators(netrc_machine)
1023 if info is not None:
1027 raise netrc.NetrcParseError(
1028 'No authenticators for %s' % netrc_machine)
1029 except (IOError, netrc.NetrcParseError) as err:
1030 self._downloader.report_warning(
1031 'parsing .netrc: %s' % error_to_compat_str(err))
1033 return username, password
1035 def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1037 Get the login info as (username, password)
1038 First look for the manually specified credentials using username_option
1039 and password_option as keys in params dictionary. If no such credentials
1040 available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1042 If there's no info available, return (None, None)
1044 if self._downloader is None:
1047 downloader_params = self._downloader.params
1049 # Attempt to use provided username and password or .netrc data
1050 if downloader_params.get(username_option) is not None:
1051 username = downloader_params[username_option]
1052 password = downloader_params[password_option]
1054 username, password = self._get_netrc_login_info(netrc_machine)
1056 return username, password
1058 def _get_tfa_info(self, note='two-factor verification code'):
1060 Get the two-factor authentication info
1061 TODO - asking the user will be required for sms/phone verify
1062 currently just uses the command line option
1063 If there's no info available, return None
1065 if self._downloader is None:
1067 downloader_params = self._downloader.params
1069 if downloader_params.get('twofactor') is not None:
1070 return downloader_params['twofactor']
1072 return compat_getpass('Type %s and press [Return]: ' % note)
1074 # Helper functions for extracting OpenGraph info
1076 def _og_regexes(prop):
1077 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1078 property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
1079 % {'prop': re.escape(prop)})
1080 template = r'<meta[^>]+?%s[^>]+?%s'
1082 template % (property_re, content_re),
1083 template % (content_re, property_re),
1087 def _meta_regex(prop):
1088 return r'''(?isx)<meta
1089 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1090 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1092 def _og_search_property(self, prop, html, name=None, **kargs):
1093 if not isinstance(prop, (list, tuple)):
1096 name = 'OpenGraph %s' % prop[0]
1099 og_regexes.extend(self._og_regexes(p))
1100 escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1103 return unescapeHTML(escaped)
1105 def _og_search_thumbnail(self, html, **kargs):
1106 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1108 def _og_search_description(self, html, **kargs):
1109 return self._og_search_property('description', html, fatal=False, **kargs)
1111 def _og_search_title(self, html, **kargs):
1112 return self._og_search_property('title', html, **kargs)
1114 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1115 regexes = self._og_regexes('video') + self._og_regexes('video:url')
1117 regexes = self._og_regexes('video:secure_url') + regexes
1118 return self._html_search_regex(regexes, html, name, **kargs)
1120 def _og_search_url(self, html, **kargs):
1121 return self._og_search_property('url', html, **kargs)
1123 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1124 if not isinstance(name, (list, tuple)):
1126 if display_name is None:
1127 display_name = name[0]
1128 return self._html_search_regex(
1129 [self._meta_regex(n) for n in name],
1130 html, display_name, fatal=fatal, group='content', **kwargs)
1132 def _dc_search_uploader(self, html):
1133 return self._html_search_meta('dc.creator', html, 'uploader')
1135 def _rta_search(self, html):
1136 # See http://www.rtalabel.org/index.php?content=howtofaq#single
1137 if re.search(r'(?ix)<meta\s+name="rating"\s+'
1138 r' content="RTA-5042-1996-1400-1577-RTA"',
1143 def _media_rating_search(self, html):
1144 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1145 rating = self._html_search_meta('rating', html)
1157 return RATING_TABLE.get(rating.lower())
1159 def _family_friendly_search(self, html):
1160 # See http://schema.org/VideoObject
1161 family_friendly = self._html_search_meta(
1162 'isFamilyFriendly', html, default=None)
1164 if not family_friendly:
1173 return RATING_TABLE.get(family_friendly.lower())
1175 def _twitter_search_player(self, html):
1176 return self._html_search_meta('twitter:player', html,
1177 'twitter card player')
1179 def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1180 json_ld = self._search_regex(
1181 JSON_LD_RE, html, 'JSON-LD', group='json_ld', **kwargs)
1182 default = kwargs.get('default', NO_DEFAULT)
1184 return default if default is not NO_DEFAULT else {}
1185 # JSON-LD may be malformed and thus `fatal` should be respected.
1186 # At the same time `default` may be passed that assumes `fatal=False`
1187 # for _search_regex. Let's simulate the same behavior here as well.
1188 fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
1189 return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1191 def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1192 if isinstance(json_ld, compat_str):
1193 json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1197 if not isinstance(json_ld, (list, tuple, dict)):
1199 if isinstance(json_ld, dict):
1202 INTERACTION_TYPE_MAP = {
1203 'CommentAction': 'comment',
1204 'AgreeAction': 'like',
1205 'DisagreeAction': 'dislike',
1206 'LikeAction': 'like',
1207 'DislikeAction': 'dislike',
1208 'ListenAction': 'view',
1209 'WatchAction': 'view',
1210 'ViewAction': 'view',
1213 def extract_interaction_statistic(e):
1214 interaction_statistic = e.get('interactionStatistic')
1215 if not isinstance(interaction_statistic, list):
1217 for is_e in interaction_statistic:
1218 if not isinstance(is_e, dict):
1220 if is_e.get('@type') != 'InteractionCounter':
1222 interaction_type = is_e.get('interactionType')
1223 if not isinstance(interaction_type, compat_str):
1225 interaction_count = int_or_none(is_e.get('userInteractionCount'))
1226 if interaction_count is None:
1228 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1231 count_key = '%s_count' % count_kind
1232 if info.get(count_key) is not None:
1234 info[count_key] = interaction_count
1236 def extract_video_object(e):
1237 assert e['@type'] == 'VideoObject'
1239 'url': url_or_none(e.get('contentUrl')),
1240 'title': unescapeHTML(e.get('name')),
1241 'description': unescapeHTML(e.get('description')),
1242 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
1243 'duration': parse_duration(e.get('duration')),
1244 'timestamp': unified_timestamp(e.get('uploadDate')),
1245 'filesize': float_or_none(e.get('contentSize')),
1246 'tbr': int_or_none(e.get('bitrate')),
1247 'width': int_or_none(e.get('width')),
1248 'height': int_or_none(e.get('height')),
1249 'view_count': int_or_none(e.get('interactionCount')),
1251 extract_interaction_statistic(e)
1254 if isinstance(e.get('@context'), compat_str) and re.match(r'^https?://schema.org/?$', e.get('@context')):
1255 item_type = e.get('@type')
1256 if expected_type is not None and expected_type != item_type:
1258 if item_type in ('TVEpisode', 'Episode'):
1259 episode_name = unescapeHTML(e.get('name'))
1261 'episode': episode_name,
1262 'episode_number': int_or_none(e.get('episodeNumber')),
1263 'description': unescapeHTML(e.get('description')),
1265 if not info.get('title') and episode_name:
1266 info['title'] = episode_name
1267 part_of_season = e.get('partOfSeason')
1268 if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1270 'season': unescapeHTML(part_of_season.get('name')),
1271 'season_number': int_or_none(part_of_season.get('seasonNumber')),
1273 part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1274 if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1275 info['series'] = unescapeHTML(part_of_series.get('name'))
1276 elif item_type == 'Movie':
1278 'title': unescapeHTML(e.get('name')),
1279 'description': unescapeHTML(e.get('description')),
1280 'duration': parse_duration(e.get('duration')),
1281 'timestamp': unified_timestamp(e.get('dateCreated')),
1283 elif item_type in ('Article', 'NewsArticle'):
1285 'timestamp': parse_iso8601(e.get('datePublished')),
1286 'title': unescapeHTML(e.get('headline')),
1287 'description': unescapeHTML(e.get('articleBody')),
1289 elif item_type == 'VideoObject':
1290 extract_video_object(e)
1292 video = e.get('video')
1293 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1294 extract_video_object(video)
1296 return dict((k, v) for k, v in info.items() if v is not None)
1299 def _hidden_inputs(html):
1300 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1302 for input in re.findall(r'(?i)(<input[^>]+>)', html):
1303 attrs = extract_attributes(input)
1306 if attrs.get('type') not in ('hidden', 'submit'):
1308 name = attrs.get('name') or attrs.get('id')
1309 value = attrs.get('value')
1310 if name and value is not None:
1311 hidden_inputs[name] = value
1312 return hidden_inputs
1314 def _form_hidden_inputs(self, form_id, html):
1315 form = self._search_regex(
1316 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1317 html, '%s form' % form_id, group='form')
1318 return self._hidden_inputs(form)
1320 def _sort_formats(self, formats, field_preference=None):
1322 raise ExtractorError('No video formats found')
1325 # Automatically determine tbr when missing based on abr and vbr (improves
1326 # formats sorting in some cases)
1327 if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
1328 f['tbr'] = f['abr'] + f['vbr']
1330 def _formats_key(f):
1331 # TODO remove the following workaround
1332 from ..utils import determine_ext
1333 if not f.get('ext') and 'url' in f:
1334 f['ext'] = determine_ext(f['url'])
1336 if isinstance(field_preference, (list, tuple)):
1339 if f.get(field) is not None
1340 else ('' if field == 'format_id' else -1)
1341 for field in field_preference)
1343 preference = f.get('preference')
1344 if preference is None:
1346 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
1349 protocol = f.get('protocol') or determine_protocol(f)
1350 proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
1352 if f.get('vcodec') == 'none': # audio only
1354 if self._downloader.params.get('prefer_free_formats'):
1355 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
1357 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
1360 audio_ext_preference = ORDER.index(f['ext'])
1362 audio_ext_preference = -1
1364 if f.get('acodec') == 'none': # video only
1366 if self._downloader.params.get('prefer_free_formats'):
1367 ORDER = ['flv', 'mp4', 'webm']
1369 ORDER = ['webm', 'flv', 'mp4']
1371 ext_preference = ORDER.index(f['ext'])
1374 audio_ext_preference = 0
1378 f.get('language_preference') if f.get('language_preference') is not None else -1,
1379 f.get('quality') if f.get('quality') is not None else -1,
1380 f.get('tbr') if f.get('tbr') is not None else -1,
1381 f.get('filesize') if f.get('filesize') is not None else -1,
1382 f.get('vbr') if f.get('vbr') is not None else -1,
1383 f.get('height') if f.get('height') is not None else -1,
1384 f.get('width') if f.get('width') is not None else -1,
1387 f.get('abr') if f.get('abr') is not None else -1,
1388 audio_ext_preference,
1389 f.get('fps') if f.get('fps') is not None else -1,
1390 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
1391 f.get('source_preference') if f.get('source_preference') is not None else -1,
1392 f.get('format_id') if f.get('format_id') is not None else '',
1394 formats.sort(key=_formats_key)
1396 def _check_formats(self, formats, video_id):
1398 formats[:] = filter(
1399 lambda f: self._is_valid_url(
1401 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1405 def _remove_duplicate_formats(formats):
1409 if f['url'] not in format_urls:
1410 format_urls.add(f['url'])
1411 unique_formats.append(f)
1412 formats[:] = unique_formats
1414 def _is_valid_url(self, url, video_id, item='video', headers={}):
1415 url = self._proto_relative_url(url, scheme='http:')
1416 # For now assume non HTTP(S) URLs always valid
1417 if not (url.startswith('http://') or url.startswith('https://')):
1420 self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1422 except ExtractorError as e:
1423 if isinstance(e.cause, compat_urllib_error.URLError):
1425 '%s: %s URL is invalid, skipping' % (video_id, item))
1429 def http_scheme(self):
1430 """ Either "http:" or "https:", depending on the user's preferences """
1433 if self._downloader.params.get('prefer_insecure', False)
1436 def _proto_relative_url(self, url, scheme=None):
1439 if url.startswith('//'):
1441 scheme = self.http_scheme()
1446 def _sleep(self, timeout, video_id, msg_template=None):
1447 if msg_template is None:
1448 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1449 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1453 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
1454 transform_source=lambda s: fix_xml_ampersands(s).strip(),
1455 fatal=True, m3u8_id=None):
1456 manifest = self._download_xml(
1457 manifest_url, video_id, 'Downloading f4m manifest',
1458 'Unable to download f4m manifest',
1459 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1460 # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1461 transform_source=transform_source,
1464 if manifest is False:
1467 return self._parse_f4m_formats(
1468 manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1469 transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1471 def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1472 transform_source=lambda s: fix_xml_ampersands(s).strip(),
1473 fatal=True, m3u8_id=None):
1474 if not isinstance(manifest, compat_etree_Element) and not fatal:
1477 # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1478 akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1479 if akamai_pv is not None and ';' in akamai_pv.text:
1480 playerVerificationChallenge = akamai_pv.text.split(';')[0]
1481 if playerVerificationChallenge.strip() != '':
1485 manifest_version = '1.0'
1486 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1488 manifest_version = '2.0'
1489 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1490 # Remove unsupported DRM protected media from final formats
1491 # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1492 media_nodes = remove_encrypted_media(media_nodes)
1496 manifest_base_url = get_base_url(manifest)
1498 bootstrap_info = xpath_element(
1499 manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1500 'bootstrap info', default=None)
1503 mime_type = xpath_text(
1504 manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1505 'base URL', default=None)
1506 if mime_type and mime_type.startswith('audio/'):
1509 for i, media_el in enumerate(media_nodes):
1510 tbr = int_or_none(media_el.attrib.get('bitrate'))
1511 width = int_or_none(media_el.attrib.get('width'))
1512 height = int_or_none(media_el.attrib.get('height'))
1513 format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1514 # If <bootstrapInfo> is present, the specified f4m is a
1515 # stream-level manifest, and only set-level manifests may refer to
1516 # external resources. See section 11.4 and section 4 of F4M spec
1517 if bootstrap_info is None:
1519 # @href is introduced in 2.0, see section 11.6 of F4M spec
1520 if manifest_version == '2.0':
1521 media_url = media_el.attrib.get('href')
1522 if media_url is None:
1523 media_url = media_el.attrib.get('url')
1527 media_url if media_url.startswith('http://') or media_url.startswith('https://')
1528 else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1529 # If media_url is itself a f4m manifest do the recursive extraction
1530 # since bitrates in parent manifest (this one) and media_url manifest
1531 # may differ leading to inability to resolve the format by requested
1532 # bitrate in f4m downloader
1533 ext = determine_ext(manifest_url)
1535 f4m_formats = self._extract_f4m_formats(
1536 manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1537 transform_source=transform_source, fatal=fatal)
1538 # Sometimes stream-level manifest contains single media entry that
1539 # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1540 # At the same time parent's media entry in set-level manifest may
1541 # contain it. We will copy it from parent in such cases.
1542 if len(f4m_formats) == 1:
1545 'tbr': f.get('tbr') or tbr,
1546 'width': f.get('width') or width,
1547 'height': f.get('height') or height,
1548 'format_id': f.get('format_id') if not tbr else format_id,
1551 formats.extend(f4m_formats)
1554 formats.extend(self._extract_m3u8_formats(
1555 manifest_url, video_id, 'mp4', preference=preference,
1556 m3u8_id=m3u8_id, fatal=fatal))
1559 'format_id': format_id,
1560 'url': manifest_url,
1561 'manifest_url': manifest_url,
1562 'ext': 'flv' if bootstrap_info is not None else None,
1568 'preference': preference,
1572 def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1574 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1578 'preference': preference - 100 if preference else -100,
1579 'resolution': 'multiple',
1580 'format_note': 'Quality selection URL',
1583 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1584 entry_protocol='m3u8', preference=None,
1585 m3u8_id=None, note=None, errnote=None,
1586 fatal=True, live=False):
1587 res = self._download_webpage_handle(
1589 note=note or 'Downloading m3u8 information',
1590 errnote=errnote or 'Failed to download m3u8 information',
1596 m3u8_doc, urlh = res
1597 m3u8_url = urlh.geturl()
1599 return self._parse_m3u8_formats(
1600 m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1601 preference=preference, m3u8_id=m3u8_id, live=live)
1603 def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
1604 entry_protocol='m3u8', preference=None,
1605 m3u8_id=None, live=False):
1606 if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access
1609 if re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc): # Apple FairPlay
1614 format_url = lambda u: (
1616 if re.match(r'^https?://', u)
1617 else compat_urlparse.urljoin(m3u8_url, u))
1620 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
1621 # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
1622 # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
1624 # We should try extracting formats only from master playlists [1, 4.3.4],
1625 # i.e. playlists that describe available qualities. On the other hand
1626 # media playlists [1, 4.3.3] should be returned as is since they contain
1627 # just the media without qualities renditions.
1628 # Fortunately, master playlist can be easily distinguished from media
1629 # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
1630 # master playlist tags MUST NOT appear in a media playist and vice versa.
1631 # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
1632 # media playlist and MUST NOT appear in master playlist thus we can
1633 # clearly detect media playlist with this criterion.
1635 if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
1638 'format_id': m3u8_id,
1640 'protocol': entry_protocol,
1641 'preference': preference,
1645 last_stream_inf = {}
1647 def extract_media(x_media_line):
1648 media = parse_m3u8_attributes(x_media_line)
1649 # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
1650 media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
1651 if not (media_type and group_id and name):
1653 groups.setdefault(group_id, []).append(media)
1654 if media_type not in ('VIDEO', 'AUDIO'):
1656 media_url = media.get('URI')
1659 for v in (m3u8_id, group_id, name):
1663 'format_id': '-'.join(format_id),
1664 'url': format_url(media_url),
1665 'manifest_url': m3u8_url,
1666 'language': media.get('LANGUAGE'),
1668 'protocol': entry_protocol,
1669 'preference': preference,
1671 if media_type == 'AUDIO':
1672 f['vcodec'] = 'none'
1675 def build_stream_name():
1676 # Despite specification does not mention NAME attribute for
1677 # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
1678 # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
1679 # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
1680 stream_name = last_stream_inf.get('NAME')
1683 # If there is no NAME in EXT-X-STREAM-INF it will be obtained
1684 # from corresponding rendition group
1685 stream_group_id = last_stream_inf.get('VIDEO')
1686 if not stream_group_id:
1688 stream_group = groups.get(stream_group_id)
1689 if not stream_group:
1690 return stream_group_id
1691 rendition = stream_group[0]
1692 return rendition.get('NAME') or stream_group_id
1694 # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
1695 # chance to detect video only formats when EXT-X-STREAM-INF tags
1696 # precede EXT-X-MEDIA tags in HLS manifest such as [3].
1697 for line in m3u8_doc.splitlines():
1698 if line.startswith('#EXT-X-MEDIA:'):
1701 for line in m3u8_doc.splitlines():
1702 if line.startswith('#EXT-X-STREAM-INF:'):
1703 last_stream_inf = parse_m3u8_attributes(line)
1704 elif line.startswith('#') or not line.strip():
1707 tbr = float_or_none(
1708 last_stream_inf.get('AVERAGE-BANDWIDTH') or
1709 last_stream_inf.get('BANDWIDTH'), scale=1000)
1712 format_id.append(m3u8_id)
1713 stream_name = build_stream_name()
1714 # Bandwidth of live streams may differ over time thus making
1715 # format_id unpredictable. So it's better to keep provided
1718 format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
1719 manifest_url = format_url(line.strip())
1721 'format_id': '-'.join(format_id),
1722 'url': manifest_url,
1723 'manifest_url': m3u8_url,
1726 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
1727 'protocol': entry_protocol,
1728 'preference': preference,
1730 resolution = last_stream_inf.get('RESOLUTION')
1732 mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
1734 f['width'] = int(mobj.group('width'))
1735 f['height'] = int(mobj.group('height'))
1736 # Unified Streaming Platform
1738 r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
1740 abr, vbr = mobj.groups()
1741 abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
1746 codecs = parse_codecs(last_stream_inf.get('CODECS'))
1748 audio_group_id = last_stream_inf.get('AUDIO')
1749 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
1750 # references a rendition group MUST have a CODECS attribute.
1751 # However, this is not always respected, for example, [2]
1752 # contains EXT-X-STREAM-INF tag which references AUDIO
1753 # rendition group but does not have CODECS and despite
1754 # referencing an audio group it represents a complete
1755 # (with audio and video) format. So, for such cases we will
1756 # ignore references to rendition groups and treat them
1757 # as complete formats.
1758 if audio_group_id and codecs and f.get('vcodec') != 'none':
1759 audio_group = groups.get(audio_group_id)
1760 if audio_group and audio_group[0].get('URI'):
1761 # TODO: update acodec for audio only formats with
1763 f['acodec'] = 'none'
1765 last_stream_inf = {}
1769 def _xpath_ns(path, namespace=None):
1773 for c in path.split('/'):
1774 if not c or c == '.':
1777 out.append('{%s}%s' % (namespace, c))
1778 return '/'.join(out)
1780 def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1781 smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1787 namespace = self._parse_smil_namespace(smil)
1789 return self._parse_smil_formats(
1790 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1792 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1793 smil = self._download_smil(smil_url, video_id, fatal=fatal)
1796 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1798 def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1799 return self._download_xml(
1800 smil_url, video_id, 'Downloading SMIL file',
1801 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1803 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1804 namespace = self._parse_smil_namespace(smil)
1806 formats = self._parse_smil_formats(
1807 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1808 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1810 video_id = os.path.splitext(url_basename(smil_url))[0]
1814 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1815 name = meta.attrib.get('name')
1816 content = meta.attrib.get('content')
1817 if not name or not content:
1819 if not title and name == 'title':
1821 elif not description and name in ('description', 'abstract'):
1822 description = content
1823 elif not upload_date and name == 'date':
1824 upload_date = unified_strdate(content)
1827 'id': image.get('type'),
1828 'url': image.get('src'),
1829 'width': int_or_none(image.get('width')),
1830 'height': int_or_none(image.get('height')),
1831 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1835 'title': title or video_id,
1836 'description': description,
1837 'upload_date': upload_date,
1838 'thumbnails': thumbnails,
1840 'subtitles': subtitles,
1843 def _parse_smil_namespace(self, smil):
1844 return self._search_regex(
1845 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1847 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1849 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1850 b = meta.get('base') or meta.get('httpBase')
1861 media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1862 for medium in media:
1863 src = medium.get('src')
1864 if not src or src in srcs:
1868 bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1869 filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1870 width = int_or_none(medium.get('width'))
1871 height = int_or_none(medium.get('height'))
1872 proto = medium.get('proto')
1873 ext = medium.get('ext')
1874 src_ext = determine_ext(src)
1875 streamer = medium.get('streamer') or base
1877 if proto == 'rtmp' or streamer.startswith('rtmp'):
1883 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1885 'filesize': filesize,
1889 if transform_rtmp_url:
1890 streamer, src = transform_rtmp_url(streamer, src)
1891 formats[-1].update({
1897 src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1898 src_url = src_url.strip()
1900 if proto == 'm3u8' or src_ext == 'm3u8':
1901 m3u8_formats = self._extract_m3u8_formats(
1902 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1903 if len(m3u8_formats) == 1:
1905 m3u8_formats[0].update({
1906 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1911 formats.extend(m3u8_formats)
1912 elif src_ext == 'f4m':
1917 'plugin': 'flowplayer-3.2.0.1',
1919 f4m_url += '&' if '?' in f4m_url else '?'
1920 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1921 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1922 elif src_ext == 'mpd':
1923 formats.extend(self._extract_mpd_formats(
1924 src_url, video_id, mpd_id='dash', fatal=False))
1925 elif re.search(r'\.ism/[Mm]anifest', src_url):
1926 formats.extend(self._extract_ism_formats(
1927 src_url, video_id, ism_id='mss', fatal=False))
1928 elif src_url.startswith('http') and self._is_valid_url(src, video_id):
1932 'ext': ext or src_ext or 'flv',
1933 'format_id': 'http-%d' % (bitrate or http_count),
1935 'filesize': filesize,
1942 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1945 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1946 src = textstream.get('src')
1947 if not src or src in urls:
1950 ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
1951 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1952 subtitles.setdefault(lang, []).append({
1958 def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
1959 xspf = self._download_xml(
1960 xspf_url, playlist_id, 'Downloading xpsf playlist',
1961 'Unable to download xspf manifest', fatal=fatal)
1964 return self._parse_xspf(
1965 xspf, playlist_id, xspf_url=xspf_url,
1966 xspf_base_url=base_url(xspf_url))
1968 def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
1970 'xspf': 'http://xspf.org/ns/0/',
1971 's1': 'http://static.streamone.nl/player/ns/0',
1975 for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1977 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1978 description = xpath_text(
1979 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1980 thumbnail = xpath_text(
1981 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1982 duration = float_or_none(
1983 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1986 for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
1987 format_url = urljoin(xspf_base_url, location.text)
1992 'manifest_url': xspf_url,
1993 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1994 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1995 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1997 self._sort_formats(formats)
2002 'description': description,
2003 'thumbnail': thumbnail,
2004 'duration': duration,
2009 def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
2010 res = self._download_xml_handle(
2012 note=note or 'Downloading MPD manifest',
2013 errnote=errnote or 'Failed to download MPD manifest',
2018 mpd_base_url = base_url(urlh.geturl())
2020 return self._parse_mpd_formats(
2021 mpd_doc, mpd_id=mpd_id, mpd_base_url=mpd_base_url,
2022 formats_dict=formats_dict, mpd_url=mpd_url)
2024 def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
2026 Parse formats from MPD manifest.
2028 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2029 http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2030 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2032 if mpd_doc.get('type') == 'dynamic':
2035 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2038 return self._xpath_ns(path, namespace)
2040 def is_drm_protected(element):
2041 return element.find(_add_ns('ContentProtection')) is not None
2043 def extract_multisegment_info(element, ms_parent_info):
2044 ms_info = ms_parent_info.copy()
2046 # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2047 # common attributes and elements. We will only extract relevant
2049 def extract_common(source):
2050 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2051 if segment_timeline is not None:
2052 s_e = segment_timeline.findall(_add_ns('S'))
2054 ms_info['total_number'] = 0
2057 r = int(s.get('r', 0))
2058 ms_info['total_number'] += 1 + r
2059 ms_info['s'].append({
2060 't': int(s.get('t', 0)),
2061 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2062 'd': int(s.attrib['d']),
2065 start_number = source.get('startNumber')
2067 ms_info['start_number'] = int(start_number)
2068 timescale = source.get('timescale')
2070 ms_info['timescale'] = int(timescale)
2071 segment_duration = source.get('duration')
2072 if segment_duration:
2073 ms_info['segment_duration'] = float(segment_duration)
2075 def extract_Initialization(source):
2076 initialization = source.find(_add_ns('Initialization'))
2077 if initialization is not None:
2078 ms_info['initialization_url'] = initialization.attrib['sourceURL']
2080 segment_list = element.find(_add_ns('SegmentList'))
2081 if segment_list is not None:
2082 extract_common(segment_list)
2083 extract_Initialization(segment_list)
2084 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2086 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2088 segment_template = element.find(_add_ns('SegmentTemplate'))
2089 if segment_template is not None:
2090 extract_common(segment_template)
2091 media = segment_template.get('media')
2093 ms_info['media'] = media
2094 initialization = segment_template.get('initialization')
2096 ms_info['initialization'] = initialization
2098 extract_Initialization(segment_template)
2101 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2103 for period in mpd_doc.findall(_add_ns('Period')):
2104 period_duration = parse_duration(period.get('duration')) or mpd_duration
2105 period_ms_info = extract_multisegment_info(period, {
2109 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2110 if is_drm_protected(adaptation_set):
2112 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2113 for representation in adaptation_set.findall(_add_ns('Representation')):
2114 if is_drm_protected(representation):
2116 representation_attrib = adaptation_set.attrib.copy()
2117 representation_attrib.update(representation.attrib)
2118 # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2119 mime_type = representation_attrib['mimeType']
2120 content_type = mime_type.split('/')[0]
2121 if content_type == 'text':
2122 # TODO implement WebVTT downloading
2124 elif content_type in ('video', 'audio'):
2126 for element in (representation, adaptation_set, period, mpd_doc):
2127 base_url_e = element.find(_add_ns('BaseURL'))
2128 if base_url_e is not None:
2129 base_url = base_url_e.text + base_url
2130 if re.match(r'^https?://', base_url):
2132 if mpd_base_url and not re.match(r'^https?://', base_url):
2133 if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
2135 base_url = mpd_base_url + base_url
2136 representation_id = representation_attrib.get('id')
2137 lang = representation_attrib.get('lang')
2138 url_el = representation.find(_add_ns('BaseURL'))
2139 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2140 bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2142 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
2143 'manifest_url': mpd_url,
2144 'ext': mimetype2ext(mime_type),
2145 'width': int_or_none(representation_attrib.get('width')),
2146 'height': int_or_none(representation_attrib.get('height')),
2147 'tbr': float_or_none(bandwidth, 1000),
2148 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2149 'fps': int_or_none(representation_attrib.get('frameRate')),
2150 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2151 'format_note': 'DASH %s' % content_type,
2152 'filesize': filesize,
2153 'container': mimetype2ext(mime_type) + '_dash',
2155 f.update(parse_codecs(representation_attrib.get('codecs')))
2156 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2158 def prepare_template(template_name, identifiers):
2159 tmpl = representation_ms_info[template_name]
2160 # First of, % characters outside $...$ templates
2161 # must be escaped by doubling for proper processing
2162 # by % operator string formatting used further (see
2163 # https://github.com/ytdl-org/youtube-dl/issues/16867).
2169 in_template = not in_template
2170 elif c == '%' and not in_template:
2172 # Next, $...$ templates are translated to their
2173 # %(...) counterparts to be used with % operator
2174 t = t.replace('$RepresentationID$', representation_id)
2175 t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2176 t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2177 t.replace('$$', '$')
2180 # @initialization is a regular template like @media one
2181 # so it should be handled just the same way (see
2182 # https://github.com/ytdl-org/youtube-dl/issues/11605)
2183 if 'initialization' in representation_ms_info:
2184 initialization_template = prepare_template(
2186 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2187 # $Time$ shall not be included for @initialization thus
2188 # only $Bandwidth$ remains
2190 representation_ms_info['initialization_url'] = initialization_template % {
2191 'Bandwidth': bandwidth,
2194 def location_key(location):
2195 return 'url' if re.match(r'^https?://', location) else 'path'
2197 if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2199 media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2200 media_location_key = location_key(media_template)
2202 # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2203 # can't be used at the same time
2204 if '%(Number' in media_template and 's' not in representation_ms_info:
2205 segment_duration = None
2206 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2207 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2208 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2209 representation_ms_info['fragments'] = [{
2210 media_location_key: media_template % {
2211 'Number': segment_number,
2212 'Bandwidth': bandwidth,
2214 'duration': segment_duration,
2215 } for segment_number in range(
2216 representation_ms_info['start_number'],
2217 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2219 # $Number*$ or $Time$ in media template with S list available
2220 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2221 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2222 representation_ms_info['fragments'] = []
2225 segment_number = representation_ms_info['start_number']
2227 def add_segment_url():
2228 segment_url = media_template % {
2229 'Time': segment_time,
2230 'Bandwidth': bandwidth,
2231 'Number': segment_number,
2233 representation_ms_info['fragments'].append({
2234 media_location_key: segment_url,
2235 'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2238 for num, s in enumerate(representation_ms_info['s']):
2239 segment_time = s.get('t') or segment_time
2243 for r in range(s.get('r', 0)):
2244 segment_time += segment_d
2247 segment_time += segment_d
2248 elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2250 # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2251 # or any YouTube dashsegments video
2254 timescale = representation_ms_info['timescale']
2255 for s in representation_ms_info['s']:
2256 duration = float_or_none(s['d'], timescale)
2257 for r in range(s.get('r', 0) + 1):
2258 segment_uri = representation_ms_info['segment_urls'][segment_index]
2260 location_key(segment_uri): segment_uri,
2261 'duration': duration,
2264 representation_ms_info['fragments'] = fragments
2265 elif 'segment_urls' in representation_ms_info:
2266 # Segment URLs with no SegmentTimeline
2267 # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2268 # https://github.com/ytdl-org/youtube-dl/pull/14844
2270 segment_duration = float_or_none(
2271 representation_ms_info['segment_duration'],
2272 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2273 for segment_url in representation_ms_info['segment_urls']:
2275 location_key(segment_url): segment_url,
2277 if segment_duration:
2278 fragment['duration'] = segment_duration
2279 fragments.append(fragment)
2280 representation_ms_info['fragments'] = fragments
2281 # If there is a fragments key available then we correctly recognized fragmented media.
2282 # Otherwise we will assume unfragmented media with direct access. Technically, such
2283 # assumption is not necessarily correct since we may simply have no support for
2284 # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2285 if 'fragments' in representation_ms_info:
2287 # NB: mpd_url may be empty when MPD manifest is parsed from a string
2288 'url': mpd_url or base_url,
2289 'fragment_base_url': base_url,
2291 'protocol': 'http_dash_segments',
2293 if 'initialization_url' in representation_ms_info:
2294 initialization_url = representation_ms_info['initialization_url']
2295 if not f.get('url'):
2296 f['url'] = initialization_url
2297 f['fragments'].append({location_key(initialization_url): initialization_url})
2298 f['fragments'].extend(representation_ms_info['fragments'])
2300 # Assuming direct URL to unfragmented media.
2303 # According to [1, 5.3.5.2, Table 7, page 35] @id of Representation
2304 # is not necessarily unique within a Period thus formats with
2305 # the same `format_id` are quite possible. There are numerous examples
2306 # of such manifests (see https://github.com/ytdl-org/youtube-dl/issues/15111,
2307 # https://github.com/ytdl-org/youtube-dl/issues/13919)
2308 full_info = formats_dict.get(representation_id, {}).copy()
2310 formats.append(full_info)
2312 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2315 def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True):
2316 res = self._download_xml_handle(
2318 note=note or 'Downloading ISM manifest',
2319 errnote=errnote or 'Failed to download ISM manifest',
2325 return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id)
2327 def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
2329 Parse formats from ISM manifest.
2331 1. [MS-SSTR]: Smooth Streaming Protocol,
2332 https://msdn.microsoft.com/en-us/library/ff469518.aspx
2334 if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
2337 duration = int(ism_doc.attrib['Duration'])
2338 timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2341 for stream in ism_doc.findall('StreamIndex'):
2342 stream_type = stream.get('Type')
2343 if stream_type not in ('video', 'audio'):
2345 url_pattern = stream.attrib['Url']
2346 stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2347 stream_name = stream.get('Name')
2348 for track in stream.findall('QualityLevel'):
2349 fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None)
2350 # TODO: add support for WVC1 and WMAP
2351 if fourcc not in ('H264', 'AVC1', 'AACL'):
2352 self.report_warning('%s is not a supported codec' % fourcc)
2354 tbr = int(track.attrib['Bitrate']) // 1000
2355 # [1] does not mention Width and Height attributes. However,
2356 # they're often present while MaxWidth and MaxHeight are
2357 # missing, so should be used as fallbacks
2358 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2359 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2360 sampling_rate = int_or_none(track.get('SamplingRate'))
2362 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2363 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2369 stream_fragments = stream.findall('c')
2370 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2371 fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2372 fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2373 fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2374 if not fragment_ctx['duration']:
2376 next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2378 next_fragment_time = duration
2379 fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2380 for _ in range(fragment_repeat):
2382 'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2383 'duration': fragment_ctx['duration'] / stream_timescale,
2385 fragment_ctx['time'] += fragment_ctx['duration']
2389 format_id.append(ism_id)
2391 format_id.append(stream_name)
2392 format_id.append(compat_str(tbr))
2395 'format_id': '-'.join(format_id),
2397 'manifest_url': ism_url,
2398 'ext': 'ismv' if stream_type == 'video' else 'isma',
2402 'asr': sampling_rate,
2403 'vcodec': 'none' if stream_type == 'audio' else fourcc,
2404 'acodec': 'none' if stream_type == 'video' else fourcc,
2406 'fragments': fragments,
2407 '_download_params': {
2408 'duration': duration,
2409 'timescale': stream_timescale,
2410 'width': width or 0,
2411 'height': height or 0,
2413 'codec_private_data': track.get('CodecPrivateData'),
2414 'sampling_rate': sampling_rate,
2415 'channels': int_or_none(track.get('Channels', 2)),
2416 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2417 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2422 def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None):
2423 def absolute_url(item_url):
2424 return urljoin(base_url, item_url)
2426 def parse_content_type(content_type):
2427 if not content_type:
2429 ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2431 mimetype, codecs = ctr.groups()
2432 f = parse_codecs(codecs)
2433 f['ext'] = mimetype2ext(mimetype)
2437 def _media_formats(src, cur_media_type, type_info={}):
2438 full_url = absolute_url(src)
2439 ext = type_info.get('ext') or determine_ext(full_url)
2441 is_plain_url = False
2442 formats = self._extract_m3u8_formats(
2443 full_url, video_id, ext='mp4',
2444 entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2445 preference=preference, fatal=False)
2447 is_plain_url = False
2448 formats = self._extract_mpd_formats(
2449 full_url, video_id, mpd_id=mpd_id, fatal=False)
2454 'vcodec': 'none' if cur_media_type == 'audio' else None,
2456 return is_plain_url, formats
2459 # amp-video and amp-audio are very similar to their HTML5 counterparts
2460 # so we wll include them right here (see
2461 # https://www.ampproject.org/docs/reference/components/amp-video)
2462 media_tags = [(media_tag, media_type, '')
2463 for media_tag, media_type
2464 in re.findall(r'(?s)(<(?:amp-)?(video|audio)[^>]*/>)', webpage)]
2465 media_tags.extend(re.findall(
2466 # We only allow video|audio followed by a whitespace or '>'.
2467 # Allowing more characters may end up in significant slow down (see
2468 # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
2469 # http://www.porntrex.com/maps/videositemap.xml).
2470 r'(?s)(<(?P<tag>(?:amp-)?(?:video|audio))(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
2471 for media_tag, media_type, media_content in media_tags:
2476 media_attributes = extract_attributes(media_tag)
2477 src = media_attributes.get('src')
2479 _, formats = _media_formats(src, media_type)
2480 media_info['formats'].extend(formats)
2481 media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
2483 for source_tag in re.findall(r'<source[^>]+>', media_content):
2484 source_attributes = extract_attributes(source_tag)
2485 src = source_attributes.get('src')
2488 f = parse_content_type(source_attributes.get('type'))
2489 is_plain_url, formats = _media_formats(src, media_type, f)
2491 # res attribute is not standard but seen several times
2494 'height': int_or_none(source_attributes.get('res')),
2495 'format_id': source_attributes.get('label'),
2497 f.update(formats[0])
2498 media_info['formats'].append(f)
2500 media_info['formats'].extend(formats)
2501 for track_tag in re.findall(r'<track[^>]+>', media_content):
2502 track_attributes = extract_attributes(track_tag)
2503 kind = track_attributes.get('kind')
2504 if not kind or kind in ('subtitles', 'captions'):
2505 src = track_attributes.get('src')
2508 lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2509 media_info['subtitles'].setdefault(lang, []).append({
2510 'url': absolute_url(src),
2512 for f in media_info['formats']:
2513 f.setdefault('http_headers', {})['Referer'] = base_url
2514 if media_info['formats'] or media_info['subtitles']:
2515 entries.append(media_info)
2518 def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
2520 hdcore_sign = 'hdcore=3.7.0'
2521 f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
2522 hds_host = hosts.get('hds')
2524 f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
2525 if 'hdcore=' not in f4m_url:
2526 f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2527 f4m_formats = self._extract_f4m_formats(
2528 f4m_url, video_id, f4m_id='hds', fatal=False)
2529 for entry in f4m_formats:
2530 entry.update({'extra_param_to_segment_url': hdcore_sign})
2531 formats.extend(f4m_formats)
2532 m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2533 hls_host = hosts.get('hls')
2535 m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
2536 formats.extend(self._extract_m3u8_formats(
2537 m3u8_url, video_id, 'mp4', 'm3u8_native',
2538 m3u8_id='hls', fatal=False))
2541 def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
2542 query = compat_urlparse.urlparse(url).query
2543 url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
2545 r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
2546 url_base = mobj.group('url')
2547 http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
2550 def manifest_url(manifest):
2551 m_url = '%s/%s' % (http_base_url, manifest)
2553 m_url += '?%s' % query
2556 if 'm3u8' not in skip_protocols:
2557 formats.extend(self._extract_m3u8_formats(
2558 manifest_url('playlist.m3u8'), video_id, 'mp4',
2559 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2560 if 'f4m' not in skip_protocols:
2561 formats.extend(self._extract_f4m_formats(
2562 manifest_url('manifest.f4m'),
2563 video_id, f4m_id='hds', fatal=False))
2564 if 'dash' not in skip_protocols:
2565 formats.extend(self._extract_mpd_formats(
2566 manifest_url('manifest.mpd'),
2567 video_id, mpd_id='dash', fatal=False))
2568 if re.search(r'(?:/smil:|\.smil)', url_base):
2569 if 'smil' not in skip_protocols:
2570 rtmp_formats = self._extract_smil_formats(
2571 manifest_url('jwplayer.smil'),
2572 video_id, fatal=False)
2573 for rtmp_format in rtmp_formats:
2574 rtsp_format = rtmp_format.copy()
2575 rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2576 del rtsp_format['play_path']
2577 del rtsp_format['ext']
2578 rtsp_format.update({
2579 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2580 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
2583 formats.extend([rtmp_format, rtsp_format])
2585 for protocol in ('rtmp', 'rtsp'):
2586 if protocol not in skip_protocols:
2588 'url': '%s:%s' % (protocol, url_base),
2589 'format_id': protocol,
2590 'protocol': protocol,
2594 def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
2596 r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
2600 jwplayer_data = self._parse_json(mobj.group('options'),
2602 transform_source=transform_source)
2603 except ExtractorError:
2606 if isinstance(jwplayer_data, dict):
2607 return jwplayer_data
2609 def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
2610 jwplayer_data = self._find_jwplayer_data(
2611 webpage, video_id, transform_source=js_to_json)
2612 return self._parse_jwplayer_data(
2613 jwplayer_data, video_id, *args, **kwargs)
2615 def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
2616 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2617 # JWPlayer backward compatibility: flattened playlists
2618 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
2619 if 'playlist' not in jwplayer_data:
2620 jwplayer_data = {'playlist': [jwplayer_data]}
2624 # JWPlayer backward compatibility: single playlist item
2625 # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
2626 if not isinstance(jwplayer_data['playlist'], list):
2627 jwplayer_data['playlist'] = [jwplayer_data['playlist']]
2629 for video_data in jwplayer_data['playlist']:
2630 # JWPlayer backward compatibility: flattened sources
2631 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
2632 if 'sources' not in video_data:
2633 video_data['sources'] = [video_data]
2635 this_video_id = video_id or video_data['mediaid']
2637 formats = self._parse_jwplayer_formats(
2638 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
2639 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
2642 tracks = video_data.get('tracks')
2643 if tracks and isinstance(tracks, list):
2644 for track in tracks:
2645 if not isinstance(track, dict):
2647 track_kind = track.get('kind')
2648 if not track_kind or not isinstance(track_kind, compat_str):
2650 if track_kind.lower() not in ('captions', 'subtitles'):
2652 track_url = urljoin(base_url, track.get('file'))
2655 subtitles.setdefault(track.get('label') or 'en', []).append({
2656 'url': self._proto_relative_url(track_url)
2660 'id': this_video_id,
2661 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
2662 'description': video_data.get('description'),
2663 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
2664 'timestamp': int_or_none(video_data.get('pubdate')),
2665 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
2666 'subtitles': subtitles,
2668 # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
2669 if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
2671 '_type': 'url_transparent',
2672 'url': formats[0]['url'],
2675 self._sort_formats(formats)
2676 entry['formats'] = formats
2677 entries.append(entry)
2678 if len(entries) == 1:
2681 return self.playlist_result(entries)
2683 def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
2684 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2687 for source in jwplayer_sources_data:
2688 if not isinstance(source, dict):
2690 source_url = urljoin(
2691 base_url, self._proto_relative_url(source.get('file')))
2692 if not source_url or source_url in urls:
2694 urls.append(source_url)
2695 source_type = source.get('type') or ''
2696 ext = mimetype2ext(source_type) or determine_ext(source_url)
2697 if source_type == 'hls' or ext == 'm3u8':
2698 formats.extend(self._extract_m3u8_formats(
2699 source_url, video_id, 'mp4', entry_protocol='m3u8_native',
2700 m3u8_id=m3u8_id, fatal=False))
2701 elif source_type == 'dash' or ext == 'mpd':
2702 formats.extend(self._extract_mpd_formats(
2703 source_url, video_id, mpd_id=mpd_id, fatal=False))
2705 formats.extend(self._extract_smil_formats(
2706 source_url, video_id, fatal=False))
2707 # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
2708 elif source_type.startswith('audio') or ext in (
2709 'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
2716 height = int_or_none(source.get('height'))
2718 # Often no height is provided but there is a label in
2719 # format like "1080p", "720p SD", or 1080.
2720 height = int_or_none(self._search_regex(
2721 r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
2722 'height', default=None))
2725 'width': int_or_none(source.get('width')),
2727 'tbr': int_or_none(source.get('bitrate')),
2730 if source_url.startswith('rtmp'):
2731 a_format['ext'] = 'flv'
2732 # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
2733 # of jwplayer.flash.swf
2734 rtmp_url_parts = re.split(
2735 r'((?:mp4|mp3|flv):)', source_url, 1)
2736 if len(rtmp_url_parts) == 3:
2737 rtmp_url, prefix, play_path = rtmp_url_parts
2740 'play_path': prefix + play_path,
2743 a_format.update(rtmp_params)
2744 formats.append(a_format)
2747 def _live_title(self, name):
2748 """ Generate the title for a live video """
2749 now = datetime.datetime.now()
2750 now_str = now.strftime('%Y-%m-%d %H:%M')
2751 return name + ' ' + now_str
2753 def _int(self, v, name, fatal=False, **kwargs):
2754 res = int_or_none(v, **kwargs)
2755 if 'get_attr' in kwargs:
2756 print(getattr(v, kwargs['get_attr']))
2758 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2760 raise ExtractorError(msg)
2762 self._downloader.report_warning(msg)
2765 def _float(self, v, name, fatal=False, **kwargs):
2766 res = float_or_none(v, **kwargs)
2768 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2770 raise ExtractorError(msg)
2772 self._downloader.report_warning(msg)
2775 def _set_cookie(self, domain, name, value, expire_time=None, port=None,
2776 path='/', secure=False, discard=False, rest={}, **kwargs):
2777 cookie = compat_cookiejar.Cookie(
2778 0, name, value, port, port is not None, domain, True,
2779 domain.startswith('.'), path, True, secure, expire_time,
2780 discard, None, None, rest)
2781 self._downloader.cookiejar.set_cookie(cookie)
2783 def _get_cookies(self, url):
2784 """ Return a compat_cookies.SimpleCookie with the cookies for the url """
2785 req = sanitized_Request(url)
2786 self._downloader.cookiejar.add_cookie_header(req)
2787 return compat_cookies.SimpleCookie(req.get_header('Cookie'))
2789 def get_testcases(self, include_onlymatching=False):
2790 t = getattr(self, '_TEST', None)
2792 assert not hasattr(self, '_TESTS'), \
2793 '%s has _TEST and _TESTS' % type(self).__name__
2796 tests = getattr(self, '_TESTS', [])
2798 if not include_onlymatching and t.get('only_matching', False):
2800 t['name'] = type(self).__name__[:-len('IE')]
2803 def is_suitable(self, age_limit):
2804 """ Test whether the extractor is generally suitable for the given
2805 age limit (i.e. pornographic sites are not, all others usually are) """
2807 any_restricted = False
2808 for tc in self.get_testcases(include_onlymatching=False):
2809 if tc.get('playlist', []):
2810 tc = tc['playlist'][0]
2811 is_restricted = age_restricted(
2812 tc.get('info_dict', {}).get('age_limit'), age_limit)
2813 if not is_restricted:
2815 any_restricted = any_restricted or is_restricted
2816 return not any_restricted
2818 def extract_subtitles(self, *args, **kwargs):
2819 if (self._downloader.params.get('writesubtitles', False) or
2820 self._downloader.params.get('listsubtitles')):
2821 return self._get_subtitles(*args, **kwargs)
2824 def _get_subtitles(self, *args, **kwargs):
2825 raise NotImplementedError('This method must be implemented by subclasses')
2828 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
2829 """ Merge subtitle items for one language. Items with duplicated URLs
2830 will be dropped. """
2831 list1_urls = set([item['url'] for item in subtitle_list1])
2832 ret = list(subtitle_list1)
2833 ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
2837 def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
2838 """ Merge two subtitle dictionaries, language by language. """
2839 ret = dict(subtitle_dict1)
2840 for lang in subtitle_dict2:
2841 ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
2844 def extract_automatic_captions(self, *args, **kwargs):
2845 if (self._downloader.params.get('writeautomaticsub', False) or
2846 self._downloader.params.get('listsubtitles')):
2847 return self._get_automatic_captions(*args, **kwargs)
2850 def _get_automatic_captions(self, *args, **kwargs):
2851 raise NotImplementedError('This method must be implemented by subclasses')
2853 def mark_watched(self, *args, **kwargs):
2854 if (self._downloader.params.get('mark_watched', False) and
2855 (self._get_login_info()[0] is not None or
2856 self._downloader.params.get('cookiefile') is not None)):
2857 self._mark_watched(*args, **kwargs)
2859 def _mark_watched(self, *args, **kwargs):
2860 raise NotImplementedError('This method must be implemented by subclasses')
2862 def geo_verification_headers(self):
2864 geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
2865 if geo_verification_proxy:
2866 headers['Ytdl-request-proxy'] = geo_verification_proxy
2869 def _generic_id(self, url):
2870 return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
2872 def _generic_title(self, url):
2873 return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
2876 class SearchInfoExtractor(InfoExtractor):
2878 Base class for paged search queries extractors.
2879 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
2880 Instances should define _SEARCH_KEY and _MAX_RESULTS.
2884 def _make_valid_url(cls):
2885 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
2888 def suitable(cls, url):
2889 return re.match(cls._make_valid_url(), url) is not None
2891 def _real_extract(self, query):
2892 mobj = re.match(self._make_valid_url(), query)
2894 raise ExtractorError('Invalid search query "%s"' % query)
2896 prefix = mobj.group('prefix')
2897 query = mobj.group('query')
2899 return self._get_n_results(query, 1)
2900 elif prefix == 'all':
2901 return self._get_n_results(query, self._MAX_RESULTS)
2905 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
2906 elif n > self._MAX_RESULTS:
2907 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
2908 n = self._MAX_RESULTS
2909 return self._get_n_results(query, n)
2911 def _get_n_results(self, query, n):
2912 """Get a specified number of results for a query"""
2913 raise NotImplementedError('This method must be implemented by subclasses')
2916 def SEARCH_KEY(self):
2917 return self._SEARCH_KEY