1 from __future__ import unicode_literals
15 from ..compat import (
18 compat_etree_fromstring,
24 compat_urllib_parse_unquote,
25 compat_urllib_parse_urlencode,
26 compat_urllib_request,
29 from ..downloader.f4m import remove_encrypted_media
58 parse_m3u8_attributes,
64 class InfoExtractor(object):
65 """Information Extractor class.
67 Information extractors are the classes that, given a URL, extract
68 information about the video (or videos) the URL refers to. This
69 information includes the real video URL, the video title, author and
70 others. The information is stored in a dictionary which is then
71 passed to the YoutubeDL. The YoutubeDL processes this
72 information possibly downloading the video to the file system, among
73 other possible outcomes.
75 The type field determines the type of the result.
76 By far the most common value (and the default if _type is missing) is
77 "video", which indicates a single video.
79 For a video, the dictionaries must include the following fields:
82 title: Video title, unescaped.
84 Additionally, it must contain either a formats entry or a url one:
86 formats: A list of dictionaries for each format available, ordered
87 from worst to best quality.
90 * url Mandatory. The URL of the video file
92 The URL of the manifest file in case of
93 fragmented media (DASH, hls, hds)
94 * ext Will be calculated from URL if missing
95 * format A human-readable description of the format
96 ("mp4 container with h264/opus").
97 Calculated from the format_id, width, height.
98 and format_note fields if missing.
99 * format_id A short description of the format
100 ("mp4_h264_opus" or "19").
101 Technically optional, but strongly recommended.
102 * format_note Additional info about the format
103 ("3D" or "DASH video")
104 * width Width of the video, if known
105 * height Height of the video, if known
106 * resolution Textual description of width and height
107 * tbr Average bitrate of audio and video in KBit/s
108 * abr Average audio bitrate in KBit/s
109 * acodec Name of the audio codec in use
110 * asr Audio sampling rate in Hertz
111 * vbr Average video bitrate in KBit/s
113 * vcodec Name of the video codec in use
114 * container Name of the container format
115 * filesize The number of bytes, if known in advance
116 * filesize_approx An estimate for the number of bytes
117 * player_url SWF Player URL (used for rtmpdump).
118 * protocol The protocol that will be used for the actual
119 download, lower-case.
120 "http", "https", "rtsp", "rtmp", "rtmpe",
121 "m3u8", "m3u8_native" or "http_dash_segments".
122 * fragments A list of fragments of the fragmented media,
123 with the following entries:
124 * "url" (mandatory) - fragment's URL
125 * "duration" (optional, int or float)
126 * "filesize" (optional, int)
127 * preference Order number of this format. If this field is
128 present and not None, the formats get sorted
129 by this field, regardless of all other values.
130 -1 for default (order by other properties),
131 -2 or smaller for less than default.
132 < -1000 to hide the format (if there is
133 another one which is strictly better)
134 * language Language code, e.g. "de" or "en-US".
135 * language_preference Is this in the language mentioned in
137 10 if it's what the URL is about,
138 -1 for default (don't know),
139 -10 otherwise, other values reserved for now.
140 * quality Order number of the video quality of this
141 format, irrespective of the file format.
142 -1 for default (order by other properties),
143 -2 or smaller for less than default.
144 * source_preference Order number for this video source
145 (quality takes higher priority)
146 -1 for default (order by other properties),
147 -2 or smaller for less than default.
148 * http_headers A dictionary of additional HTTP headers
149 to add to the request.
150 * stretched_ratio If given and not 1, indicates that the
151 video's pixels are not square.
152 width : height ratio as float.
153 * no_resume The server does not support resuming the
154 (HTTP or RTMP) download. Boolean.
156 url: Final video URL.
157 ext: Video filename extension.
158 format: The video format, defaults to ext (used for --get-format)
159 player_url: SWF Player URL (used for rtmpdump).
161 The following fields are optional:
163 alt_title: A secondary title of the video.
164 display_id An alternative identifier for the video, not necessarily
165 unique, but available before title. Typically, id is
166 something like "4234987", title "Dancing naked mole rats",
167 and display_id "dancing-naked-mole-rats"
168 thumbnails: A list of dictionaries, with the following entries:
169 * "id" (optional, string) - Thumbnail format ID
171 * "preference" (optional, int) - quality of the image
172 * "width" (optional, int)
173 * "height" (optional, int)
174 * "resolution" (optional, string "{width}x{height"},
176 * "filesize" (optional, int)
177 thumbnail: Full URL to a video thumbnail image.
178 description: Full video description.
179 uploader: Full name of the video uploader.
180 license: License name the video is licensed under.
181 creator: The creator of the video.
182 release_date: The date (YYYYMMDD) when the video was released.
183 timestamp: UNIX timestamp of the moment the video became available.
184 upload_date: Video upload date (YYYYMMDD).
185 If not explicitly set, calculated from timestamp.
186 uploader_id: Nickname or id of the video uploader.
187 uploader_url: Full URL to a personal webpage of the video uploader.
188 location: Physical location where the video was filmed.
189 subtitles: The available subtitles as a dictionary in the format
190 {language: subformats}. "subformats" is a list sorted from
191 lower to higher preference, each element is a dictionary
192 with the "ext" entry and one of:
193 * "data": The subtitles file contents
194 * "url": A URL pointing to the subtitles file
195 "ext" will be calculated from URL if missing
196 automatic_captions: Like 'subtitles', used by the YoutubeIE for
197 automatically generated captions
198 duration: Length of the video in seconds, as an integer or float.
199 view_count: How many users have watched the video on the platform.
200 like_count: Number of positive ratings of the video
201 dislike_count: Number of negative ratings of the video
202 repost_count: Number of reposts of the video
203 average_rating: Average rating give by users, the scale used depends on the webpage
204 comment_count: Number of comments on the video
205 comments: A list of comments, each with one or more of the following
206 properties (all but one of text or html optional):
207 * "author" - human-readable name of the comment author
208 * "author_id" - user ID of the comment author
210 * "html" - Comment as HTML
211 * "text" - Plain text of the comment
212 * "timestamp" - UNIX timestamp of comment
213 * "parent" - ID of the comment this one is replying to.
214 Set to "root" to indicate that this is a
215 comment to the original video.
216 age_limit: Age restriction for the video, as an integer (years)
217 webpage_url: The URL to the video webpage, if given to youtube-dl it
218 should allow to get the same result again. (It will be set
219 by YoutubeDL if it's missing)
220 categories: A list of categories that the video falls in, for example
222 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
223 is_live: True, False, or None (=unknown). Whether this video is a
224 live stream that goes on instead of a fixed-length video.
225 start_time: Time in seconds where the reproduction should start, as
226 specified in the URL.
227 end_time: Time in seconds where the reproduction should end, as
228 specified in the URL.
230 The following fields should only be used when the video belongs to some logical
233 chapter: Name or title of the chapter the video belongs to.
234 chapter_number: Number of the chapter the video belongs to, as an integer.
235 chapter_id: Id of the chapter the video belongs to, as a unicode string.
237 The following fields should only be used when the video is an episode of some
238 series, programme or podcast:
240 series: Title of the series or programme the video episode belongs to.
241 season: Title of the season the video episode belongs to.
242 season_number: Number of the season the video episode belongs to, as an integer.
243 season_id: Id of the season the video episode belongs to, as a unicode string.
244 episode: Title of the video episode. Unlike mandatory video title field,
245 this field should denote the exact title of the video episode
246 without any kind of decoration.
247 episode_number: Number of the video episode within a season, as an integer.
248 episode_id: Id of the video episode, as a unicode string.
250 The following fields should only be used when the media is a track or a part of
253 track: Title of the track.
254 track_number: Number of the track within an album or a disc, as an integer.
255 track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
257 artist: Artist(s) of the track.
258 genre: Genre(s) of the track.
259 album: Title of the album the track belongs to.
260 album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
261 album_artist: List of all artists appeared on the album (e.g.
262 "Ash Borer / Fell Voices" or "Various Artists", useful for splits
264 disc_number: Number of the disc or other physical medium the track belongs to,
266 release_year: Year (YYYY) when the album was released.
268 Unless mentioned otherwise, the fields should be Unicode strings.
270 Unless mentioned otherwise, None is equivalent to absence of information.
273 _type "playlist" indicates multiple videos.
274 There must be a key "entries", which is a list, an iterable, or a PagedList
275 object, each element of which is a valid dictionary by this specification.
277 Additionally, playlists can have "title", "description" and "id" attributes
278 with the same semantics as videos (see above).
281 _type "multi_video" indicates that there are multiple videos that
282 form a single show, for examples multiple acts of an opera or TV episode.
283 It must have an entries key like a playlist and contain all the keys
284 required for a video at the same time.
287 _type "url" indicates that the video must be extracted from another
288 location, possibly by a different extractor. Its only required key is:
289 "url" - the next URL to extract.
290 The key "ie_key" can be set to the class name (minus the trailing "IE",
291 e.g. "Youtube") if the extractor class is known in advance.
292 Additionally, the dictionary may have any properties of the resolved entity
293 known in advance, for example "title" if the title of the referred video is
297 _type "url_transparent" entities have the same specification as "url", but
298 indicate that the given additional information is more precise than the one
299 associated with the resolved URL.
300 This is useful when a site employs a video service that hosts the video and
301 its technical metadata, but that video service does not embed a useful
302 title, description etc.
305 Subclasses of this one should re-define the _real_initialize() and
306 _real_extract() methods and define a _VALID_URL regexp.
307 Probably, they should also be added to the list of extractors.
309 Finally, the _WORKING attribute should be set to False for broken IEs
310 in order to warn the users and skip the tests.
317 def __init__(self, downloader=None):
318 """Constructor. Receives an optional downloader."""
320 self.set_downloader(downloader)
323 def suitable(cls, url):
324 """Receives a URL and returns True if suitable for this IE."""
326 # This does not use has/getattr intentionally - we want to know whether
327 # we have cached the regexp for *this* class, whereas getattr would also
328 # match the superclass
329 if '_VALID_URL_RE' not in cls.__dict__:
330 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
331 return cls._VALID_URL_RE.match(url) is not None
334 def _match_id(cls, url):
335 if '_VALID_URL_RE' not in cls.__dict__:
336 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
337 m = cls._VALID_URL_RE.match(url)
343 """Getter method for _WORKING."""
346 def initialize(self):
347 """Initializes an instance (authentication, etc)."""
349 self._real_initialize()
352 def extract(self, url):
353 """Extracts URL information and returns it in list of dicts."""
356 return self._real_extract(url)
357 except ExtractorError:
359 except compat_http_client.IncompleteRead as e:
360 raise ExtractorError('A network error has occurred.', cause=e, expected=True)
361 except (KeyError, StopIteration) as e:
362 raise ExtractorError('An extractor error has occurred.', cause=e)
364 def set_downloader(self, downloader):
365 """Sets the downloader for this IE."""
366 self._downloader = downloader
368 def _real_initialize(self):
369 """Real initialization process. Redefine in subclasses."""
372 def _real_extract(self, url):
373 """Real extraction process. Redefine in subclasses."""
378 """A string for getting the InfoExtractor with get_info_extractor"""
379 return compat_str(cls.__name__[:-2])
383 return compat_str(type(self).__name__[:-2])
385 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
386 """ Returns the response handle """
388 self.report_download_webpage(video_id)
389 elif note is not False:
391 self.to_screen('%s' % (note,))
393 self.to_screen('%s: %s' % (video_id, note))
394 if isinstance(url_or_request, compat_urllib_request.Request):
395 url_or_request = update_Request(
396 url_or_request, data=data, headers=headers, query=query)
399 url_or_request = update_url_query(url_or_request, query)
400 if data is not None or headers:
401 url_or_request = sanitized_Request(url_or_request, data, headers)
403 return self._downloader.urlopen(url_or_request)
404 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
408 errnote = 'Unable to download webpage'
410 errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
412 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
414 self._downloader.report_warning(errmsg)
417 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}):
418 """ Returns a tuple (page content as string, URL handle) """
419 # Strip hashes from the URL (#1038)
420 if isinstance(url_or_request, (compat_str, str)):
421 url_or_request = url_or_request.partition('#')[0]
423 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
427 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
428 return (content, urlh)
431 def _guess_encoding_from_content(content_type, webpage_bytes):
432 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
434 encoding = m.group(1)
436 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
437 webpage_bytes[:1024])
439 encoding = m.group(1).decode('ascii')
440 elif webpage_bytes.startswith(b'\xff\xfe'):
447 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
448 content_type = urlh.headers.get('Content-Type', '')
449 webpage_bytes = urlh.read()
450 if prefix is not None:
451 webpage_bytes = prefix + webpage_bytes
453 encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
454 if self._downloader.params.get('dump_intermediate_pages', False):
456 url = url_or_request.get_full_url()
457 except AttributeError:
459 self.to_screen('Dumping request to ' + url)
460 dump = base64.b64encode(webpage_bytes).decode('ascii')
461 self._downloader.to_screen(dump)
462 if self._downloader.params.get('write_pages', False):
464 url = url_or_request.get_full_url()
465 except AttributeError:
467 basen = '%s_%s' % (video_id, url)
469 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
470 basen = basen[:240 - len(h)] + h
471 raw_filename = basen + '.dump'
472 filename = sanitize_filename(raw_filename, restricted=True)
473 self.to_screen('Saving request to ' + filename)
474 # Working around MAX_PATH limitation on Windows (see
475 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
476 if compat_os_name == 'nt':
477 absfilepath = os.path.abspath(filename)
478 if len(absfilepath) > 259:
479 filename = '\\\\?\\' + absfilepath
480 with open(filename, 'wb') as outf:
481 outf.write(webpage_bytes)
484 content = webpage_bytes.decode(encoding, 'replace')
486 content = webpage_bytes.decode('utf-8', 'replace')
488 if ('<title>Access to this site is blocked</title>' in content and
489 'Websense' in content[:512]):
490 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
491 blocked_iframe = self._html_search_regex(
492 r'<iframe src="([^"]+)"', content,
493 'Websense information URL', default=None)
495 msg += ' Visit %s for more details' % blocked_iframe
496 raise ExtractorError(msg, expected=True)
497 if '<title>The URL you requested has been blocked</title>' in content[:512]:
499 'Access to this webpage has been blocked by Indian censorship. '
500 'Use a VPN or proxy server (with --proxy) to route around it.')
501 block_msg = self._html_search_regex(
502 r'</h1><p>(.*?)</p>',
503 content, 'block message', default=None)
505 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
506 raise ExtractorError(msg, expected=True)
510 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}):
511 """ Returns the data of the page as a string """
514 while success is False:
516 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
518 except compat_http_client.IncompleteRead as e:
520 if try_count >= tries:
522 self._sleep(timeout, video_id)
529 def _download_xml(self, url_or_request, video_id,
530 note='Downloading XML', errnote='Unable to download XML',
531 transform_source=None, fatal=True, encoding=None, data=None, headers={}, query={}):
532 """Return the xml as an xml.etree.ElementTree.Element"""
533 xml_string = self._download_webpage(
534 url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query)
535 if xml_string is False:
538 xml_string = transform_source(xml_string)
539 return compat_etree_fromstring(xml_string.encode('utf-8'))
541 def _download_json(self, url_or_request, video_id,
542 note='Downloading JSON metadata',
543 errnote='Unable to download JSON metadata',
544 transform_source=None,
545 fatal=True, encoding=None, data=None, headers={}, query={}):
546 json_string = self._download_webpage(
547 url_or_request, video_id, note, errnote, fatal=fatal,
548 encoding=encoding, data=data, headers=headers, query=query)
549 if (not fatal) and json_string is False:
551 return self._parse_json(
552 json_string, video_id, transform_source=transform_source, fatal=fatal)
554 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
556 json_string = transform_source(json_string)
558 return json.loads(json_string)
559 except ValueError as ve:
560 errmsg = '%s: Failed to parse JSON ' % video_id
562 raise ExtractorError(errmsg, cause=ve)
564 self.report_warning(errmsg + str(ve))
566 def report_warning(self, msg, video_id=None):
567 idstr = '' if video_id is None else '%s: ' % video_id
568 self._downloader.report_warning(
569 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
571 def to_screen(self, msg):
572 """Print msg to screen, prefixing it with '[ie_name]'"""
573 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
575 def report_extraction(self, id_or_name):
576 """Report information extraction."""
577 self.to_screen('%s: Extracting information' % id_or_name)
579 def report_download_webpage(self, video_id):
580 """Report webpage download."""
581 self.to_screen('%s: Downloading webpage' % video_id)
583 def report_age_confirmation(self):
584 """Report attempt to confirm age."""
585 self.to_screen('Confirming age')
587 def report_login(self):
588 """Report attempt to log in."""
589 self.to_screen('Logging in')
592 def raise_login_required(msg='This video is only available for registered users'):
593 raise ExtractorError(
594 '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
598 def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'):
599 raise ExtractorError(
600 '%s. You might want to use --proxy to workaround.' % msg,
603 # Methods for following #608
605 def url_result(url, ie=None, video_id=None, video_title=None):
606 """Returns a URL that points to a page that should be processed"""
607 # TODO: ie should be the class used for getting the info
608 video_info = {'_type': 'url',
611 if video_id is not None:
612 video_info['id'] = video_id
613 if video_title is not None:
614 video_info['title'] = video_title
618 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
619 """Returns a playlist"""
620 video_info = {'_type': 'playlist',
623 video_info['id'] = playlist_id
625 video_info['title'] = playlist_title
626 if playlist_description:
627 video_info['description'] = playlist_description
630 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
632 Perform a regex search on the given string, using a single or a list of
633 patterns returning the first matching group.
634 In case of failure return a default value or raise a WARNING or a
635 RegexNotFoundError, depending on fatal, specifying the field name.
637 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
638 mobj = re.search(pattern, string, flags)
641 mobj = re.search(p, string, flags)
645 if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
646 _name = '\033[0;34m%s\033[0m' % name
652 # return the first matching group
653 return next(g for g in mobj.groups() if g is not None)
655 return mobj.group(group)
656 elif default is not NO_DEFAULT:
659 raise RegexNotFoundError('Unable to extract %s' % _name)
661 self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
664 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
666 Like _search_regex, but strips HTML tags and unescapes entities.
668 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
670 return clean_html(res).strip()
674 def _get_netrc_login_info(self, netrc_machine=None):
677 netrc_machine = netrc_machine or self._NETRC_MACHINE
679 if self._downloader.params.get('usenetrc', False):
681 info = netrc.netrc().authenticators(netrc_machine)
686 raise netrc.NetrcParseError(
687 'No authenticators for %s' % netrc_machine)
688 except (IOError, netrc.NetrcParseError) as err:
689 self._downloader.report_warning(
690 'parsing .netrc: %s' % error_to_compat_str(err))
692 return username, password
694 def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
696 Get the login info as (username, password)
697 First look for the manually specified credentials using username_option
698 and password_option as keys in params dictionary. If no such credentials
699 available look in the netrc file using the netrc_machine or _NETRC_MACHINE
701 If there's no info available, return (None, None)
703 if self._downloader is None:
706 downloader_params = self._downloader.params
708 # Attempt to use provided username and password or .netrc data
709 if downloader_params.get(username_option) is not None:
710 username = downloader_params[username_option]
711 password = downloader_params[password_option]
713 username, password = self._get_netrc_login_info(netrc_machine)
715 return username, password
717 def _get_tfa_info(self, note='two-factor verification code'):
719 Get the two-factor authentication info
720 TODO - asking the user will be required for sms/phone verify
721 currently just uses the command line option
722 If there's no info available, return None
724 if self._downloader is None:
726 downloader_params = self._downloader.params
728 if downloader_params.get('twofactor') is not None:
729 return downloader_params['twofactor']
731 return compat_getpass('Type %s and press [Return]: ' % note)
733 # Helper functions for extracting OpenGraph info
735 def _og_regexes(prop):
736 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
737 property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
738 % {'prop': re.escape(prop)})
739 template = r'<meta[^>]+?%s[^>]+?%s'
741 template % (property_re, content_re),
742 template % (content_re, property_re),
746 def _meta_regex(prop):
747 return r'''(?isx)<meta
748 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
749 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
751 def _og_search_property(self, prop, html, name=None, **kargs):
752 if not isinstance(prop, (list, tuple)):
755 name = 'OpenGraph %s' % prop[0]
758 og_regexes.extend(self._og_regexes(p))
759 escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
762 return unescapeHTML(escaped)
764 def _og_search_thumbnail(self, html, **kargs):
765 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
767 def _og_search_description(self, html, **kargs):
768 return self._og_search_property('description', html, fatal=False, **kargs)
770 def _og_search_title(self, html, **kargs):
771 return self._og_search_property('title', html, **kargs)
773 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
774 regexes = self._og_regexes('video') + self._og_regexes('video:url')
776 regexes = self._og_regexes('video:secure_url') + regexes
777 return self._html_search_regex(regexes, html, name, **kargs)
779 def _og_search_url(self, html, **kargs):
780 return self._og_search_property('url', html, **kargs)
782 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
783 if not isinstance(name, (list, tuple)):
785 if display_name is None:
786 display_name = name[0]
787 return self._html_search_regex(
788 [self._meta_regex(n) for n in name],
789 html, display_name, fatal=fatal, group='content', **kwargs)
791 def _dc_search_uploader(self, html):
792 return self._html_search_meta('dc.creator', html, 'uploader')
794 def _rta_search(self, html):
795 # See http://www.rtalabel.org/index.php?content=howtofaq#single
796 if re.search(r'(?ix)<meta\s+name="rating"\s+'
797 r' content="RTA-5042-1996-1400-1577-RTA"',
802 def _media_rating_search(self, html):
803 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
804 rating = self._html_search_meta('rating', html)
816 return RATING_TABLE.get(rating.lower())
818 def _family_friendly_search(self, html):
819 # See http://schema.org/VideoObject
820 family_friendly = self._html_search_meta('isFamilyFriendly', html)
822 if not family_friendly:
831 return RATING_TABLE.get(family_friendly.lower())
833 def _twitter_search_player(self, html):
834 return self._html_search_meta('twitter:player', html,
835 'twitter card player')
837 def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
838 json_ld = self._search_regex(
839 r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
840 html, 'JSON-LD', group='json_ld', **kwargs)
841 default = kwargs.get('default', NO_DEFAULT)
843 return default if default is not NO_DEFAULT else {}
844 # JSON-LD may be malformed and thus `fatal` should be respected.
845 # At the same time `default` may be passed that assumes `fatal=False`
846 # for _search_regex. Let's simulate the same behavior here as well.
847 fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
848 return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
850 def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
851 if isinstance(json_ld, compat_str):
852 json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
856 if not isinstance(json_ld, (list, tuple, dict)):
858 if isinstance(json_ld, dict):
861 if e.get('@context') == 'http://schema.org':
862 item_type = e.get('@type')
863 if expected_type is not None and expected_type != item_type:
865 if item_type == 'TVEpisode':
867 'episode': unescapeHTML(e.get('name')),
868 'episode_number': int_or_none(e.get('episodeNumber')),
869 'description': unescapeHTML(e.get('description')),
871 part_of_season = e.get('partOfSeason')
872 if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
873 info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
874 part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
875 if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
876 info['series'] = unescapeHTML(part_of_series.get('name'))
877 elif item_type == 'Article':
879 'timestamp': parse_iso8601(e.get('datePublished')),
880 'title': unescapeHTML(e.get('headline')),
881 'description': unescapeHTML(e.get('articleBody')),
883 elif item_type == 'VideoObject':
885 'url': e.get('contentUrl'),
886 'title': unescapeHTML(e.get('name')),
887 'description': unescapeHTML(e.get('description')),
888 'thumbnail': e.get('thumbnailUrl'),
889 'duration': parse_duration(e.get('duration')),
890 'timestamp': unified_timestamp(e.get('uploadDate')),
891 'filesize': float_or_none(e.get('contentSize')),
892 'tbr': int_or_none(e.get('bitrate')),
893 'width': int_or_none(e.get('width')),
894 'height': int_or_none(e.get('height')),
897 return dict((k, v) for k, v in info.items() if v is not None)
900 def _hidden_inputs(html):
901 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
903 for input in re.findall(r'(?i)(<input[^>]+>)', html):
904 attrs = extract_attributes(input)
907 if attrs.get('type') not in ('hidden', 'submit'):
909 name = attrs.get('name') or attrs.get('id')
910 value = attrs.get('value')
911 if name and value is not None:
912 hidden_inputs[name] = value
915 def _form_hidden_inputs(self, form_id, html):
916 form = self._search_regex(
917 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
918 html, '%s form' % form_id, group='form')
919 return self._hidden_inputs(form)
921 def _sort_formats(self, formats, field_preference=None):
923 raise ExtractorError('No video formats found')
926 # Automatically determine tbr when missing based on abr and vbr (improves
927 # formats sorting in some cases)
928 if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
929 f['tbr'] = f['abr'] + f['vbr']
932 # TODO remove the following workaround
933 from ..utils import determine_ext
934 if not f.get('ext') and 'url' in f:
935 f['ext'] = determine_ext(f['url'])
937 if isinstance(field_preference, (list, tuple)):
940 if f.get(field) is not None
941 else ('' if field == 'format_id' else -1)
942 for field in field_preference)
944 preference = f.get('preference')
945 if preference is None:
947 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
950 protocol = f.get('protocol') or determine_protocol(f)
951 proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
953 if f.get('vcodec') == 'none': # audio only
955 if self._downloader.params.get('prefer_free_formats'):
956 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
958 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
961 audio_ext_preference = ORDER.index(f['ext'])
963 audio_ext_preference = -1
965 if f.get('acodec') == 'none': # video only
967 if self._downloader.params.get('prefer_free_formats'):
968 ORDER = ['flv', 'mp4', 'webm']
970 ORDER = ['webm', 'flv', 'mp4']
972 ext_preference = ORDER.index(f['ext'])
975 audio_ext_preference = 0
979 f.get('language_preference') if f.get('language_preference') is not None else -1,
980 f.get('quality') if f.get('quality') is not None else -1,
981 f.get('tbr') if f.get('tbr') is not None else -1,
982 f.get('filesize') if f.get('filesize') is not None else -1,
983 f.get('vbr') if f.get('vbr') is not None else -1,
984 f.get('height') if f.get('height') is not None else -1,
985 f.get('width') if f.get('width') is not None else -1,
988 f.get('abr') if f.get('abr') is not None else -1,
989 audio_ext_preference,
990 f.get('fps') if f.get('fps') is not None else -1,
991 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
992 f.get('source_preference') if f.get('source_preference') is not None else -1,
993 f.get('format_id') if f.get('format_id') is not None else '',
995 formats.sort(key=_formats_key)
997 def _check_formats(self, formats, video_id):
1000 lambda f: self._is_valid_url(
1002 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1006 def _remove_duplicate_formats(formats):
1010 if f['url'] not in format_urls:
1011 format_urls.add(f['url'])
1012 unique_formats.append(f)
1013 formats[:] = unique_formats
1015 def _is_valid_url(self, url, video_id, item='video'):
1016 url = self._proto_relative_url(url, scheme='http:')
1017 # For now assume non HTTP(S) URLs always valid
1018 if not (url.startswith('http://') or url.startswith('https://')):
1021 self._request_webpage(url, video_id, 'Checking %s URL' % item)
1023 except ExtractorError as e:
1024 if isinstance(e.cause, compat_urllib_error.URLError):
1026 '%s: %s URL is invalid, skipping' % (video_id, item))
1030 def http_scheme(self):
1031 """ Either "http:" or "https:", depending on the user's preferences """
1034 if self._downloader.params.get('prefer_insecure', False)
1037 def _proto_relative_url(self, url, scheme=None):
1040 if url.startswith('//'):
1042 scheme = self.http_scheme()
1047 def _sleep(self, timeout, video_id, msg_template=None):
1048 if msg_template is None:
1049 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1050 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1054 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
1055 transform_source=lambda s: fix_xml_ampersands(s).strip(),
1056 fatal=True, m3u8_id=None):
1057 manifest = self._download_xml(
1058 manifest_url, video_id, 'Downloading f4m manifest',
1059 'Unable to download f4m manifest',
1060 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1061 # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
1062 transform_source=transform_source,
1065 if manifest is False:
1068 return self._parse_f4m_formats(
1069 manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1070 transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1072 def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1073 transform_source=lambda s: fix_xml_ampersands(s).strip(),
1074 fatal=True, m3u8_id=None):
1075 # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1076 akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1077 if akamai_pv is not None and ';' in akamai_pv.text:
1078 playerVerificationChallenge = akamai_pv.text.split(';')[0]
1079 if playerVerificationChallenge.strip() != '':
1083 manifest_version = '1.0'
1084 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1086 manifest_version = '2.0'
1087 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1088 # Remove unsupported DRM protected media from final formats
1089 # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
1090 media_nodes = remove_encrypted_media(media_nodes)
1093 base_url = xpath_text(
1094 manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
1095 'base URL', default=None)
1097 base_url = base_url.strip()
1099 bootstrap_info = xpath_element(
1100 manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1101 'bootstrap info', default=None)
1104 mime_type = xpath_text(
1105 manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1106 'base URL', default=None)
1107 if mime_type and mime_type.startswith('audio/'):
1110 for i, media_el in enumerate(media_nodes):
1111 tbr = int_or_none(media_el.attrib.get('bitrate'))
1112 width = int_or_none(media_el.attrib.get('width'))
1113 height = int_or_none(media_el.attrib.get('height'))
1114 format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1115 # If <bootstrapInfo> is present, the specified f4m is a
1116 # stream-level manifest, and only set-level manifests may refer to
1117 # external resources. See section 11.4 and section 4 of F4M spec
1118 if bootstrap_info is None:
1120 # @href is introduced in 2.0, see section 11.6 of F4M spec
1121 if manifest_version == '2.0':
1122 media_url = media_el.attrib.get('href')
1123 if media_url is None:
1124 media_url = media_el.attrib.get('url')
1128 media_url if media_url.startswith('http://') or media_url.startswith('https://')
1129 else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1130 # If media_url is itself a f4m manifest do the recursive extraction
1131 # since bitrates in parent manifest (this one) and media_url manifest
1132 # may differ leading to inability to resolve the format by requested
1133 # bitrate in f4m downloader
1134 ext = determine_ext(manifest_url)
1136 f4m_formats = self._extract_f4m_formats(
1137 manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1138 transform_source=transform_source, fatal=fatal)
1139 # Sometimes stream-level manifest contains single media entry that
1140 # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1141 # At the same time parent's media entry in set-level manifest may
1142 # contain it. We will copy it from parent in such cases.
1143 if len(f4m_formats) == 1:
1146 'tbr': f.get('tbr') or tbr,
1147 'width': f.get('width') or width,
1148 'height': f.get('height') or height,
1149 'format_id': f.get('format_id') if not tbr else format_id,
1152 formats.extend(f4m_formats)
1155 formats.extend(self._extract_m3u8_formats(
1156 manifest_url, video_id, 'mp4', preference=preference,
1157 m3u8_id=m3u8_id, fatal=fatal))
1160 'format_id': format_id,
1161 'url': manifest_url,
1162 'manifest_url': manifest_url,
1163 'ext': 'flv' if bootstrap_info is not None else None,
1168 'preference': preference,
1172 def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1174 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1178 'preference': preference - 100 if preference else -100,
1179 'resolution': 'multiple',
1180 'format_note': 'Quality selection URL',
1183 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1184 entry_protocol='m3u8', preference=None,
1185 m3u8_id=None, note=None, errnote=None,
1186 fatal=True, live=False):
1188 res = self._download_webpage_handle(
1190 note=note or 'Downloading m3u8 information',
1191 errnote=errnote or 'Failed to download m3u8 information',
1195 m3u8_doc, urlh = res
1196 m3u8_url = urlh.geturl()
1198 formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)]
1200 format_url = lambda u: (
1202 if re.match(r'^https?://', u)
1203 else compat_urlparse.urljoin(m3u8_url, u))
1205 # We should try extracting formats only from master playlists [1], i.e.
1206 # playlists that describe available qualities. On the other hand media
1207 # playlists [2] should be returned as is since they contain just the media
1208 # without qualities renditions.
1209 # Fortunately, master playlist can be easily distinguished from media
1210 # playlist based on particular tags availability. As of [1, 2] master
1211 # playlist tags MUST NOT appear in a media playist and vice versa.
1212 # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
1213 # and MUST NOT appear in master playlist thus we can clearly detect media
1214 # playlist with this criterion.
1215 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
1216 # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
1217 # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
1218 if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
1221 'format_id': m3u8_id,
1223 'protocol': entry_protocol,
1224 'preference': preference,
1228 for line in m3u8_doc.splitlines():
1229 if line.startswith('#EXT-X-STREAM-INF:'):
1230 last_info = parse_m3u8_attributes(line)
1231 elif line.startswith('#EXT-X-MEDIA:'):
1232 media = parse_m3u8_attributes(line)
1233 media_type = media.get('TYPE')
1234 if media_type in ('VIDEO', 'AUDIO'):
1235 media_url = media.get('URI')
1238 for v in (media.get('GROUP-ID'), media.get('NAME')):
1242 'format_id': '-'.join(format_id),
1243 'url': format_url(media_url),
1244 'language': media.get('LANGUAGE'),
1245 'vcodec': 'none' if media_type == 'AUDIO' else None,
1247 'protocol': entry_protocol,
1248 'preference': preference,
1251 # When there is no URI in EXT-X-MEDIA let this tag's
1252 # data be used by regular URI lines below
1254 elif line.startswith('#') or not line.strip():
1257 tbr = int_or_none(last_info.get('AVERAGE-BANDWIDTH') or last_info.get('BANDWIDTH'), scale=1000)
1260 format_id.append(m3u8_id)
1261 # Despite specification does not mention NAME attribute for
1262 # EXT-X-STREAM-INF it still sometimes may be present
1263 stream_name = last_info.get('NAME') or last_media.get('NAME')
1264 # Bandwidth of live streams may differ over time thus making
1265 # format_id unpredictable. So it's better to keep provided
1268 format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
1269 manifest_url = format_url(line.strip())
1271 'format_id': '-'.join(format_id),
1272 'url': manifest_url,
1273 'manifest_url': manifest_url,
1276 'fps': float_or_none(last_info.get('FRAME-RATE')),
1277 'protocol': entry_protocol,
1278 'preference': preference,
1280 resolution = last_info.get('RESOLUTION')
1282 width_str, height_str = resolution.split('x')
1283 f['width'] = int(width_str)
1284 f['height'] = int(height_str)
1285 # Unified Streaming Platform
1287 r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
1289 abr, vbr = mobj.groups()
1290 abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
1295 f.update(parse_codecs(last_info.get('CODECS')))
1302 def _xpath_ns(path, namespace=None):
1306 for c in path.split('/'):
1307 if not c or c == '.':
1310 out.append('{%s}%s' % (namespace, c))
1311 return '/'.join(out)
1313 def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1314 smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1320 namespace = self._parse_smil_namespace(smil)
1322 return self._parse_smil_formats(
1323 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1325 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1326 smil = self._download_smil(smil_url, video_id, fatal=fatal)
1329 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1331 def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1332 return self._download_xml(
1333 smil_url, video_id, 'Downloading SMIL file',
1334 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1336 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1337 namespace = self._parse_smil_namespace(smil)
1339 formats = self._parse_smil_formats(
1340 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1341 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1343 video_id = os.path.splitext(url_basename(smil_url))[0]
1347 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1348 name = meta.attrib.get('name')
1349 content = meta.attrib.get('content')
1350 if not name or not content:
1352 if not title and name == 'title':
1354 elif not description and name in ('description', 'abstract'):
1355 description = content
1356 elif not upload_date and name == 'date':
1357 upload_date = unified_strdate(content)
1360 'id': image.get('type'),
1361 'url': image.get('src'),
1362 'width': int_or_none(image.get('width')),
1363 'height': int_or_none(image.get('height')),
1364 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1368 'title': title or video_id,
1369 'description': description,
1370 'upload_date': upload_date,
1371 'thumbnails': thumbnails,
1373 'subtitles': subtitles,
1376 def _parse_smil_namespace(self, smil):
1377 return self._search_regex(
1378 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1380 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1382 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1383 b = meta.get('base') or meta.get('httpBase')
1394 media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1395 for medium in media:
1396 src = medium.get('src')
1397 if not src or src in srcs:
1401 bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1402 filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1403 width = int_or_none(medium.get('width'))
1404 height = int_or_none(medium.get('height'))
1405 proto = medium.get('proto')
1406 ext = medium.get('ext')
1407 src_ext = determine_ext(src)
1408 streamer = medium.get('streamer') or base
1410 if proto == 'rtmp' or streamer.startswith('rtmp'):
1416 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1418 'filesize': filesize,
1422 if transform_rtmp_url:
1423 streamer, src = transform_rtmp_url(streamer, src)
1424 formats[-1].update({
1430 src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1431 src_url = src_url.strip()
1433 if proto == 'm3u8' or src_ext == 'm3u8':
1434 m3u8_formats = self._extract_m3u8_formats(
1435 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1436 if len(m3u8_formats) == 1:
1438 m3u8_formats[0].update({
1439 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1444 formats.extend(m3u8_formats)
1447 if src_ext == 'f4m':
1452 'plugin': 'flowplayer-3.2.0.1',
1454 f4m_url += '&' if '?' in f4m_url else '?'
1455 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1456 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1459 if src_url.startswith('http') and self._is_valid_url(src, video_id):
1463 'ext': ext or src_ext or 'flv',
1464 'format_id': 'http-%d' % (bitrate or http_count),
1466 'filesize': filesize,
1474 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1477 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1478 src = textstream.get('src')
1479 if not src or src in urls:
1482 ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
1483 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1484 subtitles.setdefault(lang, []).append({
1490 def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1491 xspf = self._download_xml(
1492 playlist_url, playlist_id, 'Downloading xpsf playlist',
1493 'Unable to download xspf manifest', fatal=fatal)
1496 return self._parse_xspf(xspf, playlist_id)
1498 def _parse_xspf(self, playlist, playlist_id):
1500 'xspf': 'http://xspf.org/ns/0/',
1501 's1': 'http://static.streamone.nl/player/ns/0',
1505 for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1507 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1508 description = xpath_text(
1509 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1510 thumbnail = xpath_text(
1511 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1512 duration = float_or_none(
1513 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1516 'url': location.text,
1517 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1518 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1519 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1520 } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1521 self._sort_formats(formats)
1526 'description': description,
1527 'thumbnail': thumbnail,
1528 'duration': duration,
1533 def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1534 res = self._download_webpage_handle(
1536 note=note or 'Downloading MPD manifest',
1537 errnote=errnote or 'Failed to download MPD manifest',
1542 mpd_base_url = re.match(r'https?://.+/', urlh.geturl()).group()
1544 return self._parse_mpd_formats(
1545 compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url,
1546 formats_dict=formats_dict, mpd_url=mpd_url)
1548 def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
1550 Parse formats from MPD manifest.
1552 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
1553 http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
1554 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
1556 if mpd_doc.get('type') == 'dynamic':
1559 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1562 return self._xpath_ns(path, namespace)
1564 def is_drm_protected(element):
1565 return element.find(_add_ns('ContentProtection')) is not None
1567 def extract_multisegment_info(element, ms_parent_info):
1568 ms_info = ms_parent_info.copy()
1570 # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
1571 # common attributes and elements. We will only extract relevant
1573 def extract_common(source):
1574 segment_timeline = source.find(_add_ns('SegmentTimeline'))
1575 if segment_timeline is not None:
1576 s_e = segment_timeline.findall(_add_ns('S'))
1578 ms_info['total_number'] = 0
1581 r = int(s.get('r', 0))
1582 ms_info['total_number'] += 1 + r
1583 ms_info['s'].append({
1584 't': int(s.get('t', 0)),
1585 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
1586 'd': int(s.attrib['d']),
1589 start_number = source.get('startNumber')
1591 ms_info['start_number'] = int(start_number)
1592 timescale = source.get('timescale')
1594 ms_info['timescale'] = int(timescale)
1595 segment_duration = source.get('duration')
1596 if segment_duration:
1597 ms_info['segment_duration'] = int(segment_duration)
1599 def extract_Initialization(source):
1600 initialization = source.find(_add_ns('Initialization'))
1601 if initialization is not None:
1602 ms_info['initialization_url'] = initialization.attrib['sourceURL']
1604 segment_list = element.find(_add_ns('SegmentList'))
1605 if segment_list is not None:
1606 extract_common(segment_list)
1607 extract_Initialization(segment_list)
1608 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1610 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1612 segment_template = element.find(_add_ns('SegmentTemplate'))
1613 if segment_template is not None:
1614 extract_common(segment_template)
1615 media_template = segment_template.get('media')
1617 ms_info['media_template'] = media_template
1618 initialization = segment_template.get('initialization')
1620 ms_info['initialization_url'] = initialization
1622 extract_Initialization(segment_template)
1625 def combine_url(base_url, target_url):
1626 if re.match(r'^https?://', target_url):
1628 return '%s%s%s' % (base_url, '' if base_url.endswith('/') else '/', target_url)
1630 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1632 for period in mpd_doc.findall(_add_ns('Period')):
1633 period_duration = parse_duration(period.get('duration')) or mpd_duration
1634 period_ms_info = extract_multisegment_info(period, {
1638 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1639 if is_drm_protected(adaptation_set):
1641 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1642 for representation in adaptation_set.findall(_add_ns('Representation')):
1643 if is_drm_protected(representation):
1645 representation_attrib = adaptation_set.attrib.copy()
1646 representation_attrib.update(representation.attrib)
1647 # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
1648 mime_type = representation_attrib['mimeType']
1649 content_type = mime_type.split('/')[0]
1650 if content_type == 'text':
1651 # TODO implement WebVTT downloading
1653 elif content_type == 'video' or content_type == 'audio':
1655 for element in (representation, adaptation_set, period, mpd_doc):
1656 base_url_e = element.find(_add_ns('BaseURL'))
1657 if base_url_e is not None:
1658 base_url = base_url_e.text + base_url
1659 if re.match(r'^https?://', base_url):
1661 if mpd_base_url and not re.match(r'^https?://', base_url):
1662 if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1664 base_url = mpd_base_url + base_url
1665 representation_id = representation_attrib.get('id')
1666 lang = representation_attrib.get('lang')
1667 url_el = representation.find(_add_ns('BaseURL'))
1668 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1670 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1672 'manifest_url': mpd_url,
1673 'ext': mimetype2ext(mime_type),
1674 'width': int_or_none(representation_attrib.get('width')),
1675 'height': int_or_none(representation_attrib.get('height')),
1676 'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000),
1677 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1678 'fps': int_or_none(representation_attrib.get('frameRate')),
1679 'vcodec': 'none' if content_type == 'audio' else representation_attrib.get('codecs'),
1680 'acodec': 'none' if content_type == 'video' else representation_attrib.get('codecs'),
1681 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1682 'format_note': 'DASH %s' % content_type,
1683 'filesize': filesize,
1685 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1686 if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info:
1688 media_template = representation_ms_info['media_template']
1689 media_template = media_template.replace('$RepresentationID$', representation_id)
1690 media_template = re.sub(r'\$(Number|Bandwidth|Time)\$', r'%(\1)d', media_template)
1691 media_template = re.sub(r'\$(Number|Bandwidth|Time)%([^$]+)\$', r'%(\1)\2', media_template)
1692 media_template.replace('$$', '$')
1694 # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
1695 # can't be used at the same time
1696 if '%(Number' in media_template and 's' not in representation_ms_info:
1697 segment_duration = None
1698 if 'total_number' not in representation_ms_info and 'segment_duration':
1699 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
1700 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1701 representation_ms_info['fragments'] = [{
1702 'url': media_template % {
1703 'Number': segment_number,
1704 'Bandwidth': representation_attrib.get('bandwidth'),
1706 'duration': segment_duration,
1707 } for segment_number in range(
1708 representation_ms_info['start_number'],
1709 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1711 # $Number*$ or $Time$ in media template with S list available
1712 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
1713 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
1714 representation_ms_info['fragments'] = []
1717 segment_number = representation_ms_info['start_number']
1719 def add_segment_url():
1720 segment_url = media_template % {
1721 'Time': segment_time,
1722 'Bandwidth': representation_attrib.get('bandwidth'),
1723 'Number': segment_number,
1725 representation_ms_info['fragments'].append({
1727 'duration': float_or_none(segment_d, representation_ms_info['timescale']),
1730 for num, s in enumerate(representation_ms_info['s']):
1731 segment_time = s.get('t') or segment_time
1735 for r in range(s.get('r', 0)):
1736 segment_time += segment_d
1739 segment_time += segment_d
1740 elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
1742 # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
1743 # or any YouTube dashsegments video
1746 for segment_url in representation_ms_info['segment_urls']:
1747 s = representation_ms_info['s'][s_num]
1748 for r in range(s.get('r', 0) + 1):
1751 'duration': float_or_none(s['d'], representation_ms_info['timescale']),
1753 representation_ms_info['fragments'] = fragments
1754 # NB: MPD manifest may contain direct URLs to unfragmented media.
1755 # No fragments key is present in this case.
1756 if 'fragments' in representation_ms_info:
1759 'protocol': 'http_dash_segments',
1761 if 'initialization_url' in representation_ms_info:
1762 initialization_url = representation_ms_info['initialization_url'].replace('$RepresentationID$', representation_id)
1763 if not f.get('url'):
1764 f['url'] = initialization_url
1765 f['fragments'].append({'url': initialization_url})
1766 f['fragments'].extend(representation_ms_info['fragments'])
1767 for fragment in f['fragments']:
1768 fragment['url'] = combine_url(base_url, fragment['url'])
1770 existing_format = next(
1771 fo for fo in formats
1772 if fo['format_id'] == representation_id)
1773 except StopIteration:
1774 full_info = formats_dict.get(representation_id, {}).copy()
1776 formats.append(full_info)
1778 existing_format.update(f)
1780 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
1783 def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8'):
1784 def absolute_url(video_url):
1785 return compat_urlparse.urljoin(base_url, video_url)
1787 def parse_content_type(content_type):
1788 if not content_type:
1790 ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
1792 mimetype, codecs = ctr.groups()
1793 f = parse_codecs(codecs)
1794 f['ext'] = mimetype2ext(mimetype)
1798 def _media_formats(src, cur_media_type):
1799 full_url = absolute_url(src)
1800 if determine_ext(full_url) == 'm3u8':
1801 is_plain_url = False
1802 formats = self._extract_m3u8_formats(
1803 full_url, video_id, ext='mp4',
1804 entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id)
1809 'vcodec': 'none' if cur_media_type == 'audio' else None,
1811 return is_plain_url, formats
1814 media_tags = [(media_tag, media_type, '')
1815 for media_tag, media_type
1816 in re.findall(r'(?s)(<(video|audio)[^>]*/>)', webpage)]
1817 media_tags.extend(re.findall(r'(?s)(<(?P<tag>video|audio)[^>]*>)(.*?)</(?P=tag)>', webpage))
1818 for media_tag, media_type, media_content in media_tags:
1823 media_attributes = extract_attributes(media_tag)
1824 src = media_attributes.get('src')
1826 _, formats = _media_formats(src, media_type)
1827 media_info['formats'].extend(formats)
1828 media_info['thumbnail'] = media_attributes.get('poster')
1830 for source_tag in re.findall(r'<source[^>]+>', media_content):
1831 source_attributes = extract_attributes(source_tag)
1832 src = source_attributes.get('src')
1835 is_plain_url, formats = _media_formats(src, media_type)
1837 f = parse_content_type(source_attributes.get('type'))
1838 f.update(formats[0])
1839 media_info['formats'].append(f)
1841 media_info['formats'].extend(formats)
1842 for track_tag in re.findall(r'<track[^>]+>', media_content):
1843 track_attributes = extract_attributes(track_tag)
1844 kind = track_attributes.get('kind')
1845 if not kind or kind in ('subtitles', 'captions'):
1846 src = track_attributes.get('src')
1849 lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
1850 media_info['subtitles'].setdefault(lang, []).append({
1851 'url': absolute_url(src),
1853 if media_info['formats'] or media_info['subtitles']:
1854 entries.append(media_info)
1857 def _extract_akamai_formats(self, manifest_url, video_id):
1859 hdcore_sign = 'hdcore=3.7.0'
1860 f4m_url = re.sub(r'(https?://.+?)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
1861 if 'hdcore=' not in f4m_url:
1862 f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
1863 f4m_formats = self._extract_f4m_formats(
1864 f4m_url, video_id, f4m_id='hds', fatal=False)
1865 for entry in f4m_formats:
1866 entry.update({'extra_param_to_segment_url': hdcore_sign})
1867 formats.extend(f4m_formats)
1868 m3u8_url = re.sub(r'(https?://.+?)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
1869 formats.extend(self._extract_m3u8_formats(
1870 m3u8_url, video_id, 'mp4', 'm3u8_native',
1871 m3u8_id='hls', fatal=False))
1874 def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
1875 url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
1876 url_base = self._search_regex(r'(?:https?|rtmp|rtsp)(://[^?]+)', url, 'format url')
1877 http_base_url = 'http' + url_base
1879 if 'm3u8' not in skip_protocols:
1880 formats.extend(self._extract_m3u8_formats(
1881 http_base_url + '/playlist.m3u8', video_id, 'mp4',
1882 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
1883 if 'f4m' not in skip_protocols:
1884 formats.extend(self._extract_f4m_formats(
1885 http_base_url + '/manifest.f4m',
1886 video_id, f4m_id='hds', fatal=False))
1887 if 'dash' not in skip_protocols:
1888 formats.extend(self._extract_mpd_formats(
1889 http_base_url + '/manifest.mpd',
1890 video_id, mpd_id='dash', fatal=False))
1891 if re.search(r'(?:/smil:|\.smil)', url_base):
1892 if 'smil' not in skip_protocols:
1893 rtmp_formats = self._extract_smil_formats(
1894 http_base_url + '/jwplayer.smil',
1895 video_id, fatal=False)
1896 for rtmp_format in rtmp_formats:
1897 rtsp_format = rtmp_format.copy()
1898 rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
1899 del rtsp_format['play_path']
1900 del rtsp_format['ext']
1901 rtsp_format.update({
1902 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
1903 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
1906 formats.extend([rtmp_format, rtsp_format])
1908 for protocol in ('rtmp', 'rtsp'):
1909 if protocol not in skip_protocols:
1911 'url': protocol + url_base,
1912 'format_id': protocol,
1913 'protocol': protocol,
1917 def _live_title(self, name):
1918 """ Generate the title for a live video """
1919 now = datetime.datetime.now()
1920 now_str = now.strftime('%Y-%m-%d %H:%M')
1921 return name + ' ' + now_str
1923 def _int(self, v, name, fatal=False, **kwargs):
1924 res = int_or_none(v, **kwargs)
1925 if 'get_attr' in kwargs:
1926 print(getattr(v, kwargs['get_attr']))
1928 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1930 raise ExtractorError(msg)
1932 self._downloader.report_warning(msg)
1935 def _float(self, v, name, fatal=False, **kwargs):
1936 res = float_or_none(v, **kwargs)
1938 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1940 raise ExtractorError(msg)
1942 self._downloader.report_warning(msg)
1945 def _set_cookie(self, domain, name, value, expire_time=None):
1946 cookie = compat_cookiejar.Cookie(
1947 0, name, value, None, None, domain, None,
1948 None, '/', True, False, expire_time, '', None, None, None)
1949 self._downloader.cookiejar.set_cookie(cookie)
1951 def _get_cookies(self, url):
1952 """ Return a compat_cookies.SimpleCookie with the cookies for the url """
1953 req = sanitized_Request(url)
1954 self._downloader.cookiejar.add_cookie_header(req)
1955 return compat_cookies.SimpleCookie(req.get_header('Cookie'))
1957 def get_testcases(self, include_onlymatching=False):
1958 t = getattr(self, '_TEST', None)
1960 assert not hasattr(self, '_TESTS'), \
1961 '%s has _TEST and _TESTS' % type(self).__name__
1964 tests = getattr(self, '_TESTS', [])
1966 if not include_onlymatching and t.get('only_matching', False):
1968 t['name'] = type(self).__name__[:-len('IE')]
1971 def is_suitable(self, age_limit):
1972 """ Test whether the extractor is generally suitable for the given
1973 age limit (i.e. pornographic sites are not, all others usually are) """
1975 any_restricted = False
1976 for tc in self.get_testcases(include_onlymatching=False):
1977 if tc.get('playlist', []):
1978 tc = tc['playlist'][0]
1979 is_restricted = age_restricted(
1980 tc.get('info_dict', {}).get('age_limit'), age_limit)
1981 if not is_restricted:
1983 any_restricted = any_restricted or is_restricted
1984 return not any_restricted
1986 def extract_subtitles(self, *args, **kwargs):
1987 if (self._downloader.params.get('writesubtitles', False) or
1988 self._downloader.params.get('listsubtitles')):
1989 return self._get_subtitles(*args, **kwargs)
1992 def _get_subtitles(self, *args, **kwargs):
1993 raise NotImplementedError('This method must be implemented by subclasses')
1996 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
1997 """ Merge subtitle items for one language. Items with duplicated URLs
1998 will be dropped. """
1999 list1_urls = set([item['url'] for item in subtitle_list1])
2000 ret = list(subtitle_list1)
2001 ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
2005 def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
2006 """ Merge two subtitle dictionaries, language by language. """
2007 ret = dict(subtitle_dict1)
2008 for lang in subtitle_dict2:
2009 ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
2012 def extract_automatic_captions(self, *args, **kwargs):
2013 if (self._downloader.params.get('writeautomaticsub', False) or
2014 self._downloader.params.get('listsubtitles')):
2015 return self._get_automatic_captions(*args, **kwargs)
2018 def _get_automatic_captions(self, *args, **kwargs):
2019 raise NotImplementedError('This method must be implemented by subclasses')
2021 def mark_watched(self, *args, **kwargs):
2022 if (self._downloader.params.get('mark_watched', False) and
2023 (self._get_login_info()[0] is not None or
2024 self._downloader.params.get('cookiefile') is not None)):
2025 self._mark_watched(*args, **kwargs)
2027 def _mark_watched(self, *args, **kwargs):
2028 raise NotImplementedError('This method must be implemented by subclasses')
2030 def geo_verification_headers(self):
2032 geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
2033 if geo_verification_proxy:
2034 headers['Ytdl-request-proxy'] = geo_verification_proxy
2037 def _generic_id(self, url):
2038 return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
2040 def _generic_title(self, url):
2041 return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
2044 class SearchInfoExtractor(InfoExtractor):
2046 Base class for paged search queries extractors.
2047 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
2048 Instances should define _SEARCH_KEY and _MAX_RESULTS.
2052 def _make_valid_url(cls):
2053 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
2056 def suitable(cls, url):
2057 return re.match(cls._make_valid_url(), url) is not None
2059 def _real_extract(self, query):
2060 mobj = re.match(self._make_valid_url(), query)
2062 raise ExtractorError('Invalid search query "%s"' % query)
2064 prefix = mobj.group('prefix')
2065 query = mobj.group('query')
2067 return self._get_n_results(query, 1)
2068 elif prefix == 'all':
2069 return self._get_n_results(query, self._MAX_RESULTS)
2073 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
2074 elif n > self._MAX_RESULTS:
2075 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
2076 n = self._MAX_RESULTS
2077 return self._get_n_results(query, n)
2079 def _get_n_results(self, query, n):
2080 """Get a specified number of results for a query"""
2081 raise NotImplementedError('This method must be implemented by subclasses')
2084 def SEARCH_KEY(self):
2085 return self._SEARCH_KEY