1 from __future__ import unicode_literals
15 from ..compat import (
18 compat_etree_fromstring,
55 class InfoExtractor(object):
56 """Information Extractor class.
58 Information extractors are the classes that, given a URL, extract
59 information about the video (or videos) the URL refers to. This
60 information includes the real video URL, the video title, author and
61 others. The information is stored in a dictionary which is then
62 passed to the YoutubeDL. The YoutubeDL processes this
63 information possibly downloading the video to the file system, among
64 other possible outcomes.
66 The type field determines the type of the result.
67 By far the most common value (and the default if _type is missing) is
68 "video", which indicates a single video.
70 For a video, the dictionaries must include the following fields:
73 title: Video title, unescaped.
75 Additionally, it must contain either a formats entry or a url one:
77 formats: A list of dictionaries for each format available, ordered
78 from worst to best quality.
81 * url Mandatory. The URL of the video file
82 * ext Will be calculated from URL if missing
83 * format A human-readable description of the format
84 ("mp4 container with h264/opus").
85 Calculated from the format_id, width, height.
86 and format_note fields if missing.
87 * format_id A short description of the format
88 ("mp4_h264_opus" or "19").
89 Technically optional, but strongly recommended.
90 * format_note Additional info about the format
91 ("3D" or "DASH video")
92 * width Width of the video, if known
93 * height Height of the video, if known
94 * resolution Textual description of width and height
95 * tbr Average bitrate of audio and video in KBit/s
96 * abr Average audio bitrate in KBit/s
97 * acodec Name of the audio codec in use
98 * asr Audio sampling rate in Hertz
99 * vbr Average video bitrate in KBit/s
101 * vcodec Name of the video codec in use
102 * container Name of the container format
103 * filesize The number of bytes, if known in advance
104 * filesize_approx An estimate for the number of bytes
105 * player_url SWF Player URL (used for rtmpdump).
106 * protocol The protocol that will be used for the actual
107 download, lower-case.
108 "http", "https", "rtsp", "rtmp", "rtmpe",
109 "m3u8", "m3u8_native" or "http_dash_segments".
110 * preference Order number of this format. If this field is
111 present and not None, the formats get sorted
112 by this field, regardless of all other values.
113 -1 for default (order by other properties),
114 -2 or smaller for less than default.
115 < -1000 to hide the format (if there is
116 another one which is strictly better)
117 * language Language code, e.g. "de" or "en-US".
118 * language_preference Is this in the language mentioned in
120 10 if it's what the URL is about,
121 -1 for default (don't know),
122 -10 otherwise, other values reserved for now.
123 * quality Order number of the video quality of this
124 format, irrespective of the file format.
125 -1 for default (order by other properties),
126 -2 or smaller for less than default.
127 * source_preference Order number for this video source
128 (quality takes higher priority)
129 -1 for default (order by other properties),
130 -2 or smaller for less than default.
131 * http_headers A dictionary of additional HTTP headers
132 to add to the request.
133 * stretched_ratio If given and not 1, indicates that the
134 video's pixels are not square.
135 width : height ratio as float.
136 * no_resume The server does not support resuming the
137 (HTTP or RTMP) download. Boolean.
139 url: Final video URL.
140 ext: Video filename extension.
141 format: The video format, defaults to ext (used for --get-format)
142 player_url: SWF Player URL (used for rtmpdump).
144 The following fields are optional:
146 alt_title: A secondary title of the video.
147 display_id An alternative identifier for the video, not necessarily
148 unique, but available before title. Typically, id is
149 something like "4234987", title "Dancing naked mole rats",
150 and display_id "dancing-naked-mole-rats"
151 thumbnails: A list of dictionaries, with the following entries:
152 * "id" (optional, string) - Thumbnail format ID
154 * "preference" (optional, int) - quality of the image
155 * "width" (optional, int)
156 * "height" (optional, int)
157 * "resolution" (optional, string "{width}x{height"},
159 thumbnail: Full URL to a video thumbnail image.
160 description: Full video description.
161 uploader: Full name of the video uploader.
162 license: License name the video is licensed under.
163 creator: The main artist who created the video.
164 release_date: The date (YYYYMMDD) when the video was released.
165 timestamp: UNIX timestamp of the moment the video became available.
166 upload_date: Video upload date (YYYYMMDD).
167 If not explicitly set, calculated from timestamp.
168 uploader_id: Nickname or id of the video uploader.
169 uploader_url: Full URL to a personal webpage of the video uploader.
170 location: Physical location where the video was filmed.
171 subtitles: The available subtitles as a dictionary in the format
172 {language: subformats}. "subformats" is a list sorted from
173 lower to higher preference, each element is a dictionary
174 with the "ext" entry and one of:
175 * "data": The subtitles file contents
176 * "url": A URL pointing to the subtitles file
177 "ext" will be calculated from URL if missing
178 automatic_captions: Like 'subtitles', used by the YoutubeIE for
179 automatically generated captions
180 duration: Length of the video in seconds, as an integer or float.
181 view_count: How many users have watched the video on the platform.
182 like_count: Number of positive ratings of the video
183 dislike_count: Number of negative ratings of the video
184 repost_count: Number of reposts of the video
185 average_rating: Average rating give by users, the scale used depends on the webpage
186 comment_count: Number of comments on the video
187 comments: A list of comments, each with one or more of the following
188 properties (all but one of text or html optional):
189 * "author" - human-readable name of the comment author
190 * "author_id" - user ID of the comment author
192 * "html" - Comment as HTML
193 * "text" - Plain text of the comment
194 * "timestamp" - UNIX timestamp of comment
195 * "parent" - ID of the comment this one is replying to.
196 Set to "root" to indicate that this is a
197 comment to the original video.
198 age_limit: Age restriction for the video, as an integer (years)
199 webpage_url: The URL to the video webpage, if given to youtube-dl it
200 should allow to get the same result again. (It will be set
201 by YoutubeDL if it's missing)
202 categories: A list of categories that the video falls in, for example
204 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
205 is_live: True, False, or None (=unknown). Whether this video is a
206 live stream that goes on instead of a fixed-length video.
207 start_time: Time in seconds where the reproduction should start, as
208 specified in the URL.
209 end_time: Time in seconds where the reproduction should end, as
210 specified in the URL.
212 The following fields should only be used when the video belongs to some logical
215 chapter: Name or title of the chapter the video belongs to.
216 chapter_number: Number of the chapter the video belongs to, as an integer.
217 chapter_id: Id of the chapter the video belongs to, as a unicode string.
219 The following fields should only be used when the video is an episode of some
222 series: Title of the series or programme the video episode belongs to.
223 season: Title of the season the video episode belongs to.
224 season_number: Number of the season the video episode belongs to, as an integer.
225 season_id: Id of the season the video episode belongs to, as a unicode string.
226 episode: Title of the video episode. Unlike mandatory video title field,
227 this field should denote the exact title of the video episode
228 without any kind of decoration.
229 episode_number: Number of the video episode within a season, as an integer.
230 episode_id: Id of the video episode, as a unicode string.
232 Unless mentioned otherwise, the fields should be Unicode strings.
234 Unless mentioned otherwise, None is equivalent to absence of information.
237 _type "playlist" indicates multiple videos.
238 There must be a key "entries", which is a list, an iterable, or a PagedList
239 object, each element of which is a valid dictionary by this specification.
241 Additionally, playlists can have "title", "description" and "id" attributes
242 with the same semantics as videos (see above).
245 _type "multi_video" indicates that there are multiple videos that
246 form a single show, for examples multiple acts of an opera or TV episode.
247 It must have an entries key like a playlist and contain all the keys
248 required for a video at the same time.
251 _type "url" indicates that the video must be extracted from another
252 location, possibly by a different extractor. Its only required key is:
253 "url" - the next URL to extract.
254 The key "ie_key" can be set to the class name (minus the trailing "IE",
255 e.g. "Youtube") if the extractor class is known in advance.
256 Additionally, the dictionary may have any properties of the resolved entity
257 known in advance, for example "title" if the title of the referred video is
261 _type "url_transparent" entities have the same specification as "url", but
262 indicate that the given additional information is more precise than the one
263 associated with the resolved URL.
264 This is useful when a site employs a video service that hosts the video and
265 its technical metadata, but that video service does not embed a useful
266 title, description etc.
269 Subclasses of this one should re-define the _real_initialize() and
270 _real_extract() methods and define a _VALID_URL regexp.
271 Probably, they should also be added to the list of extractors.
273 Finally, the _WORKING attribute should be set to False for broken IEs
274 in order to warn the users and skip the tests.
281 def __init__(self, downloader=None):
282 """Constructor. Receives an optional downloader."""
284 self.set_downloader(downloader)
287 def suitable(cls, url):
288 """Receives a URL and returns True if suitable for this IE."""
290 # This does not use has/getattr intentionally - we want to know whether
291 # we have cached the regexp for *this* class, whereas getattr would also
292 # match the superclass
293 if '_VALID_URL_RE' not in cls.__dict__:
294 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
295 return cls._VALID_URL_RE.match(url) is not None
298 def _match_id(cls, url):
299 if '_VALID_URL_RE' not in cls.__dict__:
300 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
301 m = cls._VALID_URL_RE.match(url)
307 """Getter method for _WORKING."""
310 def initialize(self):
311 """Initializes an instance (authentication, etc)."""
313 self._real_initialize()
316 def extract(self, url):
317 """Extracts URL information and returns it in list of dicts."""
320 return self._real_extract(url)
321 except ExtractorError:
323 except compat_http_client.IncompleteRead as e:
324 raise ExtractorError('A network error has occurred.', cause=e, expected=True)
325 except (KeyError, StopIteration) as e:
326 raise ExtractorError('An extractor error has occurred.', cause=e)
328 def set_downloader(self, downloader):
329 """Sets the downloader for this IE."""
330 self._downloader = downloader
332 def _real_initialize(self):
333 """Real initialization process. Redefine in subclasses."""
336 def _real_extract(self, url):
337 """Real extraction process. Redefine in subclasses."""
342 """A string for getting the InfoExtractor with get_info_extractor"""
343 return compat_str(cls.__name__[:-2])
347 return compat_str(type(self).__name__[:-2])
349 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None):
350 """ Returns the response handle """
352 self.report_download_webpage(video_id)
353 elif note is not False:
355 self.to_screen('%s' % (note,))
357 self.to_screen('%s: %s' % (video_id, note))
358 # data, headers and query params will be ignored for `Request` objects
359 if isinstance(url_or_request, compat_str):
361 url_or_request = update_url_query(url_or_request, query)
363 url_or_request = sanitized_Request(url_or_request, data, headers or {})
365 return self._downloader.urlopen(url_or_request)
366 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
370 errnote = 'Unable to download webpage'
372 errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
374 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
376 self._downloader.report_warning(errmsg)
379 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers=None, query=None):
380 """ Returns a tuple (page content as string, URL handle) """
381 # Strip hashes from the URL (#1038)
382 if isinstance(url_or_request, (compat_str, str)):
383 url_or_request = url_or_request.partition('#')[0]
385 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
389 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
390 return (content, urlh)
393 def _guess_encoding_from_content(content_type, webpage_bytes):
394 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
396 encoding = m.group(1)
398 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
399 webpage_bytes[:1024])
401 encoding = m.group(1).decode('ascii')
402 elif webpage_bytes.startswith(b'\xff\xfe'):
409 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
410 content_type = urlh.headers.get('Content-Type', '')
411 webpage_bytes = urlh.read()
412 if prefix is not None:
413 webpage_bytes = prefix + webpage_bytes
415 encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
416 if self._downloader.params.get('dump_intermediate_pages', False):
418 url = url_or_request.get_full_url()
419 except AttributeError:
421 self.to_screen('Dumping request to ' + url)
422 dump = base64.b64encode(webpage_bytes).decode('ascii')
423 self._downloader.to_screen(dump)
424 if self._downloader.params.get('write_pages', False):
426 url = url_or_request.get_full_url()
427 except AttributeError:
429 basen = '%s_%s' % (video_id, url)
431 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
432 basen = basen[:240 - len(h)] + h
433 raw_filename = basen + '.dump'
434 filename = sanitize_filename(raw_filename, restricted=True)
435 self.to_screen('Saving request to ' + filename)
436 # Working around MAX_PATH limitation on Windows (see
437 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
438 if compat_os_name == 'nt':
439 absfilepath = os.path.abspath(filename)
440 if len(absfilepath) > 259:
441 filename = '\\\\?\\' + absfilepath
442 with open(filename, 'wb') as outf:
443 outf.write(webpage_bytes)
446 content = webpage_bytes.decode(encoding, 'replace')
448 content = webpage_bytes.decode('utf-8', 'replace')
450 if ('<title>Access to this site is blocked</title>' in content and
451 'Websense' in content[:512]):
452 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
453 blocked_iframe = self._html_search_regex(
454 r'<iframe src="([^"]+)"', content,
455 'Websense information URL', default=None)
457 msg += ' Visit %s for more details' % blocked_iframe
458 raise ExtractorError(msg, expected=True)
459 if '<title>The URL you requested has been blocked</title>' in content[:512]:
461 'Access to this webpage has been blocked by Indian censorship. '
462 'Use a VPN or proxy server (with --proxy) to route around it.')
463 block_msg = self._html_search_regex(
464 r'</h1><p>(.*?)</p>',
465 content, 'block message', default=None)
467 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
468 raise ExtractorError(msg, expected=True)
472 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers=None, query=None):
473 """ Returns the data of the page as a string """
476 while success is False:
478 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
480 except compat_http_client.IncompleteRead as e:
482 if try_count >= tries:
484 self._sleep(timeout, video_id)
491 def _download_xml(self, url_or_request, video_id,
492 note='Downloading XML', errnote='Unable to download XML',
493 transform_source=None, fatal=True, encoding=None, data=None, headers=None, query=None):
494 """Return the xml as an xml.etree.ElementTree.Element"""
495 xml_string = self._download_webpage(
496 url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query)
497 if xml_string is False:
500 xml_string = transform_source(xml_string)
501 return compat_etree_fromstring(xml_string.encode('utf-8'))
503 def _download_json(self, url_or_request, video_id,
504 note='Downloading JSON metadata',
505 errnote='Unable to download JSON metadata',
506 transform_source=None,
507 fatal=True, encoding=None, data=None, headers=None, query=None):
508 json_string = self._download_webpage(
509 url_or_request, video_id, note, errnote, fatal=fatal,
510 encoding=encoding, data=data, headers=headers, query=query)
511 if (not fatal) and json_string is False:
513 return self._parse_json(
514 json_string, video_id, transform_source=transform_source, fatal=fatal)
516 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
518 json_string = transform_source(json_string)
520 return json.loads(json_string)
521 except ValueError as ve:
522 errmsg = '%s: Failed to parse JSON ' % video_id
524 raise ExtractorError(errmsg, cause=ve)
526 self.report_warning(errmsg + str(ve))
528 def report_warning(self, msg, video_id=None):
529 idstr = '' if video_id is None else '%s: ' % video_id
530 self._downloader.report_warning(
531 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
533 def to_screen(self, msg):
534 """Print msg to screen, prefixing it with '[ie_name]'"""
535 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
537 def report_extraction(self, id_or_name):
538 """Report information extraction."""
539 self.to_screen('%s: Extracting information' % id_or_name)
541 def report_download_webpage(self, video_id):
542 """Report webpage download."""
543 self.to_screen('%s: Downloading webpage' % video_id)
545 def report_age_confirmation(self):
546 """Report attempt to confirm age."""
547 self.to_screen('Confirming age')
549 def report_login(self):
550 """Report attempt to log in."""
551 self.to_screen('Logging in')
554 def raise_login_required(msg='This video is only available for registered users'):
555 raise ExtractorError(
556 '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
560 def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'):
561 raise ExtractorError(
562 '%s. You might want to use --proxy to workaround.' % msg,
565 # Methods for following #608
567 def url_result(url, ie=None, video_id=None, video_title=None):
568 """Returns a URL that points to a page that should be processed"""
569 # TODO: ie should be the class used for getting the info
570 video_info = {'_type': 'url',
573 if video_id is not None:
574 video_info['id'] = video_id
575 if video_title is not None:
576 video_info['title'] = video_title
580 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
581 """Returns a playlist"""
582 video_info = {'_type': 'playlist',
585 video_info['id'] = playlist_id
587 video_info['title'] = playlist_title
588 if playlist_description:
589 video_info['description'] = playlist_description
592 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
594 Perform a regex search on the given string, using a single or a list of
595 patterns returning the first matching group.
596 In case of failure return a default value or raise a WARNING or a
597 RegexNotFoundError, depending on fatal, specifying the field name.
599 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
600 mobj = re.search(pattern, string, flags)
603 mobj = re.search(p, string, flags)
607 if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
608 _name = '\033[0;34m%s\033[0m' % name
614 # return the first matching group
615 return next(g for g in mobj.groups() if g is not None)
617 return mobj.group(group)
618 elif default is not NO_DEFAULT:
621 raise RegexNotFoundError('Unable to extract %s' % _name)
623 self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
626 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
628 Like _search_regex, but strips HTML tags and unescapes entities.
630 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
632 return clean_html(res).strip()
636 def _get_login_info(self):
638 Get the login info as (username, password)
639 It will look in the netrc file using the _NETRC_MACHINE value
640 If there's no info available, return (None, None)
642 if self._downloader is None:
647 downloader_params = self._downloader.params
649 # Attempt to use provided username and password or .netrc data
650 if downloader_params.get('username') is not None:
651 username = downloader_params['username']
652 password = downloader_params['password']
653 elif downloader_params.get('usenetrc', False):
655 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
660 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
661 except (IOError, netrc.NetrcParseError) as err:
662 self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err))
664 return (username, password)
666 def _get_tfa_info(self, note='two-factor verification code'):
668 Get the two-factor authentication info
669 TODO - asking the user will be required for sms/phone verify
670 currently just uses the command line option
671 If there's no info available, return None
673 if self._downloader is None:
675 downloader_params = self._downloader.params
677 if downloader_params.get('twofactor') is not None:
678 return downloader_params['twofactor']
680 return compat_getpass('Type %s and press [Return]: ' % note)
682 # Helper functions for extracting OpenGraph info
684 def _og_regexes(prop):
685 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
686 property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
687 % {'prop': re.escape(prop)})
688 template = r'<meta[^>]+?%s[^>]+?%s'
690 template % (property_re, content_re),
691 template % (content_re, property_re),
695 def _meta_regex(prop):
696 return r'''(?isx)<meta
697 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
698 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
700 def _og_search_property(self, prop, html, name=None, **kargs):
702 name = 'OpenGraph %s' % prop
703 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
706 return unescapeHTML(escaped)
708 def _og_search_thumbnail(self, html, **kargs):
709 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
711 def _og_search_description(self, html, **kargs):
712 return self._og_search_property('description', html, fatal=False, **kargs)
714 def _og_search_title(self, html, **kargs):
715 return self._og_search_property('title', html, **kargs)
717 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
718 regexes = self._og_regexes('video') + self._og_regexes('video:url')
720 regexes = self._og_regexes('video:secure_url') + regexes
721 return self._html_search_regex(regexes, html, name, **kargs)
723 def _og_search_url(self, html, **kargs):
724 return self._og_search_property('url', html, **kargs)
726 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
727 if display_name is None:
729 return self._html_search_regex(
730 self._meta_regex(name),
731 html, display_name, fatal=fatal, group='content', **kwargs)
733 def _dc_search_uploader(self, html):
734 return self._html_search_meta('dc.creator', html, 'uploader')
736 def _rta_search(self, html):
737 # See http://www.rtalabel.org/index.php?content=howtofaq#single
738 if re.search(r'(?ix)<meta\s+name="rating"\s+'
739 r' content="RTA-5042-1996-1400-1577-RTA"',
744 def _media_rating_search(self, html):
745 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
746 rating = self._html_search_meta('rating', html)
758 return RATING_TABLE.get(rating.lower())
760 def _family_friendly_search(self, html):
761 # See http://schema.org/VideoObject
762 family_friendly = self._html_search_meta('isFamilyFriendly', html)
764 if not family_friendly:
773 return RATING_TABLE.get(family_friendly.lower())
775 def _twitter_search_player(self, html):
776 return self._html_search_meta('twitter:player', html,
777 'twitter card player')
779 def _search_json_ld(self, html, video_id, **kwargs):
780 json_ld = self._search_regex(
781 r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
782 html, 'JSON-LD', group='json_ld', **kwargs)
785 return self._json_ld(json_ld, video_id, fatal=kwargs.get('fatal', True))
787 def _json_ld(self, json_ld, video_id, fatal=True):
788 if isinstance(json_ld, compat_str):
789 json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
793 if json_ld.get('@context') == 'http://schema.org':
794 item_type = json_ld.get('@type')
795 if item_type == 'TVEpisode':
797 'episode': unescapeHTML(json_ld.get('name')),
798 'episode_number': int_or_none(json_ld.get('episodeNumber')),
799 'description': unescapeHTML(json_ld.get('description')),
801 part_of_season = json_ld.get('partOfSeason')
802 if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
803 info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
804 part_of_series = json_ld.get('partOfSeries')
805 if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
806 info['series'] = unescapeHTML(part_of_series.get('name'))
807 elif item_type == 'Article':
809 'timestamp': parse_iso8601(json_ld.get('datePublished')),
810 'title': unescapeHTML(json_ld.get('headline')),
811 'description': unescapeHTML(json_ld.get('articleBody')),
813 return dict((k, v) for k, v in info.items() if v is not None)
816 def _hidden_inputs(html):
817 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
819 for input in re.findall(r'(?i)<input([^>]+)>', html):
820 if not re.search(r'type=(["\'])(?:hidden|submit)\1', input):
822 name = re.search(r'name=(["\'])(?P<value>.+?)\1', input)
825 value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
828 hidden_inputs[name.group('value')] = value.group('value')
831 def _form_hidden_inputs(self, form_id, html):
832 form = self._search_regex(
833 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
834 html, '%s form' % form_id, group='form')
835 return self._hidden_inputs(form)
837 def _sort_formats(self, formats, field_preference=None):
839 raise ExtractorError('No video formats found')
842 # Automatically determine tbr when missing based on abr and vbr (improves
843 # formats sorting in some cases)
844 if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
845 f['tbr'] = f['abr'] + f['vbr']
848 # TODO remove the following workaround
849 from ..utils import determine_ext
850 if not f.get('ext') and 'url' in f:
851 f['ext'] = determine_ext(f['url'])
853 if isinstance(field_preference, (list, tuple)):
854 return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
856 preference = f.get('preference')
857 if preference is None:
859 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
862 proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1
864 if f.get('vcodec') == 'none': # audio only
866 if self._downloader.params.get('prefer_free_formats'):
867 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
869 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
872 audio_ext_preference = ORDER.index(f['ext'])
874 audio_ext_preference = -1
876 if f.get('acodec') == 'none': # video only
878 if self._downloader.params.get('prefer_free_formats'):
879 ORDER = ['flv', 'mp4', 'webm']
881 ORDER = ['webm', 'flv', 'mp4']
883 ext_preference = ORDER.index(f['ext'])
886 audio_ext_preference = 0
890 f.get('language_preference') if f.get('language_preference') is not None else -1,
891 f.get('quality') if f.get('quality') is not None else -1,
892 f.get('tbr') if f.get('tbr') is not None else -1,
893 f.get('filesize') if f.get('filesize') is not None else -1,
894 f.get('vbr') if f.get('vbr') is not None else -1,
895 f.get('height') if f.get('height') is not None else -1,
896 f.get('width') if f.get('width') is not None else -1,
899 f.get('abr') if f.get('abr') is not None else -1,
900 audio_ext_preference,
901 f.get('fps') if f.get('fps') is not None else -1,
902 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
903 f.get('source_preference') if f.get('source_preference') is not None else -1,
904 f.get('format_id') if f.get('format_id') is not None else '',
906 formats.sort(key=_formats_key)
908 def _check_formats(self, formats, video_id):
911 lambda f: self._is_valid_url(
913 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
917 def _remove_duplicate_formats(formats):
921 if f['url'] not in format_urls:
922 format_urls.add(f['url'])
923 unique_formats.append(f)
924 formats[:] = unique_formats
926 def _is_valid_url(self, url, video_id, item='video'):
927 url = self._proto_relative_url(url, scheme='http:')
928 # For now assume non HTTP(S) URLs always valid
929 if not (url.startswith('http://') or url.startswith('https://')):
932 self._request_webpage(url, video_id, 'Checking %s URL' % item)
934 except ExtractorError as e:
935 if isinstance(e.cause, compat_urllib_error.URLError):
937 '%s: %s URL is invalid, skipping' % (video_id, item))
941 def http_scheme(self):
942 """ Either "http:" or "https:", depending on the user's preferences """
945 if self._downloader.params.get('prefer_insecure', False)
948 def _proto_relative_url(self, url, scheme=None):
951 if url.startswith('//'):
953 scheme = self.http_scheme()
958 def _sleep(self, timeout, video_id, msg_template=None):
959 if msg_template is None:
960 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
961 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
965 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
966 transform_source=lambda s: fix_xml_ampersands(s).strip(),
968 manifest = self._download_xml(
969 manifest_url, video_id, 'Downloading f4m manifest',
970 'Unable to download f4m manifest',
971 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
972 # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
973 transform_source=transform_source,
976 if manifest is False:
979 return self._parse_f4m_formats(
980 manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
981 transform_source=transform_source, fatal=fatal)
983 def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
984 transform_source=lambda s: fix_xml_ampersands(s).strip(),
987 manifest_version = '1.0'
988 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
990 manifest_version = '2.0'
991 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
992 base_url = xpath_text(
993 manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
994 'base URL', default=None)
996 base_url = base_url.strip()
997 for i, media_el in enumerate(media_nodes):
998 if manifest_version == '2.0':
999 media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
1003 media_url if media_url.startswith('http://') or media_url.startswith('https://')
1004 else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1005 # If media_url is itself a f4m manifest do the recursive extraction
1006 # since bitrates in parent manifest (this one) and media_url manifest
1007 # may differ leading to inability to resolve the format by requested
1008 # bitrate in f4m downloader
1009 if determine_ext(manifest_url) == 'f4m':
1010 formats.extend(self._extract_f4m_formats(
1011 manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1012 transform_source=transform_source, fatal=fatal))
1014 tbr = int_or_none(media_el.attrib.get('bitrate'))
1016 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
1017 'url': manifest_url,
1020 'width': int_or_none(media_el.attrib.get('width')),
1021 'height': int_or_none(media_el.attrib.get('height')),
1022 'preference': preference,
1024 self._sort_formats(formats)
1028 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1029 entry_protocol='m3u8', preference=None,
1030 m3u8_id=None, note=None, errnote=None,
1034 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1038 'preference': preference - 1 if preference else -1,
1039 'resolution': 'multiple',
1040 'format_note': 'Quality selection URL',
1043 format_url = lambda u: (
1045 if re.match(r'^https?://', u)
1046 else compat_urlparse.urljoin(m3u8_url, u))
1048 res = self._download_webpage_handle(
1050 note=note or 'Downloading m3u8 information',
1051 errnote=errnote or 'Failed to download m3u8 information',
1055 m3u8_doc, urlh = res
1056 m3u8_url = urlh.geturl()
1058 # We should try extracting formats only from master playlists [1], i.e.
1059 # playlists that describe available qualities. On the other hand media
1060 # playlists [2] should be returned as is since they contain just the media
1061 # without qualities renditions.
1062 # Fortunately, master playlist can be easily distinguished from media
1063 # playlist based on particular tags availability. As of [1, 2] master
1064 # playlist tags MUST NOT appear in a media playist and vice versa.
1065 # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
1066 # and MUST NOT appear in master playlist thus we can clearly detect media
1067 # playlist with this criterion.
1068 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
1069 # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
1070 # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
1071 if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
1074 'format_id': m3u8_id,
1076 'protocol': entry_protocol,
1077 'preference': preference,
1081 kv_rex = re.compile(
1082 r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
1083 for line in m3u8_doc.splitlines():
1084 if line.startswith('#EXT-X-STREAM-INF:'):
1086 for m in kv_rex.finditer(line):
1088 if v.startswith('"'):
1090 last_info[m.group('key')] = v
1091 elif line.startswith('#EXT-X-MEDIA:'):
1093 for m in kv_rex.finditer(line):
1095 if v.startswith('"'):
1097 last_media[m.group('key')] = v
1098 elif line.startswith('#') or not line.strip():
1101 if last_info is None:
1102 formats.append({'url': format_url(line)})
1104 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
1107 format_id.append(m3u8_id)
1108 last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
1109 format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
1111 'format_id': '-'.join(format_id),
1112 'url': format_url(line.strip()),
1115 'protocol': entry_protocol,
1116 'preference': preference,
1118 resolution = last_info.get('RESOLUTION')
1120 width_str, height_str = resolution.split('x')
1121 f['width'] = int(width_str)
1122 f['height'] = int(height_str)
1123 codecs = last_info.get('CODECS')
1125 vcodec, acodec = [None] * 2
1126 va_codecs = codecs.split(',')
1127 if len(va_codecs) == 1:
1128 # Audio only entries usually come with single codec and
1129 # no resolution. For more robustness we also check it to
1131 if not resolution and va_codecs[0].startswith('mp4a'):
1132 vcodec, acodec = 'none', va_codecs[0]
1134 vcodec = va_codecs[0]
1136 vcodec, acodec = va_codecs[:2]
1141 if last_media is not None:
1142 f['m3u8_media'] = last_media
1146 self._sort_formats(formats)
1150 def _xpath_ns(path, namespace=None):
1154 for c in path.split('/'):
1155 if not c or c == '.':
1158 out.append('{%s}%s' % (namespace, c))
1159 return '/'.join(out)
1161 def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1162 smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1168 namespace = self._parse_smil_namespace(smil)
1170 return self._parse_smil_formats(
1171 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1173 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1174 smil = self._download_smil(smil_url, video_id, fatal=fatal)
1177 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1179 def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1180 return self._download_xml(
1181 smil_url, video_id, 'Downloading SMIL file',
1182 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1184 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1185 namespace = self._parse_smil_namespace(smil)
1187 formats = self._parse_smil_formats(
1188 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1189 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1191 video_id = os.path.splitext(url_basename(smil_url))[0]
1195 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1196 name = meta.attrib.get('name')
1197 content = meta.attrib.get('content')
1198 if not name or not content:
1200 if not title and name == 'title':
1202 elif not description and name in ('description', 'abstract'):
1203 description = content
1204 elif not upload_date and name == 'date':
1205 upload_date = unified_strdate(content)
1208 'id': image.get('type'),
1209 'url': image.get('src'),
1210 'width': int_or_none(image.get('width')),
1211 'height': int_or_none(image.get('height')),
1212 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1216 'title': title or video_id,
1217 'description': description,
1218 'upload_date': upload_date,
1219 'thumbnails': thumbnails,
1221 'subtitles': subtitles,
1224 def _parse_smil_namespace(self, smil):
1225 return self._search_regex(
1226 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1228 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1230 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1231 b = meta.get('base') or meta.get('httpBase')
1242 videos = smil.findall(self._xpath_ns('.//video', namespace))
1243 for video in videos:
1244 src = video.get('src')
1245 if not src or src in srcs:
1249 bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
1250 filesize = int_or_none(video.get('size') or video.get('fileSize'))
1251 width = int_or_none(video.get('width'))
1252 height = int_or_none(video.get('height'))
1253 proto = video.get('proto')
1254 ext = video.get('ext')
1255 src_ext = determine_ext(src)
1256 streamer = video.get('streamer') or base
1258 if proto == 'rtmp' or streamer.startswith('rtmp'):
1264 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1266 'filesize': filesize,
1270 if transform_rtmp_url:
1271 streamer, src = transform_rtmp_url(streamer, src)
1272 formats[-1].update({
1278 src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1279 src_url = src_url.strip()
1281 if proto == 'm3u8' or src_ext == 'm3u8':
1282 m3u8_formats = self._extract_m3u8_formats(
1283 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1284 if len(m3u8_formats) == 1:
1286 m3u8_formats[0].update({
1287 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1292 formats.extend(m3u8_formats)
1295 if src_ext == 'f4m':
1300 'plugin': 'flowplayer-3.2.0.1',
1302 f4m_url += '&' if '?' in f4m_url else '?'
1303 f4m_url += compat_urllib_parse.urlencode(f4m_params)
1304 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1307 if src_url.startswith('http') and self._is_valid_url(src, video_id):
1311 'ext': ext or src_ext or 'flv',
1312 'format_id': 'http-%d' % (bitrate or http_count),
1314 'filesize': filesize,
1320 self._sort_formats(formats)
1324 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1327 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1328 src = textstream.get('src')
1329 if not src or src in urls:
1332 ext = textstream.get('ext') or determine_ext(src) or mimetype2ext(textstream.get('type'))
1333 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1334 subtitles.setdefault(lang, []).append({
1340 def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1341 xspf = self._download_xml(
1342 playlist_url, playlist_id, 'Downloading xpsf playlist',
1343 'Unable to download xspf manifest', fatal=fatal)
1346 return self._parse_xspf(xspf, playlist_id)
1348 def _parse_xspf(self, playlist, playlist_id):
1350 'xspf': 'http://xspf.org/ns/0/',
1351 's1': 'http://static.streamone.nl/player/ns/0',
1355 for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1357 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1358 description = xpath_text(
1359 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1360 thumbnail = xpath_text(
1361 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1362 duration = float_or_none(
1363 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1366 'url': location.text,
1367 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1368 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1369 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1370 } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1371 self._sort_formats(formats)
1376 'description': description,
1377 'thumbnail': thumbnail,
1378 'duration': duration,
1383 def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1384 res = self._download_webpage_handle(
1386 note=note or 'Downloading MPD manifest',
1387 errnote=errnote or 'Failed to download MPD manifest',
1392 mpd_base_url = re.match(r'https?://.+/', urlh.geturl()).group()
1394 return self._parse_mpd_formats(
1395 compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, formats_dict=formats_dict)
1397 def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}):
1398 if mpd_doc.get('type') == 'dynamic':
1401 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1404 return self._xpath_ns(path, namespace)
1406 def is_drm_protected(element):
1407 return element.find(_add_ns('ContentProtection')) is not None
1409 def extract_multisegment_info(element, ms_parent_info):
1410 ms_info = ms_parent_info.copy()
1411 segment_list = element.find(_add_ns('SegmentList'))
1412 if segment_list is not None:
1413 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1415 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1416 initialization = segment_list.find(_add_ns('Initialization'))
1417 if initialization is not None:
1418 ms_info['initialization_url'] = initialization.attrib['sourceURL']
1420 segment_template = element.find(_add_ns('SegmentTemplate'))
1421 if segment_template is not None:
1422 start_number = segment_template.get('startNumber')
1424 ms_info['start_number'] = int(start_number)
1425 segment_timeline = segment_template.find(_add_ns('SegmentTimeline'))
1426 if segment_timeline is not None:
1427 s_e = segment_timeline.findall(_add_ns('S'))
1429 ms_info['total_number'] = 0
1431 ms_info['total_number'] += 1 + int(s.get('r', '0'))
1433 timescale = segment_template.get('timescale')
1435 ms_info['timescale'] = int(timescale)
1436 segment_duration = segment_template.get('duration')
1437 if segment_duration:
1438 ms_info['segment_duration'] = int(segment_duration)
1439 media_template = segment_template.get('media')
1441 ms_info['media_template'] = media_template
1442 initialization = segment_template.get('initialization')
1444 ms_info['initialization_url'] = initialization
1446 initialization = segment_template.find(_add_ns('Initialization'))
1447 if initialization is not None:
1448 ms_info['initialization_url'] = initialization.attrib['sourceURL']
1451 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1453 for period in mpd_doc.findall(_add_ns('Period')):
1454 period_duration = parse_duration(period.get('duration')) or mpd_duration
1455 period_ms_info = extract_multisegment_info(period, {
1459 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1460 if is_drm_protected(adaptation_set):
1462 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1463 for representation in adaptation_set.findall(_add_ns('Representation')):
1464 if is_drm_protected(representation):
1466 representation_attrib = adaptation_set.attrib.copy()
1467 representation_attrib.update(representation.attrib)
1468 # According to page 41 of ISO/IEC 29001-1:2014, @mimeType is mandatory
1469 mime_type = representation_attrib['mimeType']
1470 content_type = mime_type.split('/')[0]
1471 if content_type == 'text':
1472 # TODO implement WebVTT downloading
1474 elif content_type == 'video' or content_type == 'audio':
1476 for element in (representation, adaptation_set, period, mpd_doc):
1477 base_url_e = element.find(_add_ns('BaseURL'))
1478 if base_url_e is not None:
1479 base_url = base_url_e.text + base_url
1480 if re.match(r'^https?://', base_url):
1482 if mpd_base_url and not re.match(r'^https?://', base_url):
1483 if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1485 base_url = mpd_base_url + base_url
1486 representation_id = representation_attrib.get('id')
1487 lang = representation_attrib.get('lang')
1488 url_el = representation.find(_add_ns('BaseURL'))
1489 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1491 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1493 'ext': mimetype2ext(mime_type),
1494 'width': int_or_none(representation_attrib.get('width')),
1495 'height': int_or_none(representation_attrib.get('height')),
1496 'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000),
1497 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1498 'fps': int_or_none(representation_attrib.get('frameRate')),
1499 'vcodec': 'none' if content_type == 'audio' else representation_attrib.get('codecs'),
1500 'acodec': 'none' if content_type == 'video' else representation_attrib.get('codecs'),
1501 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1502 'format_note': 'DASH %s' % content_type,
1503 'filesize': filesize,
1505 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1506 if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info:
1507 if 'total_number' not in representation_ms_info and 'segment_duration':
1508 segment_duration = float(representation_ms_info['segment_duration']) / float(representation_ms_info['timescale'])
1509 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1510 media_template = representation_ms_info['media_template']
1511 media_template = media_template.replace('$RepresentationID$', representation_id)
1512 media_template = re.sub(r'\$(Number|Bandwidth)(?:%(0\d+)d)?\$', r'%(\1)\2d', media_template)
1513 media_template.replace('$$', '$')
1514 representation_ms_info['segment_urls'] = [media_template % {'Number': segment_number, 'Bandwidth': representation_attrib.get('bandwidth')} for segment_number in range(representation_ms_info['start_number'], representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1515 if 'segment_urls' in representation_ms_info:
1517 'segment_urls': representation_ms_info['segment_urls'],
1518 'protocol': 'http_dash_segments',
1520 if 'initialization_url' in representation_ms_info:
1521 initialization_url = representation_ms_info['initialization_url'].replace('$RepresentationID$', representation_id)
1523 'initialization_url': initialization_url,
1525 if not f.get('url'):
1526 f['url'] = initialization_url
1528 existing_format = next(
1529 fo for fo in formats
1530 if fo['format_id'] == representation_id)
1531 except StopIteration:
1532 full_info = formats_dict.get(representation_id, {}).copy()
1534 formats.append(full_info)
1536 existing_format.update(f)
1538 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
1539 self._sort_formats(formats)
1542 def _live_title(self, name):
1543 """ Generate the title for a live video """
1544 now = datetime.datetime.now()
1545 now_str = now.strftime('%Y-%m-%d %H:%M')
1546 return name + ' ' + now_str
1548 def _int(self, v, name, fatal=False, **kwargs):
1549 res = int_or_none(v, **kwargs)
1550 if 'get_attr' in kwargs:
1551 print(getattr(v, kwargs['get_attr']))
1553 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1555 raise ExtractorError(msg)
1557 self._downloader.report_warning(msg)
1560 def _float(self, v, name, fatal=False, **kwargs):
1561 res = float_or_none(v, **kwargs)
1563 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1565 raise ExtractorError(msg)
1567 self._downloader.report_warning(msg)
1570 def _set_cookie(self, domain, name, value, expire_time=None):
1571 cookie = compat_cookiejar.Cookie(
1572 0, name, value, None, None, domain, None,
1573 None, '/', True, False, expire_time, '', None, None, None)
1574 self._downloader.cookiejar.set_cookie(cookie)
1576 def _get_cookies(self, url):
1577 """ Return a compat_cookies.SimpleCookie with the cookies for the url """
1578 req = sanitized_Request(url)
1579 self._downloader.cookiejar.add_cookie_header(req)
1580 return compat_cookies.SimpleCookie(req.get_header('Cookie'))
1582 def get_testcases(self, include_onlymatching=False):
1583 t = getattr(self, '_TEST', None)
1585 assert not hasattr(self, '_TESTS'), \
1586 '%s has _TEST and _TESTS' % type(self).__name__
1589 tests = getattr(self, '_TESTS', [])
1591 if not include_onlymatching and t.get('only_matching', False):
1593 t['name'] = type(self).__name__[:-len('IE')]
1596 def is_suitable(self, age_limit):
1597 """ Test whether the extractor is generally suitable for the given
1598 age limit (i.e. pornographic sites are not, all others usually are) """
1600 any_restricted = False
1601 for tc in self.get_testcases(include_onlymatching=False):
1602 if 'playlist' in tc:
1603 tc = tc['playlist'][0]
1604 is_restricted = age_restricted(
1605 tc.get('info_dict', {}).get('age_limit'), age_limit)
1606 if not is_restricted:
1608 any_restricted = any_restricted or is_restricted
1609 return not any_restricted
1611 def extract_subtitles(self, *args, **kwargs):
1612 if (self._downloader.params.get('writesubtitles', False) or
1613 self._downloader.params.get('listsubtitles')):
1614 return self._get_subtitles(*args, **kwargs)
1617 def _get_subtitles(self, *args, **kwargs):
1618 raise NotImplementedError('This method must be implemented by subclasses')
1621 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
1622 """ Merge subtitle items for one language. Items with duplicated URLs
1623 will be dropped. """
1624 list1_urls = set([item['url'] for item in subtitle_list1])
1625 ret = list(subtitle_list1)
1626 ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
1630 def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
1631 """ Merge two subtitle dictionaries, language by language. """
1632 ret = dict(subtitle_dict1)
1633 for lang in subtitle_dict2:
1634 ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
1637 def extract_automatic_captions(self, *args, **kwargs):
1638 if (self._downloader.params.get('writeautomaticsub', False) or
1639 self._downloader.params.get('listsubtitles')):
1640 return self._get_automatic_captions(*args, **kwargs)
1643 def _get_automatic_captions(self, *args, **kwargs):
1644 raise NotImplementedError('This method must be implemented by subclasses')
1646 def mark_watched(self, *args, **kwargs):
1647 if (self._downloader.params.get('mark_watched', False) and
1648 (self._get_login_info()[0] is not None or
1649 self._downloader.params.get('cookiefile') is not None)):
1650 self._mark_watched(*args, **kwargs)
1652 def _mark_watched(self, *args, **kwargs):
1653 raise NotImplementedError('This method must be implemented by subclasses')
1656 class SearchInfoExtractor(InfoExtractor):
1658 Base class for paged search queries extractors.
1659 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
1660 Instances should define _SEARCH_KEY and _MAX_RESULTS.
1664 def _make_valid_url(cls):
1665 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
1668 def suitable(cls, url):
1669 return re.match(cls._make_valid_url(), url) is not None
1671 def _real_extract(self, query):
1672 mobj = re.match(self._make_valid_url(), query)
1674 raise ExtractorError('Invalid search query "%s"' % query)
1676 prefix = mobj.group('prefix')
1677 query = mobj.group('query')
1679 return self._get_n_results(query, 1)
1680 elif prefix == 'all':
1681 return self._get_n_results(query, self._MAX_RESULTS)
1685 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
1686 elif n > self._MAX_RESULTS:
1687 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
1688 n = self._MAX_RESULTS
1689 return self._get_n_results(query, n)
1691 def _get_n_results(self, query, n):
1692 """Get a specified number of results for a query"""
1693 raise NotImplementedError('This method must be implemented by subclasses')
1696 def SEARCH_KEY(self):
1697 return self._SEARCH_KEY