1 from __future__ import unicode_literals
15 from ..compat import (
24 compat_etree_fromstring,
53 class InfoExtractor(object):
54 """Information Extractor class.
56 Information extractors are the classes that, given a URL, extract
57 information about the video (or videos) the URL refers to. This
58 information includes the real video URL, the video title, author and
59 others. The information is stored in a dictionary which is then
60 passed to the YoutubeDL. The YoutubeDL processes this
61 information possibly downloading the video to the file system, among
62 other possible outcomes.
64 The type field determines the type of the result.
65 By far the most common value (and the default if _type is missing) is
66 "video", which indicates a single video.
68 For a video, the dictionaries must include the following fields:
71 title: Video title, unescaped.
73 Additionally, it must contain either a formats entry or a url one:
75 formats: A list of dictionaries for each format available, ordered
76 from worst to best quality.
79 * url Mandatory. The URL of the video file
80 * ext Will be calculated from URL if missing
81 * format A human-readable description of the format
82 ("mp4 container with h264/opus").
83 Calculated from the format_id, width, height.
84 and format_note fields if missing.
85 * format_id A short description of the format
86 ("mp4_h264_opus" or "19").
87 Technically optional, but strongly recommended.
88 * format_note Additional info about the format
89 ("3D" or "DASH video")
90 * width Width of the video, if known
91 * height Height of the video, if known
92 * resolution Textual description of width and height
93 * tbr Average bitrate of audio and video in KBit/s
94 * abr Average audio bitrate in KBit/s
95 * acodec Name of the audio codec in use
96 * asr Audio sampling rate in Hertz
97 * vbr Average video bitrate in KBit/s
99 * vcodec Name of the video codec in use
100 * container Name of the container format
101 * filesize The number of bytes, if known in advance
102 * filesize_approx An estimate for the number of bytes
103 * player_url SWF Player URL (used for rtmpdump).
104 * protocol The protocol that will be used for the actual
105 download, lower-case.
106 "http", "https", "rtsp", "rtmp", "rtmpe",
107 "m3u8", or "m3u8_native".
108 * preference Order number of this format. If this field is
109 present and not None, the formats get sorted
110 by this field, regardless of all other values.
111 -1 for default (order by other properties),
112 -2 or smaller for less than default.
113 < -1000 to hide the format (if there is
114 another one which is strictly better)
115 * language Language code, e.g. "de" or "en-US".
116 * language_preference Is this in the language mentioned in
118 10 if it's what the URL is about,
119 -1 for default (don't know),
120 -10 otherwise, other values reserved for now.
121 * quality Order number of the video quality of this
122 format, irrespective of the file format.
123 -1 for default (order by other properties),
124 -2 or smaller for less than default.
125 * source_preference Order number for this video source
126 (quality takes higher priority)
127 -1 for default (order by other properties),
128 -2 or smaller for less than default.
129 * http_headers A dictionary of additional HTTP headers
130 to add to the request.
131 * stretched_ratio If given and not 1, indicates that the
132 video's pixels are not square.
133 width : height ratio as float.
134 * no_resume The server does not support resuming the
135 (HTTP or RTMP) download. Boolean.
137 url: Final video URL.
138 ext: Video filename extension.
139 format: The video format, defaults to ext (used for --get-format)
140 player_url: SWF Player URL (used for rtmpdump).
142 The following fields are optional:
144 alt_title: A secondary title of the video.
145 display_id An alternative identifier for the video, not necessarily
146 unique, but available before title. Typically, id is
147 something like "4234987", title "Dancing naked mole rats",
148 and display_id "dancing-naked-mole-rats"
149 thumbnails: A list of dictionaries, with the following entries:
150 * "id" (optional, string) - Thumbnail format ID
152 * "preference" (optional, int) - quality of the image
153 * "width" (optional, int)
154 * "height" (optional, int)
155 * "resolution" (optional, string "{width}x{height"},
157 thumbnail: Full URL to a video thumbnail image.
158 description: Full video description.
159 uploader: Full name of the video uploader.
160 license: License name the video is licensed under.
161 creator: The main artist who created the video.
162 release_date: The date (YYYYMMDD) when the video was released.
163 timestamp: UNIX timestamp of the moment the video became available.
164 upload_date: Video upload date (YYYYMMDD).
165 If not explicitly set, calculated from timestamp.
166 uploader_id: Nickname or id of the video uploader.
167 uploader_url: Full URL to a personal webpage of the video uploader.
168 location: Physical location where the video was filmed.
169 subtitles: The available subtitles as a dictionary in the format
170 {language: subformats}. "subformats" is a list sorted from
171 lower to higher preference, each element is a dictionary
172 with the "ext" entry and one of:
173 * "data": The subtitles file contents
174 * "url": A URL pointing to the subtitles file
175 "ext" will be calculated from URL if missing
176 automatic_captions: Like 'subtitles', used by the YoutubeIE for
177 automatically generated captions
178 duration: Length of the video in seconds, as an integer or float.
179 view_count: How many users have watched the video on the platform.
180 like_count: Number of positive ratings of the video
181 dislike_count: Number of negative ratings of the video
182 repost_count: Number of reposts of the video
183 average_rating: Average rating give by users, the scale used depends on the webpage
184 comment_count: Number of comments on the video
185 comments: A list of comments, each with one or more of the following
186 properties (all but one of text or html optional):
187 * "author" - human-readable name of the comment author
188 * "author_id" - user ID of the comment author
190 * "html" - Comment as HTML
191 * "text" - Plain text of the comment
192 * "timestamp" - UNIX timestamp of comment
193 * "parent" - ID of the comment this one is replying to.
194 Set to "root" to indicate that this is a
195 comment to the original video.
196 age_limit: Age restriction for the video, as an integer (years)
197 webpage_url: The URL to the video webpage, if given to youtube-dl it
198 should allow to get the same result again. (It will be set
199 by YoutubeDL if it's missing)
200 categories: A list of categories that the video falls in, for example
202 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
203 is_live: True, False, or None (=unknown). Whether this video is a
204 live stream that goes on instead of a fixed-length video.
205 start_time: Time in seconds where the reproduction should start, as
206 specified in the URL.
207 end_time: Time in seconds where the reproduction should end, as
208 specified in the URL.
210 The following fields should only be used when the video belongs to some logical
213 chapter: Name or title of the chapter the video belongs to.
214 chapter_number: Number of the chapter the video belongs to, as an integer.
215 chapter_id: Id of the chapter the video belongs to, as a unicode string.
217 The following fields should only be used when the video is an episode of some
220 series: Title of the series or programme the video episode belongs to.
221 season: Title of the season the video episode belongs to.
222 season_number: Number of the season the video episode belongs to, as an integer.
223 season_id: Id of the season the video episode belongs to, as a unicode string.
224 episode: Title of the video episode. Unlike mandatory video title field,
225 this field should denote the exact title of the video episode
226 without any kind of decoration.
227 episode_number: Number of the video episode within a season, as an integer.
228 episode_id: Id of the video episode, as a unicode string.
230 Unless mentioned otherwise, the fields should be Unicode strings.
232 Unless mentioned otherwise, None is equivalent to absence of information.
235 _type "playlist" indicates multiple videos.
236 There must be a key "entries", which is a list, an iterable, or a PagedList
237 object, each element of which is a valid dictionary by this specification.
239 Additionally, playlists can have "title", "description" and "id" attributes
240 with the same semantics as videos (see above).
243 _type "multi_video" indicates that there are multiple videos that
244 form a single show, for examples multiple acts of an opera or TV episode.
245 It must have an entries key like a playlist and contain all the keys
246 required for a video at the same time.
249 _type "url" indicates that the video must be extracted from another
250 location, possibly by a different extractor. Its only required key is:
251 "url" - the next URL to extract.
252 The key "ie_key" can be set to the class name (minus the trailing "IE",
253 e.g. "Youtube") if the extractor class is known in advance.
254 Additionally, the dictionary may have any properties of the resolved entity
255 known in advance, for example "title" if the title of the referred video is
259 _type "url_transparent" entities have the same specification as "url", but
260 indicate that the given additional information is more precise than the one
261 associated with the resolved URL.
262 This is useful when a site employs a video service that hosts the video and
263 its technical metadata, but that video service does not embed a useful
264 title, description etc.
267 Subclasses of this one should re-define the _real_initialize() and
268 _real_extract() methods and define a _VALID_URL regexp.
269 Probably, they should also be added to the list of extractors.
271 Finally, the _WORKING attribute should be set to False for broken IEs
272 in order to warn the users and skip the tests.
279 def __init__(self, downloader=None):
280 """Constructor. Receives an optional downloader."""
282 self.set_downloader(downloader)
285 def suitable(cls, url):
286 """Receives a URL and returns True if suitable for this IE."""
288 # This does not use has/getattr intentionally - we want to know whether
289 # we have cached the regexp for *this* class, whereas getattr would also
290 # match the superclass
291 if '_VALID_URL_RE' not in cls.__dict__:
292 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
293 return cls._VALID_URL_RE.match(url) is not None
296 def _match_id(cls, url):
297 if '_VALID_URL_RE' not in cls.__dict__:
298 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
299 m = cls._VALID_URL_RE.match(url)
305 """Getter method for _WORKING."""
308 def initialize(self):
309 """Initializes an instance (authentication, etc)."""
311 self._real_initialize()
314 def extract(self, url):
315 """Extracts URL information and returns it in list of dicts."""
318 return self._real_extract(url)
319 except ExtractorError:
321 except compat_http_client.IncompleteRead as e:
322 raise ExtractorError('A network error has occurred.', cause=e, expected=True)
323 except (KeyError, StopIteration) as e:
324 raise ExtractorError('An extractor error has occurred.', cause=e)
326 def set_downloader(self, downloader):
327 """Sets the downloader for this IE."""
328 self._downloader = downloader
330 def _real_initialize(self):
331 """Real initialization process. Redefine in subclasses."""
334 def _real_extract(self, url):
335 """Real extraction process. Redefine in subclasses."""
340 """A string for getting the InfoExtractor with get_info_extractor"""
341 return compat_str(cls.__name__[:-2])
345 return compat_str(type(self).__name__[:-2])
347 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
348 """ Returns the response handle """
350 self.report_download_webpage(video_id)
351 elif note is not False:
353 self.to_screen('%s' % (note,))
355 self.to_screen('%s: %s' % (video_id, note))
357 return self._downloader.urlopen(url_or_request)
358 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
362 errnote = 'Unable to download webpage'
364 errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
366 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
368 self._downloader.report_warning(errmsg)
371 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None):
372 """ Returns a tuple (page content as string, URL handle) """
373 # Strip hashes from the URL (#1038)
374 if isinstance(url_or_request, (compat_str, str)):
375 url_or_request = url_or_request.partition('#')[0]
377 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
381 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
382 return (content, urlh)
385 def _guess_encoding_from_content(content_type, webpage_bytes):
386 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
388 encoding = m.group(1)
390 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
391 webpage_bytes[:1024])
393 encoding = m.group(1).decode('ascii')
394 elif webpage_bytes.startswith(b'\xff\xfe'):
401 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
402 content_type = urlh.headers.get('Content-Type', '')
403 webpage_bytes = urlh.read()
404 if prefix is not None:
405 webpage_bytes = prefix + webpage_bytes
407 encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
408 if self._downloader.params.get('dump_intermediate_pages', False):
410 url = url_or_request.get_full_url()
411 except AttributeError:
413 self.to_screen('Dumping request to ' + url)
414 dump = base64.b64encode(webpage_bytes).decode('ascii')
415 self._downloader.to_screen(dump)
416 if self._downloader.params.get('write_pages', False):
418 url = url_or_request.get_full_url()
419 except AttributeError:
421 basen = '%s_%s' % (video_id, url)
423 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
424 basen = basen[:240 - len(h)] + h
425 raw_filename = basen + '.dump'
426 filename = sanitize_filename(raw_filename, restricted=True)
427 self.to_screen('Saving request to ' + filename)
428 # Working around MAX_PATH limitation on Windows (see
429 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
431 absfilepath = os.path.abspath(filename)
432 if len(absfilepath) > 259:
433 filename = '\\\\?\\' + absfilepath
434 with open(filename, 'wb') as outf:
435 outf.write(webpage_bytes)
438 content = webpage_bytes.decode(encoding, 'replace')
440 content = webpage_bytes.decode('utf-8', 'replace')
442 if ('<title>Access to this site is blocked</title>' in content and
443 'Websense' in content[:512]):
444 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
445 blocked_iframe = self._html_search_regex(
446 r'<iframe src="([^"]+)"', content,
447 'Websense information URL', default=None)
449 msg += ' Visit %s for more details' % blocked_iframe
450 raise ExtractorError(msg, expected=True)
451 if '<title>The URL you requested has been blocked</title>' in content[:512]:
453 'Access to this webpage has been blocked by Indian censorship. '
454 'Use a VPN or proxy server (with --proxy) to route around it.')
455 block_msg = self._html_search_regex(
456 r'</h1><p>(.*?)</p>',
457 content, 'block message', default=None)
459 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
460 raise ExtractorError(msg, expected=True)
464 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None):
465 """ Returns the data of the page as a string """
468 while success is False:
470 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding)
472 except compat_http_client.IncompleteRead as e:
474 if try_count >= tries:
476 self._sleep(timeout, video_id)
483 def _download_xml(self, url_or_request, video_id,
484 note='Downloading XML', errnote='Unable to download XML',
485 transform_source=None, fatal=True, encoding=None):
486 """Return the xml as an xml.etree.ElementTree.Element"""
487 xml_string = self._download_webpage(
488 url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding)
489 if xml_string is False:
492 xml_string = transform_source(xml_string)
493 return compat_etree_fromstring(xml_string.encode('utf-8'))
495 def _download_json(self, url_or_request, video_id,
496 note='Downloading JSON metadata',
497 errnote='Unable to download JSON metadata',
498 transform_source=None,
499 fatal=True, encoding=None):
500 json_string = self._download_webpage(
501 url_or_request, video_id, note, errnote, fatal=fatal,
503 if (not fatal) and json_string is False:
505 return self._parse_json(
506 json_string, video_id, transform_source=transform_source, fatal=fatal)
508 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
510 json_string = transform_source(json_string)
512 return json.loads(json_string)
513 except ValueError as ve:
514 errmsg = '%s: Failed to parse JSON ' % video_id
516 raise ExtractorError(errmsg, cause=ve)
518 self.report_warning(errmsg + str(ve))
520 def report_warning(self, msg, video_id=None):
521 idstr = '' if video_id is None else '%s: ' % video_id
522 self._downloader.report_warning(
523 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
525 def to_screen(self, msg):
526 """Print msg to screen, prefixing it with '[ie_name]'"""
527 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
529 def report_extraction(self, id_or_name):
530 """Report information extraction."""
531 self.to_screen('%s: Extracting information' % id_or_name)
533 def report_download_webpage(self, video_id):
534 """Report webpage download."""
535 self.to_screen('%s: Downloading webpage' % video_id)
537 def report_age_confirmation(self):
538 """Report attempt to confirm age."""
539 self.to_screen('Confirming age')
541 def report_login(self):
542 """Report attempt to log in."""
543 self.to_screen('Logging in')
546 def raise_login_required(msg='This video is only available for registered users'):
547 raise ExtractorError(
548 '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
552 def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'):
553 raise ExtractorError(
554 '%s. You might want to use --proxy to workaround.' % msg,
557 # Methods for following #608
559 def url_result(url, ie=None, video_id=None, video_title=None):
560 """Returns a URL that points to a page that should be processed"""
561 # TODO: ie should be the class used for getting the info
562 video_info = {'_type': 'url',
565 if video_id is not None:
566 video_info['id'] = video_id
567 if video_title is not None:
568 video_info['title'] = video_title
572 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
573 """Returns a playlist"""
574 video_info = {'_type': 'playlist',
577 video_info['id'] = playlist_id
579 video_info['title'] = playlist_title
580 if playlist_description:
581 video_info['description'] = playlist_description
584 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
586 Perform a regex search on the given string, using a single or a list of
587 patterns returning the first matching group.
588 In case of failure return a default value or raise a WARNING or a
589 RegexNotFoundError, depending on fatal, specifying the field name.
591 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
592 mobj = re.search(pattern, string, flags)
595 mobj = re.search(p, string, flags)
599 if not self._downloader.params.get('no_color') and os.name != 'nt' and sys.stderr.isatty():
600 _name = '\033[0;34m%s\033[0m' % name
606 # return the first matching group
607 return next(g for g in mobj.groups() if g is not None)
609 return mobj.group(group)
610 elif default is not NO_DEFAULT:
613 raise RegexNotFoundError('Unable to extract %s' % _name)
615 self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
618 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
620 Like _search_regex, but strips HTML tags and unescapes entities.
622 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
624 return clean_html(res).strip()
628 def _get_login_info(self):
630 Get the login info as (username, password)
631 It will look in the netrc file using the _NETRC_MACHINE value
632 If there's no info available, return (None, None)
634 if self._downloader is None:
639 downloader_params = self._downloader.params
641 # Attempt to use provided username and password or .netrc data
642 if downloader_params.get('username') is not None:
643 username = downloader_params['username']
644 password = downloader_params['password']
645 elif downloader_params.get('usenetrc', False):
647 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
652 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
653 except (IOError, netrc.NetrcParseError) as err:
654 self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err))
656 return (username, password)
658 def _get_tfa_info(self, note='two-factor verification code'):
660 Get the two-factor authentication info
661 TODO - asking the user will be required for sms/phone verify
662 currently just uses the command line option
663 If there's no info available, return None
665 if self._downloader is None:
667 downloader_params = self._downloader.params
669 if downloader_params.get('twofactor') is not None:
670 return downloader_params['twofactor']
672 return compat_getpass('Type %s and press [Return]: ' % note)
674 # Helper functions for extracting OpenGraph info
676 def _og_regexes(prop):
677 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
678 property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
679 % {'prop': re.escape(prop)})
680 template = r'<meta[^>]+?%s[^>]+?%s'
682 template % (property_re, content_re),
683 template % (content_re, property_re),
687 def _meta_regex(prop):
688 return r'''(?isx)<meta
689 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
690 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
692 def _og_search_property(self, prop, html, name=None, **kargs):
694 name = 'OpenGraph %s' % prop
695 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
698 return unescapeHTML(escaped)
700 def _og_search_thumbnail(self, html, **kargs):
701 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
703 def _og_search_description(self, html, **kargs):
704 return self._og_search_property('description', html, fatal=False, **kargs)
706 def _og_search_title(self, html, **kargs):
707 return self._og_search_property('title', html, **kargs)
709 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
710 regexes = self._og_regexes('video') + self._og_regexes('video:url')
712 regexes = self._og_regexes('video:secure_url') + regexes
713 return self._html_search_regex(regexes, html, name, **kargs)
715 def _og_search_url(self, html, **kargs):
716 return self._og_search_property('url', html, **kargs)
718 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
719 if display_name is None:
721 return self._html_search_regex(
722 self._meta_regex(name),
723 html, display_name, fatal=fatal, group='content', **kwargs)
725 def _dc_search_uploader(self, html):
726 return self._html_search_meta('dc.creator', html, 'uploader')
728 def _rta_search(self, html):
729 # See http://www.rtalabel.org/index.php?content=howtofaq#single
730 if re.search(r'(?ix)<meta\s+name="rating"\s+'
731 r' content="RTA-5042-1996-1400-1577-RTA"',
736 def _media_rating_search(self, html):
737 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
738 rating = self._html_search_meta('rating', html)
750 return RATING_TABLE.get(rating.lower())
752 def _family_friendly_search(self, html):
753 # See http://schema.org/VideoObject
754 family_friendly = self._html_search_meta('isFamilyFriendly', html)
756 if not family_friendly:
765 return RATING_TABLE.get(family_friendly.lower())
767 def _twitter_search_player(self, html):
768 return self._html_search_meta('twitter:player', html,
769 'twitter card player')
771 def _search_json_ld(self, html, video_id, **kwargs):
772 json_ld = self._search_regex(
773 r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
774 html, 'JSON-LD', group='json_ld', **kwargs)
777 return self._json_ld(json_ld, video_id, fatal=kwargs.get('fatal', True))
779 def _json_ld(self, json_ld, video_id, fatal=True):
780 if isinstance(json_ld, compat_str):
781 json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
785 if json_ld.get('@context') == 'http://schema.org':
786 item_type = json_ld.get('@type')
787 if item_type == 'TVEpisode':
789 'episode': unescapeHTML(json_ld.get('name')),
790 'episode_number': int_or_none(json_ld.get('episodeNumber')),
791 'description': unescapeHTML(json_ld.get('description')),
793 part_of_season = json_ld.get('partOfSeason')
794 if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
795 info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
796 part_of_series = json_ld.get('partOfSeries')
797 if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
798 info['series'] = unescapeHTML(part_of_series.get('name'))
799 elif item_type == 'Article':
801 'timestamp': parse_iso8601(json_ld.get('datePublished')),
802 'title': unescapeHTML(json_ld.get('headline')),
803 'description': unescapeHTML(json_ld.get('articleBody')),
805 return dict((k, v) for k, v in info.items() if v is not None)
808 def _hidden_inputs(html):
809 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
811 for input in re.findall(r'(?i)<input([^>]+)>', html):
812 if not re.search(r'type=(["\'])(?:hidden|submit)\1', input):
814 name = re.search(r'name=(["\'])(?P<value>.+?)\1', input)
817 value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
820 hidden_inputs[name.group('value')] = value.group('value')
823 def _form_hidden_inputs(self, form_id, html):
824 form = self._search_regex(
825 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
826 html, '%s form' % form_id, group='form')
827 return self._hidden_inputs(form)
829 def _sort_formats(self, formats, field_preference=None):
831 raise ExtractorError('No video formats found')
834 # Automatically determine tbr when missing based on abr and vbr (improves
835 # formats sorting in some cases)
836 if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
837 f['tbr'] = f['abr'] + f['vbr']
840 # TODO remove the following workaround
841 from ..utils import determine_ext
842 if not f.get('ext') and 'url' in f:
843 f['ext'] = determine_ext(f['url'])
845 if isinstance(field_preference, (list, tuple)):
846 return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
848 preference = f.get('preference')
849 if preference is None:
851 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
854 proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1
856 if f.get('vcodec') == 'none': # audio only
857 if self._downloader.params.get('prefer_free_formats'):
858 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
860 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
863 audio_ext_preference = ORDER.index(f['ext'])
865 audio_ext_preference = -1
867 if self._downloader.params.get('prefer_free_formats'):
868 ORDER = ['flv', 'mp4', 'webm']
870 ORDER = ['webm', 'flv', 'mp4']
872 ext_preference = ORDER.index(f['ext'])
875 audio_ext_preference = 0
879 f.get('language_preference') if f.get('language_preference') is not None else -1,
880 f.get('quality') if f.get('quality') is not None else -1,
881 f.get('tbr') if f.get('tbr') is not None else -1,
882 f.get('filesize') if f.get('filesize') is not None else -1,
883 f.get('vbr') if f.get('vbr') is not None else -1,
884 f.get('height') if f.get('height') is not None else -1,
885 f.get('width') if f.get('width') is not None else -1,
888 f.get('abr') if f.get('abr') is not None else -1,
889 audio_ext_preference,
890 f.get('fps') if f.get('fps') is not None else -1,
891 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
892 f.get('source_preference') if f.get('source_preference') is not None else -1,
893 f.get('format_id') if f.get('format_id') is not None else '',
895 formats.sort(key=_formats_key)
897 def _check_formats(self, formats, video_id):
900 lambda f: self._is_valid_url(
902 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
906 def _remove_duplicate_formats(formats):
910 if f['url'] not in format_urls:
911 format_urls.add(f['url'])
912 unique_formats.append(f)
913 formats[:] = unique_formats
915 def _is_valid_url(self, url, video_id, item='video'):
916 url = self._proto_relative_url(url, scheme='http:')
917 # For now assume non HTTP(S) URLs always valid
918 if not (url.startswith('http://') or url.startswith('https://')):
921 self._request_webpage(url, video_id, 'Checking %s URL' % item)
923 except ExtractorError as e:
924 if isinstance(e.cause, compat_urllib_error.URLError):
926 '%s: %s URL is invalid, skipping' % (video_id, item))
930 def http_scheme(self):
931 """ Either "http:" or "https:", depending on the user's preferences """
934 if self._downloader.params.get('prefer_insecure', False)
937 def _proto_relative_url(self, url, scheme=None):
940 if url.startswith('//'):
942 scheme = self.http_scheme()
947 def _sleep(self, timeout, video_id, msg_template=None):
948 if msg_template is None:
949 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
950 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
954 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
955 transform_source=lambda s: fix_xml_ampersands(s).strip(),
957 manifest = self._download_xml(
958 manifest_url, video_id, 'Downloading f4m manifest',
959 'Unable to download f4m manifest',
960 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
961 # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
962 transform_source=transform_source,
965 if manifest is False:
969 manifest_version = '1.0'
970 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
972 manifest_version = '2.0'
973 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
974 base_url = xpath_text(
975 manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
976 'base URL', default=None)
978 base_url = base_url.strip()
979 for i, media_el in enumerate(media_nodes):
980 if manifest_version == '2.0':
981 media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
985 media_url if media_url.startswith('http://') or media_url.startswith('https://')
986 else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
987 # If media_url is itself a f4m manifest do the recursive extraction
988 # since bitrates in parent manifest (this one) and media_url manifest
989 # may differ leading to inability to resolve the format by requested
990 # bitrate in f4m downloader
991 if determine_ext(manifest_url) == 'f4m':
992 formats.extend(self._extract_f4m_formats(
993 manifest_url, video_id, preference, f4m_id, fatal=fatal))
995 tbr = int_or_none(media_el.attrib.get('bitrate'))
997 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
1001 'width': int_or_none(media_el.attrib.get('width')),
1002 'height': int_or_none(media_el.attrib.get('height')),
1003 'preference': preference,
1005 self._sort_formats(formats)
1009 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1010 entry_protocol='m3u8', preference=None,
1011 m3u8_id=None, note=None, errnote=None,
1015 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1019 'preference': preference - 1 if preference else -1,
1020 'resolution': 'multiple',
1021 'format_note': 'Quality selection URL',
1024 format_url = lambda u: (
1026 if re.match(r'^https?://', u)
1027 else compat_urlparse.urljoin(m3u8_url, u))
1029 res = self._download_webpage_handle(
1031 note=note or 'Downloading m3u8 information',
1032 errnote=errnote or 'Failed to download m3u8 information',
1036 m3u8_doc, urlh = res
1037 m3u8_url = urlh.geturl()
1039 # We should try extracting formats only from master playlists [1], i.e.
1040 # playlists that describe available qualities. On the other hand media
1041 # playlists [2] should be returned as is since they contain just the media
1042 # without qualities renditions.
1043 # Fortunately, master playlist can be easily distinguished from media
1044 # playlist based on particular tags availability. As of [1, 2] master
1045 # playlist tags MUST NOT appear in a media playist and vice versa.
1046 # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
1047 # and MUST NOT appear in master playlist thus we can clearly detect media
1048 # playlist with this criterion.
1049 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
1050 # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
1051 # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
1052 if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
1055 'format_id': m3u8_id,
1057 'protocol': entry_protocol,
1058 'preference': preference,
1062 kv_rex = re.compile(
1063 r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
1064 for line in m3u8_doc.splitlines():
1065 if line.startswith('#EXT-X-STREAM-INF:'):
1067 for m in kv_rex.finditer(line):
1069 if v.startswith('"'):
1071 last_info[m.group('key')] = v
1072 elif line.startswith('#EXT-X-MEDIA:'):
1074 for m in kv_rex.finditer(line):
1076 if v.startswith('"'):
1078 last_media[m.group('key')] = v
1079 elif line.startswith('#') or not line.strip():
1082 if last_info is None:
1083 formats.append({'url': format_url(line)})
1085 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
1088 format_id.append(m3u8_id)
1089 last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
1090 format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
1092 'format_id': '-'.join(format_id),
1093 'url': format_url(line.strip()),
1096 'protocol': entry_protocol,
1097 'preference': preference,
1099 resolution = last_info.get('RESOLUTION')
1101 width_str, height_str = resolution.split('x')
1102 f['width'] = int(width_str)
1103 f['height'] = int(height_str)
1104 codecs = last_info.get('CODECS')
1106 vcodec, acodec = [None] * 2
1107 va_codecs = codecs.split(',')
1108 if len(va_codecs) == 1:
1109 # Audio only entries usually come with single codec and
1110 # no resolution. For more robustness we also check it to
1112 if not resolution and va_codecs[0].startswith('mp4a'):
1113 vcodec, acodec = 'none', va_codecs[0]
1115 vcodec = va_codecs[0]
1117 vcodec, acodec = va_codecs[:2]
1122 if last_media is not None:
1123 f['m3u8_media'] = last_media
1127 self._sort_formats(formats)
1131 def _xpath_ns(path, namespace=None):
1135 for c in path.split('/'):
1136 if not c or c == '.':
1139 out.append('{%s}%s' % (namespace, c))
1140 return '/'.join(out)
1142 def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None):
1143 smil = self._download_smil(smil_url, video_id, fatal=fatal)
1149 namespace = self._parse_smil_namespace(smil)
1151 return self._parse_smil_formats(
1152 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1154 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1155 smil = self._download_smil(smil_url, video_id, fatal=fatal)
1158 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1160 def _download_smil(self, smil_url, video_id, fatal=True):
1161 return self._download_xml(
1162 smil_url, video_id, 'Downloading SMIL file',
1163 'Unable to download SMIL file', fatal=fatal)
1165 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1166 namespace = self._parse_smil_namespace(smil)
1168 formats = self._parse_smil_formats(
1169 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1170 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1172 video_id = os.path.splitext(url_basename(smil_url))[0]
1176 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1177 name = meta.attrib.get('name')
1178 content = meta.attrib.get('content')
1179 if not name or not content:
1181 if not title and name == 'title':
1183 elif not description and name in ('description', 'abstract'):
1184 description = content
1185 elif not upload_date and name == 'date':
1186 upload_date = unified_strdate(content)
1189 'id': image.get('type'),
1190 'url': image.get('src'),
1191 'width': int_or_none(image.get('width')),
1192 'height': int_or_none(image.get('height')),
1193 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1197 'title': title or video_id,
1198 'description': description,
1199 'upload_date': upload_date,
1200 'thumbnails': thumbnails,
1202 'subtitles': subtitles,
1205 def _parse_smil_namespace(self, smil):
1206 return self._search_regex(
1207 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1209 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1211 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1212 b = meta.get('base') or meta.get('httpBase')
1223 videos = smil.findall(self._xpath_ns('.//video', namespace))
1224 for video in videos:
1225 src = video.get('src')
1226 if not src or src in srcs:
1230 bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
1231 filesize = int_or_none(video.get('size') or video.get('fileSize'))
1232 width = int_or_none(video.get('width'))
1233 height = int_or_none(video.get('height'))
1234 proto = video.get('proto')
1235 ext = video.get('ext')
1236 src_ext = determine_ext(src)
1237 streamer = video.get('streamer') or base
1239 if proto == 'rtmp' or streamer.startswith('rtmp'):
1245 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1247 'filesize': filesize,
1251 if transform_rtmp_url:
1252 streamer, src = transform_rtmp_url(streamer, src)
1253 formats[-1].update({
1259 src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1260 src_url = src_url.strip()
1262 if proto == 'm3u8' or src_ext == 'm3u8':
1263 m3u8_formats = self._extract_m3u8_formats(
1264 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1265 if len(m3u8_formats) == 1:
1267 m3u8_formats[0].update({
1268 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1273 formats.extend(m3u8_formats)
1276 if src_ext == 'f4m':
1281 'plugin': 'flowplayer-3.2.0.1',
1283 f4m_url += '&' if '?' in f4m_url else '?'
1284 f4m_url += compat_urllib_parse.urlencode(f4m_params)
1285 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1288 if src_url.startswith('http') and self._is_valid_url(src, video_id):
1292 'ext': ext or src_ext or 'flv',
1293 'format_id': 'http-%d' % (bitrate or http_count),
1295 'filesize': filesize,
1301 self._sort_formats(formats)
1305 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1308 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1309 src = textstream.get('src')
1310 if not src or src in urls:
1313 ext = textstream.get('ext') or determine_ext(src) or mimetype2ext(textstream.get('type'))
1314 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1315 subtitles.setdefault(lang, []).append({
1321 def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1322 xspf = self._download_xml(
1323 playlist_url, playlist_id, 'Downloading xpsf playlist',
1324 'Unable to download xspf manifest', fatal=fatal)
1327 return self._parse_xspf(xspf, playlist_id)
1329 def _parse_xspf(self, playlist, playlist_id):
1331 'xspf': 'http://xspf.org/ns/0/',
1332 's1': 'http://static.streamone.nl/player/ns/0',
1336 for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1338 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1339 description = xpath_text(
1340 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1341 thumbnail = xpath_text(
1342 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1343 duration = float_or_none(
1344 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1347 'url': location.text,
1348 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1349 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1350 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1351 } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1352 self._sort_formats(formats)
1357 'description': description,
1358 'thumbnail': thumbnail,
1359 'duration': duration,
1364 def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1365 res = self._download_webpage_handle(
1367 note=note or 'Downloading MPD manifest',
1368 errnote=errnote or 'Failed to download MPD manifest',
1373 mpd_base_url = re.match(r'https?://.+/', urlh.geturl()).group()
1375 return self._parse_mpd_formats(
1376 compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, formats_dict=formats_dict)
1378 def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}):
1379 if mpd_doc.get('type') == 'dynamic':
1382 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1385 return self._xpath_ns(path, namespace)
1387 def is_drm_protected(element):
1388 return element.find(_add_ns('ContentProtection')) is not None
1390 def extract_multisegment_info(element, ms_parent_info):
1391 ms_info = ms_parent_info.copy()
1392 segment_list = element.find(_add_ns('SegmentList'))
1393 if segment_list is not None:
1394 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1396 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1397 initialization = segment_list.find(_add_ns('Initialization'))
1398 if initialization is not None:
1399 ms_info['initialization_url'] = initialization.attrib['sourceURL']
1401 segment_template = element.find(_add_ns('SegmentTemplate'))
1402 if segment_template is not None:
1403 start_number = segment_template.get('startNumber')
1405 ms_info['start_number'] = int(start_number)
1406 segment_timeline = segment_template.find(_add_ns('SegmentTimeline'))
1407 if segment_timeline is not None:
1408 s_e = segment_timeline.findall(_add_ns('S'))
1410 ms_info['total_number'] = 0
1412 ms_info['total_number'] += 1 + int(s.get('r', '0'))
1414 timescale = segment_template.get('timescale')
1416 ms_info['timescale'] = int(timescale)
1417 segment_duration = segment_template.get('duration')
1418 if segment_duration:
1419 ms_info['segment_duration'] = int(segment_duration)
1420 media_template = segment_template.get('media')
1422 ms_info['media_template'] = media_template
1423 initialization = segment_template.get('initialization')
1425 ms_info['initialization_url'] = initialization
1427 initialization = segment_template.find(_add_ns('Initialization'))
1428 if initialization is not None:
1429 ms_info['initialization_url'] = initialization.attrib['sourceURL']
1432 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1434 for period in mpd_doc.findall(_add_ns('Period')):
1435 period_duration = parse_duration(period.get('duration')) or mpd_duration
1436 period_ms_info = extract_multisegment_info(period, {
1440 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1441 if is_drm_protected(adaptation_set):
1443 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1444 for representation in adaptation_set.findall(_add_ns('Representation')):
1445 if is_drm_protected(representation):
1447 representation_attrib = adaptation_set.attrib.copy()
1448 representation_attrib.update(representation.attrib)
1449 mime_type = representation_attrib.get('mimeType')
1450 content_type = mime_type.split('/')[0] if mime_type else representation_attrib.get('contentType')
1451 if content_type == 'text':
1452 # TODO implement WebVTT downloading
1454 elif content_type == 'video' or content_type == 'audio':
1456 for element in (representation, adaptation_set, period, mpd_doc):
1457 base_url_e = element.find(_add_ns('BaseURL'))
1458 if base_url_e is not None:
1459 base_url = base_url_e.text + base_url
1460 if re.match(r'^https?://', base_url):
1462 if mpd_base_url and not re.match(r'^https?://', base_url):
1463 if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1465 base_url = mpd_base_url + base_url
1466 representation_id = representation_attrib.get('id')
1467 lang = representation_attrib.get('lang')
1468 url_el = representation.find(_add_ns('BaseURL'))
1469 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1471 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1473 'width': int_or_none(representation_attrib.get('width')),
1474 'height': int_or_none(representation_attrib.get('height')),
1475 'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000),
1476 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1477 'fps': int_or_none(representation_attrib.get('frameRate')),
1478 'vcodec': 'none' if content_type == 'audio' else representation_attrib.get('codecs'),
1479 'acodec': 'none' if content_type == 'video' else representation_attrib.get('codecs'),
1480 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1481 'format_note': 'DASH %s' % content_type,
1482 'filesize': filesize,
1484 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1485 if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info:
1486 if 'total_number' not in representation_ms_info and 'segment_duration':
1487 segment_duration = float(representation_ms_info['segment_duration']) / float(representation_ms_info['timescale'])
1488 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1489 media_template = representation_ms_info['media_template']
1490 media_template = media_template.replace('$RepresentationID$', representation_id)
1491 media_template = re.sub(r'\$(Number|Bandwidth)(?:%(0\d+)d)?\$', r'%(\1)\2d', media_template)
1492 media_template.replace('$$', '$')
1493 representation_ms_info['segment_urls'] = [media_template % {'Number': segment_number, 'Bandwidth': representation_attrib.get('bandwidth')} for segment_number in range(representation_ms_info['start_number'], representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1494 if 'segment_urls' in representation_ms_info:
1496 'segment_urls': representation_ms_info['segment_urls'],
1497 'protocol': 'http_dash_segments',
1499 if 'initialization_url' in representation_ms_info:
1500 initialization_url = representation_ms_info['initialization_url'].replace('$RepresentationID$', representation_id)
1502 'initialization_url': initialization_url,
1504 if not f.get('url'):
1505 f['url'] = initialization_url
1507 existing_format = next(
1508 fo for fo in formats
1509 if fo['format_id'] == representation_id)
1510 except StopIteration:
1511 full_info = formats_dict.get(representation_id, {}).copy()
1513 formats.append(full_info)
1515 existing_format.update(f)
1517 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
1518 self._sort_formats(formats)
1521 def _live_title(self, name):
1522 """ Generate the title for a live video """
1523 now = datetime.datetime.now()
1524 now_str = now.strftime('%Y-%m-%d %H:%M')
1525 return name + ' ' + now_str
1527 def _int(self, v, name, fatal=False, **kwargs):
1528 res = int_or_none(v, **kwargs)
1529 if 'get_attr' in kwargs:
1530 print(getattr(v, kwargs['get_attr']))
1532 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1534 raise ExtractorError(msg)
1536 self._downloader.report_warning(msg)
1539 def _float(self, v, name, fatal=False, **kwargs):
1540 res = float_or_none(v, **kwargs)
1542 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1544 raise ExtractorError(msg)
1546 self._downloader.report_warning(msg)
1549 def _set_cookie(self, domain, name, value, expire_time=None):
1550 cookie = compat_cookiejar.Cookie(
1551 0, name, value, None, None, domain, None,
1552 None, '/', True, False, expire_time, '', None, None, None)
1553 self._downloader.cookiejar.set_cookie(cookie)
1555 def _get_cookies(self, url):
1556 """ Return a compat_cookies.SimpleCookie with the cookies for the url """
1557 req = sanitized_Request(url)
1558 self._downloader.cookiejar.add_cookie_header(req)
1559 return compat_cookies.SimpleCookie(req.get_header('Cookie'))
1561 def get_testcases(self, include_onlymatching=False):
1562 t = getattr(self, '_TEST', None)
1564 assert not hasattr(self, '_TESTS'), \
1565 '%s has _TEST and _TESTS' % type(self).__name__
1568 tests = getattr(self, '_TESTS', [])
1570 if not include_onlymatching and t.get('only_matching', False):
1572 t['name'] = type(self).__name__[:-len('IE')]
1575 def is_suitable(self, age_limit):
1576 """ Test whether the extractor is generally suitable for the given
1577 age limit (i.e. pornographic sites are not, all others usually are) """
1579 any_restricted = False
1580 for tc in self.get_testcases(include_onlymatching=False):
1581 if 'playlist' in tc:
1582 tc = tc['playlist'][0]
1583 is_restricted = age_restricted(
1584 tc.get('info_dict', {}).get('age_limit'), age_limit)
1585 if not is_restricted:
1587 any_restricted = any_restricted or is_restricted
1588 return not any_restricted
1590 def extract_subtitles(self, *args, **kwargs):
1591 if (self._downloader.params.get('writesubtitles', False) or
1592 self._downloader.params.get('listsubtitles')):
1593 return self._get_subtitles(*args, **kwargs)
1596 def _get_subtitles(self, *args, **kwargs):
1597 raise NotImplementedError('This method must be implemented by subclasses')
1600 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
1601 """ Merge subtitle items for one language. Items with duplicated URLs
1602 will be dropped. """
1603 list1_urls = set([item['url'] for item in subtitle_list1])
1604 ret = list(subtitle_list1)
1605 ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
1609 def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
1610 """ Merge two subtitle dictionaries, language by language. """
1611 ret = dict(subtitle_dict1)
1612 for lang in subtitle_dict2:
1613 ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
1616 def extract_automatic_captions(self, *args, **kwargs):
1617 if (self._downloader.params.get('writeautomaticsub', False) or
1618 self._downloader.params.get('listsubtitles')):
1619 return self._get_automatic_captions(*args, **kwargs)
1622 def _get_automatic_captions(self, *args, **kwargs):
1623 raise NotImplementedError('This method must be implemented by subclasses')
1625 def mark_watched(self, *args, **kwargs):
1626 if (self._downloader.params.get('mark_watched', False) and
1627 (self._get_login_info()[0] is not None or
1628 self._downloader.params.get('cookiefile') is not None)):
1629 self._mark_watched(*args, **kwargs)
1631 def _mark_watched(self, *args, **kwargs):
1632 raise NotImplementedError('This method must be implemented by subclasses')
1635 class SearchInfoExtractor(InfoExtractor):
1637 Base class for paged search queries extractors.
1638 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
1639 Instances should define _SEARCH_KEY and _MAX_RESULTS.
1643 def _make_valid_url(cls):
1644 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
1647 def suitable(cls, url):
1648 return re.match(cls._make_valid_url(), url) is not None
1650 def _real_extract(self, query):
1651 mobj = re.match(self._make_valid_url(), query)
1653 raise ExtractorError('Invalid search query "%s"' % query)
1655 prefix = mobj.group('prefix')
1656 query = mobj.group('query')
1658 return self._get_n_results(query, 1)
1659 elif prefix == 'all':
1660 return self._get_n_results(query, self._MAX_RESULTS)
1664 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
1665 elif n > self._MAX_RESULTS:
1666 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
1667 n = self._MAX_RESULTS
1668 return self._get_n_results(query, n)
1670 def _get_n_results(self, query, n):
1671 """Get a specified number of results for a query"""
1672 raise NotImplementedError('This method must be implemented by subclasses')
1675 def SEARCH_KEY(self):
1676 return self._SEARCH_KEY