1 from __future__ import unicode_literals
15 from ..compat import (
24 compat_etree_fromstring,
52 class InfoExtractor(object):
53 """Information Extractor class.
55 Information extractors are the classes that, given a URL, extract
56 information about the video (or videos) the URL refers to. This
57 information includes the real video URL, the video title, author and
58 others. The information is stored in a dictionary which is then
59 passed to the YoutubeDL. The YoutubeDL processes this
60 information possibly downloading the video to the file system, among
61 other possible outcomes.
63 The type field determines the type of the result.
64 By far the most common value (and the default if _type is missing) is
65 "video", which indicates a single video.
67 For a video, the dictionaries must include the following fields:
70 title: Video title, unescaped.
72 Additionally, it must contain either a formats entry or a url one:
74 formats: A list of dictionaries for each format available, ordered
75 from worst to best quality.
78 * url Mandatory. The URL of the video file
79 * ext Will be calculated from URL if missing
80 * format A human-readable description of the format
81 ("mp4 container with h264/opus").
82 Calculated from the format_id, width, height.
83 and format_note fields if missing.
84 * format_id A short description of the format
85 ("mp4_h264_opus" or "19").
86 Technically optional, but strongly recommended.
87 * format_note Additional info about the format
88 ("3D" or "DASH video")
89 * width Width of the video, if known
90 * height Height of the video, if known
91 * resolution Textual description of width and height
92 * tbr Average bitrate of audio and video in KBit/s
93 * abr Average audio bitrate in KBit/s
94 * acodec Name of the audio codec in use
95 * asr Audio sampling rate in Hertz
96 * vbr Average video bitrate in KBit/s
98 * vcodec Name of the video codec in use
99 * container Name of the container format
100 * filesize The number of bytes, if known in advance
101 * filesize_approx An estimate for the number of bytes
102 * player_url SWF Player URL (used for rtmpdump).
103 * protocol The protocol that will be used for the actual
104 download, lower-case.
105 "http", "https", "rtsp", "rtmp", "rtmpe",
106 "m3u8", or "m3u8_native".
107 * preference Order number of this format. If this field is
108 present and not None, the formats get sorted
109 by this field, regardless of all other values.
110 -1 for default (order by other properties),
111 -2 or smaller for less than default.
112 < -1000 to hide the format (if there is
113 another one which is strictly better)
114 * language Language code, e.g. "de" or "en-US".
115 * language_preference Is this in the language mentioned in
117 10 if it's what the URL is about,
118 -1 for default (don't know),
119 -10 otherwise, other values reserved for now.
120 * quality Order number of the video quality of this
121 format, irrespective of the file format.
122 -1 for default (order by other properties),
123 -2 or smaller for less than default.
124 * source_preference Order number for this video source
125 (quality takes higher priority)
126 -1 for default (order by other properties),
127 -2 or smaller for less than default.
128 * http_headers A dictionary of additional HTTP headers
129 to add to the request.
130 * stretched_ratio If given and not 1, indicates that the
131 video's pixels are not square.
132 width : height ratio as float.
133 * no_resume The server does not support resuming the
134 (HTTP or RTMP) download. Boolean.
136 url: Final video URL.
137 ext: Video filename extension.
138 format: The video format, defaults to ext (used for --get-format)
139 player_url: SWF Player URL (used for rtmpdump).
141 The following fields are optional:
143 alt_title: A secondary title of the video.
144 display_id An alternative identifier for the video, not necessarily
145 unique, but available before title. Typically, id is
146 something like "4234987", title "Dancing naked mole rats",
147 and display_id "dancing-naked-mole-rats"
148 thumbnails: A list of dictionaries, with the following entries:
149 * "id" (optional, string) - Thumbnail format ID
151 * "preference" (optional, int) - quality of the image
152 * "width" (optional, int)
153 * "height" (optional, int)
154 * "resolution" (optional, string "{width}x{height"},
156 thumbnail: Full URL to a video thumbnail image.
157 description: Full video description.
158 uploader: Full name of the video uploader.
159 creator: The main artist who created the video.
160 release_date: The date (YYYYMMDD) when the video was released.
161 timestamp: UNIX timestamp of the moment the video became available.
162 upload_date: Video upload date (YYYYMMDD).
163 If not explicitly set, calculated from timestamp.
164 uploader_id: Nickname or id of the video uploader.
165 location: Physical location where the video was filmed.
166 subtitles: The available subtitles as a dictionary in the format
167 {language: subformats}. "subformats" is a list sorted from
168 lower to higher preference, each element is a dictionary
169 with the "ext" entry and one of:
170 * "data": The subtitles file contents
171 * "url": A URL pointing to the subtitles file
172 "ext" will be calculated from URL if missing
173 automatic_captions: Like 'subtitles', used by the YoutubeIE for
174 automatically generated captions
175 duration: Length of the video in seconds, as an integer or float.
176 view_count: How many users have watched the video on the platform.
177 like_count: Number of positive ratings of the video
178 dislike_count: Number of negative ratings of the video
179 repost_count: Number of reposts of the video
180 average_rating: Average rating give by users, the scale used depends on the webpage
181 comment_count: Number of comments on the video
182 comments: A list of comments, each with one or more of the following
183 properties (all but one of text or html optional):
184 * "author" - human-readable name of the comment author
185 * "author_id" - user ID of the comment author
187 * "html" - Comment as HTML
188 * "text" - Plain text of the comment
189 * "timestamp" - UNIX timestamp of comment
190 * "parent" - ID of the comment this one is replying to.
191 Set to "root" to indicate that this is a
192 comment to the original video.
193 age_limit: Age restriction for the video, as an integer (years)
194 webpage_url: The URL to the video webpage, if given to youtube-dl it
195 should allow to get the same result again. (It will be set
196 by YoutubeDL if it's missing)
197 categories: A list of categories that the video falls in, for example
199 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
200 is_live: True, False, or None (=unknown). Whether this video is a
201 live stream that goes on instead of a fixed-length video.
202 start_time: Time in seconds where the reproduction should start, as
203 specified in the URL.
204 end_time: Time in seconds where the reproduction should end, as
205 specified in the URL.
207 The following fields should only be used when the video belongs to some logical
210 chapter: Name or title of the chapter the video belongs to.
211 chapter_number: Number of the chapter the video belongs to, as an integer.
212 chapter_id: Id of the chapter the video belongs to, as a unicode string.
214 The following fields should only be used when the video is an episode of some
217 series: Title of the series or programme the video episode belongs to.
218 season: Title of the season the video episode belongs to.
219 season_number: Number of the season the video episode belongs to, as an integer.
220 season_id: Id of the season the video episode belongs to, as a unicode string.
221 episode: Title of the video episode. Unlike mandatory video title field,
222 this field should denote the exact title of the video episode
223 without any kind of decoration.
224 episode_number: Number of the video episode within a season, as an integer.
225 episode_id: Id of the video episode, as a unicode string.
227 Unless mentioned otherwise, the fields should be Unicode strings.
229 Unless mentioned otherwise, None is equivalent to absence of information.
232 _type "playlist" indicates multiple videos.
233 There must be a key "entries", which is a list, an iterable, or a PagedList
234 object, each element of which is a valid dictionary by this specification.
236 Additionally, playlists can have "title", "description" and "id" attributes
237 with the same semantics as videos (see above).
240 _type "multi_video" indicates that there are multiple videos that
241 form a single show, for examples multiple acts of an opera or TV episode.
242 It must have an entries key like a playlist and contain all the keys
243 required for a video at the same time.
246 _type "url" indicates that the video must be extracted from another
247 location, possibly by a different extractor. Its only required key is:
248 "url" - the next URL to extract.
249 The key "ie_key" can be set to the class name (minus the trailing "IE",
250 e.g. "Youtube") if the extractor class is known in advance.
251 Additionally, the dictionary may have any properties of the resolved entity
252 known in advance, for example "title" if the title of the referred video is
256 _type "url_transparent" entities have the same specification as "url", but
257 indicate that the given additional information is more precise than the one
258 associated with the resolved URL.
259 This is useful when a site employs a video service that hosts the video and
260 its technical metadata, but that video service does not embed a useful
261 title, description etc.
264 Subclasses of this one should re-define the _real_initialize() and
265 _real_extract() methods and define a _VALID_URL regexp.
266 Probably, they should also be added to the list of extractors.
268 Finally, the _WORKING attribute should be set to False for broken IEs
269 in order to warn the users and skip the tests.
276 def __init__(self, downloader=None):
277 """Constructor. Receives an optional downloader."""
279 self.set_downloader(downloader)
282 def suitable(cls, url):
283 """Receives a URL and returns True if suitable for this IE."""
285 # This does not use has/getattr intentionally - we want to know whether
286 # we have cached the regexp for *this* class, whereas getattr would also
287 # match the superclass
288 if '_VALID_URL_RE' not in cls.__dict__:
289 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
290 return cls._VALID_URL_RE.match(url) is not None
293 def _match_id(cls, url):
294 if '_VALID_URL_RE' not in cls.__dict__:
295 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
296 m = cls._VALID_URL_RE.match(url)
302 """Getter method for _WORKING."""
305 def initialize(self):
306 """Initializes an instance (authentication, etc)."""
308 self._real_initialize()
311 def extract(self, url):
312 """Extracts URL information and returns it in list of dicts."""
315 return self._real_extract(url)
316 except ExtractorError:
318 except compat_http_client.IncompleteRead as e:
319 raise ExtractorError('A network error has occurred.', cause=e, expected=True)
320 except (KeyError, StopIteration) as e:
321 raise ExtractorError('An extractor error has occurred.', cause=e)
323 def set_downloader(self, downloader):
324 """Sets the downloader for this IE."""
325 self._downloader = downloader
327 def _real_initialize(self):
328 """Real initialization process. Redefine in subclasses."""
331 def _real_extract(self, url):
332 """Real extraction process. Redefine in subclasses."""
337 """A string for getting the InfoExtractor with get_info_extractor"""
338 return compat_str(cls.__name__[:-2])
342 return compat_str(type(self).__name__[:-2])
344 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
345 """ Returns the response handle """
347 self.report_download_webpage(video_id)
348 elif note is not False:
350 self.to_screen('%s' % (note,))
352 self.to_screen('%s: %s' % (video_id, note))
354 return self._downloader.urlopen(url_or_request)
355 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
359 errnote = 'Unable to download webpage'
361 errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
363 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
365 self._downloader.report_warning(errmsg)
368 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None):
369 """ Returns a tuple (page content as string, URL handle) """
370 # Strip hashes from the URL (#1038)
371 if isinstance(url_or_request, (compat_str, str)):
372 url_or_request = url_or_request.partition('#')[0]
374 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
378 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
379 return (content, urlh)
382 def _guess_encoding_from_content(content_type, webpage_bytes):
383 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
385 encoding = m.group(1)
387 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
388 webpage_bytes[:1024])
390 encoding = m.group(1).decode('ascii')
391 elif webpage_bytes.startswith(b'\xff\xfe'):
398 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
399 content_type = urlh.headers.get('Content-Type', '')
400 webpage_bytes = urlh.read()
401 if prefix is not None:
402 webpage_bytes = prefix + webpage_bytes
404 encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
405 if self._downloader.params.get('dump_intermediate_pages', False):
407 url = url_or_request.get_full_url()
408 except AttributeError:
410 self.to_screen('Dumping request to ' + url)
411 dump = base64.b64encode(webpage_bytes).decode('ascii')
412 self._downloader.to_screen(dump)
413 if self._downloader.params.get('write_pages', False):
415 url = url_or_request.get_full_url()
416 except AttributeError:
418 basen = '%s_%s' % (video_id, url)
420 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
421 basen = basen[:240 - len(h)] + h
422 raw_filename = basen + '.dump'
423 filename = sanitize_filename(raw_filename, restricted=True)
424 self.to_screen('Saving request to ' + filename)
425 # Working around MAX_PATH limitation on Windows (see
426 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
428 absfilepath = os.path.abspath(filename)
429 if len(absfilepath) > 259:
430 filename = '\\\\?\\' + absfilepath
431 with open(filename, 'wb') as outf:
432 outf.write(webpage_bytes)
435 content = webpage_bytes.decode(encoding, 'replace')
437 content = webpage_bytes.decode('utf-8', 'replace')
439 if ('<title>Access to this site is blocked</title>' in content and
440 'Websense' in content[:512]):
441 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
442 blocked_iframe = self._html_search_regex(
443 r'<iframe src="([^"]+)"', content,
444 'Websense information URL', default=None)
446 msg += ' Visit %s for more details' % blocked_iframe
447 raise ExtractorError(msg, expected=True)
448 if '<title>The URL you requested has been blocked</title>' in content[:512]:
450 'Access to this webpage has been blocked by Indian censorship. '
451 'Use a VPN or proxy server (with --proxy) to route around it.')
452 block_msg = self._html_search_regex(
453 r'</h1><p>(.*?)</p>',
454 content, 'block message', default=None)
456 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
457 raise ExtractorError(msg, expected=True)
461 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None):
462 """ Returns the data of the page as a string """
465 while success is False:
467 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding)
469 except compat_http_client.IncompleteRead as e:
471 if try_count >= tries:
473 self._sleep(timeout, video_id)
480 def _download_xml(self, url_or_request, video_id,
481 note='Downloading XML', errnote='Unable to download XML',
482 transform_source=None, fatal=True, encoding=None):
483 """Return the xml as an xml.etree.ElementTree.Element"""
484 xml_string = self._download_webpage(
485 url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding)
486 if xml_string is False:
489 xml_string = transform_source(xml_string)
490 return compat_etree_fromstring(xml_string.encode('utf-8'))
492 def _download_json(self, url_or_request, video_id,
493 note='Downloading JSON metadata',
494 errnote='Unable to download JSON metadata',
495 transform_source=None,
496 fatal=True, encoding=None):
497 json_string = self._download_webpage(
498 url_or_request, video_id, note, errnote, fatal=fatal,
500 if (not fatal) and json_string is False:
502 return self._parse_json(
503 json_string, video_id, transform_source=transform_source, fatal=fatal)
505 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
507 json_string = transform_source(json_string)
509 return json.loads(json_string)
510 except ValueError as ve:
511 errmsg = '%s: Failed to parse JSON ' % video_id
513 raise ExtractorError(errmsg, cause=ve)
515 self.report_warning(errmsg + str(ve))
517 def report_warning(self, msg, video_id=None):
518 idstr = '' if video_id is None else '%s: ' % video_id
519 self._downloader.report_warning(
520 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
522 def to_screen(self, msg):
523 """Print msg to screen, prefixing it with '[ie_name]'"""
524 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
526 def report_extraction(self, id_or_name):
527 """Report information extraction."""
528 self.to_screen('%s: Extracting information' % id_or_name)
530 def report_download_webpage(self, video_id):
531 """Report webpage download."""
532 self.to_screen('%s: Downloading webpage' % video_id)
534 def report_age_confirmation(self):
535 """Report attempt to confirm age."""
536 self.to_screen('Confirming age')
538 def report_login(self):
539 """Report attempt to log in."""
540 self.to_screen('Logging in')
543 def raise_login_required(msg='This video is only available for registered users'):
544 raise ExtractorError(
545 '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
549 def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'):
550 raise ExtractorError(
551 '%s. You might want to use --proxy to workaround.' % msg,
554 # Methods for following #608
556 def url_result(url, ie=None, video_id=None, video_title=None):
557 """Returns a URL that points to a page that should be processed"""
558 # TODO: ie should be the class used for getting the info
559 video_info = {'_type': 'url',
562 if video_id is not None:
563 video_info['id'] = video_id
564 if video_title is not None:
565 video_info['title'] = video_title
569 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
570 """Returns a playlist"""
571 video_info = {'_type': 'playlist',
574 video_info['id'] = playlist_id
576 video_info['title'] = playlist_title
577 if playlist_description:
578 video_info['description'] = playlist_description
581 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
583 Perform a regex search on the given string, using a single or a list of
584 patterns returning the first matching group.
585 In case of failure return a default value or raise a WARNING or a
586 RegexNotFoundError, depending on fatal, specifying the field name.
588 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
589 mobj = re.search(pattern, string, flags)
592 mobj = re.search(p, string, flags)
596 if not self._downloader.params.get('no_color') and os.name != 'nt' and sys.stderr.isatty():
597 _name = '\033[0;34m%s\033[0m' % name
603 # return the first matching group
604 return next(g for g in mobj.groups() if g is not None)
606 return mobj.group(group)
607 elif default is not NO_DEFAULT:
610 raise RegexNotFoundError('Unable to extract %s' % _name)
612 self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
615 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
617 Like _search_regex, but strips HTML tags and unescapes entities.
619 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
621 return clean_html(res).strip()
625 def _get_login_info(self):
627 Get the login info as (username, password)
628 It will look in the netrc file using the _NETRC_MACHINE value
629 If there's no info available, return (None, None)
631 if self._downloader is None:
636 downloader_params = self._downloader.params
638 # Attempt to use provided username and password or .netrc data
639 if downloader_params.get('username') is not None:
640 username = downloader_params['username']
641 password = downloader_params['password']
642 elif downloader_params.get('usenetrc', False):
644 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
649 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
650 except (IOError, netrc.NetrcParseError) as err:
651 self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err))
653 return (username, password)
655 def _get_tfa_info(self, note='two-factor verification code'):
657 Get the two-factor authentication info
658 TODO - asking the user will be required for sms/phone verify
659 currently just uses the command line option
660 If there's no info available, return None
662 if self._downloader is None:
664 downloader_params = self._downloader.params
666 if downloader_params.get('twofactor') is not None:
667 return downloader_params['twofactor']
669 return compat_getpass('Type %s and press [Return]: ' % note)
671 # Helper functions for extracting OpenGraph info
673 def _og_regexes(prop):
674 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
675 property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
676 % {'prop': re.escape(prop)})
677 template = r'<meta[^>]+?%s[^>]+?%s'
679 template % (property_re, content_re),
680 template % (content_re, property_re),
684 def _meta_regex(prop):
685 return r'''(?isx)<meta
686 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
687 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
689 def _og_search_property(self, prop, html, name=None, **kargs):
691 name = 'OpenGraph %s' % prop
692 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
695 return unescapeHTML(escaped)
697 def _og_search_thumbnail(self, html, **kargs):
698 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
700 def _og_search_description(self, html, **kargs):
701 return self._og_search_property('description', html, fatal=False, **kargs)
703 def _og_search_title(self, html, **kargs):
704 return self._og_search_property('title', html, **kargs)
706 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
707 regexes = self._og_regexes('video') + self._og_regexes('video:url')
709 regexes = self._og_regexes('video:secure_url') + regexes
710 return self._html_search_regex(regexes, html, name, **kargs)
712 def _og_search_url(self, html, **kargs):
713 return self._og_search_property('url', html, **kargs)
715 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
716 if display_name is None:
718 return self._html_search_regex(
719 self._meta_regex(name),
720 html, display_name, fatal=fatal, group='content', **kwargs)
722 def _dc_search_uploader(self, html):
723 return self._html_search_meta('dc.creator', html, 'uploader')
725 def _rta_search(self, html):
726 # See http://www.rtalabel.org/index.php?content=howtofaq#single
727 if re.search(r'(?ix)<meta\s+name="rating"\s+'
728 r' content="RTA-5042-1996-1400-1577-RTA"',
733 def _media_rating_search(self, html):
734 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
735 rating = self._html_search_meta('rating', html)
747 return RATING_TABLE.get(rating.lower())
749 def _family_friendly_search(self, html):
750 # See http://schema.org/VideoObject
751 family_friendly = self._html_search_meta('isFamilyFriendly', html)
753 if not family_friendly:
762 return RATING_TABLE.get(family_friendly.lower())
764 def _twitter_search_player(self, html):
765 return self._html_search_meta('twitter:player', html,
766 'twitter card player')
768 def _search_json_ld(self, html, video_id, **kwargs):
769 json_ld = self._search_regex(
770 r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
771 html, 'JSON-LD', group='json_ld', **kwargs)
774 return self._json_ld(json_ld, video_id, fatal=kwargs.get('fatal', True))
776 def _json_ld(self, json_ld, video_id, fatal=True):
777 if isinstance(json_ld, compat_str):
778 json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
782 if json_ld.get('@context') == 'http://schema.org':
783 item_type = json_ld.get('@type')
784 if item_type == 'TVEpisode':
786 'episode': unescapeHTML(json_ld.get('name')),
787 'episode_number': int_or_none(json_ld.get('episodeNumber')),
788 'description': unescapeHTML(json_ld.get('description')),
790 part_of_season = json_ld.get('partOfSeason')
791 if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
792 info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
793 part_of_series = json_ld.get('partOfSeries')
794 if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
795 info['series'] = unescapeHTML(part_of_series.get('name'))
796 elif item_type == 'Article':
798 'timestamp': parse_iso8601(json_ld.get('datePublished')),
799 'title': unescapeHTML(json_ld.get('headline')),
800 'description': unescapeHTML(json_ld.get('articleBody')),
802 return dict((k, v) for k, v in info.items() if v is not None)
805 def _hidden_inputs(html):
806 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
808 for input in re.findall(r'(?i)<input([^>]+)>', html):
809 if not re.search(r'type=(["\'])(?:hidden|submit)\1', input):
811 name = re.search(r'name=(["\'])(?P<value>.+?)\1', input)
814 value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
817 hidden_inputs[name.group('value')] = value.group('value')
820 def _form_hidden_inputs(self, form_id, html):
821 form = self._search_regex(
822 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
823 html, '%s form' % form_id, group='form')
824 return self._hidden_inputs(form)
826 def _sort_formats(self, formats, field_preference=None):
828 raise ExtractorError('No video formats found')
831 # Automatically determine tbr when missing based on abr and vbr (improves
832 # formats sorting in some cases)
833 if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
834 f['tbr'] = f['abr'] + f['vbr']
837 # TODO remove the following workaround
838 from ..utils import determine_ext
839 if not f.get('ext') and 'url' in f:
840 f['ext'] = determine_ext(f['url'])
842 if isinstance(field_preference, (list, tuple)):
843 return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
845 preference = f.get('preference')
846 if preference is None:
848 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
851 proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1
853 if f.get('vcodec') == 'none': # audio only
854 if self._downloader.params.get('prefer_free_formats'):
855 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
857 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
860 audio_ext_preference = ORDER.index(f['ext'])
862 audio_ext_preference = -1
864 if self._downloader.params.get('prefer_free_formats'):
865 ORDER = ['flv', 'mp4', 'webm']
867 ORDER = ['webm', 'flv', 'mp4']
869 ext_preference = ORDER.index(f['ext'])
872 audio_ext_preference = 0
876 f.get('language_preference') if f.get('language_preference') is not None else -1,
877 f.get('quality') if f.get('quality') is not None else -1,
878 f.get('tbr') if f.get('tbr') is not None else -1,
879 f.get('filesize') if f.get('filesize') is not None else -1,
880 f.get('vbr') if f.get('vbr') is not None else -1,
881 f.get('height') if f.get('height') is not None else -1,
882 f.get('width') if f.get('width') is not None else -1,
885 f.get('abr') if f.get('abr') is not None else -1,
886 audio_ext_preference,
887 f.get('fps') if f.get('fps') is not None else -1,
888 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
889 f.get('source_preference') if f.get('source_preference') is not None else -1,
890 f.get('format_id') if f.get('format_id') is not None else '',
892 formats.sort(key=_formats_key)
894 def _check_formats(self, formats, video_id):
897 lambda f: self._is_valid_url(
899 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
902 def _is_valid_url(self, url, video_id, item='video'):
903 url = self._proto_relative_url(url, scheme='http:')
904 # For now assume non HTTP(S) URLs always valid
905 if not (url.startswith('http://') or url.startswith('https://')):
908 self._request_webpage(url, video_id, 'Checking %s URL' % item)
910 except ExtractorError as e:
911 if isinstance(e.cause, compat_urllib_error.URLError):
913 '%s: %s URL is invalid, skipping' % (video_id, item))
917 def http_scheme(self):
918 """ Either "http:" or "https:", depending on the user's preferences """
921 if self._downloader.params.get('prefer_insecure', False)
924 def _proto_relative_url(self, url, scheme=None):
927 if url.startswith('//'):
929 scheme = self.http_scheme()
934 def _sleep(self, timeout, video_id, msg_template=None):
935 if msg_template is None:
936 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
937 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
941 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
942 transform_source=lambda s: fix_xml_ampersands(s).strip(),
944 manifest = self._download_xml(
945 manifest_url, video_id, 'Downloading f4m manifest',
946 'Unable to download f4m manifest',
947 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
948 # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
949 transform_source=transform_source,
952 if manifest is False:
956 manifest_version = '1.0'
957 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
959 manifest_version = '2.0'
960 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
961 base_url = xpath_text(
962 manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
963 'base URL', default=None)
965 base_url = base_url.strip()
966 for i, media_el in enumerate(media_nodes):
967 if manifest_version == '2.0':
968 media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
972 media_url if media_url.startswith('http://') or media_url.startswith('https://')
973 else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
974 # If media_url is itself a f4m manifest do the recursive extraction
975 # since bitrates in parent manifest (this one) and media_url manifest
976 # may differ leading to inability to resolve the format by requested
977 # bitrate in f4m downloader
978 if determine_ext(manifest_url) == 'f4m':
979 formats.extend(self._extract_f4m_formats(
980 manifest_url, video_id, preference, f4m_id, fatal=fatal))
982 tbr = int_or_none(media_el.attrib.get('bitrate'))
984 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
988 'width': int_or_none(media_el.attrib.get('width')),
989 'height': int_or_none(media_el.attrib.get('height')),
990 'preference': preference,
992 self._sort_formats(formats)
996 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
997 entry_protocol='m3u8', preference=None,
998 m3u8_id=None, note=None, errnote=None,
1002 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1006 'preference': preference - 1 if preference else -1,
1007 'resolution': 'multiple',
1008 'format_note': 'Quality selection URL',
1011 format_url = lambda u: (
1013 if re.match(r'^https?://', u)
1014 else compat_urlparse.urljoin(m3u8_url, u))
1016 res = self._download_webpage_handle(
1018 note=note or 'Downloading m3u8 information',
1019 errnote=errnote or 'Failed to download m3u8 information',
1023 m3u8_doc, urlh = res
1024 m3u8_url = urlh.geturl()
1025 # A Media Playlist Tag MUST NOT appear in a Master Playlist
1026 # https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
1027 # The EXT-X-TARGETDURATION tag is REQUIRED for every M3U8 Media Playlists
1028 # https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
1029 if '#EXT-X-TARGETDURATION' in m3u8_doc:
1032 'format_id': m3u8_id,
1034 'protocol': entry_protocol,
1035 'preference': preference,
1039 kv_rex = re.compile(
1040 r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
1041 for line in m3u8_doc.splitlines():
1042 if line.startswith('#EXT-X-STREAM-INF:'):
1044 for m in kv_rex.finditer(line):
1046 if v.startswith('"'):
1048 last_info[m.group('key')] = v
1049 elif line.startswith('#EXT-X-MEDIA:'):
1051 for m in kv_rex.finditer(line):
1053 if v.startswith('"'):
1055 last_media[m.group('key')] = v
1056 elif line.startswith('#') or not line.strip():
1059 if last_info is None:
1060 formats.append({'url': format_url(line)})
1062 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
1065 format_id.append(m3u8_id)
1066 last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
1067 format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
1069 'format_id': '-'.join(format_id),
1070 'url': format_url(line.strip()),
1073 'protocol': entry_protocol,
1074 'preference': preference,
1076 codecs = last_info.get('CODECS')
1078 # TODO: looks like video codec is not always necessarily goes first
1079 va_codecs = codecs.split(',')
1081 f['vcodec'] = va_codecs[0]
1082 if len(va_codecs) > 1 and va_codecs[1]:
1083 f['acodec'] = va_codecs[1]
1084 resolution = last_info.get('RESOLUTION')
1086 width_str, height_str = resolution.split('x')
1087 f['width'] = int(width_str)
1088 f['height'] = int(height_str)
1089 if last_media is not None:
1090 f['m3u8_media'] = last_media
1094 self._sort_formats(formats)
1098 def _xpath_ns(path, namespace=None):
1102 for c in path.split('/'):
1103 if not c or c == '.':
1106 out.append('{%s}%s' % (namespace, c))
1107 return '/'.join(out)
1109 def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None):
1110 smil = self._download_smil(smil_url, video_id, fatal=fatal)
1116 namespace = self._parse_smil_namespace(smil)
1118 return self._parse_smil_formats(
1119 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1121 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1122 smil = self._download_smil(smil_url, video_id, fatal=fatal)
1125 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1127 def _download_smil(self, smil_url, video_id, fatal=True):
1128 return self._download_xml(
1129 smil_url, video_id, 'Downloading SMIL file',
1130 'Unable to download SMIL file', fatal=fatal)
1132 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1133 namespace = self._parse_smil_namespace(smil)
1135 formats = self._parse_smil_formats(
1136 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1137 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1139 video_id = os.path.splitext(url_basename(smil_url))[0]
1143 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1144 name = meta.attrib.get('name')
1145 content = meta.attrib.get('content')
1146 if not name or not content:
1148 if not title and name == 'title':
1150 elif not description and name in ('description', 'abstract'):
1151 description = content
1152 elif not upload_date and name == 'date':
1153 upload_date = unified_strdate(content)
1156 'id': image.get('type'),
1157 'url': image.get('src'),
1158 'width': int_or_none(image.get('width')),
1159 'height': int_or_none(image.get('height')),
1160 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1164 'title': title or video_id,
1165 'description': description,
1166 'upload_date': upload_date,
1167 'thumbnails': thumbnails,
1169 'subtitles': subtitles,
1172 def _parse_smil_namespace(self, smil):
1173 return self._search_regex(
1174 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1176 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1178 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1179 b = meta.get('base') or meta.get('httpBase')
1190 videos = smil.findall(self._xpath_ns('.//video', namespace))
1191 for video in videos:
1192 src = video.get('src')
1193 if not src or src in srcs:
1197 bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
1198 filesize = int_or_none(video.get('size') or video.get('fileSize'))
1199 width = int_or_none(video.get('width'))
1200 height = int_or_none(video.get('height'))
1201 proto = video.get('proto')
1202 ext = video.get('ext')
1203 src_ext = determine_ext(src)
1204 streamer = video.get('streamer') or base
1206 if proto == 'rtmp' or streamer.startswith('rtmp'):
1212 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1214 'filesize': filesize,
1218 if transform_rtmp_url:
1219 streamer, src = transform_rtmp_url(streamer, src)
1220 formats[-1].update({
1226 src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1227 src_url = src_url.strip()
1229 if proto == 'm3u8' or src_ext == 'm3u8':
1230 m3u8_formats = self._extract_m3u8_formats(
1231 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1232 if len(m3u8_formats) == 1:
1234 m3u8_formats[0].update({
1235 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1240 formats.extend(m3u8_formats)
1243 if src_ext == 'f4m':
1248 'plugin': 'flowplayer-3.2.0.1',
1250 f4m_url += '&' if '?' in f4m_url else '?'
1251 f4m_url += compat_urllib_parse.urlencode(f4m_params)
1252 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1255 if src_url.startswith('http') and self._is_valid_url(src, video_id):
1259 'ext': ext or src_ext or 'flv',
1260 'format_id': 'http-%d' % (bitrate or http_count),
1262 'filesize': filesize,
1268 self._sort_formats(formats)
1272 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1275 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1276 src = textstream.get('src')
1277 if not src or src in urls:
1280 ext = textstream.get('ext') or determine_ext(src)
1282 type_ = textstream.get('type')
1286 'application/smptett+xml': 'tt',
1288 if type_ in SUBTITLES_TYPES:
1289 ext = SUBTITLES_TYPES[type_]
1290 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1291 subtitles.setdefault(lang, []).append({
1297 def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1298 xspf = self._download_xml(
1299 playlist_url, playlist_id, 'Downloading xpsf playlist',
1300 'Unable to download xspf manifest', fatal=fatal)
1303 return self._parse_xspf(xspf, playlist_id)
1305 def _parse_xspf(self, playlist, playlist_id):
1307 'xspf': 'http://xspf.org/ns/0/',
1308 's1': 'http://static.streamone.nl/player/ns/0',
1312 for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1314 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1315 description = xpath_text(
1316 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1317 thumbnail = xpath_text(
1318 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1319 duration = float_or_none(
1320 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1323 'url': location.text,
1324 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1325 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1326 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1327 } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1328 self._sort_formats(formats)
1333 'description': description,
1334 'thumbnail': thumbnail,
1335 'duration': duration,
1340 def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1341 res = self._download_webpage_handle(
1343 note=note or 'Downloading MPD manifest',
1344 errnote=errnote or 'Failed to download MPD manifest',
1349 mpd_base_url = re.match(r'https?://.+/', urlh.geturl()).group()
1351 return self._parse_mpd_formats(
1352 compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, formats_dict=formats_dict)
1354 def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}):
1355 if mpd_doc.get('type') == 'dynamic':
1358 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1361 return self._xpath_ns(path, namespace)
1363 def is_drm_protected(element):
1364 return element.find(_add_ns('ContentProtection')) is not None
1366 def extract_multisegment_info(element, ms_parent_info):
1367 ms_info = ms_parent_info.copy()
1368 segment_list = element.find(_add_ns('SegmentList'))
1369 if segment_list is not None:
1370 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1372 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1373 initialization = segment_list.find(_add_ns('Initialization'))
1374 if initialization is not None:
1375 ms_info['initialization_url'] = initialization.attrib['sourceURL']
1377 segment_template = element.find(_add_ns('SegmentTemplate'))
1378 if segment_template is not None:
1379 start_number = segment_template.get('startNumber')
1381 ms_info['start_number'] = int(start_number)
1382 segment_timeline = segment_template.find(_add_ns('SegmentTimeline'))
1383 if segment_timeline is not None:
1384 s_e = segment_timeline.findall(_add_ns('S'))
1386 ms_info['total_number'] = 0
1388 ms_info['total_number'] += 1 + int(s.get('r', '0'))
1390 timescale = segment_template.get('timescale')
1392 ms_info['timescale'] = int(timescale)
1393 segment_duration = segment_template.get('duration')
1394 if segment_duration:
1395 ms_info['segment_duration'] = int(segment_duration)
1396 media_template = segment_template.get('media')
1398 ms_info['media_template'] = media_template
1399 initialization = segment_template.get('initialization')
1401 ms_info['initialization_url'] = initialization
1403 initialization = segment_template.find(_add_ns('Initialization'))
1404 if initialization is not None:
1405 ms_info['initialization_url'] = initialization.attrib['sourceURL']
1408 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1410 for period in mpd_doc.findall(_add_ns('Period')):
1411 period_duration = parse_duration(period.get('duration')) or mpd_duration
1412 period_ms_info = extract_multisegment_info(period, {
1416 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1417 if is_drm_protected(adaptation_set):
1419 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1420 for representation in adaptation_set.findall(_add_ns('Representation')):
1421 if is_drm_protected(representation):
1423 representation_attrib = adaptation_set.attrib.copy()
1424 representation_attrib.update(representation.attrib)
1425 mime_type = representation_attrib.get('mimeType')
1426 content_type = mime_type.split('/')[0] if mime_type else representation_attrib.get('contentType')
1427 if content_type == 'text':
1428 # TODO implement WebVTT downloading
1430 elif content_type == 'video' or content_type == 'audio':
1432 for element in (representation, adaptation_set, period, mpd_doc):
1433 base_url_e = element.find(_add_ns('BaseURL'))
1434 if base_url_e is not None:
1435 base_url = base_url_e.text + base_url
1436 if re.match(r'^https?://', base_url):
1438 if mpd_base_url and not re.match(r'^https?://', base_url):
1439 if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1441 base_url = mpd_base_url + base_url
1442 representation_id = representation_attrib.get('id')
1443 lang = representation_attrib.get('lang')
1444 url_el = representation.find(_add_ns('BaseURL'))
1445 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1447 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1449 'width': int_or_none(representation_attrib.get('width')),
1450 'height': int_or_none(representation_attrib.get('height')),
1451 'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000),
1452 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1453 'fps': int_or_none(representation_attrib.get('frameRate')),
1454 'vcodec': 'none' if content_type == 'audio' else representation_attrib.get('codecs'),
1455 'acodec': 'none' if content_type == 'video' else representation_attrib.get('codecs'),
1456 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1457 'format_note': 'DASH %s' % content_type,
1458 'filesize': filesize,
1460 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1461 if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info:
1462 if 'total_number' not in representation_ms_info and 'segment_duration':
1463 segment_duration = float(representation_ms_info['segment_duration']) / float(representation_ms_info['timescale'])
1464 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1465 media_template = representation_ms_info['media_template']
1466 media_template = media_template.replace('$RepresentationID$', representation_id)
1467 media_template = re.sub(r'\$(Number|Bandwidth)(?:%(0\d+)d)?\$', r'%(\1)\2d', media_template)
1468 media_template.replace('$$', '$')
1469 representation_ms_info['segment_urls'] = [media_template % {'Number': segment_number, 'Bandwidth': representation_attrib.get('bandwidth')} for segment_number in range(representation_ms_info['start_number'], representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1470 if 'segment_urls' in representation_ms_info:
1472 'segment_urls': representation_ms_info['segment_urls'],
1473 'protocol': 'http_dash_segments',
1475 if 'initialization_url' in representation_ms_info:
1476 initialization_url = representation_ms_info['initialization_url'].replace('$RepresentationID$', representation_id)
1478 'initialization_url': initialization_url,
1480 if not f.get('url'):
1481 f['url'] = initialization_url
1483 existing_format = next(
1484 fo for fo in formats
1485 if fo['format_id'] == representation_id)
1486 except StopIteration:
1487 full_info = formats_dict.get(representation_id, {}).copy()
1489 formats.append(full_info)
1491 existing_format.update(f)
1493 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
1494 self._sort_formats(formats)
1497 def _live_title(self, name):
1498 """ Generate the title for a live video """
1499 now = datetime.datetime.now()
1500 now_str = now.strftime('%Y-%m-%d %H:%M')
1501 return name + ' ' + now_str
1503 def _int(self, v, name, fatal=False, **kwargs):
1504 res = int_or_none(v, **kwargs)
1505 if 'get_attr' in kwargs:
1506 print(getattr(v, kwargs['get_attr']))
1508 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1510 raise ExtractorError(msg)
1512 self._downloader.report_warning(msg)
1515 def _float(self, v, name, fatal=False, **kwargs):
1516 res = float_or_none(v, **kwargs)
1518 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1520 raise ExtractorError(msg)
1522 self._downloader.report_warning(msg)
1525 def _set_cookie(self, domain, name, value, expire_time=None):
1526 cookie = compat_cookiejar.Cookie(
1527 0, name, value, None, None, domain, None,
1528 None, '/', True, False, expire_time, '', None, None, None)
1529 self._downloader.cookiejar.set_cookie(cookie)
1531 def _get_cookies(self, url):
1532 """ Return a compat_cookies.SimpleCookie with the cookies for the url """
1533 req = sanitized_Request(url)
1534 self._downloader.cookiejar.add_cookie_header(req)
1535 return compat_cookies.SimpleCookie(req.get_header('Cookie'))
1537 def get_testcases(self, include_onlymatching=False):
1538 t = getattr(self, '_TEST', None)
1540 assert not hasattr(self, '_TESTS'), \
1541 '%s has _TEST and _TESTS' % type(self).__name__
1544 tests = getattr(self, '_TESTS', [])
1546 if not include_onlymatching and t.get('only_matching', False):
1548 t['name'] = type(self).__name__[:-len('IE')]
1551 def is_suitable(self, age_limit):
1552 """ Test whether the extractor is generally suitable for the given
1553 age limit (i.e. pornographic sites are not, all others usually are) """
1555 any_restricted = False
1556 for tc in self.get_testcases(include_onlymatching=False):
1557 if 'playlist' in tc:
1558 tc = tc['playlist'][0]
1559 is_restricted = age_restricted(
1560 tc.get('info_dict', {}).get('age_limit'), age_limit)
1561 if not is_restricted:
1563 any_restricted = any_restricted or is_restricted
1564 return not any_restricted
1566 def extract_subtitles(self, *args, **kwargs):
1567 if (self._downloader.params.get('writesubtitles', False) or
1568 self._downloader.params.get('listsubtitles')):
1569 return self._get_subtitles(*args, **kwargs)
1572 def _get_subtitles(self, *args, **kwargs):
1573 raise NotImplementedError('This method must be implemented by subclasses')
1576 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
1577 """ Merge subtitle items for one language. Items with duplicated URLs
1578 will be dropped. """
1579 list1_urls = set([item['url'] for item in subtitle_list1])
1580 ret = list(subtitle_list1)
1581 ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
1585 def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
1586 """ Merge two subtitle dictionaries, language by language. """
1587 ret = dict(subtitle_dict1)
1588 for lang in subtitle_dict2:
1589 ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
1592 def extract_automatic_captions(self, *args, **kwargs):
1593 if (self._downloader.params.get('writeautomaticsub', False) or
1594 self._downloader.params.get('listsubtitles')):
1595 return self._get_automatic_captions(*args, **kwargs)
1598 def _get_automatic_captions(self, *args, **kwargs):
1599 raise NotImplementedError('This method must be implemented by subclasses')
1602 class SearchInfoExtractor(InfoExtractor):
1604 Base class for paged search queries extractors.
1605 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
1606 Instances should define _SEARCH_KEY and _MAX_RESULTS.
1610 def _make_valid_url(cls):
1611 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
1614 def suitable(cls, url):
1615 return re.match(cls._make_valid_url(), url) is not None
1617 def _real_extract(self, query):
1618 mobj = re.match(self._make_valid_url(), query)
1620 raise ExtractorError('Invalid search query "%s"' % query)
1622 prefix = mobj.group('prefix')
1623 query = mobj.group('query')
1625 return self._get_n_results(query, 1)
1626 elif prefix == 'all':
1627 return self._get_n_results(query, self._MAX_RESULTS)
1631 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
1632 elif n > self._MAX_RESULTS:
1633 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
1634 n = self._MAX_RESULTS
1635 return self._get_n_results(query, n)
1637 def _get_n_results(self, query, n):
1638 """Get a specified number of results for a query"""
1639 raise NotImplementedError('This method must be implemented by subclasses')
1642 def SEARCH_KEY(self):
1643 return self._SEARCH_KEY