1 from __future__ import unicode_literals
13 import xml.etree.ElementTree
15 from ..compat import (
22 compat_urllib_parse_urlparse,
23 compat_urllib_request,
47 class InfoExtractor(object):
48 """Information Extractor class.
50 Information extractors are the classes that, given a URL, extract
51 information about the video (or videos) the URL refers to. This
52 information includes the real video URL, the video title, author and
53 others. The information is stored in a dictionary which is then
54 passed to the YoutubeDL. The YoutubeDL processes this
55 information possibly downloading the video to the file system, among
56 other possible outcomes.
58 The type field determines the type of the result.
59 By far the most common value (and the default if _type is missing) is
60 "video", which indicates a single video.
62 For a video, the dictionaries must include the following fields:
65 title: Video title, unescaped.
67 Additionally, it must contain either a formats entry or a url one:
69 formats: A list of dictionaries for each format available, ordered
70 from worst to best quality.
73 * url Mandatory. The URL of the video file
74 * ext Will be calculated from URL if missing
75 * format A human-readable description of the format
76 ("mp4 container with h264/opus").
77 Calculated from the format_id, width, height.
78 and format_note fields if missing.
79 * format_id A short description of the format
80 ("mp4_h264_opus" or "19").
81 Technically optional, but strongly recommended.
82 * format_note Additional info about the format
83 ("3D" or "DASH video")
84 * width Width of the video, if known
85 * height Height of the video, if known
86 * resolution Textual description of width and height
87 * tbr Average bitrate of audio and video in KBit/s
88 * abr Average audio bitrate in KBit/s
89 * acodec Name of the audio codec in use
90 * asr Audio sampling rate in Hertz
91 * vbr Average video bitrate in KBit/s
93 * vcodec Name of the video codec in use
94 * container Name of the container format
95 * filesize The number of bytes, if known in advance
96 * filesize_approx An estimate for the number of bytes
97 * player_url SWF Player URL (used for rtmpdump).
98 * protocol The protocol that will be used for the actual
100 "http", "https", "rtsp", "rtmp", "rtmpe",
101 "m3u8", or "m3u8_native".
102 * preference Order number of this format. If this field is
103 present and not None, the formats get sorted
104 by this field, regardless of all other values.
105 -1 for default (order by other properties),
106 -2 or smaller for less than default.
107 < -1000 to hide the format (if there is
108 another one which is strictly better)
109 * language_preference Is this in the correct requested
111 10 if it's what the URL is about,
112 -1 for default (don't know),
113 -10 otherwise, other values reserved for now.
114 * quality Order number of the video quality of this
115 format, irrespective of the file format.
116 -1 for default (order by other properties),
117 -2 or smaller for less than default.
118 * source_preference Order number for this video source
119 (quality takes higher priority)
120 -1 for default (order by other properties),
121 -2 or smaller for less than default.
122 * http_headers A dictionary of additional HTTP headers
123 to add to the request.
124 * stretched_ratio If given and not 1, indicates that the
125 video's pixels are not square.
126 width : height ratio as float.
127 * no_resume The server does not support resuming the
128 (HTTP or RTMP) download. Boolean.
130 url: Final video URL.
131 ext: Video filename extension.
132 format: The video format, defaults to ext (used for --get-format)
133 player_url: SWF Player URL (used for rtmpdump).
135 The following fields are optional:
137 alt_title: A secondary title of the video.
138 display_id An alternative identifier for the video, not necessarily
139 unique, but available before title. Typically, id is
140 something like "4234987", title "Dancing naked mole rats",
141 and display_id "dancing-naked-mole-rats"
142 thumbnails: A list of dictionaries, with the following entries:
143 * "id" (optional, string) - Thumbnail format ID
145 * "preference" (optional, int) - quality of the image
146 * "width" (optional, int)
147 * "height" (optional, int)
148 * "resolution" (optional, string "{width}x{height"},
150 thumbnail: Full URL to a video thumbnail image.
151 description: Full video description.
152 uploader: Full name of the video uploader.
153 creator: The main artist who created the video.
154 timestamp: UNIX timestamp of the moment the video became available.
155 upload_date: Video upload date (YYYYMMDD).
156 If not explicitly set, calculated from timestamp.
157 uploader_id: Nickname or id of the video uploader.
158 location: Physical location where the video was filmed.
159 subtitles: The available subtitles as a dictionary in the format
160 {language: subformats}. "subformats" is a list sorted from
161 lower to higher preference, each element is a dictionary
162 with the "ext" entry and one of:
163 * "data": The subtitles file contents
164 * "url": A URL pointing to the subtitles file
165 automatic_captions: Like 'subtitles', used by the YoutubeIE for
166 automatically generated captions
167 duration: Length of the video in seconds, as an integer.
168 view_count: How many users have watched the video on the platform.
169 like_count: Number of positive ratings of the video
170 dislike_count: Number of negative ratings of the video
171 average_rating: Average rating give by users, the scale used depends on the webpage
172 comment_count: Number of comments on the video
173 comments: A list of comments, each with one or more of the following
174 properties (all but one of text or html optional):
175 * "author" - human-readable name of the comment author
176 * "author_id" - user ID of the comment author
178 * "html" - Comment as HTML
179 * "text" - Plain text of the comment
180 * "timestamp" - UNIX timestamp of comment
181 * "parent" - ID of the comment this one is replying to.
182 Set to "root" to indicate that this is a
183 comment to the original video.
184 age_limit: Age restriction for the video, as an integer (years)
185 webpage_url: The URL to the video webpage, if given to youtube-dl it
186 should allow to get the same result again. (It will be set
187 by YoutubeDL if it's missing)
188 categories: A list of categories that the video falls in, for example
190 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
191 is_live: True, False, or None (=unknown). Whether this video is a
192 live stream that goes on instead of a fixed-length video.
193 start_time: Time in seconds where the reproduction should start, as
194 specified in the URL.
195 end_time: Time in seconds where the reproduction should end, as
196 specified in the URL.
198 Unless mentioned otherwise, the fields should be Unicode strings.
200 Unless mentioned otherwise, None is equivalent to absence of information.
203 _type "playlist" indicates multiple videos.
204 There must be a key "entries", which is a list, an iterable, or a PagedList
205 object, each element of which is a valid dictionary by this specification.
207 Additionally, playlists can have "title", "description" and "id" attributes
208 with the same semantics as videos (see above).
211 _type "multi_video" indicates that there are multiple videos that
212 form a single show, for examples multiple acts of an opera or TV episode.
213 It must have an entries key like a playlist and contain all the keys
214 required for a video at the same time.
217 _type "url" indicates that the video must be extracted from another
218 location, possibly by a different extractor. Its only required key is:
219 "url" - the next URL to extract.
220 The key "ie_key" can be set to the class name (minus the trailing "IE",
221 e.g. "Youtube") if the extractor class is known in advance.
222 Additionally, the dictionary may have any properties of the resolved entity
223 known in advance, for example "title" if the title of the referred video is
227 _type "url_transparent" entities have the same specification as "url", but
228 indicate that the given additional information is more precise than the one
229 associated with the resolved URL.
230 This is useful when a site employs a video service that hosts the video and
231 its technical metadata, but that video service does not embed a useful
232 title, description etc.
235 Subclasses of this one should re-define the _real_initialize() and
236 _real_extract() methods and define a _VALID_URL regexp.
237 Probably, they should also be added to the list of extractors.
239 Finally, the _WORKING attribute should be set to False for broken IEs
240 in order to warn the users and skip the tests.
247 def __init__(self, downloader=None):
248 """Constructor. Receives an optional downloader."""
250 self.set_downloader(downloader)
253 def suitable(cls, url):
254 """Receives a URL and returns True if suitable for this IE."""
256 # This does not use has/getattr intentionally - we want to know whether
257 # we have cached the regexp for *this* class, whereas getattr would also
258 # match the superclass
259 if '_VALID_URL_RE' not in cls.__dict__:
260 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
261 return cls._VALID_URL_RE.match(url) is not None
264 def _match_id(cls, url):
265 if '_VALID_URL_RE' not in cls.__dict__:
266 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
267 m = cls._VALID_URL_RE.match(url)
273 """Getter method for _WORKING."""
276 def initialize(self):
277 """Initializes an instance (authentication, etc)."""
279 self._real_initialize()
282 def extract(self, url):
283 """Extracts URL information and returns it in list of dicts."""
286 return self._real_extract(url)
287 except ExtractorError:
289 except compat_http_client.IncompleteRead as e:
290 raise ExtractorError('A network error has occured.', cause=e, expected=True)
291 except (KeyError, StopIteration) as e:
292 raise ExtractorError('An extractor error has occured.', cause=e)
294 def set_downloader(self, downloader):
295 """Sets the downloader for this IE."""
296 self._downloader = downloader
298 def _real_initialize(self):
299 """Real initialization process. Redefine in subclasses."""
302 def _real_extract(self, url):
303 """Real extraction process. Redefine in subclasses."""
308 """A string for getting the InfoExtractor with get_info_extractor"""
309 return cls.__name__[:-2]
313 return type(self).__name__[:-2]
315 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
316 """ Returns the response handle """
318 self.report_download_webpage(video_id)
319 elif note is not False:
321 self.to_screen('%s' % (note,))
323 self.to_screen('%s: %s' % (video_id, note))
325 return self._downloader.urlopen(url_or_request)
326 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
330 errnote = 'Unable to download webpage'
331 errmsg = '%s: %s' % (errnote, compat_str(err))
333 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
335 self._downloader.report_warning(errmsg)
338 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None):
339 """ Returns a tuple (page content as string, URL handle) """
340 # Strip hashes from the URL (#1038)
341 if isinstance(url_or_request, (compat_str, str)):
342 url_or_request = url_or_request.partition('#')[0]
344 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
348 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
349 return (content, urlh)
352 def _guess_encoding_from_content(content_type, webpage_bytes):
353 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
355 encoding = m.group(1)
357 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
358 webpage_bytes[:1024])
360 encoding = m.group(1).decode('ascii')
361 elif webpage_bytes.startswith(b'\xff\xfe'):
368 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
369 content_type = urlh.headers.get('Content-Type', '')
370 webpage_bytes = urlh.read()
371 if prefix is not None:
372 webpage_bytes = prefix + webpage_bytes
374 encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
375 if self._downloader.params.get('dump_intermediate_pages', False):
377 url = url_or_request.get_full_url()
378 except AttributeError:
380 self.to_screen('Dumping request to ' + url)
381 dump = base64.b64encode(webpage_bytes).decode('ascii')
382 self._downloader.to_screen(dump)
383 if self._downloader.params.get('write_pages', False):
385 url = url_or_request.get_full_url()
386 except AttributeError:
388 basen = '%s_%s' % (video_id, url)
390 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
391 basen = basen[:240 - len(h)] + h
392 raw_filename = basen + '.dump'
393 filename = sanitize_filename(raw_filename, restricted=True)
394 self.to_screen('Saving request to ' + filename)
395 # Working around MAX_PATH limitation on Windows (see
396 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
398 absfilepath = os.path.abspath(filename)
399 if len(absfilepath) > 259:
400 filename = '\\\\?\\' + absfilepath
401 with open(filename, 'wb') as outf:
402 outf.write(webpage_bytes)
405 content = webpage_bytes.decode(encoding, 'replace')
407 content = webpage_bytes.decode('utf-8', 'replace')
409 if ('<title>Access to this site is blocked</title>' in content and
410 'Websense' in content[:512]):
411 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
412 blocked_iframe = self._html_search_regex(
413 r'<iframe src="([^"]+)"', content,
414 'Websense information URL', default=None)
416 msg += ' Visit %s for more details' % blocked_iframe
417 raise ExtractorError(msg, expected=True)
418 if '<title>The URL you requested has been blocked</title>' in content[:512]:
420 'Access to this webpage has been blocked by Indian censorship. '
421 'Use a VPN or proxy server (with --proxy) to route around it.')
422 block_msg = self._html_search_regex(
423 r'</h1><p>(.*?)</p>',
424 content, 'block message', default=None)
426 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
427 raise ExtractorError(msg, expected=True)
431 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None):
432 """ Returns the data of the page as a string """
435 while success is False:
437 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding)
439 except compat_http_client.IncompleteRead as e:
441 if try_count >= tries:
443 self._sleep(timeout, video_id)
450 def _download_xml(self, url_or_request, video_id,
451 note='Downloading XML', errnote='Unable to download XML',
452 transform_source=None, fatal=True, encoding=None):
453 """Return the xml as an xml.etree.ElementTree.Element"""
454 xml_string = self._download_webpage(
455 url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding)
456 if xml_string is False:
459 xml_string = transform_source(xml_string)
460 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
462 def _download_json(self, url_or_request, video_id,
463 note='Downloading JSON metadata',
464 errnote='Unable to download JSON metadata',
465 transform_source=None,
466 fatal=True, encoding=None):
467 json_string = self._download_webpage(
468 url_or_request, video_id, note, errnote, fatal=fatal,
470 if (not fatal) and json_string is False:
472 return self._parse_json(
473 json_string, video_id, transform_source=transform_source, fatal=fatal)
475 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
477 json_string = transform_source(json_string)
479 return json.loads(json_string)
480 except ValueError as ve:
481 errmsg = '%s: Failed to parse JSON ' % video_id
483 raise ExtractorError(errmsg, cause=ve)
485 self.report_warning(errmsg + str(ve))
487 def report_warning(self, msg, video_id=None):
488 idstr = '' if video_id is None else '%s: ' % video_id
489 self._downloader.report_warning(
490 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
492 def to_screen(self, msg):
493 """Print msg to screen, prefixing it with '[ie_name]'"""
494 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
496 def report_extraction(self, id_or_name):
497 """Report information extraction."""
498 self.to_screen('%s: Extracting information' % id_or_name)
500 def report_download_webpage(self, video_id):
501 """Report webpage download."""
502 self.to_screen('%s: Downloading webpage' % video_id)
504 def report_age_confirmation(self):
505 """Report attempt to confirm age."""
506 self.to_screen('Confirming age')
508 def report_login(self):
509 """Report attempt to log in."""
510 self.to_screen('Logging in')
512 # Methods for following #608
514 def url_result(url, ie=None, video_id=None, video_title=None):
515 """Returns a URL that points to a page that should be processed"""
516 # TODO: ie should be the class used for getting the info
517 video_info = {'_type': 'url',
520 if video_id is not None:
521 video_info['id'] = video_id
522 if video_title is not None:
523 video_info['title'] = video_title
527 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
528 """Returns a playlist"""
529 video_info = {'_type': 'playlist',
532 video_info['id'] = playlist_id
534 video_info['title'] = playlist_title
535 if playlist_description:
536 video_info['description'] = playlist_description
539 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
541 Perform a regex search on the given string, using a single or a list of
542 patterns returning the first matching group.
543 In case of failure return a default value or raise a WARNING or a
544 RegexNotFoundError, depending on fatal, specifying the field name.
546 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
547 mobj = re.search(pattern, string, flags)
550 mobj = re.search(p, string, flags)
554 if not self._downloader.params.get('no_color') and os.name != 'nt' and sys.stderr.isatty():
555 _name = '\033[0;34m%s\033[0m' % name
561 # return the first matching group
562 return next(g for g in mobj.groups() if g is not None)
564 return mobj.group(group)
565 elif default is not NO_DEFAULT:
568 raise RegexNotFoundError('Unable to extract %s' % _name)
570 self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
573 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
575 Like _search_regex, but strips HTML tags and unescapes entities.
577 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
579 return clean_html(res).strip()
583 def _get_login_info(self):
585 Get the login info as (username, password)
586 It will look in the netrc file using the _NETRC_MACHINE value
587 If there's no info available, return (None, None)
589 if self._downloader is None:
594 downloader_params = self._downloader.params
596 # Attempt to use provided username and password or .netrc data
597 if downloader_params.get('username', None) is not None:
598 username = downloader_params['username']
599 password = downloader_params['password']
600 elif downloader_params.get('usenetrc', False):
602 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
607 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
608 except (IOError, netrc.NetrcParseError) as err:
609 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
611 return (username, password)
613 def _get_tfa_info(self):
615 Get the two-factor authentication info
616 TODO - asking the user will be required for sms/phone verify
617 currently just uses the command line option
618 If there's no info available, return None
620 if self._downloader is None:
622 downloader_params = self._downloader.params
624 if downloader_params.get('twofactor', None) is not None:
625 return downloader_params['twofactor']
629 # Helper functions for extracting OpenGraph info
631 def _og_regexes(prop):
632 content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
633 property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
634 template = r'<meta[^>]+?%s[^>]+?%s'
636 template % (property_re, content_re),
637 template % (content_re, property_re),
641 def _meta_regex(prop):
642 return r'''(?isx)<meta
643 (?=[^>]+(?:itemprop|name|property|id)=(["\']?)%s\1)
644 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
646 def _og_search_property(self, prop, html, name=None, **kargs):
648 name = 'OpenGraph %s' % prop
649 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
652 return unescapeHTML(escaped)
654 def _og_search_thumbnail(self, html, **kargs):
655 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
657 def _og_search_description(self, html, **kargs):
658 return self._og_search_property('description', html, fatal=False, **kargs)
660 def _og_search_title(self, html, **kargs):
661 return self._og_search_property('title', html, **kargs)
663 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
664 regexes = self._og_regexes('video') + self._og_regexes('video:url')
666 regexes = self._og_regexes('video:secure_url') + regexes
667 return self._html_search_regex(regexes, html, name, **kargs)
669 def _og_search_url(self, html, **kargs):
670 return self._og_search_property('url', html, **kargs)
672 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
673 if display_name is None:
675 return self._html_search_regex(
676 self._meta_regex(name),
677 html, display_name, fatal=fatal, group='content', **kwargs)
679 def _dc_search_uploader(self, html):
680 return self._html_search_meta('dc.creator', html, 'uploader')
682 def _rta_search(self, html):
683 # See http://www.rtalabel.org/index.php?content=howtofaq#single
684 if re.search(r'(?ix)<meta\s+name="rating"\s+'
685 r' content="RTA-5042-1996-1400-1577-RTA"',
690 def _media_rating_search(self, html):
691 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
692 rating = self._html_search_meta('rating', html)
704 return RATING_TABLE.get(rating.lower(), None)
706 def _family_friendly_search(self, html):
707 # See http://schema.org/VideoObject
708 family_friendly = self._html_search_meta('isFamilyFriendly', html)
710 if not family_friendly:
719 return RATING_TABLE.get(family_friendly.lower(), None)
721 def _twitter_search_player(self, html):
722 return self._html_search_meta('twitter:player', html,
723 'twitter card player')
726 def _hidden_inputs(html):
728 (input.group('name'), input.group('value')) for input in re.finditer(
731 type=(?P<q_hidden>["\'])hidden(?P=q_hidden)\s+
732 name=(?P<q_name>["\'])(?P<name>.+?)(?P=q_name)\s+
733 (?:id=(?P<q_id>["\']).+?(?P=q_id)\s+)?
734 value=(?P<q_value>["\'])(?P<value>.*?)(?P=q_value)
738 def _form_hidden_inputs(self, form_id, html):
739 form = self._search_regex(
740 r'(?s)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
741 html, '%s form' % form_id, group='form')
742 return self._hidden_inputs(form)
744 def _sort_formats(self, formats, field_preference=None):
746 raise ExtractorError('No video formats found')
749 # TODO remove the following workaround
750 from ..utils import determine_ext
751 if not f.get('ext') and 'url' in f:
752 f['ext'] = determine_ext(f['url'])
754 if isinstance(field_preference, (list, tuple)):
755 return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
757 preference = f.get('preference')
758 if preference is None:
759 proto = f.get('protocol')
761 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
763 preference = 0 if proto in ['http', 'https'] else -0.1
764 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
767 if f.get('vcodec') == 'none': # audio only
768 if self._downloader.params.get('prefer_free_formats'):
769 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
771 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
774 audio_ext_preference = ORDER.index(f['ext'])
776 audio_ext_preference = -1
778 if self._downloader.params.get('prefer_free_formats'):
779 ORDER = ['flv', 'mp4', 'webm']
781 ORDER = ['webm', 'flv', 'mp4']
783 ext_preference = ORDER.index(f['ext'])
786 audio_ext_preference = 0
790 f.get('language_preference') if f.get('language_preference') is not None else -1,
791 f.get('quality') if f.get('quality') is not None else -1,
792 f.get('tbr') if f.get('tbr') is not None else -1,
793 f.get('filesize') if f.get('filesize') is not None else -1,
794 f.get('vbr') if f.get('vbr') is not None else -1,
795 f.get('height') if f.get('height') is not None else -1,
796 f.get('width') if f.get('width') is not None else -1,
798 f.get('abr') if f.get('abr') is not None else -1,
799 audio_ext_preference,
800 f.get('fps') if f.get('fps') is not None else -1,
801 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
802 f.get('source_preference') if f.get('source_preference') is not None else -1,
803 f.get('format_id') if f.get('format_id') is not None else '',
805 formats.sort(key=_formats_key)
807 def _check_formats(self, formats, video_id):
810 lambda f: self._is_valid_url(
812 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
815 def _is_valid_url(self, url, video_id, item='video'):
816 url = self._proto_relative_url(url, scheme='http:')
817 # For now assume non HTTP(S) URLs always valid
818 if not (url.startswith('http://') or url.startswith('https://')):
821 self._request_webpage(url, video_id, 'Checking %s URL' % item)
823 except ExtractorError as e:
824 if isinstance(e.cause, compat_HTTPError):
826 '%s: %s URL is invalid, skipping' % (video_id, item))
830 def http_scheme(self):
831 """ Either "http:" or "https:", depending on the user's preferences """
834 if self._downloader.params.get('prefer_insecure', False)
837 def _proto_relative_url(self, url, scheme=None):
840 if url.startswith('//'):
842 scheme = self.http_scheme()
847 def _sleep(self, timeout, video_id, msg_template=None):
848 if msg_template is None:
849 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
850 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
854 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
855 transform_source=lambda s: fix_xml_ampersands(s).strip()):
856 manifest = self._download_xml(
857 manifest_url, video_id, 'Downloading f4m manifest',
858 'Unable to download f4m manifest',
859 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
860 # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
861 transform_source=transform_source)
864 manifest_version = '1.0'
865 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
867 manifest_version = '2.0'
868 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
869 for i, media_el in enumerate(media_nodes):
870 if manifest_version == '2.0':
871 media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
875 media_url if media_url.startswith('http://') or media_url.startswith('https://')
876 else ('/'.join(manifest_url.split('/')[:-1]) + '/' + media_url))
877 # If media_url is itself a f4m manifest do the recursive extraction
878 # since bitrates in parent manifest (this one) and media_url manifest
879 # may differ leading to inability to resolve the format by requested
880 # bitrate in f4m downloader
881 if determine_ext(manifest_url) == 'f4m':
882 formats.extend(self._extract_f4m_formats(manifest_url, video_id, preference, f4m_id))
884 tbr = int_or_none(media_el.attrib.get('bitrate'))
886 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
890 'width': int_or_none(media_el.attrib.get('width')),
891 'height': int_or_none(media_el.attrib.get('height')),
892 'preference': preference,
894 self._sort_formats(formats)
898 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
899 entry_protocol='m3u8', preference=None,
900 m3u8_id=None, note=None, errnote=None,
904 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
908 'preference': preference - 1 if preference else -1,
909 'resolution': 'multiple',
910 'format_note': 'Quality selection URL',
913 format_url = lambda u: (
915 if re.match(r'^https?://', u)
916 else compat_urlparse.urljoin(m3u8_url, u))
918 m3u8_doc = self._download_webpage(
920 note=note or 'Downloading m3u8 information',
921 errnote=errnote or 'Failed to download m3u8 information',
923 if m3u8_doc is False:
928 r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
929 for line in m3u8_doc.splitlines():
930 if line.startswith('#EXT-X-STREAM-INF:'):
932 for m in kv_rex.finditer(line):
934 if v.startswith('"'):
936 last_info[m.group('key')] = v
937 elif line.startswith('#EXT-X-MEDIA:'):
939 for m in kv_rex.finditer(line):
941 if v.startswith('"'):
943 last_media[m.group('key')] = v
944 elif line.startswith('#') or not line.strip():
947 if last_info is None:
948 formats.append({'url': format_url(line)})
950 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
953 format_id.append(m3u8_id)
954 last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
955 format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
957 'format_id': '-'.join(format_id),
958 'url': format_url(line.strip()),
961 'protocol': entry_protocol,
962 'preference': preference,
964 codecs = last_info.get('CODECS')
966 # TODO: looks like video codec is not always necessarily goes first
967 va_codecs = codecs.split(',')
969 f['vcodec'] = va_codecs[0].partition('.')[0]
970 if len(va_codecs) > 1 and va_codecs[1]:
971 f['acodec'] = va_codecs[1].partition('.')[0]
972 resolution = last_info.get('RESOLUTION')
974 width_str, height_str = resolution.split('x')
975 f['width'] = int(width_str)
976 f['height'] = int(height_str)
977 if last_media is not None:
978 f['m3u8_media'] = last_media
982 self._sort_formats(formats)
986 def _xpath_ns(path, namespace=None):
990 for c in path.split('/'):
991 if not c or c == '.':
994 out.append('{%s}%s' % (namespace, c))
997 def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None):
998 smil = self._download_smil(smil_url, video_id, fatal=fatal)
1004 namespace = self._parse_smil_namespace(smil)
1006 return self._parse_smil_formats(
1007 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1009 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1010 smil = self._download_smil(smil_url, video_id, fatal=fatal)
1013 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1015 def _download_smil(self, smil_url, video_id, fatal=True):
1016 return self._download_xml(
1017 smil_url, video_id, 'Downloading SMIL file',
1018 'Unable to download SMIL file', fatal=fatal)
1020 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1021 namespace = self._parse_smil_namespace(smil)
1023 formats = self._parse_smil_formats(
1024 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1025 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1027 video_id = os.path.splitext(url_basename(smil_url))[0]
1030 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1031 name = meta.attrib.get('name')
1032 content = meta.attrib.get('content')
1033 if not name or not content:
1035 if not title and name == 'title':
1037 elif not description and name in ('description', 'abstract'):
1038 description = content
1042 'title': title or video_id,
1043 'description': description,
1045 'subtitles': subtitles,
1048 def _parse_smil_namespace(self, smil):
1049 return self._search_regex(
1050 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1052 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None):
1054 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1055 b = meta.get('base') or meta.get('httpBase')
1064 videos = smil.findall(self._xpath_ns('.//video', namespace))
1065 for video in videos:
1066 src = video.get('src')
1070 bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
1071 filesize = int_or_none(video.get('size') or video.get('fileSize'))
1072 width = int_or_none(video.get('width'))
1073 height = int_or_none(video.get('height'))
1074 proto = video.get('proto')
1075 ext = video.get('ext')
1076 src_ext = determine_ext(src)
1077 streamer = video.get('streamer') or base
1079 if proto == 'rtmp' or streamer.startswith('rtmp'):
1085 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1087 'filesize': filesize,
1093 src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1095 if proto == 'm3u8' or src_ext == 'm3u8':
1096 formats.extend(self._extract_m3u8_formats(
1097 src_url, video_id, ext or 'mp4', m3u8_id='hls'))
1100 if src_ext == 'f4m':
1105 'plugin': 'flowplayer-3.2.0.1',
1107 f4m_url += '&' if '?' in f4m_url else '?'
1108 f4m_url += compat_urllib_parse.urlencode(f4m_params)
1109 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds'))
1112 if src_url.startswith('http'):
1116 'ext': ext or src_ext or 'flv',
1117 'format_id': 'http-%d' % (bitrate or http_count),
1119 'filesize': filesize,
1125 self._sort_formats(formats)
1129 def _parse_smil_subtitles(self, smil, namespace=None):
1131 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1132 src = textstream.get('src')
1135 ext = textstream.get('ext') or determine_ext(src)
1137 type_ = textstream.get('type')
1138 if type_ == 'text/srt':
1140 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName')
1141 subtitles.setdefault(lang, []).append({
1147 def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1148 xspf = self._download_xml(
1149 playlist_url, playlist_id, 'Downloading xpsf playlist',
1150 'Unable to download xspf manifest', fatal=fatal)
1153 return self._parse_xspf(xspf, playlist_id)
1155 def _parse_xspf(self, playlist, playlist_id):
1157 'xspf': 'http://xspf.org/ns/0/',
1158 's1': 'http://static.streamone.nl/player/ns/0',
1162 for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1164 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1165 description = xpath_text(
1166 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1167 thumbnail = xpath_text(
1168 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1169 duration = float_or_none(
1170 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1173 'url': location.text,
1174 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1175 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1176 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1177 } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1178 self._sort_formats(formats)
1183 'description': description,
1184 'thumbnail': thumbnail,
1185 'duration': duration,
1190 def _live_title(self, name):
1191 """ Generate the title for a live video """
1192 now = datetime.datetime.now()
1193 now_str = now.strftime("%Y-%m-%d %H:%M")
1194 return name + ' ' + now_str
1196 def _int(self, v, name, fatal=False, **kwargs):
1197 res = int_or_none(v, **kwargs)
1198 if 'get_attr' in kwargs:
1199 print(getattr(v, kwargs['get_attr']))
1201 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1203 raise ExtractorError(msg)
1205 self._downloader.report_warning(msg)
1208 def _float(self, v, name, fatal=False, **kwargs):
1209 res = float_or_none(v, **kwargs)
1211 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1213 raise ExtractorError(msg)
1215 self._downloader.report_warning(msg)
1218 def _set_cookie(self, domain, name, value, expire_time=None):
1219 cookie = compat_cookiejar.Cookie(
1220 0, name, value, None, None, domain, None,
1221 None, '/', True, False, expire_time, '', None, None, None)
1222 self._downloader.cookiejar.set_cookie(cookie)
1224 def _get_cookies(self, url):
1225 """ Return a compat_cookies.SimpleCookie with the cookies for the url """
1226 req = compat_urllib_request.Request(url)
1227 self._downloader.cookiejar.add_cookie_header(req)
1228 return compat_cookies.SimpleCookie(req.get_header('Cookie'))
1230 def get_testcases(self, include_onlymatching=False):
1231 t = getattr(self, '_TEST', None)
1233 assert not hasattr(self, '_TESTS'), \
1234 '%s has _TEST and _TESTS' % type(self).__name__
1237 tests = getattr(self, '_TESTS', [])
1239 if not include_onlymatching and t.get('only_matching', False):
1241 t['name'] = type(self).__name__[:-len('IE')]
1244 def is_suitable(self, age_limit):
1245 """ Test whether the extractor is generally suitable for the given
1246 age limit (i.e. pornographic sites are not, all others usually are) """
1248 any_restricted = False
1249 for tc in self.get_testcases(include_onlymatching=False):
1250 if 'playlist' in tc:
1251 tc = tc['playlist'][0]
1252 is_restricted = age_restricted(
1253 tc.get('info_dict', {}).get('age_limit'), age_limit)
1254 if not is_restricted:
1256 any_restricted = any_restricted or is_restricted
1257 return not any_restricted
1259 def extract_subtitles(self, *args, **kwargs):
1260 if (self._downloader.params.get('writesubtitles', False) or
1261 self._downloader.params.get('listsubtitles')):
1262 return self._get_subtitles(*args, **kwargs)
1265 def _get_subtitles(self, *args, **kwargs):
1266 raise NotImplementedError("This method must be implemented by subclasses")
1268 def extract_automatic_captions(self, *args, **kwargs):
1269 if (self._downloader.params.get('writeautomaticsub', False) or
1270 self._downloader.params.get('listsubtitles')):
1271 return self._get_automatic_captions(*args, **kwargs)
1274 def _get_automatic_captions(self, *args, **kwargs):
1275 raise NotImplementedError("This method must be implemented by subclasses")
1278 class SearchInfoExtractor(InfoExtractor):
1280 Base class for paged search queries extractors.
1281 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
1282 Instances should define _SEARCH_KEY and _MAX_RESULTS.
1286 def _make_valid_url(cls):
1287 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
1290 def suitable(cls, url):
1291 return re.match(cls._make_valid_url(), url) is not None
1293 def _real_extract(self, query):
1294 mobj = re.match(self._make_valid_url(), query)
1296 raise ExtractorError('Invalid search query "%s"' % query)
1298 prefix = mobj.group('prefix')
1299 query = mobj.group('query')
1301 return self._get_n_results(query, 1)
1302 elif prefix == 'all':
1303 return self._get_n_results(query, self._MAX_RESULTS)
1307 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
1308 elif n > self._MAX_RESULTS:
1309 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
1310 n = self._MAX_RESULTS
1311 return self._get_n_results(query, n)
1313 def _get_n_results(self, query, n):
1314 """Get a specified number of results for a query"""
1315 raise NotImplementedError("This method must be implemented by subclasses")
1318 def SEARCH_KEY(self):
1319 return self._SEARCH_KEY