1 from __future__ import unicode_literals
13 import xml.etree.ElementTree
15 from ..compat import (
20 compat_urllib_parse_urlparse,
36 _NO_DEFAULT = object()
39 class InfoExtractor(object):
40 """Information Extractor class.
42 Information extractors are the classes that, given a URL, extract
43 information about the video (or videos) the URL refers to. This
44 information includes the real video URL, the video title, author and
45 others. The information is stored in a dictionary which is then
46 passed to the YoutubeDL. The YoutubeDL processes this
47 information possibly downloading the video to the file system, among
48 other possible outcomes.
50 The type field determines the the type of the result.
51 By far the most common value (and the default if _type is missing) is
52 "video", which indicates a single video.
54 For a video, the dictionaries must include the following fields:
57 title: Video title, unescaped.
59 Additionally, it must contain either a formats entry or a url one:
61 formats: A list of dictionaries for each format available, ordered
62 from worst to best quality.
65 * url Mandatory. The URL of the video file
66 * ext Will be calculated from url if missing
67 * format A human-readable description of the format
68 ("mp4 container with h264/opus").
69 Calculated from the format_id, width, height.
70 and format_note fields if missing.
71 * format_id A short description of the format
72 ("mp4_h264_opus" or "19").
73 Technically optional, but strongly recommended.
74 * format_note Additional info about the format
75 ("3D" or "DASH video")
76 * width Width of the video, if known
77 * height Height of the video, if known
78 * resolution Textual description of width and height
79 * tbr Average bitrate of audio and video in KBit/s
80 * abr Average audio bitrate in KBit/s
81 * acodec Name of the audio codec in use
82 * asr Audio sampling rate in Hertz
83 * vbr Average video bitrate in KBit/s
85 * vcodec Name of the video codec in use
86 * container Name of the container format
87 * filesize The number of bytes, if known in advance
88 * filesize_approx An estimate for the number of bytes
89 * player_url SWF Player URL (used for rtmpdump).
90 * protocol The protocol that will be used for the actual
92 "http", "https", "rtsp", "rtmp", "m3u8" or so.
93 * preference Order number of this format. If this field is
94 present and not None, the formats get sorted
95 by this field, regardless of all other values.
96 -1 for default (order by other properties),
97 -2 or smaller for less than default.
98 < -1000 to hide the format (if there is
99 another one which is strictly better)
100 * language_preference Is this in the correct requested
102 10 if it's what the URL is about,
103 -1 for default (don't know),
104 -10 otherwise, other values reserved for now.
105 * quality Order number of the video quality of this
106 format, irrespective of the file format.
107 -1 for default (order by other properties),
108 -2 or smaller for less than default.
109 * source_preference Order number for this video source
110 (quality takes higher priority)
111 -1 for default (order by other properties),
112 -2 or smaller for less than default.
113 * http_method HTTP method to use for the download.
114 * http_headers A dictionary of additional HTTP headers
115 to add to the request.
116 * http_post_data Additional data to send with a POST
118 * stretched_ratio If given and not 1, indicates that the
119 video's pixels are not square.
120 width : height ratio as float.
121 * no_resume The server does not support resuming the
122 (HTTP or RTMP) download. Boolean.
124 url: Final video URL.
125 ext: Video filename extension.
126 format: The video format, defaults to ext (used for --get-format)
127 player_url: SWF Player URL (used for rtmpdump).
129 The following fields are optional:
131 alt_title: A secondary title of the video.
132 display_id An alternative identifier for the video, not necessarily
133 unique, but available before title. Typically, id is
134 something like "4234987", title "Dancing naked mole rats",
135 and display_id "dancing-naked-mole-rats"
136 thumbnails: A list of dictionaries, with the following entries:
137 * "id" (optional, string) - Thumbnail format ID
139 * "preference" (optional, int) - quality of the image
140 * "width" (optional, int)
141 * "height" (optional, int)
142 * "resolution" (optional, string "{width}x{height"},
144 thumbnail: Full URL to a video thumbnail image.
145 description: Full video description.
146 uploader: Full name of the video uploader.
147 timestamp: UNIX timestamp of the moment the video became available.
148 upload_date: Video upload date (YYYYMMDD).
149 If not explicitly set, calculated from timestamp.
150 uploader_id: Nickname or id of the video uploader.
151 location: Physical location where the video was filmed.
152 subtitles: The subtitle file contents as a dictionary in the format
153 {language: subtitles}.
154 duration: Length of the video in seconds, as an integer.
155 view_count: How many users have watched the video on the platform.
156 like_count: Number of positive ratings of the video
157 dislike_count: Number of negative ratings of the video
158 comment_count: Number of comments on the video
159 comments: A list of comments, each with one or more of the following
160 properties (all but one of text or html optional):
161 * "author" - human-readable name of the comment author
162 * "author_id" - user ID of the comment author
164 * "html" - Comment as HTML
165 * "text" - Plain text of the comment
166 * "timestamp" - UNIX timestamp of comment
167 * "parent" - ID of the comment this one is replying to.
168 Set to "root" to indicate that this is a
169 comment to the original video.
170 age_limit: Age restriction for the video, as an integer (years)
171 webpage_url: The url to the video webpage, if given to youtube-dl it
172 should allow to get the same result again. (It will be set
173 by YoutubeDL if it's missing)
174 categories: A list of categories that the video falls in, for example
176 is_live: True, False, or None (=unknown). Whether this video is a
177 live stream that goes on instead of a fixed-length video.
179 Unless mentioned otherwise, the fields should be Unicode strings.
181 Unless mentioned otherwise, None is equivalent to absence of information.
184 _type "playlist" indicates multiple videos.
185 There must be a key "entries", which is a list, an iterable, or a PagedList
186 object, each element of which is a valid dictionary by this specification.
188 Additionally, playlists can have "title" and "id" attributes with the same
189 semantics as videos (see above).
192 _type "multi_video" indicates that there are multiple videos that
193 form a single show, for examples multiple acts of an opera or TV episode.
194 It must have an entries key like a playlist and contain all the keys
195 required for a video at the same time.
198 _type "url" indicates that the video must be extracted from another
199 location, possibly by a different extractor. Its only required key is:
200 "url" - the next URL to extract.
201 The key "ie_key" can be set to the class name (minus the trailing "IE",
202 e.g. "Youtube") if the extractor class is known in advance.
203 Additionally, the dictionary may have any properties of the resolved entity
204 known in advance, for example "title" if the title of the referred video is
208 _type "url_transparent" entities have the same specification as "url", but
209 indicate that the given additional information is more precise than the one
210 associated with the resolved URL.
211 This is useful when a site employs a video service that hosts the video and
212 its technical metadata, but that video service does not embed a useful
213 title, description etc.
216 Subclasses of this one should re-define the _real_initialize() and
217 _real_extract() methods and define a _VALID_URL regexp.
218 Probably, they should also be added to the list of extractors.
220 Finally, the _WORKING attribute should be set to False for broken IEs
221 in order to warn the users and skip the tests.
228 def __init__(self, downloader=None):
229 """Constructor. Receives an optional downloader."""
231 self.set_downloader(downloader)
234 def suitable(cls, url):
235 """Receives a URL and returns True if suitable for this IE."""
237 # This does not use has/getattr intentionally - we want to know whether
238 # we have cached the regexp for *this* class, whereas getattr would also
239 # match the superclass
240 if '_VALID_URL_RE' not in cls.__dict__:
241 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
242 return cls._VALID_URL_RE.match(url) is not None
245 def _match_id(cls, url):
246 if '_VALID_URL_RE' not in cls.__dict__:
247 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
248 m = cls._VALID_URL_RE.match(url)
254 """Getter method for _WORKING."""
257 def initialize(self):
258 """Initializes an instance (authentication, etc)."""
260 self._real_initialize()
263 def extract(self, url):
264 """Extracts URL information and returns it in list of dicts."""
266 return self._real_extract(url)
268 def set_downloader(self, downloader):
269 """Sets the downloader for this IE."""
270 self._downloader = downloader
272 def _real_initialize(self):
273 """Real initialization process. Redefine in subclasses."""
276 def _real_extract(self, url):
277 """Real extraction process. Redefine in subclasses."""
282 """A string for getting the InfoExtractor with get_info_extractor"""
283 return cls.__name__[:-2]
287 return type(self).__name__[:-2]
289 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
290 """ Returns the response handle """
292 self.report_download_webpage(video_id)
293 elif note is not False:
295 self.to_screen('%s' % (note,))
297 self.to_screen('%s: %s' % (video_id, note))
299 return self._downloader.urlopen(url_or_request)
300 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
304 errnote = 'Unable to download webpage'
305 errmsg = '%s: %s' % (errnote, compat_str(err))
307 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
309 self._downloader.report_warning(errmsg)
312 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
313 """ Returns a tuple (page content as string, URL handle) """
314 # Strip hashes from the URL (#1038)
315 if isinstance(url_or_request, (compat_str, str)):
316 url_or_request = url_or_request.partition('#')[0]
318 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
322 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal)
323 return (content, urlh)
325 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None):
326 content_type = urlh.headers.get('Content-Type', '')
327 webpage_bytes = urlh.read()
328 if prefix is not None:
329 webpage_bytes = prefix + webpage_bytes
330 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
332 encoding = m.group(1)
334 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
335 webpage_bytes[:1024])
337 encoding = m.group(1).decode('ascii')
338 elif webpage_bytes.startswith(b'\xff\xfe'):
342 if self._downloader.params.get('dump_intermediate_pages', False):
344 url = url_or_request.get_full_url()
345 except AttributeError:
347 self.to_screen('Dumping request to ' + url)
348 dump = base64.b64encode(webpage_bytes).decode('ascii')
349 self._downloader.to_screen(dump)
350 if self._downloader.params.get('write_pages', False):
352 url = url_or_request.get_full_url()
353 except AttributeError:
355 basen = '%s_%s' % (video_id, url)
357 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
358 basen = basen[:240 - len(h)] + h
359 raw_filename = basen + '.dump'
360 filename = sanitize_filename(raw_filename, restricted=True)
361 self.to_screen('Saving request to ' + filename)
362 # Working around MAX_PATH limitation on Windows (see
363 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
365 absfilepath = os.path.abspath(filename)
366 if len(absfilepath) > 259:
367 filename = '\\\\?\\' + absfilepath
368 with open(filename, 'wb') as outf:
369 outf.write(webpage_bytes)
372 content = webpage_bytes.decode(encoding, 'replace')
374 content = webpage_bytes.decode('utf-8', 'replace')
376 if ('<title>Access to this site is blocked</title>' in content and
377 'Websense' in content[:512]):
378 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
379 blocked_iframe = self._html_search_regex(
380 r'<iframe src="([^"]+)"', content,
381 'Websense information URL', default=None)
383 msg += ' Visit %s for more details' % blocked_iframe
384 raise ExtractorError(msg, expected=True)
388 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5):
389 """ Returns the data of the page as a string """
392 while success is False:
394 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
396 except compat_http_client.IncompleteRead as e:
398 if try_count >= tries:
400 self._sleep(timeout, video_id)
407 def _download_xml(self, url_or_request, video_id,
408 note='Downloading XML', errnote='Unable to download XML',
409 transform_source=None, fatal=True):
410 """Return the xml as an xml.etree.ElementTree.Element"""
411 xml_string = self._download_webpage(
412 url_or_request, video_id, note, errnote, fatal=fatal)
413 if xml_string is False:
416 xml_string = transform_source(xml_string)
417 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
419 def _download_json(self, url_or_request, video_id,
420 note='Downloading JSON metadata',
421 errnote='Unable to download JSON metadata',
422 transform_source=None,
424 json_string = self._download_webpage(
425 url_or_request, video_id, note, errnote, fatal=fatal)
426 if (not fatal) and json_string is False:
428 return self._parse_json(
429 json_string, video_id, transform_source=transform_source, fatal=fatal)
431 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
433 json_string = transform_source(json_string)
435 return json.loads(json_string)
436 except ValueError as ve:
437 errmsg = '%s: Failed to parse JSON ' % video_id
439 raise ExtractorError(errmsg, cause=ve)
441 self.report_warning(errmsg + str(ve))
443 def report_warning(self, msg, video_id=None):
444 idstr = '' if video_id is None else '%s: ' % video_id
445 self._downloader.report_warning(
446 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
448 def to_screen(self, msg):
449 """Print msg to screen, prefixing it with '[ie_name]'"""
450 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
452 def report_extraction(self, id_or_name):
453 """Report information extraction."""
454 self.to_screen('%s: Extracting information' % id_or_name)
456 def report_download_webpage(self, video_id):
457 """Report webpage download."""
458 self.to_screen('%s: Downloading webpage' % video_id)
460 def report_age_confirmation(self):
461 """Report attempt to confirm age."""
462 self.to_screen('Confirming age')
464 def report_login(self):
465 """Report attempt to log in."""
466 self.to_screen('Logging in')
468 # Methods for following #608
470 def url_result(url, ie=None, video_id=None):
471 """Returns a url that points to a page that should be processed"""
472 # TODO: ie should be the class used for getting the info
473 video_info = {'_type': 'url',
476 if video_id is not None:
477 video_info['id'] = video_id
481 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
482 """Returns a playlist"""
483 video_info = {'_type': 'playlist',
486 video_info['id'] = playlist_id
488 video_info['title'] = playlist_title
489 if playlist_description:
490 video_info['description'] = playlist_description
493 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
495 Perform a regex search on the given string, using a single or a list of
496 patterns returning the first matching group.
497 In case of failure return a default value or raise a WARNING or a
498 RegexNotFoundError, depending on fatal, specifying the field name.
500 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
501 mobj = re.search(pattern, string, flags)
504 mobj = re.search(p, string, flags)
508 if os.name != 'nt' and sys.stderr.isatty():
509 _name = '\033[0;34m%s\033[0m' % name
515 # return the first matching group
516 return next(g for g in mobj.groups() if g is not None)
518 return mobj.group(group)
519 elif default is not _NO_DEFAULT:
522 raise RegexNotFoundError('Unable to extract %s' % _name)
524 self._downloader.report_warning('unable to extract %s; '
525 'please report this issue on http://yt-dl.org/bug' % _name)
528 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
530 Like _search_regex, but strips HTML tags and unescapes entities.
532 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
534 return clean_html(res).strip()
538 def _get_login_info(self):
540 Get the the login info as (username, password)
541 It will look in the netrc file using the _NETRC_MACHINE value
542 If there's no info available, return (None, None)
544 if self._downloader is None:
549 downloader_params = self._downloader.params
551 # Attempt to use provided username and password or .netrc data
552 if downloader_params.get('username', None) is not None:
553 username = downloader_params['username']
554 password = downloader_params['password']
555 elif downloader_params.get('usenetrc', False):
557 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
562 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
563 except (IOError, netrc.NetrcParseError) as err:
564 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
566 return (username, password)
568 def _get_tfa_info(self):
570 Get the two-factor authentication info
571 TODO - asking the user will be required for sms/phone verify
572 currently just uses the command line option
573 If there's no info available, return None
575 if self._downloader is None:
577 downloader_params = self._downloader.params
579 if downloader_params.get('twofactor', None) is not None:
580 return downloader_params['twofactor']
584 # Helper functions for extracting OpenGraph info
586 def _og_regexes(prop):
587 content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
588 property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
589 template = r'<meta[^>]+?%s[^>]+?%s'
591 template % (property_re, content_re),
592 template % (content_re, property_re),
595 def _og_search_property(self, prop, html, name=None, **kargs):
597 name = 'OpenGraph %s' % prop
598 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
601 return unescapeHTML(escaped)
603 def _og_search_thumbnail(self, html, **kargs):
604 return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs)
606 def _og_search_description(self, html, **kargs):
607 return self._og_search_property('description', html, fatal=False, **kargs)
609 def _og_search_title(self, html, **kargs):
610 return self._og_search_property('title', html, **kargs)
612 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
613 regexes = self._og_regexes('video') + self._og_regexes('video:url')
615 regexes = self._og_regexes('video:secure_url') + regexes
616 return self._html_search_regex(regexes, html, name, **kargs)
618 def _og_search_url(self, html, **kargs):
619 return self._og_search_property('url', html, **kargs)
621 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
622 if display_name is None:
624 return self._html_search_regex(
626 (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1)
627 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(name),
628 html, display_name, fatal=fatal, group='content', **kwargs)
630 def _dc_search_uploader(self, html):
631 return self._html_search_meta('dc.creator', html, 'uploader')
633 def _rta_search(self, html):
634 # See http://www.rtalabel.org/index.php?content=howtofaq#single
635 if re.search(r'(?ix)<meta\s+name="rating"\s+'
636 r' content="RTA-5042-1996-1400-1577-RTA"',
641 def _media_rating_search(self, html):
642 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
643 rating = self._html_search_meta('rating', html)
655 return RATING_TABLE.get(rating.lower(), None)
657 def _twitter_search_player(self, html):
658 return self._html_search_meta('twitter:player', html,
659 'twitter card player')
661 def _sort_formats(self, formats):
663 raise ExtractorError('No video formats found')
666 # TODO remove the following workaround
667 from ..utils import determine_ext
668 if not f.get('ext') and 'url' in f:
669 f['ext'] = determine_ext(f['url'])
671 preference = f.get('preference')
672 if preference is None:
673 proto = f.get('protocol')
675 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
677 preference = 0 if proto in ['http', 'https'] else -0.1
678 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
681 if f.get('vcodec') == 'none': # audio only
682 if self._downloader.params.get('prefer_free_formats'):
683 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
685 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
688 audio_ext_preference = ORDER.index(f['ext'])
690 audio_ext_preference = -1
692 if self._downloader.params.get('prefer_free_formats'):
693 ORDER = ['flv', 'mp4', 'webm']
695 ORDER = ['webm', 'flv', 'mp4']
697 ext_preference = ORDER.index(f['ext'])
700 audio_ext_preference = 0
704 f.get('language_preference') if f.get('language_preference') is not None else -1,
705 f.get('quality') if f.get('quality') is not None else -1,
706 f.get('height') if f.get('height') is not None else -1,
707 f.get('width') if f.get('width') is not None else -1,
709 f.get('tbr') if f.get('tbr') is not None else -1,
710 f.get('vbr') if f.get('vbr') is not None else -1,
711 f.get('abr') if f.get('abr') is not None else -1,
712 audio_ext_preference,
713 f.get('fps') if f.get('fps') is not None else -1,
714 f.get('filesize') if f.get('filesize') is not None else -1,
715 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
716 f.get('source_preference') if f.get('source_preference') is not None else -1,
719 formats.sort(key=_formats_key)
721 def _check_formats(self, formats, video_id):
724 lambda f: self._is_valid_url(
726 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
729 def _is_valid_url(self, url, video_id, item='video'):
731 self._request_webpage(
732 HEADRequest(url), video_id,
733 'Checking %s URL' % item)
735 except ExtractorError as e:
736 if isinstance(e.cause, compat_HTTPError):
738 '%s URL is invalid, skipping' % item, video_id)
742 def http_scheme(self):
743 """ Either "http:" or "https:", depending on the user's preferences """
746 if self._downloader.params.get('prefer_insecure', False)
749 def _proto_relative_url(self, url, scheme=None):
752 if url.startswith('//'):
754 scheme = self.http_scheme()
759 def _sleep(self, timeout, video_id, msg_template=None):
760 if msg_template is None:
761 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
762 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
766 def _extract_f4m_formats(self, manifest_url, video_id):
767 manifest = self._download_xml(
768 manifest_url, video_id, 'Downloading f4m manifest',
769 'Unable to download f4m manifest')
772 manifest_version = '1.0'
773 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
775 manifest_version = '2.0'
776 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
777 for i, media_el in enumerate(media_nodes):
778 if manifest_version == '2.0':
779 manifest_url = '/'.join(manifest_url.split('/')[:-1]) + '/' + media_el.attrib.get('href')
780 tbr = int_or_none(media_el.attrib.get('bitrate'))
781 format_id = 'f4m-%d' % (i if tbr is None else tbr)
783 'format_id': format_id,
787 'width': int_or_none(media_el.attrib.get('width')),
788 'height': int_or_none(media_el.attrib.get('height')),
790 self._sort_formats(formats)
794 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
795 entry_protocol='m3u8', preference=None):
798 'format_id': 'm3u8-meta',
803 'resolution': 'multiple',
804 'format_note': 'Quality selection URL',
807 format_url = lambda u: (
809 if re.match(r'^https?://', u)
810 else compat_urlparse.urljoin(m3u8_url, u))
812 m3u8_doc = self._download_webpage(
814 note='Downloading m3u8 information',
815 errnote='Failed to download m3u8 information')
818 r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
819 for line in m3u8_doc.splitlines():
820 if line.startswith('#EXT-X-STREAM-INF:'):
822 for m in kv_rex.finditer(line):
824 if v.startswith('"'):
826 last_info[m.group('key')] = v
827 elif line.startswith('#') or not line.strip():
830 if last_info is None:
831 formats.append({'url': format_url(line)})
833 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
836 'format_id': 'm3u8-%d' % (tbr if tbr else len(formats)),
837 'url': format_url(line.strip()),
840 'protocol': entry_protocol,
841 'preference': preference,
843 codecs = last_info.get('CODECS')
845 # TODO: looks like video codec is not always necessarily goes first
846 va_codecs = codecs.split(',')
848 f['vcodec'] = va_codecs[0].partition('.')[0]
849 if len(va_codecs) > 1 and va_codecs[1]:
850 f['acodec'] = va_codecs[1].partition('.')[0]
851 resolution = last_info.get('RESOLUTION')
853 width_str, height_str = resolution.split('x')
854 f['width'] = int(width_str)
855 f['height'] = int(height_str)
858 self._sort_formats(formats)
861 # TODO: improve extraction
862 def _extract_smil_formats(self, smil_url, video_id):
863 smil = self._download_xml(
864 smil_url, video_id, 'Downloading SMIL file',
865 'Unable to download SMIL file')
867 base = smil.find('./head/meta').get('base')
871 for video in smil.findall('./body/switch/video'):
872 src = video.get('src')
875 bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
876 width = int_or_none(video.get('width'))
877 height = int_or_none(video.get('height'))
878 proto = video.get('proto')
881 if base.startswith('rtmp'):
883 elif base.startswith('http'):
885 ext = video.get('ext')
887 formats.extend(self._extract_m3u8_formats(src, video_id, ext))
888 elif proto == 'rtmp':
890 streamer = video.get('streamer') or base
895 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
900 self._sort_formats(formats)
904 def _live_title(self, name):
905 """ Generate the title for a live video """
906 now = datetime.datetime.now()
907 now_str = now.strftime("%Y-%m-%d %H:%M")
908 return name + ' ' + now_str
910 def _int(self, v, name, fatal=False, **kwargs):
911 res = int_or_none(v, **kwargs)
912 if 'get_attr' in kwargs:
913 print(getattr(v, kwargs['get_attr']))
915 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
917 raise ExtractorError(msg)
919 self._downloader.report_warning(msg)
922 def _float(self, v, name, fatal=False, **kwargs):
923 res = float_or_none(v, **kwargs)
925 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
927 raise ExtractorError(msg)
929 self._downloader.report_warning(msg)
932 def _set_cookie(self, domain, name, value, expire_time=None):
933 cookie = compat_cookiejar.Cookie(
934 0, name, value, None, None, domain, None,
935 None, '/', True, False, expire_time, '', None, None, None)
936 self._downloader.cookiejar.set_cookie(cookie)
938 def get_testcases(self, include_onlymatching=False):
939 t = getattr(self, '_TEST', None)
941 assert not hasattr(self, '_TESTS'), \
942 '%s has _TEST and _TESTS' % type(self).__name__
945 tests = getattr(self, '_TESTS', [])
947 if not include_onlymatching and t.get('only_matching', False):
949 t['name'] = type(self).__name__[:-len('IE')]
952 def is_suitable(self, age_limit):
953 """ Test whether the extractor is generally suitable for the given
954 age limit (i.e. pornographic sites are not, all others usually are) """
956 any_restricted = False
957 for tc in self.get_testcases(include_onlymatching=False):
959 tc = tc['playlist'][0]
960 is_restricted = age_restricted(
961 tc.get('info_dict', {}).get('age_limit'), age_limit)
962 if not is_restricted:
964 any_restricted = any_restricted or is_restricted
965 return not any_restricted
968 class SearchInfoExtractor(InfoExtractor):
970 Base class for paged search queries extractors.
971 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
972 Instances should define _SEARCH_KEY and _MAX_RESULTS.
976 def _make_valid_url(cls):
977 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
980 def suitable(cls, url):
981 return re.match(cls._make_valid_url(), url) is not None
983 def _real_extract(self, query):
984 mobj = re.match(self._make_valid_url(), query)
986 raise ExtractorError('Invalid search query "%s"' % query)
988 prefix = mobj.group('prefix')
989 query = mobj.group('query')
991 return self._get_n_results(query, 1)
992 elif prefix == 'all':
993 return self._get_n_results(query, self._MAX_RESULTS)
997 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
998 elif n > self._MAX_RESULTS:
999 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
1000 n = self._MAX_RESULTS
1001 return self._get_n_results(query, n)
1003 def _get_n_results(self, query, n):
1004 """Get a specified number of results for a query"""
1005 raise NotImplementedError("This method must be implemented by subclasses")
1008 def SEARCH_KEY(self):
1009 return self._SEARCH_KEY