1 from __future__ import unicode_literals
13 import xml.etree.ElementTree
15 from ..compat import (
19 compat_urllib_parse_urlparse,
33 _NO_DEFAULT = object()
36 class InfoExtractor(object):
37 """Information Extractor class.
39 Information extractors are the classes that, given a URL, extract
40 information about the video (or videos) the URL refers to. This
41 information includes the real video URL, the video title, author and
42 others. The information is stored in a dictionary which is then
43 passed to the FileDownloader. The FileDownloader processes this
44 information possibly downloading the video to the file system, among
45 other possible outcomes.
47 The type field determines the the type of the result.
48 By far the most common value (and the default if _type is missing) is
49 "video", which indicates a single video.
51 For a video, the dictionaries must include the following fields:
54 title: Video title, unescaped.
56 Additionally, it must contain either a formats entry or a url one:
58 formats: A list of dictionaries for each format available, ordered
59 from worst to best quality.
62 * url Mandatory. The URL of the video file
63 * ext Will be calculated from url if missing
64 * format A human-readable description of the format
65 ("mp4 container with h264/opus").
66 Calculated from the format_id, width, height.
67 and format_note fields if missing.
68 * format_id A short description of the format
69 ("mp4_h264_opus" or "19").
70 Technically optional, but strongly recommended.
71 * format_note Additional info about the format
72 ("3D" or "DASH video")
73 * width Width of the video, if known
74 * height Height of the video, if known
75 * resolution Textual description of width and height
76 * tbr Average bitrate of audio and video in KBit/s
77 * abr Average audio bitrate in KBit/s
78 * acodec Name of the audio codec in use
79 * asr Audio sampling rate in Hertz
80 * vbr Average video bitrate in KBit/s
82 * vcodec Name of the video codec in use
83 * container Name of the container format
84 * filesize The number of bytes, if known in advance
85 * filesize_approx An estimate for the number of bytes
86 * player_url SWF Player URL (used for rtmpdump).
87 * protocol The protocol that will be used for the actual
89 "http", "https", "rtsp", "rtmp", "m3u8" or so.
90 * preference Order number of this format. If this field is
91 present and not None, the formats get sorted
92 by this field, regardless of all other values.
93 -1 for default (order by other properties),
94 -2 or smaller for less than default.
95 * language_preference Is this in the correct requested
97 10 if it's what the URL is about,
98 -1 for default (don't know),
99 -10 otherwise, other values reserved for now.
100 * quality Order number of the video quality of this
101 format, irrespective of the file format.
102 -1 for default (order by other properties),
103 -2 or smaller for less than default.
104 * source_preference Order number for this video source
105 (quality takes higher priority)
106 -1 for default (order by other properties),
107 -2 or smaller for less than default.
108 * http_referer HTTP Referer header value to set.
109 * http_method HTTP method to use for the download.
110 * http_headers A dictionary of additional HTTP headers
111 to add to the request.
112 * http_post_data Additional data to send with a POST
114 url: Final video URL.
115 ext: Video filename extension.
116 format: The video format, defaults to ext (used for --get-format)
117 player_url: SWF Player URL (used for rtmpdump).
119 The following fields are optional:
121 display_id An alternative identifier for the video, not necessarily
122 unique, but available before title. Typically, id is
123 something like "4234987", title "Dancing naked mole rats",
124 and display_id "dancing-naked-mole-rats"
125 thumbnails: A list of dictionaries, with the following entries:
127 * "width" (optional, int)
128 * "height" (optional, int)
129 * "resolution" (optional, string "{width}x{height"},
131 thumbnail: Full URL to a video thumbnail image.
132 description: One-line video description.
133 uploader: Full name of the video uploader.
134 timestamp: UNIX timestamp of the moment the video became available.
135 upload_date: Video upload date (YYYYMMDD).
136 If not explicitly set, calculated from timestamp.
137 uploader_id: Nickname or id of the video uploader.
138 location: Physical location where the video was filmed.
139 subtitles: The subtitle file contents as a dictionary in the format
140 {language: subtitles}.
141 duration: Length of the video in seconds, as an integer.
142 view_count: How many users have watched the video on the platform.
143 like_count: Number of positive ratings of the video
144 dislike_count: Number of negative ratings of the video
145 comment_count: Number of comments on the video
146 age_limit: Age restriction for the video, as an integer (years)
147 webpage_url: The url to the video webpage, if given to youtube-dl it
148 should allow to get the same result again. (It will be set
149 by YoutubeDL if it's missing)
150 categories: A list of categories that the video falls in, for example
152 is_live: True, False, or None (=unknown). Whether this video is a
153 live stream that goes on instead of a fixed-length video.
155 Unless mentioned otherwise, the fields should be Unicode strings.
157 Unless mentioned otherwise, None is equivalent to absence of information.
160 _type "playlist" indicates multiple videos.
161 There must be a key "entries", which is a list, an iterable, or a PagedList
162 object, each element of which is a valid dictionary by this specification.
164 Additionally, playlists can have "title" and "id" attributes with the same
165 semantics as videos (see above).
168 _type "multi_video" indicates that there are multiple videos that
169 form a single show, for examples multiple acts of an opera or TV episode.
170 It must have an entries key like a playlist and contain all the keys
171 required for a video at the same time.
174 _type "url" indicates that the video must be extracted from another
175 location, possibly by a different extractor. Its only required key is:
176 "url" - the next URL to extract.
177 The key "ie_key" can be set to the class name (minus the trailing "IE",
178 e.g. "Youtube") if the extractor class is known in advance.
179 Additionally, the dictionary may have any properties of the resolved entity
180 known in advance, for example "title" if the title of the referred video is
184 _type "url_transparent" entities have the same specification as "url", but
185 indicate that the given additional information is more precise than the one
186 associated with the resolved URL.
187 This is useful when a site employs a video service that hosts the video and
188 its technical metadata, but that video service does not embed a useful
189 title, description etc.
192 Subclasses of this one should re-define the _real_initialize() and
193 _real_extract() methods and define a _VALID_URL regexp.
194 Probably, they should also be added to the list of extractors.
196 Finally, the _WORKING attribute should be set to False for broken IEs
197 in order to warn the users and skip the tests.
204 def __init__(self, downloader=None):
205 """Constructor. Receives an optional downloader."""
207 self.set_downloader(downloader)
210 def suitable(cls, url):
211 """Receives a URL and returns True if suitable for this IE."""
213 # This does not use has/getattr intentionally - we want to know whether
214 # we have cached the regexp for *this* class, whereas getattr would also
215 # match the superclass
216 if '_VALID_URL_RE' not in cls.__dict__:
217 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
218 return cls._VALID_URL_RE.match(url) is not None
221 def _match_id(cls, url):
222 if '_VALID_URL_RE' not in cls.__dict__:
223 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
224 m = cls._VALID_URL_RE.match(url)
230 """Getter method for _WORKING."""
233 def initialize(self):
234 """Initializes an instance (authentication, etc)."""
236 self._real_initialize()
239 def extract(self, url):
240 """Extracts URL information and returns it in list of dicts."""
242 return self._real_extract(url)
244 def set_downloader(self, downloader):
245 """Sets the downloader for this IE."""
246 self._downloader = downloader
248 def _real_initialize(self):
249 """Real initialization process. Redefine in subclasses."""
252 def _real_extract(self, url):
253 """Real extraction process. Redefine in subclasses."""
258 """A string for getting the InfoExtractor with get_info_extractor"""
259 return cls.__name__[:-2]
263 return type(self).__name__[:-2]
265 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
266 """ Returns the response handle """
268 self.report_download_webpage(video_id)
269 elif note is not False:
271 self.to_screen('%s' % (note,))
273 self.to_screen('%s: %s' % (video_id, note))
275 return self._downloader.urlopen(url_or_request)
276 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
280 errnote = 'Unable to download webpage'
281 errmsg = '%s: %s' % (errnote, compat_str(err))
283 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
285 self._downloader.report_warning(errmsg)
288 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
289 """ Returns a tuple (page content as string, URL handle) """
290 # Strip hashes from the URL (#1038)
291 if isinstance(url_or_request, (compat_str, str)):
292 url_or_request = url_or_request.partition('#')[0]
294 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
298 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal)
299 return (content, urlh)
301 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None):
302 content_type = urlh.headers.get('Content-Type', '')
303 webpage_bytes = urlh.read()
304 if prefix is not None:
305 webpage_bytes = prefix + webpage_bytes
306 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
308 encoding = m.group(1)
310 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
311 webpage_bytes[:1024])
313 encoding = m.group(1).decode('ascii')
314 elif webpage_bytes.startswith(b'\xff\xfe'):
318 if self._downloader.params.get('dump_intermediate_pages', False):
320 url = url_or_request.get_full_url()
321 except AttributeError:
323 self.to_screen('Dumping request to ' + url)
324 dump = base64.b64encode(webpage_bytes).decode('ascii')
325 self._downloader.to_screen(dump)
326 if self._downloader.params.get('write_pages', False):
328 url = url_or_request.get_full_url()
329 except AttributeError:
331 basen = '%s_%s' % (video_id, url)
333 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
334 basen = basen[:240 - len(h)] + h
335 raw_filename = basen + '.dump'
336 filename = sanitize_filename(raw_filename, restricted=True)
337 self.to_screen('Saving request to ' + filename)
338 # Working around MAX_PATH limitation on Windows (see
339 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
341 absfilepath = os.path.abspath(filename)
342 if len(absfilepath) > 259:
343 filename = '\\\\?\\' + absfilepath
344 with open(filename, 'wb') as outf:
345 outf.write(webpage_bytes)
348 content = webpage_bytes.decode(encoding, 'replace')
350 content = webpage_bytes.decode('utf-8', 'replace')
352 if ('<title>Access to this site is blocked</title>' in content and
353 'Websense' in content[:512]):
354 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
355 blocked_iframe = self._html_search_regex(
356 r'<iframe src="([^"]+)"', content,
357 'Websense information URL', default=None)
359 msg += ' Visit %s for more details' % blocked_iframe
360 raise ExtractorError(msg, expected=True)
364 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
365 """ Returns the data of the page as a string """
366 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
373 def _download_xml(self, url_or_request, video_id,
374 note='Downloading XML', errnote='Unable to download XML',
375 transform_source=None, fatal=True):
376 """Return the xml as an xml.etree.ElementTree.Element"""
377 xml_string = self._download_webpage(
378 url_or_request, video_id, note, errnote, fatal=fatal)
379 if xml_string is False:
382 xml_string = transform_source(xml_string)
383 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
385 def _download_json(self, url_or_request, video_id,
386 note='Downloading JSON metadata',
387 errnote='Unable to download JSON metadata',
388 transform_source=None,
390 json_string = self._download_webpage(
391 url_or_request, video_id, note, errnote, fatal=fatal)
392 if (not fatal) and json_string is False:
395 json_string = transform_source(json_string)
397 return json.loads(json_string)
398 except ValueError as ve:
399 errmsg = '%s: Failed to parse JSON ' % video_id
401 raise ExtractorError(errmsg, cause=ve)
403 self.report_warning(errmsg + str(ve))
405 def report_warning(self, msg, video_id=None):
406 idstr = '' if video_id is None else '%s: ' % video_id
407 self._downloader.report_warning(
408 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
410 def to_screen(self, msg):
411 """Print msg to screen, prefixing it with '[ie_name]'"""
412 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
414 def report_extraction(self, id_or_name):
415 """Report information extraction."""
416 self.to_screen('%s: Extracting information' % id_or_name)
418 def report_download_webpage(self, video_id):
419 """Report webpage download."""
420 self.to_screen('%s: Downloading webpage' % video_id)
422 def report_age_confirmation(self):
423 """Report attempt to confirm age."""
424 self.to_screen('Confirming age')
426 def report_login(self):
427 """Report attempt to log in."""
428 self.to_screen('Logging in')
430 # Methods for following #608
432 def url_result(url, ie=None, video_id=None):
433 """Returns a url that points to a page that should be processed"""
434 # TODO: ie should be the class used for getting the info
435 video_info = {'_type': 'url',
438 if video_id is not None:
439 video_info['id'] = video_id
443 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
444 """Returns a playlist"""
445 video_info = {'_type': 'playlist',
448 video_info['id'] = playlist_id
450 video_info['title'] = playlist_title
451 if playlist_description:
452 video_info['description'] = playlist_description
455 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
457 Perform a regex search on the given string, using a single or a list of
458 patterns returning the first matching group.
459 In case of failure return a default value or raise a WARNING or a
460 RegexNotFoundError, depending on fatal, specifying the field name.
462 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
463 mobj = re.search(pattern, string, flags)
466 mobj = re.search(p, string, flags)
470 if os.name != 'nt' and sys.stderr.isatty():
471 _name = '\033[0;34m%s\033[0m' % name
477 # return the first matching group
478 return next(g for g in mobj.groups() if g is not None)
480 return mobj.group(group)
481 elif default is not _NO_DEFAULT:
484 raise RegexNotFoundError('Unable to extract %s' % _name)
486 self._downloader.report_warning('unable to extract %s; '
487 'please report this issue on http://yt-dl.org/bug' % _name)
490 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
492 Like _search_regex, but strips HTML tags and unescapes entities.
494 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
496 return clean_html(res).strip()
500 def _get_login_info(self):
502 Get the the login info as (username, password)
503 It will look in the netrc file using the _NETRC_MACHINE value
504 If there's no info available, return (None, None)
506 if self._downloader is None:
511 downloader_params = self._downloader.params
513 # Attempt to use provided username and password or .netrc data
514 if downloader_params.get('username', None) is not None:
515 username = downloader_params['username']
516 password = downloader_params['password']
517 elif downloader_params.get('usenetrc', False):
519 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
524 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
525 except (IOError, netrc.NetrcParseError) as err:
526 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
528 return (username, password)
530 def _get_tfa_info(self):
532 Get the two-factor authentication info
533 TODO - asking the user will be required for sms/phone verify
534 currently just uses the command line option
535 If there's no info available, return None
537 if self._downloader is None:
539 downloader_params = self._downloader.params
541 if downloader_params.get('twofactor', None) is not None:
542 return downloader_params['twofactor']
546 # Helper functions for extracting OpenGraph info
548 def _og_regexes(prop):
549 content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
550 property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
551 template = r'<meta[^>]+?%s[^>]+?%s'
553 template % (property_re, content_re),
554 template % (content_re, property_re),
557 def _og_search_property(self, prop, html, name=None, **kargs):
559 name = 'OpenGraph %s' % prop
560 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
563 return unescapeHTML(escaped)
565 def _og_search_thumbnail(self, html, **kargs):
566 return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs)
568 def _og_search_description(self, html, **kargs):
569 return self._og_search_property('description', html, fatal=False, **kargs)
571 def _og_search_title(self, html, **kargs):
572 return self._og_search_property('title', html, **kargs)
574 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
575 regexes = self._og_regexes('video') + self._og_regexes('video:url')
577 regexes = self._og_regexes('video:secure_url') + regexes
578 return self._html_search_regex(regexes, html, name, **kargs)
580 def _og_search_url(self, html, **kargs):
581 return self._og_search_property('url', html, **kargs)
583 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
584 if display_name is None:
586 return self._html_search_regex(
588 (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1)
589 [^>]+content=(["\'])(?P<content>.*?)\1''' % re.escape(name),
590 html, display_name, fatal=fatal, group='content', **kwargs)
592 def _dc_search_uploader(self, html):
593 return self._html_search_meta('dc.creator', html, 'uploader')
595 def _rta_search(self, html):
596 # See http://www.rtalabel.org/index.php?content=howtofaq#single
597 if re.search(r'(?ix)<meta\s+name="rating"\s+'
598 r' content="RTA-5042-1996-1400-1577-RTA"',
603 def _media_rating_search(self, html):
604 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
605 rating = self._html_search_meta('rating', html)
617 return RATING_TABLE.get(rating.lower(), None)
619 def _twitter_search_player(self, html):
620 return self._html_search_meta('twitter:player', html,
621 'twitter card player')
623 def _sort_formats(self, formats):
625 raise ExtractorError('No video formats found')
628 # TODO remove the following workaround
629 from ..utils import determine_ext
630 if not f.get('ext') and 'url' in f:
631 f['ext'] = determine_ext(f['url'])
633 preference = f.get('preference')
634 if preference is None:
635 proto = f.get('protocol')
637 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
639 preference = 0 if proto in ['http', 'https'] else -0.1
640 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
643 if f.get('vcodec') == 'none': # audio only
644 if self._downloader.params.get('prefer_free_formats'):
645 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
647 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
650 audio_ext_preference = ORDER.index(f['ext'])
652 audio_ext_preference = -1
654 if self._downloader.params.get('prefer_free_formats'):
655 ORDER = ['flv', 'mp4', 'webm']
657 ORDER = ['webm', 'flv', 'mp4']
659 ext_preference = ORDER.index(f['ext'])
662 audio_ext_preference = 0
666 f.get('language_preference') if f.get('language_preference') is not None else -1,
667 f.get('quality') if f.get('quality') is not None else -1,
668 f.get('height') if f.get('height') is not None else -1,
669 f.get('width') if f.get('width') is not None else -1,
671 f.get('tbr') if f.get('tbr') is not None else -1,
672 f.get('vbr') if f.get('vbr') is not None else -1,
673 f.get('abr') if f.get('abr') is not None else -1,
674 audio_ext_preference,
675 f.get('fps') if f.get('fps') is not None else -1,
676 f.get('filesize') if f.get('filesize') is not None else -1,
677 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
678 f.get('source_preference') if f.get('source_preference') is not None else -1,
681 formats.sort(key=_formats_key)
683 def http_scheme(self):
684 """ Either "http:" or "https:", depending on the user's preferences """
687 if self._downloader.params.get('prefer_insecure', False)
690 def _proto_relative_url(self, url, scheme=None):
693 if url.startswith('//'):
695 scheme = self.http_scheme()
700 def _sleep(self, timeout, video_id, msg_template=None):
701 if msg_template is None:
702 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
703 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
707 def _extract_f4m_formats(self, manifest_url, video_id):
708 manifest = self._download_xml(
709 manifest_url, video_id, 'Downloading f4m manifest',
710 'Unable to download f4m manifest')
713 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
714 for i, media_el in enumerate(media_nodes):
715 tbr = int_or_none(media_el.attrib.get('bitrate'))
716 format_id = 'f4m-%d' % (i if tbr is None else tbr)
718 'format_id': format_id,
722 'width': int_or_none(media_el.attrib.get('width')),
723 'height': int_or_none(media_el.attrib.get('height')),
725 self._sort_formats(formats)
729 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
730 entry_protocol='m3u8', preference=None):
733 'format_id': 'm3u8-meta',
738 'resolution': 'multiple',
739 'format_note': 'Quality selection URL',
742 format_url = lambda u: (
744 if re.match(r'^https?://', u)
745 else compat_urlparse.urljoin(m3u8_url, u))
747 m3u8_doc = self._download_webpage(
749 note='Downloading m3u8 information',
750 errnote='Failed to download m3u8 information')
753 r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
754 for line in m3u8_doc.splitlines():
755 if line.startswith('#EXT-X-STREAM-INF:'):
757 for m in kv_rex.finditer(line):
759 if v.startswith('"'):
761 last_info[m.group('key')] = v
762 elif line.startswith('#') or not line.strip():
765 if last_info is None:
766 formats.append({'url': format_url(line)})
768 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
771 'format_id': 'm3u8-%d' % (tbr if tbr else len(formats)),
772 'url': format_url(line.strip()),
775 'protocol': entry_protocol,
776 'preference': preference,
778 codecs = last_info.get('CODECS')
780 # TODO: looks like video codec is not always necessarily goes first
781 va_codecs = codecs.split(',')
783 f['vcodec'] = va_codecs[0].partition('.')[0]
784 if len(va_codecs) > 1 and va_codecs[1]:
785 f['acodec'] = va_codecs[1].partition('.')[0]
786 resolution = last_info.get('RESOLUTION')
788 width_str, height_str = resolution.split('x')
789 f['width'] = int(width_str)
790 f['height'] = int(height_str)
793 self._sort_formats(formats)
796 # TODO: improve extraction
797 def _extract_smil_formats(self, smil_url, video_id):
798 smil = self._download_xml(
799 smil_url, video_id, 'Downloading SMIL file',
800 'Unable to download SMIL file')
802 base = smil.find('./head/meta').get('base')
806 for video in smil.findall('./body/switch/video'):
807 src = video.get('src')
810 bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
811 width = int_or_none(video.get('width'))
812 height = int_or_none(video.get('height'))
813 proto = video.get('proto')
816 if base.startswith('rtmp'):
818 elif base.startswith('http'):
820 ext = video.get('ext')
822 formats.extend(self._extract_m3u8_formats(src, video_id, ext))
823 elif proto == 'rtmp':
825 streamer = video.get('streamer') or base
830 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
835 self._sort_formats(formats)
839 def _live_title(self, name):
840 """ Generate the title for a live video """
841 now = datetime.datetime.now()
842 now_str = now.strftime("%Y-%m-%d %H:%M")
843 return name + ' ' + now_str
845 def _int(self, v, name, fatal=False, **kwargs):
846 res = int_or_none(v, **kwargs)
847 if 'get_attr' in kwargs:
848 print(getattr(v, kwargs['get_attr']))
850 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
852 raise ExtractorError(msg)
854 self._downloader.report_warning(msg)
857 def _float(self, v, name, fatal=False, **kwargs):
858 res = float_or_none(v, **kwargs)
860 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
862 raise ExtractorError(msg)
864 self._downloader.report_warning(msg)
867 def _set_cookie(self, domain, name, value, expire_time=None):
868 cookie = compat_cookiejar.Cookie(
869 0, name, value, None, None, domain, None,
870 None, '/', True, False, expire_time, '', None, None, None)
871 self._downloader.cookiejar.set_cookie(cookie)
874 class SearchInfoExtractor(InfoExtractor):
876 Base class for paged search queries extractors.
877 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
878 Instances should define _SEARCH_KEY and _MAX_RESULTS.
882 def _make_valid_url(cls):
883 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
886 def suitable(cls, url):
887 return re.match(cls._make_valid_url(), url) is not None
889 def _real_extract(self, query):
890 mobj = re.match(self._make_valid_url(), query)
892 raise ExtractorError('Invalid search query "%s"' % query)
894 prefix = mobj.group('prefix')
895 query = mobj.group('query')
897 return self._get_n_results(query, 1)
898 elif prefix == 'all':
899 return self._get_n_results(query, self._MAX_RESULTS)
903 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
904 elif n > self._MAX_RESULTS:
905 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
906 n = self._MAX_RESULTS
907 return self._get_n_results(query, n)
909 def _get_n_results(self, query, n):
910 """Get a specified number of results for a query"""
911 raise NotImplementedError("This method must be implemented by subclasses")
914 def SEARCH_KEY(self):
915 return self._SEARCH_KEY