youtube_dl/extractor/common.py

   1 import base64
   2 import hashlib
   3 import json
   4 import netrc
   5 import os
   6 import re
   7 import socket
   8 import sys
   9 import time
  10 import xml.etree.ElementTree
  11
  12 from ..utils import (
  13     compat_http_client,
  14     compat_urllib_error,
  15     compat_urllib_parse_urlparse,
  16     compat_str,
  17
  18     clean_html,
  19     compiled_regex_type,
  20     ExtractorError,
  21     int_or_none,
  22     RegexNotFoundError,
  23     sanitize_filename,
  24     unescapeHTML,
  25 )
  26 _NO_DEFAULT = object()
  27
  28
  29 class InfoExtractor(object):
  30     """Information Extractor class.
  31
  32     Information extractors are the classes that, given a URL, extract
  33     information about the video (or videos) the URL refers to. This
  34     information includes the real video URL, the video title, author and
  35     others. The information is stored in a dictionary which is then
  36     passed to the FileDownloader. The FileDownloader processes this
  37     information possibly downloading the video to the file system, among
  38     other possible outcomes.
  39
  40     The dictionaries must include the following fields:
  41
  42     id:             Video identifier.
  43     title:          Video title, unescaped.
  44
  45     Additionally, it must contain either a formats entry or a url one:
  46
  47     formats:        A list of dictionaries for each format available, ordered
  48                     from worst to best quality.
  49
  50                     Potential fields:
  51                     * url        Mandatory. The URL of the video file
  52                     * ext        Will be calculated from url if missing
  53                     * format     A human-readable description of the format
  54                                  ("mp4 container with h264/opus").
  55                                  Calculated from the format_id, width, height.
  56                                  and format_note fields if missing.
  57                     * format_id  A short description of the format
  58                                  ("mp4_h264_opus" or "19").
  59                                 Technically optional, but strongly recommended.
  60                     * format_note Additional info about the format
  61                                  ("3D" or "DASH video")
  62                     * width      Width of the video, if known
  63                     * height     Height of the video, if known
  64                     * resolution Textual description of width and height
  65                     * tbr        Average bitrate of audio and video in KBit/s
  66                     * abr        Average audio bitrate in KBit/s
  67                     * acodec     Name of the audio codec in use
  68                     * asr        Audio sampling rate in Hertz
  69                     * vbr        Average video bitrate in KBit/s
  70                     * vcodec     Name of the video codec in use
  71                     * container  Name of the container format
  72                     * filesize   The number of bytes, if known in advance
  73                     * filesize_approx  An estimate for the number of bytes
  74                     * player_url SWF Player URL (used for rtmpdump).
  75                     * protocol   The protocol that will be used for the actual
  76                                  download, lower-case.
  77                                  "http", "https", "rtsp", "rtmp", "m3u8" or so.
  78                     * preference Order number of this format. If this field is
  79                                  present and not None, the formats get sorted
  80                                  by this field, regardless of all other values.
  81                                  -1 for default (order by other properties),
  82                                  -2 or smaller for less than default.
  83                     * quality    Order number of the video quality of this
  84                                  format, irrespective of the file format.
  85                                  -1 for default (order by other properties),
  86                                  -2 or smaller for less than default.
  87                     * http_referer  HTTP Referer header value to set.
  88                     * http_method  HTTP method to use for the download.
  89                     * http_headers  A dictionary of additional HTTP headers
  90                                  to add to the request.
  91                     * http_post_data  Additional data to send with a POST
  92                                  request.
  93     url:            Final video URL.
  94     ext:            Video filename extension.
  95     format:         The video format, defaults to ext (used for --get-format)
  96     player_url:     SWF Player URL (used for rtmpdump).
  97
  98     The following fields are optional:
  99
 100     display_id      An alternative identifier for the video, not necessarily
 101                     unique, but available before title. Typically, id is
 102                     something like "4234987", title "Dancing naked mole rats",
 103                     and display_id "dancing-naked-mole-rats"
 104     thumbnails:     A list of dictionaries, with the following entries:
 105                         * "url"
 106                         * "width" (optional, int)
 107                         * "height" (optional, int)
 108                         * "resolution" (optional, string "{width}x{height"},
 109                                         deprecated)
 110     thumbnail:      Full URL to a video thumbnail image.
 111     description:    One-line video description.
 112     uploader:       Full name of the video uploader.
 113     timestamp:      UNIX timestamp of the moment the video became available.
 114     upload_date:    Video upload date (YYYYMMDD).
 115                     If not explicitly set, calculated from timestamp.
 116     uploader_id:    Nickname or id of the video uploader.
 117     location:       Physical location of the video.
 118     subtitles:      The subtitle file contents as a dictionary in the format
 119                     {language: subtitles}.
 120     duration:       Length of the video in seconds, as an integer.
 121     view_count:     How many users have watched the video on the platform.
 122     like_count:     Number of positive ratings of the video
 123     dislike_count:  Number of negative ratings of the video
 124     comment_count:  Number of comments on the video
 125     age_limit:      Age restriction for the video, as an integer (years)
 126     webpage_url:    The url to the video webpage, if given to youtube-dl it
 127                     should allow to get the same result again. (It will be set
 128                     by YoutubeDL if it's missing)
 129     categories:     A list of categories that the video falls in, for example
 130                     ["Sports", "Berlin"]
 131
 132     Unless mentioned otherwise, the fields should be Unicode strings.
 133
 134     Subclasses of this one should re-define the _real_initialize() and
 135     _real_extract() methods and define a _VALID_URL regexp.
 136     Probably, they should also be added to the list of extractors.
 137
 138     Finally, the _WORKING attribute should be set to False for broken IEs
 139     in order to warn the users and skip the tests.
 140     """
 141
 142     _ready = False
 143     _downloader = None
 144     _WORKING = True
 145
 146     def __init__(self, downloader=None):
 147         """Constructor. Receives an optional downloader."""
 148         self._ready = False
 149         self.set_downloader(downloader)
 150
 151     @classmethod
 152     def suitable(cls, url):
 153         """Receives a URL and returns True if suitable for this IE."""
 154
 155         # This does not use has/getattr intentionally - we want to know whether
 156         # we have cached the regexp for *this* class, whereas getattr would also
 157         # match the superclass
 158         if '_VALID_URL_RE' not in cls.__dict__:
 159             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 160         return cls._VALID_URL_RE.match(url) is not None
 161
 162     @classmethod
 163     def working(cls):
 164         """Getter method for _WORKING."""
 165         return cls._WORKING
 166
 167     def initialize(self):
 168         """Initializes an instance (authentication, etc)."""
 169         if not self._ready:
 170             self._real_initialize()
 171             self._ready = True
 172
 173     def extract(self, url):
 174         """Extracts URL information and returns it in list of dicts."""
 175         self.initialize()
 176         return self._real_extract(url)
 177
 178     def set_downloader(self, downloader):
 179         """Sets the downloader for this IE."""
 180         self._downloader = downloader
 181
 182     def _real_initialize(self):
 183         """Real initialization process. Redefine in subclasses."""
 184         pass
 185
 186     def _real_extract(self, url):
 187         """Real extraction process. Redefine in subclasses."""
 188         pass
 189
 190     @classmethod
 191     def ie_key(cls):
 192         """A string for getting the InfoExtractor with get_info_extractor"""
 193         return cls.__name__[:-2]
 194
 195     @property
 196     def IE_NAME(self):
 197         return type(self).__name__[:-2]
 198
 199     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 200         """ Returns the response handle """
 201         if note is None:
 202             self.report_download_webpage(video_id)
 203         elif note is not False:
 204             if video_id is None:
 205                 self.to_screen(u'%s' % (note,))
 206             else:
 207                 self.to_screen(u'%s: %s' % (video_id, note))
 208         try:
 209             return self._downloader.urlopen(url_or_request)
 210         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 211             if errnote is False:
 212                 return False
 213             if errnote is None:
 214                 errnote = u'Unable to download webpage'
 215             errmsg = u'%s: %s' % (errnote, compat_str(err))
 216             if fatal:
 217                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 218             else:
 219                 self._downloader.report_warning(errmsg)
 220                 return False
 221
 222     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 223         """ Returns a tuple (page content as string, URL handle) """
 224
 225         # Strip hashes from the URL (#1038)
 226         if isinstance(url_or_request, (compat_str, str)):
 227             url_or_request = url_or_request.partition('#')[0]
 228
 229         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 230         if urlh is False:
 231             assert not fatal
 232             return False
 233         content_type = urlh.headers.get('Content-Type', '')
 234         webpage_bytes = urlh.read()
 235         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 236         if m:
 237             encoding = m.group(1)
 238         else:
 239             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 240                           webpage_bytes[:1024])
 241             if m:
 242                 encoding = m.group(1).decode('ascii')
 243             elif webpage_bytes.startswith(b'\xff\xfe'):
 244                 encoding = 'utf-16'
 245             else:
 246                 encoding = 'utf-8'
 247         if self._downloader.params.get('dump_intermediate_pages', False):
 248             try:
 249                 url = url_or_request.get_full_url()
 250             except AttributeError:
 251                 url = url_or_request
 252             self.to_screen(u'Dumping request to ' + url)
 253             dump = base64.b64encode(webpage_bytes).decode('ascii')
 254             self._downloader.to_screen(dump)
 255         if self._downloader.params.get('write_pages', False):
 256             try:
 257                 url = url_or_request.get_full_url()
 258             except AttributeError:
 259                 url = url_or_request
 260             basen = '%s_%s' % (video_id, url)
 261             if len(basen) > 240:
 262                 h = u'___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 263                 basen = basen[:240 - len(h)] + h
 264             raw_filename = basen + '.dump'
 265             filename = sanitize_filename(raw_filename, restricted=True)
 266             self.to_screen(u'Saving request to ' + filename)
 267             with open(filename, 'wb') as outf:
 268                 outf.write(webpage_bytes)
 269
 270         try:
 271             content = webpage_bytes.decode(encoding, 'replace')
 272         except LookupError:
 273             content = webpage_bytes.decode('utf-8', 'replace')
 274
 275         if (u'<title>Access to this site is blocked</title>' in content and
 276                 u'Websense' in content[:512]):
 277             msg = u'Access to this webpage has been blocked by Websense filtering software in your network.'
 278             blocked_iframe = self._html_search_regex(
 279                 r'<iframe src="([^"]+)"', content,
 280                 u'Websense information URL', default=None)
 281             if blocked_iframe:
 282                 msg += u' Visit %s for more details' % blocked_iframe
 283             raise ExtractorError(msg, expected=True)
 284
 285         return (content, urlh)
 286
 287     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 288         """ Returns the data of the page as a string """
 289         res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
 290         if res is False:
 291             return res
 292         else:
 293             content, _ = res
 294             return content
 295
 296     def _download_xml(self, url_or_request, video_id,
 297                       note=u'Downloading XML', errnote=u'Unable to download XML',
 298                       transform_source=None, fatal=True):
 299         """Return the xml as an xml.etree.ElementTree.Element"""
 300         xml_string = self._download_webpage(
 301             url_or_request, video_id, note, errnote, fatal=fatal)
 302         if xml_string is False:
 303             return xml_string
 304         if transform_source:
 305             xml_string = transform_source(xml_string)
 306         return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
 307
 308     def _download_json(self, url_or_request, video_id,
 309                        note=u'Downloading JSON metadata',
 310                        errnote=u'Unable to download JSON metadata',
 311                        transform_source=None,
 312                        fatal=True):
 313         json_string = self._download_webpage(
 314             url_or_request, video_id, note, errnote, fatal=fatal)
 315         if (not fatal) and json_string is False:
 316             return None
 317         if transform_source:
 318             json_string = transform_source(json_string)
 319         try:
 320             return json.loads(json_string)
 321         except ValueError as ve:
 322             raise ExtractorError('Failed to download JSON', cause=ve)
 323
 324     def report_warning(self, msg, video_id=None):
 325         idstr = u'' if video_id is None else u'%s: ' % video_id
 326         self._downloader.report_warning(
 327             u'[%s] %s%s' % (self.IE_NAME, idstr, msg))
 328
 329     def to_screen(self, msg):
 330         """Print msg to screen, prefixing it with '[ie_name]'"""
 331         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 332
 333     def report_extraction(self, id_or_name):
 334         """Report information extraction."""
 335         self.to_screen(u'%s: Extracting information' % id_or_name)
 336
 337     def report_download_webpage(self, video_id):
 338         """Report webpage download."""
 339         self.to_screen(u'%s: Downloading webpage' % video_id)
 340
 341     def report_age_confirmation(self):
 342         """Report attempt to confirm age."""
 343         self.to_screen(u'Confirming age')
 344
 345     def report_login(self):
 346         """Report attempt to log in."""
 347         self.to_screen(u'Logging in')
 348
 349     #Methods for following #608
 350     @staticmethod
 351     def url_result(url, ie=None, video_id=None):
 352         """Returns a url that points to a page that should be processed"""
 353         #TODO: ie should be the class used for getting the info
 354         video_info = {'_type': 'url',
 355                       'url': url,
 356                       'ie_key': ie}
 357         if video_id is not None:
 358             video_info['id'] = video_id
 359         return video_info
 360     @staticmethod
 361     def playlist_result(entries, playlist_id=None, playlist_title=None):
 362         """Returns a playlist"""
 363         video_info = {'_type': 'playlist',
 364                       'entries': entries}
 365         if playlist_id:
 366             video_info['id'] = playlist_id
 367         if playlist_title:
 368             video_info['title'] = playlist_title
 369         return video_info
 370
 371     def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
 372         """
 373         Perform a regex search on the given string, using a single or a list of
 374         patterns returning the first matching group.
 375         In case of failure return a default value or raise a WARNING or a
 376         RegexNotFoundError, depending on fatal, specifying the field name.
 377         """
 378         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 379             mobj = re.search(pattern, string, flags)
 380         else:
 381             for p in pattern:
 382                 mobj = re.search(p, string, flags)
 383                 if mobj:
 384                     break
 385
 386         if os.name != 'nt' and sys.stderr.isatty():
 387             _name = u'\033[0;34m%s\033[0m' % name
 388         else:
 389             _name = name
 390
 391         if mobj:
 392             # return the first matching group
 393             return next(g for g in mobj.groups() if g is not None)
 394         elif default is not _NO_DEFAULT:
 395             return default
 396         elif fatal:
 397             raise RegexNotFoundError(u'Unable to extract %s' % _name)
 398         else:
 399             self._downloader.report_warning(u'unable to extract %s; '
 400                 u'please report this issue on http://yt-dl.org/bug' % _name)
 401             return None
 402
 403     def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
 404         """
 405         Like _search_regex, but strips HTML tags and unescapes entities.
 406         """
 407         res = self._search_regex(pattern, string, name, default, fatal, flags)
 408         if res:
 409             return clean_html(res).strip()
 410         else:
 411             return res
 412
 413     def _get_login_info(self):
 414         """
 415         Get the the login info as (username, password)
 416         It will look in the netrc file using the _NETRC_MACHINE value
 417         If there's no info available, return (None, None)
 418         """
 419         if self._downloader is None:
 420             return (None, None)
 421
 422         username = None
 423         password = None
 424         downloader_params = self._downloader.params
 425
 426         # Attempt to use provided username and password or .netrc data
 427         if downloader_params.get('username', None) is not None:
 428             username = downloader_params['username']
 429             password = downloader_params['password']
 430         elif downloader_params.get('usenetrc', False):
 431             try:
 432                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 433                 if info is not None:
 434                     username = info[0]
 435                     password = info[2]
 436                 else:
 437                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 438             except (IOError, netrc.NetrcParseError) as err:
 439                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 440
 441         return (username, password)
 442
 443     # Helper functions for extracting OpenGraph info
 444     @staticmethod
 445     def _og_regexes(prop):
 446         content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
 447         property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
 448         template = r'<meta[^>]+?%s[^>]+?%s'
 449         return [
 450             template % (property_re, content_re),
 451             template % (content_re, property_re),
 452         ]
 453
 454     def _og_search_property(self, prop, html, name=None, **kargs):
 455         if name is None:
 456             name = 'OpenGraph %s' % prop
 457         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 458         if escaped is None:
 459             return None
 460         return unescapeHTML(escaped)
 461
 462     def _og_search_thumbnail(self, html, **kargs):
 463         return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
 464
 465     def _og_search_description(self, html, **kargs):
 466         return self._og_search_property('description', html, fatal=False, **kargs)
 467
 468     def _og_search_title(self, html, **kargs):
 469         return self._og_search_property('title', html, **kargs)
 470
 471     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 472         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 473         if secure:
 474             regexes = self._og_regexes('video:secure_url') + regexes
 475         return self._html_search_regex(regexes, html, name, **kargs)
 476
 477     def _og_search_url(self, html, **kargs):
 478         return self._og_search_property('url', html, **kargs)
 479
 480     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 481         if display_name is None:
 482             display_name = name
 483         return self._html_search_regex(
 484             r'''(?ix)<meta
 485                     (?=[^>]+(?:itemprop|name|property)=["\']?%s["\']?)
 486                     [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
 487             html, display_name, fatal=fatal, **kwargs)
 488
 489     def _dc_search_uploader(self, html):
 490         return self._html_search_meta('dc.creator', html, 'uploader')
 491
 492     def _rta_search(self, html):
 493         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 494         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 495                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 496                      html):
 497             return 18
 498         return 0
 499
 500     def _media_rating_search(self, html):
 501         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 502         rating = self._html_search_meta('rating', html)
 503
 504         if not rating:
 505             return None
 506
 507         RATING_TABLE = {
 508             'safe for kids': 0,
 509             'general': 8,
 510             '14 years': 14,
 511             'mature': 17,
 512             'restricted': 19,
 513         }
 514         return RATING_TABLE.get(rating.lower(), None)
 515
 516     def _twitter_search_player(self, html):
 517         return self._html_search_meta('twitter:player', html,
 518             'twitter card player')
 519
 520     def _sort_formats(self, formats):
 521         if not formats:
 522             raise ExtractorError(u'No video formats found')
 523
 524         def _formats_key(f):
 525             # TODO remove the following workaround
 526             from ..utils import determine_ext
 527             if not f.get('ext') and 'url' in f:
 528                 f['ext'] = determine_ext(f['url'])
 529
 530             preference = f.get('preference')
 531             if preference is None:
 532                 proto = f.get('protocol')
 533                 if proto is None:
 534                     proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
 535
 536                 preference = 0 if proto in ['http', 'https'] else -0.1
 537                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 538                     preference -= 0.5
 539
 540             if f.get('vcodec') == 'none':  # audio only
 541                 if self._downloader.params.get('prefer_free_formats'):
 542                     ORDER = [u'aac', u'mp3', u'm4a', u'webm', u'ogg', u'opus']
 543                 else:
 544                     ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a']
 545                 ext_preference = 0
 546                 try:
 547                     audio_ext_preference = ORDER.index(f['ext'])
 548                 except ValueError:
 549                     audio_ext_preference = -1
 550             else:
 551                 if self._downloader.params.get('prefer_free_formats'):
 552                     ORDER = [u'flv', u'mp4', u'webm']
 553                 else:
 554                     ORDER = [u'webm', u'flv', u'mp4']
 555                 try:
 556                     ext_preference = ORDER.index(f['ext'])
 557                 except ValueError:
 558                     ext_preference = -1
 559                 audio_ext_preference = 0
 560
 561             return (
 562                 preference,
 563                 f.get('quality') if f.get('quality') is not None else -1,
 564                 f.get('height') if f.get('height') is not None else -1,
 565                 f.get('width') if f.get('width') is not None else -1,
 566                 ext_preference,
 567                 f.get('tbr') if f.get('tbr') is not None else -1,
 568                 f.get('vbr') if f.get('vbr') is not None else -1,
 569                 f.get('abr') if f.get('abr') is not None else -1,
 570                 audio_ext_preference,
 571                 f.get('filesize') if f.get('filesize') is not None else -1,
 572                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
 573                 f.get('format_id'),
 574             )
 575         formats.sort(key=_formats_key)
 576
 577     def http_scheme(self):
 578         """ Either "https:" or "https:", depending on the user's preferences """
 579         return (
 580             'http:'
 581             if self._downloader.params.get('prefer_insecure', False)
 582             else 'https:')
 583
 584     def _proto_relative_url(self, url, scheme=None):
 585         if url is None:
 586             return url
 587         if url.startswith('//'):
 588             if scheme is None:
 589                 scheme = self.http_scheme()
 590             return scheme + url
 591         else:
 592             return url
 593
 594     def _sleep(self, timeout, video_id, msg_template=None):
 595         if msg_template is None:
 596             msg_template = u'%(video_id)s: Waiting for %(timeout)s seconds'
 597         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
 598         self.to_screen(msg)
 599         time.sleep(timeout)
 600
 601     def _extract_f4m_formats(self, manifest_url, video_id):
 602         manifest = self._download_xml(
 603             manifest_url, video_id, 'Downloading f4m manifest',
 604             'Unable to download f4m manifest')
 605
 606         formats = []
 607         for media_el in manifest.findall('{http://ns.adobe.com/f4m/1.0}media'):
 608             formats.append({
 609                 'url': manifest_url,
 610                 'ext': 'flv',
 611                 'tbr': int_or_none(media_el.attrib.get('bitrate')),
 612                 'width': int_or_none(media_el.attrib.get('width')),
 613                 'height': int_or_none(media_el.attrib.get('height')),
 614             })
 615         self._sort_formats(formats)
 616
 617         return formats
 618
 619
 620 class SearchInfoExtractor(InfoExtractor):
 621     """
 622     Base class for paged search queries extractors.
 623     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 624     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 625     """
 626
 627     @classmethod
 628     def _make_valid_url(cls):
 629         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 630
 631     @classmethod
 632     def suitable(cls, url):
 633         return re.match(cls._make_valid_url(), url) is not None
 634
 635     def _real_extract(self, query):
 636         mobj = re.match(self._make_valid_url(), query)
 637         if mobj is None:
 638             raise ExtractorError(u'Invalid search query "%s"' % query)
 639
 640         prefix = mobj.group('prefix')
 641         query = mobj.group('query')
 642         if prefix == '':
 643             return self._get_n_results(query, 1)
 644         elif prefix == 'all':
 645             return self._get_n_results(query, self._MAX_RESULTS)
 646         else:
 647             n = int(prefix)
 648             if n <= 0:
 649                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
 650             elif n > self._MAX_RESULTS:
 651                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 652                 n = self._MAX_RESULTS
 653             return self._get_n_results(query, n)
 654
 655     def _get_n_results(self, query, n):
 656         """Get a specified number of results for a query"""
 657         raise NotImplementedError("This method must be implemented by subclasses")
 658
 659     @property
 660     def SEARCH_KEY(self):
 661         return self._SEARCH_KEY