youtube_dl/extractor/common.py

   1 import base64
   2 import hashlib
   3 import json
   4 import netrc
   5 import os
   6 import re
   7 import socket
   8 import sys
   9 import time
  10 import xml.etree.ElementTree
  11
  12 from ..utils import (
  13     compat_http_client,
  14     compat_urllib_error,
  15     compat_urllib_parse_urlparse,
  16     compat_str,
  17
  18     clean_html,
  19     compiled_regex_type,
  20     ExtractorError,
  21     int_or_none,
  22     RegexNotFoundError,
  23     sanitize_filename,
  24     unescapeHTML,
  25 )
  26 _NO_DEFAULT = object()
  27
  28
  29 class InfoExtractor(object):
  30     """Information Extractor class.
  31
  32     Information extractors are the classes that, given a URL, extract
  33     information about the video (or videos) the URL refers to. This
  34     information includes the real video URL, the video title, author and
  35     others. The information is stored in a dictionary which is then
  36     passed to the FileDownloader. The FileDownloader processes this
  37     information possibly downloading the video to the file system, among
  38     other possible outcomes.
  39
  40     The dictionaries must include the following fields:
  41
  42     id:             Video identifier.
  43     title:          Video title, unescaped.
  44
  45     Additionally, it must contain either a formats entry or a url one:
  46
  47     formats:        A list of dictionaries for each format available, ordered
  48                     from worst to best quality.
  49
  50                     Potential fields:
  51                     * url        Mandatory. The URL of the video file
  52                     * ext        Will be calculated from url if missing
  53                     * format     A human-readable description of the format
  54                                  ("mp4 container with h264/opus").
  55                                  Calculated from the format_id, width, height.
  56                                  and format_note fields if missing.
  57                     * format_id  A short description of the format
  58                                  ("mp4_h264_opus" or "19").
  59                                 Technically optional, but strongly recommended.
  60                     * format_note Additional info about the format
  61                                  ("3D" or "DASH video")
  62                     * width      Width of the video, if known
  63                     * height     Height of the video, if known
  64                     * resolution Textual description of width and height
  65                     * tbr        Average bitrate of audio and video in KBit/s
  66                     * abr        Average audio bitrate in KBit/s
  67                     * acodec     Name of the audio codec in use
  68                     * asr        Audio sampling rate in Hertz
  69                     * vbr        Average video bitrate in KBit/s
  70                     * vcodec     Name of the video codec in use
  71                     * container  Name of the container format
  72                     * filesize   The number of bytes, if known in advance
  73                     * filesize_approx  An estimate for the number of bytes
  74                     * player_url SWF Player URL (used for rtmpdump).
  75                     * protocol   The protocol that will be used for the actual
  76                                  download, lower-case.
  77                                  "http", "https", "rtsp", "rtmp", "m3u8" or so.
  78                     * preference Order number of this format. If this field is
  79                                  present and not None, the formats get sorted
  80                                  by this field, regardless of all other values.
  81                                  -1 for default (order by other properties),
  82                                  -2 or smaller for less than default.
  83                     * quality    Order number of the video quality of this
  84                                  format, irrespective of the file format.
  85                                  -1 for default (order by other properties),
  86                                  -2 or smaller for less than default.
  87                     * http_referer  HTTP Referer header value to set.
  88                     * http_method  HTTP method to use for the download.
  89                     * http_headers  A dictionary of additional HTTP headers
  90                                  to add to the request.
  91                     * http_post_data  Additional data to send with a POST
  92                                  request.
  93     url:            Final video URL.
  94     ext:            Video filename extension.
  95     format:         The video format, defaults to ext (used for --get-format)
  96     player_url:     SWF Player URL (used for rtmpdump).
  97
  98     The following fields are optional:
  99
 100     display_id      An alternative identifier for the video, not necessarily
 101                     unique, but available before title. Typically, id is
 102                     something like "4234987", title "Dancing naked mole rats",
 103                     and display_id "dancing-naked-mole-rats"
 104     thumbnails:     A list of dictionaries, with the following entries:
 105                         * "url"
 106                         * "width" (optional, int)
 107                         * "height" (optional, int)
 108                         * "resolution" (optional, string "{width}x{height"},
 109                                         deprecated)
 110     thumbnail:      Full URL to a video thumbnail image.
 111     description:    One-line video description.
 112     uploader:       Full name of the video uploader.
 113     timestamp:      UNIX timestamp of the moment the video became available.
 114     upload_date:    Video upload date (YYYYMMDD).
 115                     If not explicitly set, calculated from timestamp.
 116     uploader_id:    Nickname or id of the video uploader.
 117     location:       Physical location of the video.
 118     subtitles:      The subtitle file contents as a dictionary in the format
 119                     {language: subtitles}.
 120     duration:       Length of the video in seconds, as an integer.
 121     view_count:     How many users have watched the video on the platform.
 122     like_count:     Number of positive ratings of the video
 123     dislike_count:  Number of negative ratings of the video
 124     comment_count:  Number of comments on the video
 125     age_limit:      Age restriction for the video, as an integer (years)
 126     webpage_url:    The url to the video webpage, if given to youtube-dl it
 127                     should allow to get the same result again. (It will be set
 128                     by YoutubeDL if it's missing)
 129     categories:     A list of categories that the video falls in, for example
 130                     ["Sports", "Berlin"]
 131
 132     Unless mentioned otherwise, the fields should be Unicode strings.
 133
 134     Subclasses of this one should re-define the _real_initialize() and
 135     _real_extract() methods and define a _VALID_URL regexp.
 136     Probably, they should also be added to the list of extractors.
 137
 138     Finally, the _WORKING attribute should be set to False for broken IEs
 139     in order to warn the users and skip the tests.
 140     """
 141
 142     _ready = False
 143     _downloader = None
 144     _WORKING = True
 145
 146     def __init__(self, downloader=None):
 147         """Constructor. Receives an optional downloader."""
 148         self._ready = False
 149         self.set_downloader(downloader)
 150
 151     @classmethod
 152     def suitable(cls, url):
 153         """Receives a URL and returns True if suitable for this IE."""
 154
 155         # This does not use has/getattr intentionally - we want to know whether
 156         # we have cached the regexp for *this* class, whereas getattr would also
 157         # match the superclass
 158         if '_VALID_URL_RE' not in cls.__dict__:
 159             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 160         return cls._VALID_URL_RE.match(url) is not None
 161
 162     @classmethod
 163     def working(cls):
 164         """Getter method for _WORKING."""
 165         return cls._WORKING
 166
 167     def initialize(self):
 168         """Initializes an instance (authentication, etc)."""
 169         if not self._ready:
 170             self._real_initialize()
 171             self._ready = True
 172
 173     def extract(self, url):
 174         """Extracts URL information and returns it in list of dicts."""
 175         self.initialize()
 176         return self._real_extract(url)
 177
 178     def set_downloader(self, downloader):
 179         """Sets the downloader for this IE."""
 180         self._downloader = downloader
 181
 182     def _real_initialize(self):
 183         """Real initialization process. Redefine in subclasses."""
 184         pass
 185
 186     def _real_extract(self, url):
 187         """Real extraction process. Redefine in subclasses."""
 188         pass
 189
 190     @classmethod
 191     def ie_key(cls):
 192         """A string for getting the InfoExtractor with get_info_extractor"""
 193         return cls.__name__[:-2]
 194
 195     @property
 196     def IE_NAME(self):
 197         return type(self).__name__[:-2]
 198
 199     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 200         """ Returns the response handle """
 201         if note is None:
 202             self.report_download_webpage(video_id)
 203         elif note is not False:
 204             if video_id is None:
 205                 self.to_screen(u'%s' % (note,))
 206             else:
 207                 self.to_screen(u'%s: %s' % (video_id, note))
 208         try:
 209             return self._downloader.urlopen(url_or_request)
 210         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 211             if errnote is False:
 212                 return False
 213             if errnote is None:
 214                 errnote = u'Unable to download webpage'
 215             errmsg = u'%s: %s' % (errnote, compat_str(err))
 216             if fatal:
 217                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 218             else:
 219                 self._downloader.report_warning(errmsg)
 220                 return False
 221
 222     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 223         """ Returns a tuple (page content as string, URL handle) """
 224
 225         # Strip hashes from the URL (#1038)
 226         if isinstance(url_or_request, (compat_str, str)):
 227             url_or_request = url_or_request.partition('#')[0]
 228
 229         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 230         if urlh is False:
 231             assert not fatal
 232             return False
 233         content_type = urlh.headers.get('Content-Type', '')
 234         webpage_bytes = urlh.read()
 235         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 236         if m:
 237             encoding = m.group(1)
 238         else:
 239             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 240                           webpage_bytes[:1024])
 241             if m:
 242                 encoding = m.group(1).decode('ascii')
 243             elif webpage_bytes.startswith(b'\xff\xfe'):
 244                 encoding = 'utf-16'
 245             else:
 246                 encoding = 'utf-8'
 247         if self._downloader.params.get('dump_intermediate_pages', False):
 248             try:
 249                 url = url_or_request.get_full_url()
 250             except AttributeError:
 251                 url = url_or_request
 252             self.to_screen(u'Dumping request to ' + url)
 253             dump = base64.b64encode(webpage_bytes).decode('ascii')
 254             self._downloader.to_screen(dump)
 255         if self._downloader.params.get('write_pages', False):
 256             try:
 257                 url = url_or_request.get_full_url()
 258             except AttributeError:
 259                 url = url_or_request
 260             basen = '%s_%s' % (video_id, url)
 261             if len(basen) > 240:
 262                 h = u'___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 263                 basen = basen[:240 - len(h)] + h
 264             raw_filename = basen + '.dump'
 265             filename = sanitize_filename(raw_filename, restricted=True)
 266             self.to_screen(u'Saving request to ' + filename)
 267             with open(filename, 'wb') as outf:
 268                 outf.write(webpage_bytes)
 269
 270         try:
 271             content = webpage_bytes.decode(encoding, 'replace')
 272         except LookupError:
 273             content = webpage_bytes.decode('utf-8', 'replace')
 274
 275         if (u'<title>Access to this site is blocked</title>' in content and
 276                 u'Websense' in content[:512]):
 277             msg = u'Access to this webpage has been blocked by Websense filtering software in your network.'
 278             blocked_iframe = self._html_search_regex(
 279                 r'<iframe src="([^"]+)"', content,
 280                 u'Websense information URL', default=None)
 281             if blocked_iframe:
 282                 msg += u' Visit %s for more details' % blocked_iframe
 283             raise ExtractorError(msg, expected=True)
 284
 285         return (content, urlh)
 286
 287     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 288         """ Returns the data of the page as a string """
 289         res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
 290         if res is False:
 291             return res
 292         else:
 293             content, _ = res
 294             return content
 295
 296     def _download_xml(self, url_or_request, video_id,
 297                       note=u'Downloading XML', errnote=u'Unable to download XML',
 298                       transform_source=None, fatal=True):
 299         """Return the xml as an xml.etree.ElementTree.Element"""
 300         xml_string = self._download_webpage(
 301             url_or_request, video_id, note, errnote, fatal=fatal)
 302         if xml_string is False:
 303             return xml_string
 304         if transform_source:
 305             xml_string = transform_source(xml_string)
 306         return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
 307
 308     def _download_json(self, url_or_request, video_id,
 309                        note=u'Downloading JSON metadata',
 310                        errnote=u'Unable to download JSON metadata',
 311                        transform_source=None,
 312                        fatal=True):
 313         json_string = self._download_webpage(
 314             url_or_request, video_id, note, errnote, fatal=fatal)
 315         if (not fatal) and json_string is False:
 316             return None
 317         if transform_source:
 318             json_string = transform_source(json_string)
 319         try:
 320             return json.loads(json_string)
 321         except ValueError as ve:
 322             raise ExtractorError('Failed to download JSON', cause=ve)
 323
 324     def report_warning(self, msg, video_id=None):
 325         idstr = u'' if video_id is None else u'%s: ' % video_id
 326         self._downloader.report_warning(
 327             u'[%s] %s%s' % (self.IE_NAME, idstr, msg))
 328
 329     def to_screen(self, msg):
 330         """Print msg to screen, prefixing it with '[ie_name]'"""
 331         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 332
 333     def report_extraction(self, id_or_name):
 334         """Report information extraction."""
 335         self.to_screen(u'%s: Extracting information' % id_or_name)
 336
 337     def report_download_webpage(self, video_id):
 338         """Report webpage download."""
 339         self.to_screen(u'%s: Downloading webpage' % video_id)
 340
 341     def report_age_confirmation(self):
 342         """Report attempt to confirm age."""
 343         self.to_screen(u'Confirming age')
 344
 345     def report_login(self):
 346         """Report attempt to log in."""
 347         self.to_screen(u'Logging in')
 348
 349     #Methods for following #608
 350     @staticmethod
 351     def url_result(url, ie=None, video_id=None):
 352         """Returns a url that points to a page that should be processed"""
 353         #TODO: ie should be the class used for getting the info
 354         video_info = {'_type': 'url',
 355                       'url': url,
 356                       'ie_key': ie}
 357         if video_id is not None:
 358             video_info['id'] = video_id
 359         return video_info
 360     @staticmethod
 361     def playlist_result(entries, playlist_id=None, playlist_title=None):
 362         """Returns a playlist"""
 363         video_info = {'_type': 'playlist',
 364                       'entries': entries}
 365         if playlist_id:
 366             video_info['id'] = playlist_id
 367         if playlist_title:
 368             video_info['title'] = playlist_title
 369         return video_info
 370
 371     def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
 372         """
 373         Perform a regex search on the given string, using a single or a list of
 374         patterns returning the first matching group.
 375         In case of failure return a default value or raise a WARNING or a
 376         RegexNotFoundError, depending on fatal, specifying the field name.
 377         """
 378         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 379             mobj = re.search(pattern, string, flags)
 380         else:
 381             for p in pattern:
 382                 mobj = re.search(p, string, flags)
 383                 if mobj:
 384                     break
 385
 386         if os.name != 'nt' and sys.stderr.isatty():
 387             _name = u'\033[0;34m%s\033[0m' % name
 388         else:
 389             _name = name
 390
 391         if mobj:
 392             # return the first matching group
 393             return next(g for g in mobj.groups() if g is not None)
 394         elif default is not _NO_DEFAULT:
 395             return default
 396         elif fatal:
 397             raise RegexNotFoundError(u'Unable to extract %s' % _name)
 398         else:
 399             self._downloader.report_warning(u'unable to extract %s; '
 400                 u'please report this issue on http://yt-dl.org/bug' % _name)
 401             return None
 402
 403     def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
 404         """
 405         Like _search_regex, but strips HTML tags and unescapes entities.
 406         """
 407         res = self._search_regex(pattern, string, name, default, fatal, flags)
 408         if res:
 409             return clean_html(res).strip()
 410         else:
 411             return res
 412
 413     def _get_login_info(self):
 414         """
 415         Get the the login info as (username, password)
 416         It will look in the netrc file using the _NETRC_MACHINE value
 417         If there's no info available, return (None, None)
 418         """
 419         if self._downloader is None:
 420             return (None, None)
 421
 422         username = None
 423         password = None
 424         downloader_params = self._downloader.params
 425
 426         # Attempt to use provided username and password or .netrc data
 427         if downloader_params.get('username', None) is not None:
 428             username = downloader_params['username']
 429             password = downloader_params['password']
 430         elif downloader_params.get('usenetrc', False):
 431             try:
 432                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 433                 if info is not None:
 434                     username = info[0]
 435                     password = info[2]
 436                 else:
 437                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 438             except (IOError, netrc.NetrcParseError) as err:
 439                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 440
 441         return (username, password)
 442
 443     def _get_tfa_info(self):
 444         """
 445         Get the two-factor authentication info
 446         TODO - asking the user will be required for sms/phone verify
 447         currently just uses the command line option
 448         If there's no info available, return None
 449         """
 450         if self._downloader is None:
 451             return None
 452         downloader_params = self._downloader.params
 453
 454         if downloader_params.get('twofactor', None) is not None:
 455             return downloader_params['twofactor']
 456
 457         return None
 458
 459     # Helper functions for extracting OpenGraph info
 460     @staticmethod
 461     def _og_regexes(prop):
 462         content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
 463         property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
 464         template = r'<meta[^>]+?%s[^>]+?%s'
 465         return [
 466             template % (property_re, content_re),
 467             template % (content_re, property_re),
 468         ]
 469
 470     def _og_search_property(self, prop, html, name=None, **kargs):
 471         if name is None:
 472             name = 'OpenGraph %s' % prop
 473         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 474         if escaped is None:
 475             return None
 476         return unescapeHTML(escaped)
 477
 478     def _og_search_thumbnail(self, html, **kargs):
 479         return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
 480
 481     def _og_search_description(self, html, **kargs):
 482         return self._og_search_property('description', html, fatal=False, **kargs)
 483
 484     def _og_search_title(self, html, **kargs):
 485         return self._og_search_property('title', html, **kargs)
 486
 487     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 488         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 489         if secure:
 490             regexes = self._og_regexes('video:secure_url') + regexes
 491         return self._html_search_regex(regexes, html, name, **kargs)
 492
 493     def _og_search_url(self, html, **kargs):
 494         return self._og_search_property('url', html, **kargs)
 495
 496     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 497         if display_name is None:
 498             display_name = name
 499         return self._html_search_regex(
 500             r'''(?ix)<meta
 501                     (?=[^>]+(?:itemprop|name|property)=["\']?%s["\']?)
 502                     [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
 503             html, display_name, fatal=fatal, **kwargs)
 504
 505     def _dc_search_uploader(self, html):
 506         return self._html_search_meta('dc.creator', html, 'uploader')
 507
 508     def _rta_search(self, html):
 509         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 510         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 511                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 512                      html):
 513             return 18
 514         return 0
 515
 516     def _media_rating_search(self, html):
 517         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 518         rating = self._html_search_meta('rating', html)
 519
 520         if not rating:
 521             return None
 522
 523         RATING_TABLE = {
 524             'safe for kids': 0,
 525             'general': 8,
 526             '14 years': 14,
 527             'mature': 17,
 528             'restricted': 19,
 529         }
 530         return RATING_TABLE.get(rating.lower(), None)
 531
 532     def _twitter_search_player(self, html):
 533         return self._html_search_meta('twitter:player', html,
 534             'twitter card player')
 535
 536     def _sort_formats(self, formats):
 537         if not formats:
 538             raise ExtractorError(u'No video formats found')
 539
 540         def _formats_key(f):
 541             # TODO remove the following workaround
 542             from ..utils import determine_ext
 543             if not f.get('ext') and 'url' in f:
 544                 f['ext'] = determine_ext(f['url'])
 545
 546             preference = f.get('preference')
 547             if preference is None:
 548                 proto = f.get('protocol')
 549                 if proto is None:
 550                     proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
 551
 552                 preference = 0 if proto in ['http', 'https'] else -0.1
 553                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 554                     preference -= 0.5
 555
 556             if f.get('vcodec') == 'none':  # audio only
 557                 if self._downloader.params.get('prefer_free_formats'):
 558                     ORDER = [u'aac', u'mp3', u'm4a', u'webm', u'ogg', u'opus']
 559                 else:
 560                     ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a']
 561                 ext_preference = 0
 562                 try:
 563                     audio_ext_preference = ORDER.index(f['ext'])
 564                 except ValueError:
 565                     audio_ext_preference = -1
 566             else:
 567                 if self._downloader.params.get('prefer_free_formats'):
 568                     ORDER = [u'flv', u'mp4', u'webm']
 569                 else:
 570                     ORDER = [u'webm', u'flv', u'mp4']
 571                 try:
 572                     ext_preference = ORDER.index(f['ext'])
 573                 except ValueError:
 574                     ext_preference = -1
 575                 audio_ext_preference = 0
 576
 577             return (
 578                 preference,
 579                 f.get('quality') if f.get('quality') is not None else -1,
 580                 f.get('height') if f.get('height') is not None else -1,
 581                 f.get('width') if f.get('width') is not None else -1,
 582                 ext_preference,
 583                 f.get('tbr') if f.get('tbr') is not None else -1,
 584                 f.get('vbr') if f.get('vbr') is not None else -1,
 585                 f.get('abr') if f.get('abr') is not None else -1,
 586                 audio_ext_preference,
 587                 f.get('filesize') if f.get('filesize') is not None else -1,
 588                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
 589                 f.get('format_id'),
 590             )
 591         formats.sort(key=_formats_key)
 592
 593     def http_scheme(self):
 594         """ Either "https:" or "https:", depending on the user's preferences """
 595         return (
 596             'http:'
 597             if self._downloader.params.get('prefer_insecure', False)
 598             else 'https:')
 599
 600     def _proto_relative_url(self, url, scheme=None):
 601         if url is None:
 602             return url
 603         if url.startswith('//'):
 604             if scheme is None:
 605                 scheme = self.http_scheme()
 606             return scheme + url
 607         else:
 608             return url
 609
 610     def _sleep(self, timeout, video_id, msg_template=None):
 611         if msg_template is None:
 612             msg_template = u'%(video_id)s: Waiting for %(timeout)s seconds'
 613         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
 614         self.to_screen(msg)
 615         time.sleep(timeout)
 616
 617     def _extract_f4m_formats(self, manifest_url, video_id):
 618         manifest = self._download_xml(
 619             manifest_url, video_id, 'Downloading f4m manifest',
 620             'Unable to download f4m manifest')
 621
 622         formats = []
 623         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
 624         for i, media_el in enumerate(media_nodes):
 625             tbr = int_or_none(media_el.attrib.get('bitrate'))
 626             format_id = 'f4m-%d' % (i if tbr is None else tbr)
 627             formats.append({
 628                 'format_id': format_id,
 629                 'url': manifest_url,
 630                 'ext': 'flv',
 631                 'tbr': tbr,
 632                 'width': int_or_none(media_el.attrib.get('width')),
 633                 'height': int_or_none(media_el.attrib.get('height')),
 634             })
 635         self._sort_formats(formats)
 636
 637         return formats
 638
 639
 640 class SearchInfoExtractor(InfoExtractor):
 641     """
 642     Base class for paged search queries extractors.
 643     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 644     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 645     """
 646
 647     @classmethod
 648     def _make_valid_url(cls):
 649         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 650
 651     @classmethod
 652     def suitable(cls, url):
 653         return re.match(cls._make_valid_url(), url) is not None
 654
 655     def _real_extract(self, query):
 656         mobj = re.match(self._make_valid_url(), query)
 657         if mobj is None:
 658             raise ExtractorError(u'Invalid search query "%s"' % query)
 659
 660         prefix = mobj.group('prefix')
 661         query = mobj.group('query')
 662         if prefix == '':
 663             return self._get_n_results(query, 1)
 664         elif prefix == 'all':
 665             return self._get_n_results(query, self._MAX_RESULTS)
 666         else:
 667             n = int(prefix)
 668             if n <= 0:
 669                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
 670             elif n > self._MAX_RESULTS:
 671                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 672                 n = self._MAX_RESULTS
 673             return self._get_n_results(query, n)
 674
 675     def _get_n_results(self, query, n):
 676         """Get a specified number of results for a query"""
 677         raise NotImplementedError("This method must be implemented by subclasses")
 678
 679     @property
 680     def SEARCH_KEY(self):
 681         return self._SEARCH_KEY