youtube_dl/extractor/common.py

   1 from __future__ import unicode_literals
   2
   3 import base64
   4 import hashlib
   5 import json
   6 import netrc
   7 import os
   8 import re
   9 import socket
  10 import sys
  11 import time
  12 import xml.etree.ElementTree
  13
  14 from ..utils import (
  15     compat_http_client,
  16     compat_urllib_error,
  17     compat_urllib_parse_urlparse,
  18     compat_str,
  19
  20     clean_html,
  21     compiled_regex_type,
  22     ExtractorError,
  23     int_or_none,
  24     RegexNotFoundError,
  25     sanitize_filename,
  26     unescapeHTML,
  27 )
  28 _NO_DEFAULT = object()
  29
  30
  31 class InfoExtractor(object):
  32     """Information Extractor class.
  33
  34     Information extractors are the classes that, given a URL, extract
  35     information about the video (or videos) the URL refers to. This
  36     information includes the real video URL, the video title, author and
  37     others. The information is stored in a dictionary which is then
  38     passed to the FileDownloader. The FileDownloader processes this
  39     information possibly downloading the video to the file system, among
  40     other possible outcomes.
  41
  42     The dictionaries must include the following fields:
  43
  44     id:             Video identifier.
  45     title:          Video title, unescaped.
  46
  47     Additionally, it must contain either a formats entry or a url one:
  48
  49     formats:        A list of dictionaries for each format available, ordered
  50                     from worst to best quality.
  51
  52                     Potential fields:
  53                     * url        Mandatory. The URL of the video file
  54                     * ext        Will be calculated from url if missing
  55                     * format     A human-readable description of the format
  56                                  ("mp4 container with h264/opus").
  57                                  Calculated from the format_id, width, height.
  58                                  and format_note fields if missing.
  59                     * format_id  A short description of the format
  60                                  ("mp4_h264_opus" or "19").
  61                                 Technically optional, but strongly recommended.
  62                     * format_note Additional info about the format
  63                                  ("3D" or "DASH video")
  64                     * width      Width of the video, if known
  65                     * height     Height of the video, if known
  66                     * resolution Textual description of width and height
  67                     * tbr        Average bitrate of audio and video in KBit/s
  68                     * abr        Average audio bitrate in KBit/s
  69                     * acodec     Name of the audio codec in use
  70                     * asr        Audio sampling rate in Hertz
  71                     * vbr        Average video bitrate in KBit/s
  72                     * vcodec     Name of the video codec in use
  73                     * container  Name of the container format
  74                     * filesize   The number of bytes, if known in advance
  75                     * filesize_approx  An estimate for the number of bytes
  76                     * player_url SWF Player URL (used for rtmpdump).
  77                     * protocol   The protocol that will be used for the actual
  78                                  download, lower-case.
  79                                  "http", "https", "rtsp", "rtmp", "m3u8" or so.
  80                     * preference Order number of this format. If this field is
  81                                  present and not None, the formats get sorted
  82                                  by this field, regardless of all other values.
  83                                  -1 for default (order by other properties),
  84                                  -2 or smaller for less than default.
  85                     * quality    Order number of the video quality of this
  86                                  format, irrespective of the file format.
  87                                  -1 for default (order by other properties),
  88                                  -2 or smaller for less than default.
  89                     * http_referer  HTTP Referer header value to set.
  90                     * http_method  HTTP method to use for the download.
  91                     * http_headers  A dictionary of additional HTTP headers
  92                                  to add to the request.
  93                     * http_post_data  Additional data to send with a POST
  94                                  request.
  95     url:            Final video URL.
  96     ext:            Video filename extension.
  97     format:         The video format, defaults to ext (used for --get-format)
  98     player_url:     SWF Player URL (used for rtmpdump).
  99
 100     The following fields are optional:
 101
 102     display_id      An alternative identifier for the video, not necessarily
 103                     unique, but available before title. Typically, id is
 104                     something like "4234987", title "Dancing naked mole rats",
 105                     and display_id "dancing-naked-mole-rats"
 106     thumbnails:     A list of dictionaries, with the following entries:
 107                         * "url"
 108                         * "width" (optional, int)
 109                         * "height" (optional, int)
 110                         * "resolution" (optional, string "{width}x{height"},
 111                                         deprecated)
 112     thumbnail:      Full URL to a video thumbnail image.
 113     description:    One-line video description.
 114     uploader:       Full name of the video uploader.
 115     timestamp:      UNIX timestamp of the moment the video became available.
 116     upload_date:    Video upload date (YYYYMMDD).
 117                     If not explicitly set, calculated from timestamp.
 118     uploader_id:    Nickname or id of the video uploader.
 119     location:       Physical location where the video was filmed.
 120     subtitles:      The subtitle file contents as a dictionary in the format
 121                     {language: subtitles}.
 122     duration:       Length of the video in seconds, as an integer.
 123     view_count:     How many users have watched the video on the platform.
 124     like_count:     Number of positive ratings of the video
 125     dislike_count:  Number of negative ratings of the video
 126     comment_count:  Number of comments on the video
 127     age_limit:      Age restriction for the video, as an integer (years)
 128     webpage_url:    The url to the video webpage, if given to youtube-dl it
 129                     should allow to get the same result again. (It will be set
 130                     by YoutubeDL if it's missing)
 131     categories:     A list of categories that the video falls in, for example
 132                     ["Sports", "Berlin"]
 133     is_live:        True, False, or None (=unknown). Whether this video is a
 134                     live stream that goes on instead of a fixed-length video.
 135
 136     Unless mentioned otherwise, the fields should be Unicode strings.
 137
 138     Subclasses of this one should re-define the _real_initialize() and
 139     _real_extract() methods and define a _VALID_URL regexp.
 140     Probably, they should also be added to the list of extractors.
 141
 142     Finally, the _WORKING attribute should be set to False for broken IEs
 143     in order to warn the users and skip the tests.
 144     """
 145
 146     _ready = False
 147     _downloader = None
 148     _WORKING = True
 149
 150     def __init__(self, downloader=None):
 151         """Constructor. Receives an optional downloader."""
 152         self._ready = False
 153         self.set_downloader(downloader)
 154
 155     @classmethod
 156     def suitable(cls, url):
 157         """Receives a URL and returns True if suitable for this IE."""
 158
 159         # This does not use has/getattr intentionally - we want to know whether
 160         # we have cached the regexp for *this* class, whereas getattr would also
 161         # match the superclass
 162         if '_VALID_URL_RE' not in cls.__dict__:
 163             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 164         return cls._VALID_URL_RE.match(url) is not None
 165
 166     @classmethod
 167     def working(cls):
 168         """Getter method for _WORKING."""
 169         return cls._WORKING
 170
 171     def initialize(self):
 172         """Initializes an instance (authentication, etc)."""
 173         if not self._ready:
 174             self._real_initialize()
 175             self._ready = True
 176
 177     def extract(self, url):
 178         """Extracts URL information and returns it in list of dicts."""
 179         self.initialize()
 180         return self._real_extract(url)
 181
 182     def set_downloader(self, downloader):
 183         """Sets the downloader for this IE."""
 184         self._downloader = downloader
 185
 186     def _real_initialize(self):
 187         """Real initialization process. Redefine in subclasses."""
 188         pass
 189
 190     def _real_extract(self, url):
 191         """Real extraction process. Redefine in subclasses."""
 192         pass
 193
 194     @classmethod
 195     def ie_key(cls):
 196         """A string for getting the InfoExtractor with get_info_extractor"""
 197         return cls.__name__[:-2]
 198
 199     @property
 200     def IE_NAME(self):
 201         return type(self).__name__[:-2]
 202
 203     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 204         """ Returns the response handle """
 205         if note is None:
 206             self.report_download_webpage(video_id)
 207         elif note is not False:
 208             if video_id is None:
 209                 self.to_screen('%s' % (note,))
 210             else:
 211                 self.to_screen('%s: %s' % (video_id, note))
 212         try:
 213             return self._downloader.urlopen(url_or_request)
 214         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 215             if errnote is False:
 216                 return False
 217             if errnote is None:
 218                 errnote = 'Unable to download webpage'
 219             errmsg = '%s: %s' % (errnote, compat_str(err))
 220             if fatal:
 221                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 222             else:
 223                 self._downloader.report_warning(errmsg)
 224                 return False
 225
 226     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 227         """ Returns a tuple (page content as string, URL handle) """
 228
 229         # Strip hashes from the URL (#1038)
 230         if isinstance(url_or_request, (compat_str, str)):
 231             url_or_request = url_or_request.partition('#')[0]
 232
 233         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 234         if urlh is False:
 235             assert not fatal
 236             return False
 237         content_type = urlh.headers.get('Content-Type', '')
 238         webpage_bytes = urlh.read()
 239         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 240         if m:
 241             encoding = m.group(1)
 242         else:
 243             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 244                           webpage_bytes[:1024])
 245             if m:
 246                 encoding = m.group(1).decode('ascii')
 247             elif webpage_bytes.startswith(b'\xff\xfe'):
 248                 encoding = 'utf-16'
 249             else:
 250                 encoding = 'utf-8'
 251         if self._downloader.params.get('dump_intermediate_pages', False):
 252             try:
 253                 url = url_or_request.get_full_url()
 254             except AttributeError:
 255                 url = url_or_request
 256             self.to_screen('Dumping request to ' + url)
 257             dump = base64.b64encode(webpage_bytes).decode('ascii')
 258             self._downloader.to_screen(dump)
 259         if self._downloader.params.get('write_pages', False):
 260             try:
 261                 url = url_or_request.get_full_url()
 262             except AttributeError:
 263                 url = url_or_request
 264             basen = '%s_%s' % (video_id, url)
 265             if len(basen) > 240:
 266                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 267                 basen = basen[:240 - len(h)] + h
 268             raw_filename = basen + '.dump'
 269             filename = sanitize_filename(raw_filename, restricted=True)
 270             self.to_screen('Saving request to ' + filename)
 271             with open(filename, 'wb') as outf:
 272                 outf.write(webpage_bytes)
 273
 274         try:
 275             content = webpage_bytes.decode(encoding, 'replace')
 276         except LookupError:
 277             content = webpage_bytes.decode('utf-8', 'replace')
 278
 279         if ('<title>Access to this site is blocked</title>' in content and
 280                 'Websense' in content[:512]):
 281             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 282             blocked_iframe = self._html_search_regex(
 283                 r'<iframe src="([^"]+)"', content,
 284                 'Websense information URL', default=None)
 285             if blocked_iframe:
 286                 msg += ' Visit %s for more details' % blocked_iframe
 287             raise ExtractorError(msg, expected=True)
 288
 289         return (content, urlh)
 290
 291     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 292         """ Returns the data of the page as a string """
 293         res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
 294         if res is False:
 295             return res
 296         else:
 297             content, _ = res
 298             return content
 299
 300     def _download_xml(self, url_or_request, video_id,
 301                       note='Downloading XML', errnote='Unable to download XML',
 302                       transform_source=None, fatal=True):
 303         """Return the xml as an xml.etree.ElementTree.Element"""
 304         xml_string = self._download_webpage(
 305             url_or_request, video_id, note, errnote, fatal=fatal)
 306         if xml_string is False:
 307             return xml_string
 308         if transform_source:
 309             xml_string = transform_source(xml_string)
 310         return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
 311
 312     def _download_json(self, url_or_request, video_id,
 313                        note='Downloading JSON metadata',
 314                        errnote='Unable to download JSON metadata',
 315                        transform_source=None,
 316                        fatal=True):
 317         json_string = self._download_webpage(
 318             url_or_request, video_id, note, errnote, fatal=fatal)
 319         if (not fatal) and json_string is False:
 320             return None
 321         if transform_source:
 322             json_string = transform_source(json_string)
 323         try:
 324             return json.loads(json_string)
 325         except ValueError as ve:
 326             raise ExtractorError('Failed to download JSON', cause=ve)
 327
 328     def report_warning(self, msg, video_id=None):
 329         idstr = '' if video_id is None else '%s: ' % video_id
 330         self._downloader.report_warning(
 331             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 332
 333     def to_screen(self, msg):
 334         """Print msg to screen, prefixing it with '[ie_name]'"""
 335         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 336
 337     def report_extraction(self, id_or_name):
 338         """Report information extraction."""
 339         self.to_screen('%s: Extracting information' % id_or_name)
 340
 341     def report_download_webpage(self, video_id):
 342         """Report webpage download."""
 343         self.to_screen('%s: Downloading webpage' % video_id)
 344
 345     def report_age_confirmation(self):
 346         """Report attempt to confirm age."""
 347         self.to_screen('Confirming age')
 348
 349     def report_login(self):
 350         """Report attempt to log in."""
 351         self.to_screen('Logging in')
 352
 353     #Methods for following #608
 354     @staticmethod
 355     def url_result(url, ie=None, video_id=None):
 356         """Returns a url that points to a page that should be processed"""
 357         #TODO: ie should be the class used for getting the info
 358         video_info = {'_type': 'url',
 359                       'url': url,
 360                       'ie_key': ie}
 361         if video_id is not None:
 362             video_info['id'] = video_id
 363         return video_info
 364     @staticmethod
 365     def playlist_result(entries, playlist_id=None, playlist_title=None):
 366         """Returns a playlist"""
 367         video_info = {'_type': 'playlist',
 368                       'entries': entries}
 369         if playlist_id:
 370             video_info['id'] = playlist_id
 371         if playlist_title:
 372             video_info['title'] = playlist_title
 373         return video_info
 374
 375     def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
 376         """
 377         Perform a regex search on the given string, using a single or a list of
 378         patterns returning the first matching group.
 379         In case of failure return a default value or raise a WARNING or a
 380         RegexNotFoundError, depending on fatal, specifying the field name.
 381         """
 382         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 383             mobj = re.search(pattern, string, flags)
 384         else:
 385             for p in pattern:
 386                 mobj = re.search(p, string, flags)
 387                 if mobj:
 388                     break
 389
 390         if os.name != 'nt' and sys.stderr.isatty():
 391             _name = '\033[0;34m%s\033[0m' % name
 392         else:
 393             _name = name
 394
 395         if mobj:
 396             # return the first matching group
 397             return next(g for g in mobj.groups() if g is not None)
 398         elif default is not _NO_DEFAULT:
 399             return default
 400         elif fatal:
 401             raise RegexNotFoundError('Unable to extract %s' % _name)
 402         else:
 403             self._downloader.report_warning('unable to extract %s; '
 404                 'please report this issue on http://yt-dl.org/bug' % _name)
 405             return None
 406
 407     def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
 408         """
 409         Like _search_regex, but strips HTML tags and unescapes entities.
 410         """
 411         res = self._search_regex(pattern, string, name, default, fatal, flags)
 412         if res:
 413             return clean_html(res).strip()
 414         else:
 415             return res
 416
 417     def _get_login_info(self):
 418         """
 419         Get the the login info as (username, password)
 420         It will look in the netrc file using the _NETRC_MACHINE value
 421         If there's no info available, return (None, None)
 422         """
 423         if self._downloader is None:
 424             return (None, None)
 425
 426         username = None
 427         password = None
 428         downloader_params = self._downloader.params
 429
 430         # Attempt to use provided username and password or .netrc data
 431         if downloader_params.get('username', None) is not None:
 432             username = downloader_params['username']
 433             password = downloader_params['password']
 434         elif downloader_params.get('usenetrc', False):
 435             try:
 436                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 437                 if info is not None:
 438                     username = info[0]
 439                     password = info[2]
 440                 else:
 441                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 442             except (IOError, netrc.NetrcParseError) as err:
 443                 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
 444
 445         return (username, password)
 446
 447     def _get_tfa_info(self):
 448         """
 449         Get the two-factor authentication info
 450         TODO - asking the user will be required for sms/phone verify
 451         currently just uses the command line option
 452         If there's no info available, return None
 453         """
 454         if self._downloader is None:
 455             return None
 456         downloader_params = self._downloader.params
 457
 458         if downloader_params.get('twofactor', None) is not None:
 459             return downloader_params['twofactor']
 460
 461         return None
 462
 463     # Helper functions for extracting OpenGraph info
 464     @staticmethod
 465     def _og_regexes(prop):
 466         content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
 467         property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
 468         template = r'<meta[^>]+?%s[^>]+?%s'
 469         return [
 470             template % (property_re, content_re),
 471             template % (content_re, property_re),
 472         ]
 473
 474     def _og_search_property(self, prop, html, name=None, **kargs):
 475         if name is None:
 476             name = 'OpenGraph %s' % prop
 477         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 478         if escaped is None:
 479             return None
 480         return unescapeHTML(escaped)
 481
 482     def _og_search_thumbnail(self, html, **kargs):
 483         return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs)
 484
 485     def _og_search_description(self, html, **kargs):
 486         return self._og_search_property('description', html, fatal=False, **kargs)
 487
 488     def _og_search_title(self, html, **kargs):
 489         return self._og_search_property('title', html, **kargs)
 490
 491     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 492         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 493         if secure:
 494             regexes = self._og_regexes('video:secure_url') + regexes
 495         return self._html_search_regex(regexes, html, name, **kargs)
 496
 497     def _og_search_url(self, html, **kargs):
 498         return self._og_search_property('url', html, **kargs)
 499
 500     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 501         if display_name is None:
 502             display_name = name
 503         return self._html_search_regex(
 504             r'''(?ix)<meta
 505                     (?=[^>]+(?:itemprop|name|property)=["\']?%s["\']?)
 506                     [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
 507             html, display_name, fatal=fatal, **kwargs)
 508
 509     def _dc_search_uploader(self, html):
 510         return self._html_search_meta('dc.creator', html, 'uploader')
 511
 512     def _rta_search(self, html):
 513         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 514         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 515                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 516                      html):
 517             return 18
 518         return 0
 519
 520     def _media_rating_search(self, html):
 521         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 522         rating = self._html_search_meta('rating', html)
 523
 524         if not rating:
 525             return None
 526
 527         RATING_TABLE = {
 528             'safe for kids': 0,
 529             'general': 8,
 530             '14 years': 14,
 531             'mature': 17,
 532             'restricted': 19,
 533         }
 534         return RATING_TABLE.get(rating.lower(), None)
 535
 536     def _twitter_search_player(self, html):
 537         return self._html_search_meta('twitter:player', html,
 538             'twitter card player')
 539
 540     def _sort_formats(self, formats):
 541         if not formats:
 542             raise ExtractorError('No video formats found')
 543
 544         def _formats_key(f):
 545             # TODO remove the following workaround
 546             from ..utils import determine_ext
 547             if not f.get('ext') and 'url' in f:
 548                 f['ext'] = determine_ext(f['url'])
 549
 550             preference = f.get('preference')
 551             if preference is None:
 552                 proto = f.get('protocol')
 553                 if proto is None:
 554                     proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
 555
 556                 preference = 0 if proto in ['http', 'https'] else -0.1
 557                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 558                     preference -= 0.5
 559
 560             if f.get('vcodec') == 'none':  # audio only
 561                 if self._downloader.params.get('prefer_free_formats'):
 562                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
 563                 else:
 564                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
 565                 ext_preference = 0
 566                 try:
 567                     audio_ext_preference = ORDER.index(f['ext'])
 568                 except ValueError:
 569                     audio_ext_preference = -1
 570             else:
 571                 if self._downloader.params.get('prefer_free_formats'):
 572                     ORDER = ['flv', 'mp4', 'webm']
 573                 else:
 574                     ORDER = ['webm', 'flv', 'mp4']
 575                 try:
 576                     ext_preference = ORDER.index(f['ext'])
 577                 except ValueError:
 578                     ext_preference = -1
 579                 audio_ext_preference = 0
 580
 581             return (
 582                 preference,
 583                 f.get('quality') if f.get('quality') is not None else -1,
 584                 f.get('height') if f.get('height') is not None else -1,
 585                 f.get('width') if f.get('width') is not None else -1,
 586                 ext_preference,
 587                 f.get('tbr') if f.get('tbr') is not None else -1,
 588                 f.get('vbr') if f.get('vbr') is not None else -1,
 589                 f.get('abr') if f.get('abr') is not None else -1,
 590                 audio_ext_preference,
 591                 f.get('filesize') if f.get('filesize') is not None else -1,
 592                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
 593                 f.get('format_id'),
 594             )
 595         formats.sort(key=_formats_key)
 596
 597     def http_scheme(self):
 598         """ Either "https:" or "https:", depending on the user's preferences """
 599         return (
 600             'http:'
 601             if self._downloader.params.get('prefer_insecure', False)
 602             else 'https:')
 603
 604     def _proto_relative_url(self, url, scheme=None):
 605         if url is None:
 606             return url
 607         if url.startswith('//'):
 608             if scheme is None:
 609                 scheme = self.http_scheme()
 610             return scheme + url
 611         else:
 612             return url
 613
 614     def _sleep(self, timeout, video_id, msg_template=None):
 615         if msg_template is None:
 616             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
 617         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
 618         self.to_screen(msg)
 619         time.sleep(timeout)
 620
 621     def _extract_f4m_formats(self, manifest_url, video_id):
 622         manifest = self._download_xml(
 623             manifest_url, video_id, 'Downloading f4m manifest',
 624             'Unable to download f4m manifest')
 625
 626         formats = []
 627         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
 628         for i, media_el in enumerate(media_nodes):
 629             tbr = int_or_none(media_el.attrib.get('bitrate'))
 630             format_id = 'f4m-%d' % (i if tbr is None else tbr)
 631             formats.append({
 632                 'format_id': format_id,
 633                 'url': manifest_url,
 634                 'ext': 'flv',
 635                 'tbr': tbr,
 636                 'width': int_or_none(media_el.attrib.get('width')),
 637                 'height': int_or_none(media_el.attrib.get('height')),
 638             })
 639         self._sort_formats(formats)
 640
 641         return formats
 642
 643     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None):
 644         formats = [{
 645             'format_id': 'm3u8-meta',
 646             'url': m3u8_url,
 647             'ext': ext,
 648             'protocol': 'm3u8',
 649             'preference': -1,
 650             'resolution': 'multiple',
 651             'format_note': 'Quality selection URL',
 652         }]
 653
 654         m3u8_doc = self._download_webpage(m3u8_url, video_id)
 655         last_info = None
 656         kv_rex = re.compile(
 657             r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
 658         for line in m3u8_doc.splitlines():
 659             if line.startswith('#EXT-X-STREAM-INF:'):
 660                 last_info = {}
 661                 for m in kv_rex.finditer(line):
 662                     v = m.group('val')
 663                     if v.startswith('"'):
 664                         v = v[1:-1]
 665                     last_info[m.group('key')] = v
 666             elif line.startswith('#') or not line.strip():
 667                 continue
 668             else:
 669                 if last_info is None:
 670                     formats.append({'url': line})
 671                     continue
 672                 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
 673
 674                 f = {
 675                     'format_id': 'm3u8-%d' % (tbr if tbr else len(formats)),
 676                     'url': line.strip(),
 677                     'tbr': tbr,
 678                     'ext': ext,
 679                 }
 680                 codecs = last_info.get('CODECS')
 681                 if codecs:
 682                     # TODO: looks like video codec is not always necessarily goes first
 683                     va_codecs = codecs.split(',')
 684                     if va_codecs[0]:
 685                         f['vcodec'] = va_codecs[0].partition('.')[0]
 686                     if len(va_codecs) > 1 and va_codecs[1]:
 687                         f['acodec'] = va_codecs[1].partition('.')[0]
 688                 resolution = last_info.get('RESOLUTION')
 689                 if resolution:
 690                     width_str, height_str = resolution.split('x')
 691                     f['width'] = int(width_str)
 692                     f['height'] = int(height_str)
 693                 formats.append(f)
 694                 last_info = {}
 695         self._sort_formats(formats)
 696         return formats
 697
 698
 699 class SearchInfoExtractor(InfoExtractor):
 700     """
 701     Base class for paged search queries extractors.
 702     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 703     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 704     """
 705
 706     @classmethod
 707     def _make_valid_url(cls):
 708         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 709
 710     @classmethod
 711     def suitable(cls, url):
 712         return re.match(cls._make_valid_url(), url) is not None
 713
 714     def _real_extract(self, query):
 715         mobj = re.match(self._make_valid_url(), query)
 716         if mobj is None:
 717             raise ExtractorError('Invalid search query "%s"' % query)
 718
 719         prefix = mobj.group('prefix')
 720         query = mobj.group('query')
 721         if prefix == '':
 722             return self._get_n_results(query, 1)
 723         elif prefix == 'all':
 724             return self._get_n_results(query, self._MAX_RESULTS)
 725         else:
 726             n = int(prefix)
 727             if n <= 0:
 728                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
 729             elif n > self._MAX_RESULTS:
 730                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 731                 n = self._MAX_RESULTS
 732             return self._get_n_results(query, n)
 733
 734     def _get_n_results(self, query, n):
 735         """Get a specified number of results for a query"""
 736         raise NotImplementedError("This method must be implemented by subclasses")
 737
 738     @property
 739     def SEARCH_KEY(self):
 740         return self._SEARCH_KEY