youtube_dl/extractor/common.py

   1 from __future__ import unicode_literals
   2
   3 import base64
   4 import datetime
   5 import hashlib
   6 import json
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import sys
  12 import time
  13 import xml.etree.ElementTree
  14
  15 from ..compat import (
  16     compat_http_client,
  17     compat_urllib_error,
  18     compat_urllib_parse_urlparse,
  19     compat_urlparse,
  20     compat_str,
  21 )
  22 from ..utils import (
  23     clean_html,
  24     compiled_regex_type,
  25     ExtractorError,
  26     float_or_none,
  27     int_or_none,
  28     RegexNotFoundError,
  29     sanitize_filename,
  30     unescapeHTML,
  31 )
  32 _NO_DEFAULT = object()
  33
  34
  35 class InfoExtractor(object):
  36     """Information Extractor class.
  37
  38     Information extractors are the classes that, given a URL, extract
  39     information about the video (or videos) the URL refers to. This
  40     information includes the real video URL, the video title, author and
  41     others. The information is stored in a dictionary which is then
  42     passed to the FileDownloader. The FileDownloader processes this
  43     information possibly downloading the video to the file system, among
  44     other possible outcomes.
  45
  46     The type field determines the the type of the result.
  47     By far the most common value (and the default if _type is missing) is
  48     "video", which indicates a single video.
  49
  50     For a video, the dictionaries must include the following fields:
  51
  52     id:             Video identifier.
  53     title:          Video title, unescaped.
  54
  55     Additionally, it must contain either a formats entry or a url one:
  56
  57     formats:        A list of dictionaries for each format available, ordered
  58                     from worst to best quality.
  59
  60                     Potential fields:
  61                     * url        Mandatory. The URL of the video file
  62                     * ext        Will be calculated from url if missing
  63                     * format     A human-readable description of the format
  64                                  ("mp4 container with h264/opus").
  65                                  Calculated from the format_id, width, height.
  66                                  and format_note fields if missing.
  67                     * format_id  A short description of the format
  68                                  ("mp4_h264_opus" or "19").
  69                                 Technically optional, but strongly recommended.
  70                     * format_note Additional info about the format
  71                                  ("3D" or "DASH video")
  72                     * width      Width of the video, if known
  73                     * height     Height of the video, if known
  74                     * resolution Textual description of width and height
  75                     * tbr        Average bitrate of audio and video in KBit/s
  76                     * abr        Average audio bitrate in KBit/s
  77                     * acodec     Name of the audio codec in use
  78                     * asr        Audio sampling rate in Hertz
  79                     * vbr        Average video bitrate in KBit/s
  80                     * fps        Frame rate
  81                     * vcodec     Name of the video codec in use
  82                     * container  Name of the container format
  83                     * filesize   The number of bytes, if known in advance
  84                     * filesize_approx  An estimate for the number of bytes
  85                     * player_url SWF Player URL (used for rtmpdump).
  86                     * protocol   The protocol that will be used for the actual
  87                                  download, lower-case.
  88                                  "http", "https", "rtsp", "rtmp", "m3u8" or so.
  89                     * preference Order number of this format. If this field is
  90                                  present and not None, the formats get sorted
  91                                  by this field, regardless of all other values.
  92                                  -1 for default (order by other properties),
  93                                  -2 or smaller for less than default.
  94                     * language_preference  Is this in the correct requested
  95                                  language?
  96                                  10 if it's what the URL is about,
  97                                  -1 for default (don't know),
  98                                  -10 otherwise, other values reserved for now.
  99                     * quality    Order number of the video quality of this
 100                                  format, irrespective of the file format.
 101                                  -1 for default (order by other properties),
 102                                  -2 or smaller for less than default.
 103                     * source_preference  Order number for this video source
 104                                   (quality takes higher priority)
 105                                  -1 for default (order by other properties),
 106                                  -2 or smaller for less than default.
 107                     * http_referer  HTTP Referer header value to set.
 108                     * http_method  HTTP method to use for the download.
 109                     * http_headers  A dictionary of additional HTTP headers
 110                                  to add to the request.
 111                     * http_post_data  Additional data to send with a POST
 112                                  request.
 113     url:            Final video URL.
 114     ext:            Video filename extension.
 115     format:         The video format, defaults to ext (used for --get-format)
 116     player_url:     SWF Player URL (used for rtmpdump).
 117
 118     The following fields are optional:
 119
 120     display_id      An alternative identifier for the video, not necessarily
 121                     unique, but available before title. Typically, id is
 122                     something like "4234987", title "Dancing naked mole rats",
 123                     and display_id "dancing-naked-mole-rats"
 124     thumbnails:     A list of dictionaries, with the following entries:
 125                         * "url"
 126                         * "width" (optional, int)
 127                         * "height" (optional, int)
 128                         * "resolution" (optional, string "{width}x{height"},
 129                                         deprecated)
 130     thumbnail:      Full URL to a video thumbnail image.
 131     description:    One-line video description.
 132     uploader:       Full name of the video uploader.
 133     timestamp:      UNIX timestamp of the moment the video became available.
 134     upload_date:    Video upload date (YYYYMMDD).
 135                     If not explicitly set, calculated from timestamp.
 136     uploader_id:    Nickname or id of the video uploader.
 137     location:       Physical location where the video was filmed.
 138     subtitles:      The subtitle file contents as a dictionary in the format
 139                     {language: subtitles}.
 140     duration:       Length of the video in seconds, as an integer.
 141     view_count:     How many users have watched the video on the platform.
 142     like_count:     Number of positive ratings of the video
 143     dislike_count:  Number of negative ratings of the video
 144     comment_count:  Number of comments on the video
 145     age_limit:      Age restriction for the video, as an integer (years)
 146     webpage_url:    The url to the video webpage, if given to youtube-dl it
 147                     should allow to get the same result again. (It will be set
 148                     by YoutubeDL if it's missing)
 149     categories:     A list of categories that the video falls in, for example
 150                     ["Sports", "Berlin"]
 151     is_live:        True, False, or None (=unknown). Whether this video is a
 152                     live stream that goes on instead of a fixed-length video.
 153
 154     Unless mentioned otherwise, the fields should be Unicode strings.
 155
 156     Unless mentioned otherwise, None is equivalent to absence of information.
 157
 158
 159     _type "playlist" indicates multiple videos.
 160     There must be a key "entries", which is a list or a PagedList object, each
 161     element of which is a valid dictionary under this specfication.
 162
 163     Additionally, playlists can have "title" and "id" attributes with the same
 164     semantics as videos (see above).
 165
 166
 167     _type "multi_video" indicates that there are multiple videos that
 168     form a single show, for examples multiple acts of an opera or TV episode.
 169     It must have an entries key like a playlist and contain all the keys
 170     required for a video at the same time.
 171
 172
 173     _type "url" indicates that the video must be extracted from another
 174     location, possibly by a different extractor. Its only required key is:
 175     "url" - the next URL to extract.
 176
 177     Additionally, it may have properties believed to be identical to the
 178     resolved entity, for example "title" if the title of the referred video is
 179     known ahead of time.
 180
 181
 182     _type "url_transparent" entities have the same specification as "url", but
 183     indicate that the given additional information is more precise than the one
 184     associated with the resolved URL.
 185     This is useful when a site employs a video service that hosts the video and
 186     its technical metadata, but that video service does not embed a useful
 187     title, description etc.
 188
 189
 190     Subclasses of this one should re-define the _real_initialize() and
 191     _real_extract() methods and define a _VALID_URL regexp.
 192     Probably, they should also be added to the list of extractors.
 193
 194     Finally, the _WORKING attribute should be set to False for broken IEs
 195     in order to warn the users and skip the tests.
 196     """
 197
 198     _ready = False
 199     _downloader = None
 200     _WORKING = True
 201
 202     def __init__(self, downloader=None):
 203         """Constructor. Receives an optional downloader."""
 204         self._ready = False
 205         self.set_downloader(downloader)
 206
 207     @classmethod
 208     def suitable(cls, url):
 209         """Receives a URL and returns True if suitable for this IE."""
 210
 211         # This does not use has/getattr intentionally - we want to know whether
 212         # we have cached the regexp for *this* class, whereas getattr would also
 213         # match the superclass
 214         if '_VALID_URL_RE' not in cls.__dict__:
 215             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 216         return cls._VALID_URL_RE.match(url) is not None
 217
 218     @classmethod
 219     def _match_id(cls, url):
 220         if '_VALID_URL_RE' not in cls.__dict__:
 221             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 222         m = cls._VALID_URL_RE.match(url)
 223         assert m
 224         return m.group('id')
 225
 226     @classmethod
 227     def working(cls):
 228         """Getter method for _WORKING."""
 229         return cls._WORKING
 230
 231     def initialize(self):
 232         """Initializes an instance (authentication, etc)."""
 233         if not self._ready:
 234             self._real_initialize()
 235             self._ready = True
 236
 237     def extract(self, url):
 238         """Extracts URL information and returns it in list of dicts."""
 239         self.initialize()
 240         return self._real_extract(url)
 241
 242     def set_downloader(self, downloader):
 243         """Sets the downloader for this IE."""
 244         self._downloader = downloader
 245
 246     def _real_initialize(self):
 247         """Real initialization process. Redefine in subclasses."""
 248         pass
 249
 250     def _real_extract(self, url):
 251         """Real extraction process. Redefine in subclasses."""
 252         pass
 253
 254     @classmethod
 255     def ie_key(cls):
 256         """A string for getting the InfoExtractor with get_info_extractor"""
 257         return cls.__name__[:-2]
 258
 259     @property
 260     def IE_NAME(self):
 261         return type(self).__name__[:-2]
 262
 263     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 264         """ Returns the response handle """
 265         if note is None:
 266             self.report_download_webpage(video_id)
 267         elif note is not False:
 268             if video_id is None:
 269                 self.to_screen('%s' % (note,))
 270             else:
 271                 self.to_screen('%s: %s' % (video_id, note))
 272         try:
 273             return self._downloader.urlopen(url_or_request)
 274         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 275             if errnote is False:
 276                 return False
 277             if errnote is None:
 278                 errnote = 'Unable to download webpage'
 279             errmsg = '%s: %s' % (errnote, compat_str(err))
 280             if fatal:
 281                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 282             else:
 283                 self._downloader.report_warning(errmsg)
 284                 return False
 285
 286     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 287         """ Returns a tuple (page content as string, URL handle) """
 288         # Strip hashes from the URL (#1038)
 289         if isinstance(url_or_request, (compat_str, str)):
 290             url_or_request = url_or_request.partition('#')[0]
 291
 292         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 293         if urlh is False:
 294             assert not fatal
 295             return False
 296         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal)
 297         return (content, urlh)
 298
 299     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True):
 300         content_type = urlh.headers.get('Content-Type', '')
 301         webpage_bytes = urlh.read()
 302         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 303         if m:
 304             encoding = m.group(1)
 305         else:
 306             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 307                           webpage_bytes[:1024])
 308             if m:
 309                 encoding = m.group(1).decode('ascii')
 310             elif webpage_bytes.startswith(b'\xff\xfe'):
 311                 encoding = 'utf-16'
 312             else:
 313                 encoding = 'utf-8'
 314         if self._downloader.params.get('dump_intermediate_pages', False):
 315             try:
 316                 url = url_or_request.get_full_url()
 317             except AttributeError:
 318                 url = url_or_request
 319             self.to_screen('Dumping request to ' + url)
 320             dump = base64.b64encode(webpage_bytes).decode('ascii')
 321             self._downloader.to_screen(dump)
 322         if self._downloader.params.get('write_pages', False):
 323             try:
 324                 url = url_or_request.get_full_url()
 325             except AttributeError:
 326                 url = url_or_request
 327             basen = '%s_%s' % (video_id, url)
 328             if len(basen) > 240:
 329                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 330                 basen = basen[:240 - len(h)] + h
 331             raw_filename = basen + '.dump'
 332             filename = sanitize_filename(raw_filename, restricted=True)
 333             self.to_screen('Saving request to ' + filename)
 334             # Working around MAX_PATH limitation on Windows (see
 335             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 336             if os.name == 'nt':
 337                 absfilepath = os.path.abspath(filename)
 338                 if len(absfilepath) > 259:
 339                     filename = '\\\\?\\' + absfilepath
 340             with open(filename, 'wb') as outf:
 341                 outf.write(webpage_bytes)
 342
 343         try:
 344             content = webpage_bytes.decode(encoding, 'replace')
 345         except LookupError:
 346             content = webpage_bytes.decode('utf-8', 'replace')
 347
 348         if ('<title>Access to this site is blocked</title>' in content and
 349                 'Websense' in content[:512]):
 350             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 351             blocked_iframe = self._html_search_regex(
 352                 r'<iframe src="([^"]+)"', content,
 353                 'Websense information URL', default=None)
 354             if blocked_iframe:
 355                 msg += ' Visit %s for more details' % blocked_iframe
 356             raise ExtractorError(msg, expected=True)
 357
 358         return content
 359
 360     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 361         """ Returns the data of the page as a string """
 362         res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
 363         if res is False:
 364             return res
 365         else:
 366             content, _ = res
 367             return content
 368
 369     def _download_xml(self, url_or_request, video_id,
 370                       note='Downloading XML', errnote='Unable to download XML',
 371                       transform_source=None, fatal=True):
 372         """Return the xml as an xml.etree.ElementTree.Element"""
 373         xml_string = self._download_webpage(
 374             url_or_request, video_id, note, errnote, fatal=fatal)
 375         if xml_string is False:
 376             return xml_string
 377         if transform_source:
 378             xml_string = transform_source(xml_string)
 379         return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
 380
 381     def _download_json(self, url_or_request, video_id,
 382                        note='Downloading JSON metadata',
 383                        errnote='Unable to download JSON metadata',
 384                        transform_source=None,
 385                        fatal=True):
 386         json_string = self._download_webpage(
 387             url_or_request, video_id, note, errnote, fatal=fatal)
 388         if (not fatal) and json_string is False:
 389             return None
 390         if transform_source:
 391             json_string = transform_source(json_string)
 392         try:
 393             return json.loads(json_string)
 394         except ValueError as ve:
 395             errmsg = '%s: Failed to parse JSON ' % video_id
 396             if fatal:
 397                 raise ExtractorError(errmsg, cause=ve)
 398             else:
 399                 self.report_warning(errmsg + str(ve))
 400
 401     def report_warning(self, msg, video_id=None):
 402         idstr = '' if video_id is None else '%s: ' % video_id
 403         self._downloader.report_warning(
 404             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 405
 406     def to_screen(self, msg):
 407         """Print msg to screen, prefixing it with '[ie_name]'"""
 408         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 409
 410     def report_extraction(self, id_or_name):
 411         """Report information extraction."""
 412         self.to_screen('%s: Extracting information' % id_or_name)
 413
 414     def report_download_webpage(self, video_id):
 415         """Report webpage download."""
 416         self.to_screen('%s: Downloading webpage' % video_id)
 417
 418     def report_age_confirmation(self):
 419         """Report attempt to confirm age."""
 420         self.to_screen('Confirming age')
 421
 422     def report_login(self):
 423         """Report attempt to log in."""
 424         self.to_screen('Logging in')
 425
 426     # Methods for following #608
 427     @staticmethod
 428     def url_result(url, ie=None, video_id=None):
 429         """Returns a url that points to a page that should be processed"""
 430         # TODO: ie should be the class used for getting the info
 431         video_info = {'_type': 'url',
 432                       'url': url,
 433                       'ie_key': ie}
 434         if video_id is not None:
 435             video_info['id'] = video_id
 436         return video_info
 437
 438     @staticmethod
 439     def playlist_result(entries, playlist_id=None, playlist_title=None):
 440         """Returns a playlist"""
 441         video_info = {'_type': 'playlist',
 442                       'entries': entries}
 443         if playlist_id:
 444             video_info['id'] = playlist_id
 445         if playlist_title:
 446             video_info['title'] = playlist_title
 447         return video_info
 448
 449     def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
 450         """
 451         Perform a regex search on the given string, using a single or a list of
 452         patterns returning the first matching group.
 453         In case of failure return a default value or raise a WARNING or a
 454         RegexNotFoundError, depending on fatal, specifying the field name.
 455         """
 456         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 457             mobj = re.search(pattern, string, flags)
 458         else:
 459             for p in pattern:
 460                 mobj = re.search(p, string, flags)
 461                 if mobj:
 462                     break
 463
 464         if os.name != 'nt' and sys.stderr.isatty():
 465             _name = '\033[0;34m%s\033[0m' % name
 466         else:
 467             _name = name
 468
 469         if mobj:
 470             if group is None:
 471                 # return the first matching group
 472                 return next(g for g in mobj.groups() if g is not None)
 473             else:
 474                 return mobj.group(group)
 475         elif default is not _NO_DEFAULT:
 476             return default
 477         elif fatal:
 478             raise RegexNotFoundError('Unable to extract %s' % _name)
 479         else:
 480             self._downloader.report_warning('unable to extract %s; '
 481                                             'please report this issue on http://yt-dl.org/bug' % _name)
 482             return None
 483
 484     def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
 485         """
 486         Like _search_regex, but strips HTML tags and unescapes entities.
 487         """
 488         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 489         if res:
 490             return clean_html(res).strip()
 491         else:
 492             return res
 493
 494     def _get_login_info(self):
 495         """
 496         Get the the login info as (username, password)
 497         It will look in the netrc file using the _NETRC_MACHINE value
 498         If there's no info available, return (None, None)
 499         """
 500         if self._downloader is None:
 501             return (None, None)
 502
 503         username = None
 504         password = None
 505         downloader_params = self._downloader.params
 506
 507         # Attempt to use provided username and password or .netrc data
 508         if downloader_params.get('username', None) is not None:
 509             username = downloader_params['username']
 510             password = downloader_params['password']
 511         elif downloader_params.get('usenetrc', False):
 512             try:
 513                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 514                 if info is not None:
 515                     username = info[0]
 516                     password = info[2]
 517                 else:
 518                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 519             except (IOError, netrc.NetrcParseError) as err:
 520                 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
 521
 522         return (username, password)
 523
 524     def _get_tfa_info(self):
 525         """
 526         Get the two-factor authentication info
 527         TODO - asking the user will be required for sms/phone verify
 528         currently just uses the command line option
 529         If there's no info available, return None
 530         """
 531         if self._downloader is None:
 532             return None
 533         downloader_params = self._downloader.params
 534
 535         if downloader_params.get('twofactor', None) is not None:
 536             return downloader_params['twofactor']
 537
 538         return None
 539
 540     # Helper functions for extracting OpenGraph info
 541     @staticmethod
 542     def _og_regexes(prop):
 543         content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
 544         property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
 545         template = r'<meta[^>]+?%s[^>]+?%s'
 546         return [
 547             template % (property_re, content_re),
 548             template % (content_re, property_re),
 549         ]
 550
 551     def _og_search_property(self, prop, html, name=None, **kargs):
 552         if name is None:
 553             name = 'OpenGraph %s' % prop
 554         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 555         if escaped is None:
 556             return None
 557         return unescapeHTML(escaped)
 558
 559     def _og_search_thumbnail(self, html, **kargs):
 560         return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs)
 561
 562     def _og_search_description(self, html, **kargs):
 563         return self._og_search_property('description', html, fatal=False, **kargs)
 564
 565     def _og_search_title(self, html, **kargs):
 566         return self._og_search_property('title', html, **kargs)
 567
 568     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 569         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 570         if secure:
 571             regexes = self._og_regexes('video:secure_url') + regexes
 572         return self._html_search_regex(regexes, html, name, **kargs)
 573
 574     def _og_search_url(self, html, **kargs):
 575         return self._og_search_property('url', html, **kargs)
 576
 577     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 578         if display_name is None:
 579             display_name = name
 580         return self._html_search_regex(
 581             r'''(?ix)<meta
 582                     (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1)
 583                     [^>]+content=(["\'])(?P<content>.*?)\1''' % re.escape(name),
 584             html, display_name, fatal=fatal, group='content', **kwargs)
 585
 586     def _dc_search_uploader(self, html):
 587         return self._html_search_meta('dc.creator', html, 'uploader')
 588
 589     def _rta_search(self, html):
 590         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 591         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 592                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 593                      html):
 594             return 18
 595         return 0
 596
 597     def _media_rating_search(self, html):
 598         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 599         rating = self._html_search_meta('rating', html)
 600
 601         if not rating:
 602             return None
 603
 604         RATING_TABLE = {
 605             'safe for kids': 0,
 606             'general': 8,
 607             '14 years': 14,
 608             'mature': 17,
 609             'restricted': 19,
 610         }
 611         return RATING_TABLE.get(rating.lower(), None)
 612
 613     def _twitter_search_player(self, html):
 614         return self._html_search_meta('twitter:player', html,
 615                                       'twitter card player')
 616
 617     def _sort_formats(self, formats):
 618         if not formats:
 619             raise ExtractorError('No video formats found')
 620
 621         def _formats_key(f):
 622             # TODO remove the following workaround
 623             from ..utils import determine_ext
 624             if not f.get('ext') and 'url' in f:
 625                 f['ext'] = determine_ext(f['url'])
 626
 627             preference = f.get('preference')
 628             if preference is None:
 629                 proto = f.get('protocol')
 630                 if proto is None:
 631                     proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
 632
 633                 preference = 0 if proto in ['http', 'https'] else -0.1
 634                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 635                     preference -= 0.5
 636
 637             if f.get('vcodec') == 'none':  # audio only
 638                 if self._downloader.params.get('prefer_free_formats'):
 639                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
 640                 else:
 641                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
 642                 ext_preference = 0
 643                 try:
 644                     audio_ext_preference = ORDER.index(f['ext'])
 645                 except ValueError:
 646                     audio_ext_preference = -1
 647             else:
 648                 if self._downloader.params.get('prefer_free_formats'):
 649                     ORDER = ['flv', 'mp4', 'webm']
 650                 else:
 651                     ORDER = ['webm', 'flv', 'mp4']
 652                 try:
 653                     ext_preference = ORDER.index(f['ext'])
 654                 except ValueError:
 655                     ext_preference = -1
 656                 audio_ext_preference = 0
 657
 658             return (
 659                 preference,
 660                 f.get('language_preference') if f.get('language_preference') is not None else -1,
 661                 f.get('quality') if f.get('quality') is not None else -1,
 662                 f.get('height') if f.get('height') is not None else -1,
 663                 f.get('width') if f.get('width') is not None else -1,
 664                 ext_preference,
 665                 f.get('tbr') if f.get('tbr') is not None else -1,
 666                 f.get('vbr') if f.get('vbr') is not None else -1,
 667                 f.get('abr') if f.get('abr') is not None else -1,
 668                 audio_ext_preference,
 669                 f.get('fps') if f.get('fps') is not None else -1,
 670                 f.get('filesize') if f.get('filesize') is not None else -1,
 671                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
 672                 f.get('source_preference') if f.get('source_preference') is not None else -1,
 673                 f.get('format_id'),
 674             )
 675         formats.sort(key=_formats_key)
 676
 677     def http_scheme(self):
 678         """ Either "http:" or "https:", depending on the user's preferences """
 679         return (
 680             'http:'
 681             if self._downloader.params.get('prefer_insecure', False)
 682             else 'https:')
 683
 684     def _proto_relative_url(self, url, scheme=None):
 685         if url is None:
 686             return url
 687         if url.startswith('//'):
 688             if scheme is None:
 689                 scheme = self.http_scheme()
 690             return scheme + url
 691         else:
 692             return url
 693
 694     def _sleep(self, timeout, video_id, msg_template=None):
 695         if msg_template is None:
 696             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
 697         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
 698         self.to_screen(msg)
 699         time.sleep(timeout)
 700
 701     def _extract_f4m_formats(self, manifest_url, video_id):
 702         manifest = self._download_xml(
 703             manifest_url, video_id, 'Downloading f4m manifest',
 704             'Unable to download f4m manifest')
 705
 706         formats = []
 707         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
 708         for i, media_el in enumerate(media_nodes):
 709             tbr = int_or_none(media_el.attrib.get('bitrate'))
 710             format_id = 'f4m-%d' % (i if tbr is None else tbr)
 711             formats.append({
 712                 'format_id': format_id,
 713                 'url': manifest_url,
 714                 'ext': 'flv',
 715                 'tbr': tbr,
 716                 'width': int_or_none(media_el.attrib.get('width')),
 717                 'height': int_or_none(media_el.attrib.get('height')),
 718             })
 719         self._sort_formats(formats)
 720
 721         return formats
 722
 723     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
 724                               entry_protocol='m3u8', preference=None):
 725
 726         formats = [{
 727             'format_id': 'm3u8-meta',
 728             'url': m3u8_url,
 729             'ext': ext,
 730             'protocol': 'm3u8',
 731             'preference': -1,
 732             'resolution': 'multiple',
 733             'format_note': 'Quality selection URL',
 734         }]
 735
 736         format_url = lambda u: (
 737             u
 738             if re.match(r'^https?://', u)
 739             else compat_urlparse.urljoin(m3u8_url, u))
 740
 741         m3u8_doc = self._download_webpage(
 742             m3u8_url, video_id,
 743             note='Downloading m3u8 information',
 744             errnote='Failed to download m3u8 information')
 745         last_info = None
 746         kv_rex = re.compile(
 747             r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
 748         for line in m3u8_doc.splitlines():
 749             if line.startswith('#EXT-X-STREAM-INF:'):
 750                 last_info = {}
 751                 for m in kv_rex.finditer(line):
 752                     v = m.group('val')
 753                     if v.startswith('"'):
 754                         v = v[1:-1]
 755                     last_info[m.group('key')] = v
 756             elif line.startswith('#') or not line.strip():
 757                 continue
 758             else:
 759                 if last_info is None:
 760                     formats.append({'url': format_url(line)})
 761                     continue
 762                 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
 763
 764                 f = {
 765                     'format_id': 'm3u8-%d' % (tbr if tbr else len(formats)),
 766                     'url': format_url(line.strip()),
 767                     'tbr': tbr,
 768                     'ext': ext,
 769                     'protocol': entry_protocol,
 770                     'preference': preference,
 771                 }
 772                 codecs = last_info.get('CODECS')
 773                 if codecs:
 774                     # TODO: looks like video codec is not always necessarily goes first
 775                     va_codecs = codecs.split(',')
 776                     if va_codecs[0]:
 777                         f['vcodec'] = va_codecs[0].partition('.')[0]
 778                     if len(va_codecs) > 1 and va_codecs[1]:
 779                         f['acodec'] = va_codecs[1].partition('.')[0]
 780                 resolution = last_info.get('RESOLUTION')
 781                 if resolution:
 782                     width_str, height_str = resolution.split('x')
 783                     f['width'] = int(width_str)
 784                     f['height'] = int(height_str)
 785                 formats.append(f)
 786                 last_info = {}
 787         self._sort_formats(formats)
 788         return formats
 789
 790     def _live_title(self, name):
 791         """ Generate the title for a live video """
 792         now = datetime.datetime.now()
 793         now_str = now.strftime("%Y-%m-%d %H:%M")
 794         return name + ' ' + now_str
 795
 796     def _int(self, v, name, fatal=False, **kwargs):
 797         res = int_or_none(v, **kwargs)
 798         if 'get_attr' in kwargs:
 799             print(getattr(v, kwargs['get_attr']))
 800         if res is None:
 801             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
 802             if fatal:
 803                 raise ExtractorError(msg)
 804             else:
 805                 self._downloader.report_warning(msg)
 806         return res
 807
 808     def _float(self, v, name, fatal=False, **kwargs):
 809         res = float_or_none(v, **kwargs)
 810         if res is None:
 811             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
 812             if fatal:
 813                 raise ExtractorError(msg)
 814             else:
 815                 self._downloader.report_warning(msg)
 816         return res
 817
 818
 819 class SearchInfoExtractor(InfoExtractor):
 820     """
 821     Base class for paged search queries extractors.
 822     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 823     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 824     """
 825
 826     @classmethod
 827     def _make_valid_url(cls):
 828         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 829
 830     @classmethod
 831     def suitable(cls, url):
 832         return re.match(cls._make_valid_url(), url) is not None
 833
 834     def _real_extract(self, query):
 835         mobj = re.match(self._make_valid_url(), query)
 836         if mobj is None:
 837             raise ExtractorError('Invalid search query "%s"' % query)
 838
 839         prefix = mobj.group('prefix')
 840         query = mobj.group('query')
 841         if prefix == '':
 842             return self._get_n_results(query, 1)
 843         elif prefix == 'all':
 844             return self._get_n_results(query, self._MAX_RESULTS)
 845         else:
 846             n = int(prefix)
 847             if n <= 0:
 848                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
 849             elif n > self._MAX_RESULTS:
 850                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 851                 n = self._MAX_RESULTS
 852             return self._get_n_results(query, n)
 853
 854     def _get_n_results(self, query, n):
 855         """Get a specified number of results for a query"""
 856         raise NotImplementedError("This method must be implemented by subclasses")
 857
 858     @property
 859     def SEARCH_KEY(self):
 860         return self._SEARCH_KEY