youtube_dl/extractor/common.py

   1 from __future__ import unicode_literals
   2
   3 import base64
   4 import datetime
   5 import hashlib
   6 import json
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import sys
  12 import time
  13 import xml.etree.ElementTree
  14
  15 from ..compat import (
  16     compat_cookiejar,
  17     compat_HTTPError,
  18     compat_http_client,
  19     compat_urllib_error,
  20     compat_urllib_parse_urlparse,
  21     compat_urlparse,
  22     compat_str,
  23 )
  24 from ..utils import (
  25     age_restricted,
  26     clean_html,
  27     compiled_regex_type,
  28     ExtractorError,
  29     float_or_none,
  30     HEADRequest,
  31     int_or_none,
  32     RegexNotFoundError,
  33     sanitize_filename,
  34     unescapeHTML,
  35 )
  36 _NO_DEFAULT = object()
  37
  38
  39 class InfoExtractor(object):
  40     """Information Extractor class.
  41
  42     Information extractors are the classes that, given a URL, extract
  43     information about the video (or videos) the URL refers to. This
  44     information includes the real video URL, the video title, author and
  45     others. The information is stored in a dictionary which is then
  46     passed to the YoutubeDL. The YoutubeDL processes this
  47     information possibly downloading the video to the file system, among
  48     other possible outcomes.
  49
  50     The type field determines the the type of the result.
  51     By far the most common value (and the default if _type is missing) is
  52     "video", which indicates a single video.
  53
  54     For a video, the dictionaries must include the following fields:
  55
  56     id:             Video identifier.
  57     title:          Video title, unescaped.
  58
  59     Additionally, it must contain either a formats entry or a url one:
  60
  61     formats:        A list of dictionaries for each format available, ordered
  62                     from worst to best quality.
  63
  64                     Potential fields:
  65                     * url        Mandatory. The URL of the video file
  66                     * ext        Will be calculated from url if missing
  67                     * format     A human-readable description of the format
  68                                  ("mp4 container with h264/opus").
  69                                  Calculated from the format_id, width, height.
  70                                  and format_note fields if missing.
  71                     * format_id  A short description of the format
  72                                  ("mp4_h264_opus" or "19").
  73                                 Technically optional, but strongly recommended.
  74                     * format_note Additional info about the format
  75                                  ("3D" or "DASH video")
  76                     * width      Width of the video, if known
  77                     * height     Height of the video, if known
  78                     * resolution Textual description of width and height
  79                     * tbr        Average bitrate of audio and video in KBit/s
  80                     * abr        Average audio bitrate in KBit/s
  81                     * acodec     Name of the audio codec in use
  82                     * asr        Audio sampling rate in Hertz
  83                     * vbr        Average video bitrate in KBit/s
  84                     * fps        Frame rate
  85                     * vcodec     Name of the video codec in use
  86                     * container  Name of the container format
  87                     * filesize   The number of bytes, if known in advance
  88                     * filesize_approx  An estimate for the number of bytes
  89                     * player_url SWF Player URL (used for rtmpdump).
  90                     * protocol   The protocol that will be used for the actual
  91                                  download, lower-case.
  92                                  "http", "https", "rtsp", "rtmp", "rtmpe",
  93                                  "m3u8", or "m3u8_native".
  94                     * preference Order number of this format. If this field is
  95                                  present and not None, the formats get sorted
  96                                  by this field, regardless of all other values.
  97                                  -1 for default (order by other properties),
  98                                  -2 or smaller for less than default.
  99                                  < -1000 to hide the format (if there is
 100                                     another one which is strictly better)
 101                     * language_preference  Is this in the correct requested
 102                                  language?
 103                                  10 if it's what the URL is about,
 104                                  -1 for default (don't know),
 105                                  -10 otherwise, other values reserved for now.
 106                     * quality    Order number of the video quality of this
 107                                  format, irrespective of the file format.
 108                                  -1 for default (order by other properties),
 109                                  -2 or smaller for less than default.
 110                     * source_preference  Order number for this video source
 111                                   (quality takes higher priority)
 112                                  -1 for default (order by other properties),
 113                                  -2 or smaller for less than default.
 114                     * http_method  HTTP method to use for the download.
 115                     * http_headers  A dictionary of additional HTTP headers
 116                                  to add to the request.
 117                     * http_post_data  Additional data to send with a POST
 118                                  request.
 119                     * stretched_ratio  If given and not 1, indicates that the
 120                                  video's pixels are not square.
 121                                  width : height ratio as float.
 122                     * no_resume  The server does not support resuming the
 123                                  (HTTP or RTMP) download. Boolean.
 124
 125     url:            Final video URL.
 126     ext:            Video filename extension.
 127     format:         The video format, defaults to ext (used for --get-format)
 128     player_url:     SWF Player URL (used for rtmpdump).
 129
 130     The following fields are optional:
 131
 132     alt_title:      A secondary title of the video.
 133     display_id      An alternative identifier for the video, not necessarily
 134                     unique, but available before title. Typically, id is
 135                     something like "4234987", title "Dancing naked mole rats",
 136                     and display_id "dancing-naked-mole-rats"
 137     thumbnails:     A list of dictionaries, with the following entries:
 138                         * "id" (optional, string) - Thumbnail format ID
 139                         * "url"
 140                         * "preference" (optional, int) - quality of the image
 141                         * "width" (optional, int)
 142                         * "height" (optional, int)
 143                         * "resolution" (optional, string "{width}x{height"},
 144                                         deprecated)
 145     thumbnail:      Full URL to a video thumbnail image.
 146     description:    Full video description.
 147     uploader:       Full name of the video uploader.
 148     timestamp:      UNIX timestamp of the moment the video became available.
 149     upload_date:    Video upload date (YYYYMMDD).
 150                     If not explicitly set, calculated from timestamp.
 151     uploader_id:    Nickname or id of the video uploader.
 152     location:       Physical location where the video was filmed.
 153     subtitles:      The subtitle file contents as a dictionary in the format
 154                     {language: subtitles}.
 155     duration:       Length of the video in seconds, as an integer.
 156     view_count:     How many users have watched the video on the platform.
 157     like_count:     Number of positive ratings of the video
 158     dislike_count:  Number of negative ratings of the video
 159     comment_count:  Number of comments on the video
 160     comments:       A list of comments, each with one or more of the following
 161                     properties (all but one of text or html optional):
 162                         * "author" - human-readable name of the comment author
 163                         * "author_id" - user ID of the comment author
 164                         * "id" - Comment ID
 165                         * "html" - Comment as HTML
 166                         * "text" - Plain text of the comment
 167                         * "timestamp" - UNIX timestamp of comment
 168                         * "parent" - ID of the comment this one is replying to.
 169                                      Set to "root" to indicate that this is a
 170                                      comment to the original video.
 171     age_limit:      Age restriction for the video, as an integer (years)
 172     webpage_url:    The url to the video webpage, if given to youtube-dl it
 173                     should allow to get the same result again. (It will be set
 174                     by YoutubeDL if it's missing)
 175     categories:     A list of categories that the video falls in, for example
 176                     ["Sports", "Berlin"]
 177     is_live:        True, False, or None (=unknown). Whether this video is a
 178                     live stream that goes on instead of a fixed-length video.
 179
 180     Unless mentioned otherwise, the fields should be Unicode strings.
 181
 182     Unless mentioned otherwise, None is equivalent to absence of information.
 183
 184
 185     _type "playlist" indicates multiple videos.
 186     There must be a key "entries", which is a list, an iterable, or a PagedList
 187     object, each element of which is a valid dictionary by this specification.
 188
 189     Additionally, playlists can have "title" and "id" attributes with the same
 190     semantics as videos (see above).
 191
 192
 193     _type "multi_video" indicates that there are multiple videos that
 194     form a single show, for examples multiple acts of an opera or TV episode.
 195     It must have an entries key like a playlist and contain all the keys
 196     required for a video at the same time.
 197
 198
 199     _type "url" indicates that the video must be extracted from another
 200     location, possibly by a different extractor. Its only required key is:
 201     "url" - the next URL to extract.
 202     The key "ie_key" can be set to the class name (minus the trailing "IE",
 203     e.g. "Youtube") if the extractor class is known in advance.
 204     Additionally, the dictionary may have any properties of the resolved entity
 205     known in advance, for example "title" if the title of the referred video is
 206     known ahead of time.
 207
 208
 209     _type "url_transparent" entities have the same specification as "url", but
 210     indicate that the given additional information is more precise than the one
 211     associated with the resolved URL.
 212     This is useful when a site employs a video service that hosts the video and
 213     its technical metadata, but that video service does not embed a useful
 214     title, description etc.
 215
 216
 217     Subclasses of this one should re-define the _real_initialize() and
 218     _real_extract() methods and define a _VALID_URL regexp.
 219     Probably, they should also be added to the list of extractors.
 220
 221     Finally, the _WORKING attribute should be set to False for broken IEs
 222     in order to warn the users and skip the tests.
 223     """
 224
 225     _ready = False
 226     _downloader = None
 227     _WORKING = True
 228
 229     def __init__(self, downloader=None):
 230         """Constructor. Receives an optional downloader."""
 231         self._ready = False
 232         self.set_downloader(downloader)
 233
 234     @classmethod
 235     def suitable(cls, url):
 236         """Receives a URL and returns True if suitable for this IE."""
 237
 238         # This does not use has/getattr intentionally - we want to know whether
 239         # we have cached the regexp for *this* class, whereas getattr would also
 240         # match the superclass
 241         if '_VALID_URL_RE' not in cls.__dict__:
 242             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 243         return cls._VALID_URL_RE.match(url) is not None
 244
 245     @classmethod
 246     def _match_id(cls, url):
 247         if '_VALID_URL_RE' not in cls.__dict__:
 248             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 249         m = cls._VALID_URL_RE.match(url)
 250         assert m
 251         return m.group('id')
 252
 253     @classmethod
 254     def working(cls):
 255         """Getter method for _WORKING."""
 256         return cls._WORKING
 257
 258     def initialize(self):
 259         """Initializes an instance (authentication, etc)."""
 260         if not self._ready:
 261             self._real_initialize()
 262             self._ready = True
 263
 264     def extract(self, url):
 265         """Extracts URL information and returns it in list of dicts."""
 266         self.initialize()
 267         return self._real_extract(url)
 268
 269     def set_downloader(self, downloader):
 270         """Sets the downloader for this IE."""
 271         self._downloader = downloader
 272
 273     def _real_initialize(self):
 274         """Real initialization process. Redefine in subclasses."""
 275         pass
 276
 277     def _real_extract(self, url):
 278         """Real extraction process. Redefine in subclasses."""
 279         pass
 280
 281     @classmethod
 282     def ie_key(cls):
 283         """A string for getting the InfoExtractor with get_info_extractor"""
 284         return cls.__name__[:-2]
 285
 286     @property
 287     def IE_NAME(self):
 288         return type(self).__name__[:-2]
 289
 290     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 291         """ Returns the response handle """
 292         if note is None:
 293             self.report_download_webpage(video_id)
 294         elif note is not False:
 295             if video_id is None:
 296                 self.to_screen('%s' % (note,))
 297             else:
 298                 self.to_screen('%s: %s' % (video_id, note))
 299         try:
 300             return self._downloader.urlopen(url_or_request)
 301         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 302             if errnote is False:
 303                 return False
 304             if errnote is None:
 305                 errnote = 'Unable to download webpage'
 306             errmsg = '%s: %s' % (errnote, compat_str(err))
 307             if fatal:
 308                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 309             else:
 310                 self._downloader.report_warning(errmsg)
 311                 return False
 312
 313     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 314         """ Returns a tuple (page content as string, URL handle) """
 315         # Strip hashes from the URL (#1038)
 316         if isinstance(url_or_request, (compat_str, str)):
 317             url_or_request = url_or_request.partition('#')[0]
 318
 319         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 320         if urlh is False:
 321             assert not fatal
 322             return False
 323         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal)
 324         return (content, urlh)
 325
 326     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None):
 327         content_type = urlh.headers.get('Content-Type', '')
 328         webpage_bytes = urlh.read()
 329         if prefix is not None:
 330             webpage_bytes = prefix + webpage_bytes
 331         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 332         if m:
 333             encoding = m.group(1)
 334         else:
 335             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 336                           webpage_bytes[:1024])
 337             if m:
 338                 encoding = m.group(1).decode('ascii')
 339             elif webpage_bytes.startswith(b'\xff\xfe'):
 340                 encoding = 'utf-16'
 341             else:
 342                 encoding = 'utf-8'
 343         if self._downloader.params.get('dump_intermediate_pages', False):
 344             try:
 345                 url = url_or_request.get_full_url()
 346             except AttributeError:
 347                 url = url_or_request
 348             self.to_screen('Dumping request to ' + url)
 349             dump = base64.b64encode(webpage_bytes).decode('ascii')
 350             self._downloader.to_screen(dump)
 351         if self._downloader.params.get('write_pages', False):
 352             try:
 353                 url = url_or_request.get_full_url()
 354             except AttributeError:
 355                 url = url_or_request
 356             basen = '%s_%s' % (video_id, url)
 357             if len(basen) > 240:
 358                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 359                 basen = basen[:240 - len(h)] + h
 360             raw_filename = basen + '.dump'
 361             filename = sanitize_filename(raw_filename, restricted=True)
 362             self.to_screen('Saving request to ' + filename)
 363             # Working around MAX_PATH limitation on Windows (see
 364             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 365             if os.name == 'nt':
 366                 absfilepath = os.path.abspath(filename)
 367                 if len(absfilepath) > 259:
 368                     filename = '\\\\?\\' + absfilepath
 369             with open(filename, 'wb') as outf:
 370                 outf.write(webpage_bytes)
 371
 372         try:
 373             content = webpage_bytes.decode(encoding, 'replace')
 374         except LookupError:
 375             content = webpage_bytes.decode('utf-8', 'replace')
 376
 377         if ('<title>Access to this site is blocked</title>' in content and
 378                 'Websense' in content[:512]):
 379             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 380             blocked_iframe = self._html_search_regex(
 381                 r'<iframe src="([^"]+)"', content,
 382                 'Websense information URL', default=None)
 383             if blocked_iframe:
 384                 msg += ' Visit %s for more details' % blocked_iframe
 385             raise ExtractorError(msg, expected=True)
 386
 387         return content
 388
 389     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5):
 390         """ Returns the data of the page as a string """
 391         success = False
 392         try_count = 0
 393         while success is False:
 394             try:
 395                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
 396                 success = True
 397             except compat_http_client.IncompleteRead as e:
 398                 try_count += 1
 399                 if try_count >= tries:
 400                     raise e
 401                 self._sleep(timeout, video_id)
 402         if res is False:
 403             return res
 404         else:
 405             content, _ = res
 406             return content
 407
 408     def _download_xml(self, url_or_request, video_id,
 409                       note='Downloading XML', errnote='Unable to download XML',
 410                       transform_source=None, fatal=True):
 411         """Return the xml as an xml.etree.ElementTree.Element"""
 412         xml_string = self._download_webpage(
 413             url_or_request, video_id, note, errnote, fatal=fatal)
 414         if xml_string is False:
 415             return xml_string
 416         if transform_source:
 417             xml_string = transform_source(xml_string)
 418         return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
 419
 420     def _download_json(self, url_or_request, video_id,
 421                        note='Downloading JSON metadata',
 422                        errnote='Unable to download JSON metadata',
 423                        transform_source=None,
 424                        fatal=True):
 425         json_string = self._download_webpage(
 426             url_or_request, video_id, note, errnote, fatal=fatal)
 427         if (not fatal) and json_string is False:
 428             return None
 429         return self._parse_json(
 430             json_string, video_id, transform_source=transform_source, fatal=fatal)
 431
 432     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 433         if transform_source:
 434             json_string = transform_source(json_string)
 435         try:
 436             return json.loads(json_string)
 437         except ValueError as ve:
 438             errmsg = '%s: Failed to parse JSON ' % video_id
 439             if fatal:
 440                 raise ExtractorError(errmsg, cause=ve)
 441             else:
 442                 self.report_warning(errmsg + str(ve))
 443
 444     def report_warning(self, msg, video_id=None):
 445         idstr = '' if video_id is None else '%s: ' % video_id
 446         self._downloader.report_warning(
 447             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 448
 449     def to_screen(self, msg):
 450         """Print msg to screen, prefixing it with '[ie_name]'"""
 451         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 452
 453     def report_extraction(self, id_or_name):
 454         """Report information extraction."""
 455         self.to_screen('%s: Extracting information' % id_or_name)
 456
 457     def report_download_webpage(self, video_id):
 458         """Report webpage download."""
 459         self.to_screen('%s: Downloading webpage' % video_id)
 460
 461     def report_age_confirmation(self):
 462         """Report attempt to confirm age."""
 463         self.to_screen('Confirming age')
 464
 465     def report_login(self):
 466         """Report attempt to log in."""
 467         self.to_screen('Logging in')
 468
 469     # Methods for following #608
 470     @staticmethod
 471     def url_result(url, ie=None, video_id=None):
 472         """Returns a url that points to a page that should be processed"""
 473         # TODO: ie should be the class used for getting the info
 474         video_info = {'_type': 'url',
 475                       'url': url,
 476                       'ie_key': ie}
 477         if video_id is not None:
 478             video_info['id'] = video_id
 479         return video_info
 480
 481     @staticmethod
 482     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 483         """Returns a playlist"""
 484         video_info = {'_type': 'playlist',
 485                       'entries': entries}
 486         if playlist_id:
 487             video_info['id'] = playlist_id
 488         if playlist_title:
 489             video_info['title'] = playlist_title
 490         if playlist_description:
 491             video_info['description'] = playlist_description
 492         return video_info
 493
 494     def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
 495         """
 496         Perform a regex search on the given string, using a single or a list of
 497         patterns returning the first matching group.
 498         In case of failure return a default value or raise a WARNING or a
 499         RegexNotFoundError, depending on fatal, specifying the field name.
 500         """
 501         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 502             mobj = re.search(pattern, string, flags)
 503         else:
 504             for p in pattern:
 505                 mobj = re.search(p, string, flags)
 506                 if mobj:
 507                     break
 508
 509         if os.name != 'nt' and sys.stderr.isatty():
 510             _name = '\033[0;34m%s\033[0m' % name
 511         else:
 512             _name = name
 513
 514         if mobj:
 515             if group is None:
 516                 # return the first matching group
 517                 return next(g for g in mobj.groups() if g is not None)
 518             else:
 519                 return mobj.group(group)
 520         elif default is not _NO_DEFAULT:
 521             return default
 522         elif fatal:
 523             raise RegexNotFoundError('Unable to extract %s' % _name)
 524         else:
 525             self._downloader.report_warning('unable to extract %s; '
 526                                             'please report this issue on http://yt-dl.org/bug' % _name)
 527             return None
 528
 529     def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
 530         """
 531         Like _search_regex, but strips HTML tags and unescapes entities.
 532         """
 533         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 534         if res:
 535             return clean_html(res).strip()
 536         else:
 537             return res
 538
 539     def _get_login_info(self):
 540         """
 541         Get the the login info as (username, password)
 542         It will look in the netrc file using the _NETRC_MACHINE value
 543         If there's no info available, return (None, None)
 544         """
 545         if self._downloader is None:
 546             return (None, None)
 547
 548         username = None
 549         password = None
 550         downloader_params = self._downloader.params
 551
 552         # Attempt to use provided username and password or .netrc data
 553         if downloader_params.get('username', None) is not None:
 554             username = downloader_params['username']
 555             password = downloader_params['password']
 556         elif downloader_params.get('usenetrc', False):
 557             try:
 558                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 559                 if info is not None:
 560                     username = info[0]
 561                     password = info[2]
 562                 else:
 563                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 564             except (IOError, netrc.NetrcParseError) as err:
 565                 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
 566
 567         return (username, password)
 568
 569     def _get_tfa_info(self):
 570         """
 571         Get the two-factor authentication info
 572         TODO - asking the user will be required for sms/phone verify
 573         currently just uses the command line option
 574         If there's no info available, return None
 575         """
 576         if self._downloader is None:
 577             return None
 578         downloader_params = self._downloader.params
 579
 580         if downloader_params.get('twofactor', None) is not None:
 581             return downloader_params['twofactor']
 582
 583         return None
 584
 585     # Helper functions for extracting OpenGraph info
 586     @staticmethod
 587     def _og_regexes(prop):
 588         content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
 589         property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
 590         template = r'<meta[^>]+?%s[^>]+?%s'
 591         return [
 592             template % (property_re, content_re),
 593             template % (content_re, property_re),
 594         ]
 595
 596     def _og_search_property(self, prop, html, name=None, **kargs):
 597         if name is None:
 598             name = 'OpenGraph %s' % prop
 599         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 600         if escaped is None:
 601             return None
 602         return unescapeHTML(escaped)
 603
 604     def _og_search_thumbnail(self, html, **kargs):
 605         return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs)
 606
 607     def _og_search_description(self, html, **kargs):
 608         return self._og_search_property('description', html, fatal=False, **kargs)
 609
 610     def _og_search_title(self, html, **kargs):
 611         return self._og_search_property('title', html, **kargs)
 612
 613     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 614         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 615         if secure:
 616             regexes = self._og_regexes('video:secure_url') + regexes
 617         return self._html_search_regex(regexes, html, name, **kargs)
 618
 619     def _og_search_url(self, html, **kargs):
 620         return self._og_search_property('url', html, **kargs)
 621
 622     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 623         if display_name is None:
 624             display_name = name
 625         return self._html_search_regex(
 626             r'''(?isx)<meta
 627                     (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1)
 628                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(name),
 629             html, display_name, fatal=fatal, group='content', **kwargs)
 630
 631     def _dc_search_uploader(self, html):
 632         return self._html_search_meta('dc.creator', html, 'uploader')
 633
 634     def _rta_search(self, html):
 635         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 636         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 637                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 638                      html):
 639             return 18
 640         return 0
 641
 642     def _media_rating_search(self, html):
 643         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 644         rating = self._html_search_meta('rating', html)
 645
 646         if not rating:
 647             return None
 648
 649         RATING_TABLE = {
 650             'safe for kids': 0,
 651             'general': 8,
 652             '14 years': 14,
 653             'mature': 17,
 654             'restricted': 19,
 655         }
 656         return RATING_TABLE.get(rating.lower(), None)
 657
 658     def _twitter_search_player(self, html):
 659         return self._html_search_meta('twitter:player', html,
 660                                       'twitter card player')
 661
 662     def _sort_formats(self, formats):
 663         if not formats:
 664             raise ExtractorError('No video formats found')
 665
 666         def _formats_key(f):
 667             # TODO remove the following workaround
 668             from ..utils import determine_ext
 669             if not f.get('ext') and 'url' in f:
 670                 f['ext'] = determine_ext(f['url'])
 671
 672             preference = f.get('preference')
 673             if preference is None:
 674                 proto = f.get('protocol')
 675                 if proto is None:
 676                     proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
 677
 678                 preference = 0 if proto in ['http', 'https'] else -0.1
 679                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 680                     preference -= 0.5
 681
 682             if f.get('vcodec') == 'none':  # audio only
 683                 if self._downloader.params.get('prefer_free_formats'):
 684                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
 685                 else:
 686                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
 687                 ext_preference = 0
 688                 try:
 689                     audio_ext_preference = ORDER.index(f['ext'])
 690                 except ValueError:
 691                     audio_ext_preference = -1
 692             else:
 693                 if self._downloader.params.get('prefer_free_formats'):
 694                     ORDER = ['flv', 'mp4', 'webm']
 695                 else:
 696                     ORDER = ['webm', 'flv', 'mp4']
 697                 try:
 698                     ext_preference = ORDER.index(f['ext'])
 699                 except ValueError:
 700                     ext_preference = -1
 701                 audio_ext_preference = 0
 702
 703             return (
 704                 preference,
 705                 f.get('language_preference') if f.get('language_preference') is not None else -1,
 706                 f.get('quality') if f.get('quality') is not None else -1,
 707                 f.get('height') if f.get('height') is not None else -1,
 708                 f.get('width') if f.get('width') is not None else -1,
 709                 ext_preference,
 710                 f.get('tbr') if f.get('tbr') is not None else -1,
 711                 f.get('vbr') if f.get('vbr') is not None else -1,
 712                 f.get('abr') if f.get('abr') is not None else -1,
 713                 audio_ext_preference,
 714                 f.get('fps') if f.get('fps') is not None else -1,
 715                 f.get('filesize') if f.get('filesize') is not None else -1,
 716                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
 717                 f.get('source_preference') if f.get('source_preference') is not None else -1,
 718                 f.get('format_id'),
 719             )
 720         formats.sort(key=_formats_key)
 721
 722     def _check_formats(self, formats, video_id):
 723         if formats:
 724             formats[:] = filter(
 725                 lambda f: self._is_valid_url(
 726                     f['url'], video_id,
 727                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
 728                 formats)
 729
 730     def _is_valid_url(self, url, video_id, item='video'):
 731         try:
 732             self._request_webpage(
 733                 HEADRequest(url), video_id,
 734                 'Checking %s URL' % item)
 735             return True
 736         except ExtractorError as e:
 737             if isinstance(e.cause, compat_HTTPError):
 738                 self.report_warning(
 739                     '%s URL is invalid, skipping' % item, video_id)
 740                 return False
 741             raise
 742
 743     def http_scheme(self):
 744         """ Either "http:" or "https:", depending on the user's preferences """
 745         return (
 746             'http:'
 747             if self._downloader.params.get('prefer_insecure', False)
 748             else 'https:')
 749
 750     def _proto_relative_url(self, url, scheme=None):
 751         if url is None:
 752             return url
 753         if url.startswith('//'):
 754             if scheme is None:
 755                 scheme = self.http_scheme()
 756             return scheme + url
 757         else:
 758             return url
 759
 760     def _sleep(self, timeout, video_id, msg_template=None):
 761         if msg_template is None:
 762             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
 763         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
 764         self.to_screen(msg)
 765         time.sleep(timeout)
 766
 767     def _extract_f4m_formats(self, manifest_url, video_id):
 768         manifest = self._download_xml(
 769             manifest_url, video_id, 'Downloading f4m manifest',
 770             'Unable to download f4m manifest')
 771
 772         formats = []
 773         manifest_version = '1.0'
 774         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
 775         if not media_nodes:
 776             manifest_version = '2.0'
 777             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
 778         for i, media_el in enumerate(media_nodes):
 779             if manifest_version == '2.0':
 780                 manifest_url = '/'.join(manifest_url.split('/')[:-1]) + '/' + media_el.attrib.get('href')
 781             tbr = int_or_none(media_el.attrib.get('bitrate'))
 782             format_id = 'f4m-%d' % (i if tbr is None else tbr)
 783             formats.append({
 784                 'format_id': format_id,
 785                 'url': manifest_url,
 786                 'ext': 'flv',
 787                 'tbr': tbr,
 788                 'width': int_or_none(media_el.attrib.get('width')),
 789                 'height': int_or_none(media_el.attrib.get('height')),
 790             })
 791         self._sort_formats(formats)
 792
 793         return formats
 794
 795     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
 796                               entry_protocol='m3u8', preference=None):
 797
 798         formats = [{
 799             'format_id': 'm3u8-meta',
 800             'url': m3u8_url,
 801             'ext': ext,
 802             'protocol': 'm3u8',
 803             'preference': -1,
 804             'resolution': 'multiple',
 805             'format_note': 'Quality selection URL',
 806         }]
 807
 808         format_url = lambda u: (
 809             u
 810             if re.match(r'^https?://', u)
 811             else compat_urlparse.urljoin(m3u8_url, u))
 812
 813         m3u8_doc = self._download_webpage(
 814             m3u8_url, video_id,
 815             note='Downloading m3u8 information',
 816             errnote='Failed to download m3u8 information')
 817         last_info = None
 818         kv_rex = re.compile(
 819             r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
 820         for line in m3u8_doc.splitlines():
 821             if line.startswith('#EXT-X-STREAM-INF:'):
 822                 last_info = {}
 823                 for m in kv_rex.finditer(line):
 824                     v = m.group('val')
 825                     if v.startswith('"'):
 826                         v = v[1:-1]
 827                     last_info[m.group('key')] = v
 828             elif line.startswith('#') or not line.strip():
 829                 continue
 830             else:
 831                 if last_info is None:
 832                     formats.append({'url': format_url(line)})
 833                     continue
 834                 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
 835
 836                 f = {
 837                     'format_id': 'm3u8-%d' % (tbr if tbr else len(formats)),
 838                     'url': format_url(line.strip()),
 839                     'tbr': tbr,
 840                     'ext': ext,
 841                     'protocol': entry_protocol,
 842                     'preference': preference,
 843                 }
 844                 codecs = last_info.get('CODECS')
 845                 if codecs:
 846                     # TODO: looks like video codec is not always necessarily goes first
 847                     va_codecs = codecs.split(',')
 848                     if va_codecs[0]:
 849                         f['vcodec'] = va_codecs[0].partition('.')[0]
 850                     if len(va_codecs) > 1 and va_codecs[1]:
 851                         f['acodec'] = va_codecs[1].partition('.')[0]
 852                 resolution = last_info.get('RESOLUTION')
 853                 if resolution:
 854                     width_str, height_str = resolution.split('x')
 855                     f['width'] = int(width_str)
 856                     f['height'] = int(height_str)
 857                 formats.append(f)
 858                 last_info = {}
 859         self._sort_formats(formats)
 860         return formats
 861
 862     # TODO: improve extraction
 863     def _extract_smil_formats(self, smil_url, video_id):
 864         smil = self._download_xml(
 865             smil_url, video_id, 'Downloading SMIL file',
 866             'Unable to download SMIL file')
 867
 868         base = smil.find('./head/meta').get('base')
 869
 870         formats = []
 871         rtmp_count = 0
 872         for video in smil.findall('./body/switch/video'):
 873             src = video.get('src')
 874             if not src:
 875                 continue
 876             bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
 877             width = int_or_none(video.get('width'))
 878             height = int_or_none(video.get('height'))
 879             proto = video.get('proto')
 880             if not proto:
 881                 if base:
 882                     if base.startswith('rtmp'):
 883                         proto = 'rtmp'
 884                     elif base.startswith('http'):
 885                         proto = 'http'
 886             ext = video.get('ext')
 887             if proto == 'm3u8':
 888                 formats.extend(self._extract_m3u8_formats(src, video_id, ext))
 889             elif proto == 'rtmp':
 890                 rtmp_count += 1
 891                 streamer = video.get('streamer') or base
 892                 formats.append({
 893                     'url': streamer,
 894                     'play_path': src,
 895                     'ext': 'flv',
 896                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
 897                     'tbr': bitrate,
 898                     'width': width,
 899                     'height': height,
 900                 })
 901         self._sort_formats(formats)
 902
 903         return formats
 904
 905     def _live_title(self, name):
 906         """ Generate the title for a live video """
 907         now = datetime.datetime.now()
 908         now_str = now.strftime("%Y-%m-%d %H:%M")
 909         return name + ' ' + now_str
 910
 911     def _int(self, v, name, fatal=False, **kwargs):
 912         res = int_or_none(v, **kwargs)
 913         if 'get_attr' in kwargs:
 914             print(getattr(v, kwargs['get_attr']))
 915         if res is None:
 916             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
 917             if fatal:
 918                 raise ExtractorError(msg)
 919             else:
 920                 self._downloader.report_warning(msg)
 921         return res
 922
 923     def _float(self, v, name, fatal=False, **kwargs):
 924         res = float_or_none(v, **kwargs)
 925         if res is None:
 926             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
 927             if fatal:
 928                 raise ExtractorError(msg)
 929             else:
 930                 self._downloader.report_warning(msg)
 931         return res
 932
 933     def _set_cookie(self, domain, name, value, expire_time=None):
 934         cookie = compat_cookiejar.Cookie(
 935             0, name, value, None, None, domain, None,
 936             None, '/', True, False, expire_time, '', None, None, None)
 937         self._downloader.cookiejar.set_cookie(cookie)
 938
 939     def get_testcases(self, include_onlymatching=False):
 940         t = getattr(self, '_TEST', None)
 941         if t:
 942             assert not hasattr(self, '_TESTS'), \
 943                 '%s has _TEST and _TESTS' % type(self).__name__
 944             tests = [t]
 945         else:
 946             tests = getattr(self, '_TESTS', [])
 947         for t in tests:
 948             if not include_onlymatching and t.get('only_matching', False):
 949                 continue
 950             t['name'] = type(self).__name__[:-len('IE')]
 951             yield t
 952
 953     def is_suitable(self, age_limit):
 954         """ Test whether the extractor is generally suitable for the given
 955         age limit (i.e. pornographic sites are not, all others usually are) """
 956
 957         any_restricted = False
 958         for tc in self.get_testcases(include_onlymatching=False):
 959             if 'playlist' in tc:
 960                 tc = tc['playlist'][0]
 961             is_restricted = age_restricted(
 962                 tc.get('info_dict', {}).get('age_limit'), age_limit)
 963             if not is_restricted:
 964                 return True
 965             any_restricted = any_restricted or is_restricted
 966         return not any_restricted
 967
 968
 969 class SearchInfoExtractor(InfoExtractor):
 970     """
 971     Base class for paged search queries extractors.
 972     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 973     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 974     """
 975
 976     @classmethod
 977     def _make_valid_url(cls):
 978         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 979
 980     @classmethod
 981     def suitable(cls, url):
 982         return re.match(cls._make_valid_url(), url) is not None
 983
 984     def _real_extract(self, query):
 985         mobj = re.match(self._make_valid_url(), query)
 986         if mobj is None:
 987             raise ExtractorError('Invalid search query "%s"' % query)
 988
 989         prefix = mobj.group('prefix')
 990         query = mobj.group('query')
 991         if prefix == '':
 992             return self._get_n_results(query, 1)
 993         elif prefix == 'all':
 994             return self._get_n_results(query, self._MAX_RESULTS)
 995         else:
 996             n = int(prefix)
 997             if n <= 0:
 998                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
 999             elif n > self._MAX_RESULTS:
1000                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
1001                 n = self._MAX_RESULTS
1002             return self._get_n_results(query, n)
1003
1004     def _get_n_results(self, query, n):
1005         """Get a specified number of results for a query"""
1006         raise NotImplementedError("This method must be implemented by subclasses")
1007
1008     @property
1009     def SEARCH_KEY(self):
1010         return self._SEARCH_KEY