youtube_dl/extractor/common.py

   1 from __future__ import unicode_literals
   2
   3 import base64
   4 import datetime
   5 import hashlib
   6 import json
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import sys
  12 import time
  13 import xml.etree.ElementTree
  14
  15 from ..compat import (
  16     compat_cookiejar,
  17     compat_HTTPError,
  18     compat_http_client,
  19     compat_urllib_error,
  20     compat_urllib_parse_urlparse,
  21     compat_urlparse,
  22     compat_str,
  23 )
  24 from ..utils import (
  25     age_restricted,
  26     clean_html,
  27     compiled_regex_type,
  28     ExtractorError,
  29     float_or_none,
  30     HEADRequest,
  31     int_or_none,
  32     RegexNotFoundError,
  33     sanitize_filename,
  34     unescapeHTML,
  35 )
  36 _NO_DEFAULT = object()
  37
  38
  39 class InfoExtractor(object):
  40     """Information Extractor class.
  41
  42     Information extractors are the classes that, given a URL, extract
  43     information about the video (or videos) the URL refers to. This
  44     information includes the real video URL, the video title, author and
  45     others. The information is stored in a dictionary which is then
  46     passed to the YoutubeDL. The YoutubeDL processes this
  47     information possibly downloading the video to the file system, among
  48     other possible outcomes.
  49
  50     The type field determines the the type of the result.
  51     By far the most common value (and the default if _type is missing) is
  52     "video", which indicates a single video.
  53
  54     For a video, the dictionaries must include the following fields:
  55
  56     id:             Video identifier.
  57     title:          Video title, unescaped.
  58
  59     Additionally, it must contain either a formats entry or a url one:
  60
  61     formats:        A list of dictionaries for each format available, ordered
  62                     from worst to best quality.
  63
  64                     Potential fields:
  65                     * url        Mandatory. The URL of the video file
  66                     * ext        Will be calculated from url if missing
  67                     * format     A human-readable description of the format
  68                                  ("mp4 container with h264/opus").
  69                                  Calculated from the format_id, width, height.
  70                                  and format_note fields if missing.
  71                     * format_id  A short description of the format
  72                                  ("mp4_h264_opus" or "19").
  73                                 Technically optional, but strongly recommended.
  74                     * format_note Additional info about the format
  75                                  ("3D" or "DASH video")
  76                     * width      Width of the video, if known
  77                     * height     Height of the video, if known
  78                     * resolution Textual description of width and height
  79                     * tbr        Average bitrate of audio and video in KBit/s
  80                     * abr        Average audio bitrate in KBit/s
  81                     * acodec     Name of the audio codec in use
  82                     * asr        Audio sampling rate in Hertz
  83                     * vbr        Average video bitrate in KBit/s
  84                     * fps        Frame rate
  85                     * vcodec     Name of the video codec in use
  86                     * container  Name of the container format
  87                     * filesize   The number of bytes, if known in advance
  88                     * filesize_approx  An estimate for the number of bytes
  89                     * player_url SWF Player URL (used for rtmpdump).
  90                     * protocol   The protocol that will be used for the actual
  91                                  download, lower-case.
  92                                  "http", "https", "rtsp", "rtmp", "m3u8" or so.
  93                     * preference Order number of this format. If this field is
  94                                  present and not None, the formats get sorted
  95                                  by this field, regardless of all other values.
  96                                  -1 for default (order by other properties),
  97                                  -2 or smaller for less than default.
  98                                  < -1000 to hide the format (if there is
  99                                     another one which is strictly better)
 100                     * language_preference  Is this in the correct requested
 101                                  language?
 102                                  10 if it's what the URL is about,
 103                                  -1 for default (don't know),
 104                                  -10 otherwise, other values reserved for now.
 105                     * quality    Order number of the video quality of this
 106                                  format, irrespective of the file format.
 107                                  -1 for default (order by other properties),
 108                                  -2 or smaller for less than default.
 109                     * source_preference  Order number for this video source
 110                                   (quality takes higher priority)
 111                                  -1 for default (order by other properties),
 112                                  -2 or smaller for less than default.
 113                     * http_method  HTTP method to use for the download.
 114                     * http_headers  A dictionary of additional HTTP headers
 115                                  to add to the request.
 116                     * http_post_data  Additional data to send with a POST
 117                                  request.
 118                     * stretched_ratio  If given and not 1, indicates that the
 119                                  video's pixels are not square.
 120                                  width : height ratio as float.
 121                     * no_resume  The server does not support resuming the
 122                                  (HTTP or RTMP) download. Boolean.
 123
 124     url:            Final video URL.
 125     ext:            Video filename extension.
 126     format:         The video format, defaults to ext (used for --get-format)
 127     player_url:     SWF Player URL (used for rtmpdump).
 128
 129     The following fields are optional:
 130
 131     alt_title:      A secondary title of the video.
 132     display_id      An alternative identifier for the video, not necessarily
 133                     unique, but available before title. Typically, id is
 134                     something like "4234987", title "Dancing naked mole rats",
 135                     and display_id "dancing-naked-mole-rats"
 136     thumbnails:     A list of dictionaries, with the following entries:
 137                         * "id" (optional, string) - Thumbnail format ID
 138                         * "url"
 139                         * "preference" (optional, int) - quality of the image
 140                         * "width" (optional, int)
 141                         * "height" (optional, int)
 142                         * "resolution" (optional, string "{width}x{height"},
 143                                         deprecated)
 144     thumbnail:      Full URL to a video thumbnail image.
 145     description:    Full video description.
 146     uploader:       Full name of the video uploader.
 147     timestamp:      UNIX timestamp of the moment the video became available.
 148     upload_date:    Video upload date (YYYYMMDD).
 149                     If not explicitly set, calculated from timestamp.
 150     uploader_id:    Nickname or id of the video uploader.
 151     location:       Physical location where the video was filmed.
 152     subtitles:      The subtitle file contents as a dictionary in the format
 153                     {language: subtitles}.
 154     duration:       Length of the video in seconds, as an integer.
 155     view_count:     How many users have watched the video on the platform.
 156     like_count:     Number of positive ratings of the video
 157     dislike_count:  Number of negative ratings of the video
 158     comment_count:  Number of comments on the video
 159     comments:       A list of comments, each with one or more of the following
 160                     properties (all but one of text or html optional):
 161                         * "author" - human-readable name of the comment author
 162                         * "author_id" - user ID of the comment author
 163                         * "id" - Comment ID
 164                         * "html" - Comment as HTML
 165                         * "text" - Plain text of the comment
 166                         * "timestamp" - UNIX timestamp of comment
 167                         * "parent" - ID of the comment this one is replying to.
 168                                      Set to "root" to indicate that this is a
 169                                      comment to the original video.
 170     age_limit:      Age restriction for the video, as an integer (years)
 171     webpage_url:    The url to the video webpage, if given to youtube-dl it
 172                     should allow to get the same result again. (It will be set
 173                     by YoutubeDL if it's missing)
 174     categories:     A list of categories that the video falls in, for example
 175                     ["Sports", "Berlin"]
 176     is_live:        True, False, or None (=unknown). Whether this video is a
 177                     live stream that goes on instead of a fixed-length video.
 178
 179     Unless mentioned otherwise, the fields should be Unicode strings.
 180
 181     Unless mentioned otherwise, None is equivalent to absence of information.
 182
 183
 184     _type "playlist" indicates multiple videos.
 185     There must be a key "entries", which is a list, an iterable, or a PagedList
 186     object, each element of which is a valid dictionary by this specification.
 187
 188     Additionally, playlists can have "title" and "id" attributes with the same
 189     semantics as videos (see above).
 190
 191
 192     _type "multi_video" indicates that there are multiple videos that
 193     form a single show, for examples multiple acts of an opera or TV episode.
 194     It must have an entries key like a playlist and contain all the keys
 195     required for a video at the same time.
 196
 197
 198     _type "url" indicates that the video must be extracted from another
 199     location, possibly by a different extractor. Its only required key is:
 200     "url" - the next URL to extract.
 201     The key "ie_key" can be set to the class name (minus the trailing "IE",
 202     e.g. "Youtube") if the extractor class is known in advance.
 203     Additionally, the dictionary may have any properties of the resolved entity
 204     known in advance, for example "title" if the title of the referred video is
 205     known ahead of time.
 206
 207
 208     _type "url_transparent" entities have the same specification as "url", but
 209     indicate that the given additional information is more precise than the one
 210     associated with the resolved URL.
 211     This is useful when a site employs a video service that hosts the video and
 212     its technical metadata, but that video service does not embed a useful
 213     title, description etc.
 214
 215
 216     Subclasses of this one should re-define the _real_initialize() and
 217     _real_extract() methods and define a _VALID_URL regexp.
 218     Probably, they should also be added to the list of extractors.
 219
 220     Finally, the _WORKING attribute should be set to False for broken IEs
 221     in order to warn the users and skip the tests.
 222     """
 223
 224     _ready = False
 225     _downloader = None
 226     _WORKING = True
 227
 228     def __init__(self, downloader=None):
 229         """Constructor. Receives an optional downloader."""
 230         self._ready = False
 231         self.set_downloader(downloader)
 232
 233     @classmethod
 234     def suitable(cls, url):
 235         """Receives a URL and returns True if suitable for this IE."""
 236
 237         # This does not use has/getattr intentionally - we want to know whether
 238         # we have cached the regexp for *this* class, whereas getattr would also
 239         # match the superclass
 240         if '_VALID_URL_RE' not in cls.__dict__:
 241             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 242         return cls._VALID_URL_RE.match(url) is not None
 243
 244     @classmethod
 245     def _match_id(cls, url):
 246         if '_VALID_URL_RE' not in cls.__dict__:
 247             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 248         m = cls._VALID_URL_RE.match(url)
 249         assert m
 250         return m.group('id')
 251
 252     @classmethod
 253     def working(cls):
 254         """Getter method for _WORKING."""
 255         return cls._WORKING
 256
 257     def initialize(self):
 258         """Initializes an instance (authentication, etc)."""
 259         if not self._ready:
 260             self._real_initialize()
 261             self._ready = True
 262
 263     def extract(self, url):
 264         """Extracts URL information and returns it in list of dicts."""
 265         self.initialize()
 266         return self._real_extract(url)
 267
 268     def set_downloader(self, downloader):
 269         """Sets the downloader for this IE."""
 270         self._downloader = downloader
 271
 272     def _real_initialize(self):
 273         """Real initialization process. Redefine in subclasses."""
 274         pass
 275
 276     def _real_extract(self, url):
 277         """Real extraction process. Redefine in subclasses."""
 278         pass
 279
 280     @classmethod
 281     def ie_key(cls):
 282         """A string for getting the InfoExtractor with get_info_extractor"""
 283         return cls.__name__[:-2]
 284
 285     @property
 286     def IE_NAME(self):
 287         return type(self).__name__[:-2]
 288
 289     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 290         """ Returns the response handle """
 291         if note is None:
 292             self.report_download_webpage(video_id)
 293         elif note is not False:
 294             if video_id is None:
 295                 self.to_screen('%s' % (note,))
 296             else:
 297                 self.to_screen('%s: %s' % (video_id, note))
 298         try:
 299             return self._downloader.urlopen(url_or_request)
 300         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 301             if errnote is False:
 302                 return False
 303             if errnote is None:
 304                 errnote = 'Unable to download webpage'
 305             errmsg = '%s: %s' % (errnote, compat_str(err))
 306             if fatal:
 307                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 308             else:
 309                 self._downloader.report_warning(errmsg)
 310                 return False
 311
 312     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 313         """ Returns a tuple (page content as string, URL handle) """
 314         # Strip hashes from the URL (#1038)
 315         if isinstance(url_or_request, (compat_str, str)):
 316             url_or_request = url_or_request.partition('#')[0]
 317
 318         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 319         if urlh is False:
 320             assert not fatal
 321             return False
 322         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal)
 323         return (content, urlh)
 324
 325     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None):
 326         content_type = urlh.headers.get('Content-Type', '')
 327         webpage_bytes = urlh.read()
 328         if prefix is not None:
 329             webpage_bytes = prefix + webpage_bytes
 330         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 331         if m:
 332             encoding = m.group(1)
 333         else:
 334             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 335                           webpage_bytes[:1024])
 336             if m:
 337                 encoding = m.group(1).decode('ascii')
 338             elif webpage_bytes.startswith(b'\xff\xfe'):
 339                 encoding = 'utf-16'
 340             else:
 341                 encoding = 'utf-8'
 342         if self._downloader.params.get('dump_intermediate_pages', False):
 343             try:
 344                 url = url_or_request.get_full_url()
 345             except AttributeError:
 346                 url = url_or_request
 347             self.to_screen('Dumping request to ' + url)
 348             dump = base64.b64encode(webpage_bytes).decode('ascii')
 349             self._downloader.to_screen(dump)
 350         if self._downloader.params.get('write_pages', False):
 351             try:
 352                 url = url_or_request.get_full_url()
 353             except AttributeError:
 354                 url = url_or_request
 355             basen = '%s_%s' % (video_id, url)
 356             if len(basen) > 240:
 357                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 358                 basen = basen[:240 - len(h)] + h
 359             raw_filename = basen + '.dump'
 360             filename = sanitize_filename(raw_filename, restricted=True)
 361             self.to_screen('Saving request to ' + filename)
 362             # Working around MAX_PATH limitation on Windows (see
 363             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 364             if os.name == 'nt':
 365                 absfilepath = os.path.abspath(filename)
 366                 if len(absfilepath) > 259:
 367                     filename = '\\\\?\\' + absfilepath
 368             with open(filename, 'wb') as outf:
 369                 outf.write(webpage_bytes)
 370
 371         try:
 372             content = webpage_bytes.decode(encoding, 'replace')
 373         except LookupError:
 374             content = webpage_bytes.decode('utf-8', 'replace')
 375
 376         if ('<title>Access to this site is blocked</title>' in content and
 377                 'Websense' in content[:512]):
 378             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 379             blocked_iframe = self._html_search_regex(
 380                 r'<iframe src="([^"]+)"', content,
 381                 'Websense information URL', default=None)
 382             if blocked_iframe:
 383                 msg += ' Visit %s for more details' % blocked_iframe
 384             raise ExtractorError(msg, expected=True)
 385
 386         return content
 387
 388     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5):
 389         """ Returns the data of the page as a string """
 390         success = False
 391         try_count = 0
 392         while success is False:
 393             try:
 394                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
 395                 success = True
 396             except compat_http_client.IncompleteRead as e:
 397                 try_count += 1
 398                 if try_count >= tries:
 399                     raise e
 400                 self._sleep(timeout, video_id)
 401         if res is False:
 402             return res
 403         else:
 404             content, _ = res
 405             return content
 406
 407     def _download_xml(self, url_or_request, video_id,
 408                       note='Downloading XML', errnote='Unable to download XML',
 409                       transform_source=None, fatal=True):
 410         """Return the xml as an xml.etree.ElementTree.Element"""
 411         xml_string = self._download_webpage(
 412             url_or_request, video_id, note, errnote, fatal=fatal)
 413         if xml_string is False:
 414             return xml_string
 415         if transform_source:
 416             xml_string = transform_source(xml_string)
 417         return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
 418
 419     def _download_json(self, url_or_request, video_id,
 420                        note='Downloading JSON metadata',
 421                        errnote='Unable to download JSON metadata',
 422                        transform_source=None,
 423                        fatal=True):
 424         json_string = self._download_webpage(
 425             url_or_request, video_id, note, errnote, fatal=fatal)
 426         if (not fatal) and json_string is False:
 427             return None
 428         return self._parse_json(
 429             json_string, video_id, transform_source=transform_source, fatal=fatal)
 430
 431     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 432         if transform_source:
 433             json_string = transform_source(json_string)
 434         try:
 435             return json.loads(json_string)
 436         except ValueError as ve:
 437             errmsg = '%s: Failed to parse JSON ' % video_id
 438             if fatal:
 439                 raise ExtractorError(errmsg, cause=ve)
 440             else:
 441                 self.report_warning(errmsg + str(ve))
 442
 443     def report_warning(self, msg, video_id=None):
 444         idstr = '' if video_id is None else '%s: ' % video_id
 445         self._downloader.report_warning(
 446             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 447
 448     def to_screen(self, msg):
 449         """Print msg to screen, prefixing it with '[ie_name]'"""
 450         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 451
 452     def report_extraction(self, id_or_name):
 453         """Report information extraction."""
 454         self.to_screen('%s: Extracting information' % id_or_name)
 455
 456     def report_download_webpage(self, video_id):
 457         """Report webpage download."""
 458         self.to_screen('%s: Downloading webpage' % video_id)
 459
 460     def report_age_confirmation(self):
 461         """Report attempt to confirm age."""
 462         self.to_screen('Confirming age')
 463
 464     def report_login(self):
 465         """Report attempt to log in."""
 466         self.to_screen('Logging in')
 467
 468     # Methods for following #608
 469     @staticmethod
 470     def url_result(url, ie=None, video_id=None):
 471         """Returns a url that points to a page that should be processed"""
 472         # TODO: ie should be the class used for getting the info
 473         video_info = {'_type': 'url',
 474                       'url': url,
 475                       'ie_key': ie}
 476         if video_id is not None:
 477             video_info['id'] = video_id
 478         return video_info
 479
 480     @staticmethod
 481     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 482         """Returns a playlist"""
 483         video_info = {'_type': 'playlist',
 484                       'entries': entries}
 485         if playlist_id:
 486             video_info['id'] = playlist_id
 487         if playlist_title:
 488             video_info['title'] = playlist_title
 489         if playlist_description:
 490             video_info['description'] = playlist_description
 491         return video_info
 492
 493     def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
 494         """
 495         Perform a regex search on the given string, using a single or a list of
 496         patterns returning the first matching group.
 497         In case of failure return a default value or raise a WARNING or a
 498         RegexNotFoundError, depending on fatal, specifying the field name.
 499         """
 500         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 501             mobj = re.search(pattern, string, flags)
 502         else:
 503             for p in pattern:
 504                 mobj = re.search(p, string, flags)
 505                 if mobj:
 506                     break
 507
 508         if os.name != 'nt' and sys.stderr.isatty():
 509             _name = '\033[0;34m%s\033[0m' % name
 510         else:
 511             _name = name
 512
 513         if mobj:
 514             if group is None:
 515                 # return the first matching group
 516                 return next(g for g in mobj.groups() if g is not None)
 517             else:
 518                 return mobj.group(group)
 519         elif default is not _NO_DEFAULT:
 520             return default
 521         elif fatal:
 522             raise RegexNotFoundError('Unable to extract %s' % _name)
 523         else:
 524             self._downloader.report_warning('unable to extract %s; '
 525                                             'please report this issue on http://yt-dl.org/bug' % _name)
 526             return None
 527
 528     def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
 529         """
 530         Like _search_regex, but strips HTML tags and unescapes entities.
 531         """
 532         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 533         if res:
 534             return clean_html(res).strip()
 535         else:
 536             return res
 537
 538     def _get_login_info(self):
 539         """
 540         Get the the login info as (username, password)
 541         It will look in the netrc file using the _NETRC_MACHINE value
 542         If there's no info available, return (None, None)
 543         """
 544         if self._downloader is None:
 545             return (None, None)
 546
 547         username = None
 548         password = None
 549         downloader_params = self._downloader.params
 550
 551         # Attempt to use provided username and password or .netrc data
 552         if downloader_params.get('username', None) is not None:
 553             username = downloader_params['username']
 554             password = downloader_params['password']
 555         elif downloader_params.get('usenetrc', False):
 556             try:
 557                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 558                 if info is not None:
 559                     username = info[0]
 560                     password = info[2]
 561                 else:
 562                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 563             except (IOError, netrc.NetrcParseError) as err:
 564                 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
 565
 566         return (username, password)
 567
 568     def _get_tfa_info(self):
 569         """
 570         Get the two-factor authentication info
 571         TODO - asking the user will be required for sms/phone verify
 572         currently just uses the command line option
 573         If there's no info available, return None
 574         """
 575         if self._downloader is None:
 576             return None
 577         downloader_params = self._downloader.params
 578
 579         if downloader_params.get('twofactor', None) is not None:
 580             return downloader_params['twofactor']
 581
 582         return None
 583
 584     # Helper functions for extracting OpenGraph info
 585     @staticmethod
 586     def _og_regexes(prop):
 587         content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
 588         property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
 589         template = r'<meta[^>]+?%s[^>]+?%s'
 590         return [
 591             template % (property_re, content_re),
 592             template % (content_re, property_re),
 593         ]
 594
 595     def _og_search_property(self, prop, html, name=None, **kargs):
 596         if name is None:
 597             name = 'OpenGraph %s' % prop
 598         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 599         if escaped is None:
 600             return None
 601         return unescapeHTML(escaped)
 602
 603     def _og_search_thumbnail(self, html, **kargs):
 604         return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs)
 605
 606     def _og_search_description(self, html, **kargs):
 607         return self._og_search_property('description', html, fatal=False, **kargs)
 608
 609     def _og_search_title(self, html, **kargs):
 610         return self._og_search_property('title', html, **kargs)
 611
 612     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 613         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 614         if secure:
 615             regexes = self._og_regexes('video:secure_url') + regexes
 616         return self._html_search_regex(regexes, html, name, **kargs)
 617
 618     def _og_search_url(self, html, **kargs):
 619         return self._og_search_property('url', html, **kargs)
 620
 621     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 622         if display_name is None:
 623             display_name = name
 624         return self._html_search_regex(
 625             r'''(?isx)<meta
 626                     (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1)
 627                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(name),
 628             html, display_name, fatal=fatal, group='content', **kwargs)
 629
 630     def _dc_search_uploader(self, html):
 631         return self._html_search_meta('dc.creator', html, 'uploader')
 632
 633     def _rta_search(self, html):
 634         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 635         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 636                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 637                      html):
 638             return 18
 639         return 0
 640
 641     def _media_rating_search(self, html):
 642         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 643         rating = self._html_search_meta('rating', html)
 644
 645         if not rating:
 646             return None
 647
 648         RATING_TABLE = {
 649             'safe for kids': 0,
 650             'general': 8,
 651             '14 years': 14,
 652             'mature': 17,
 653             'restricted': 19,
 654         }
 655         return RATING_TABLE.get(rating.lower(), None)
 656
 657     def _twitter_search_player(self, html):
 658         return self._html_search_meta('twitter:player', html,
 659                                       'twitter card player')
 660
 661     def _sort_formats(self, formats):
 662         if not formats:
 663             raise ExtractorError('No video formats found')
 664
 665         def _formats_key(f):
 666             # TODO remove the following workaround
 667             from ..utils import determine_ext
 668             if not f.get('ext') and 'url' in f:
 669                 f['ext'] = determine_ext(f['url'])
 670
 671             preference = f.get('preference')
 672             if preference is None:
 673                 proto = f.get('protocol')
 674                 if proto is None:
 675                     proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
 676
 677                 preference = 0 if proto in ['http', 'https'] else -0.1
 678                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 679                     preference -= 0.5
 680
 681             if f.get('vcodec') == 'none':  # audio only
 682                 if self._downloader.params.get('prefer_free_formats'):
 683                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
 684                 else:
 685                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
 686                 ext_preference = 0
 687                 try:
 688                     audio_ext_preference = ORDER.index(f['ext'])
 689                 except ValueError:
 690                     audio_ext_preference = -1
 691             else:
 692                 if self._downloader.params.get('prefer_free_formats'):
 693                     ORDER = ['flv', 'mp4', 'webm']
 694                 else:
 695                     ORDER = ['webm', 'flv', 'mp4']
 696                 try:
 697                     ext_preference = ORDER.index(f['ext'])
 698                 except ValueError:
 699                     ext_preference = -1
 700                 audio_ext_preference = 0
 701
 702             return (
 703                 preference,
 704                 f.get('language_preference') if f.get('language_preference') is not None else -1,
 705                 f.get('quality') if f.get('quality') is not None else -1,
 706                 f.get('height') if f.get('height') is not None else -1,
 707                 f.get('width') if f.get('width') is not None else -1,
 708                 ext_preference,
 709                 f.get('tbr') if f.get('tbr') is not None else -1,
 710                 f.get('vbr') if f.get('vbr') is not None else -1,
 711                 f.get('abr') if f.get('abr') is not None else -1,
 712                 audio_ext_preference,
 713                 f.get('fps') if f.get('fps') is not None else -1,
 714                 f.get('filesize') if f.get('filesize') is not None else -1,
 715                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
 716                 f.get('source_preference') if f.get('source_preference') is not None else -1,
 717                 f.get('format_id'),
 718             )
 719         formats.sort(key=_formats_key)
 720
 721     def _check_formats(self, formats, video_id):
 722         if formats:
 723             formats[:] = filter(
 724                 lambda f: self._is_valid_url(
 725                     f['url'], video_id,
 726                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
 727                 formats)
 728
 729     def _is_valid_url(self, url, video_id, item='video'):
 730         try:
 731             self._request_webpage(
 732                 HEADRequest(url), video_id,
 733                 'Checking %s URL' % item)
 734             return True
 735         except ExtractorError as e:
 736             if isinstance(e.cause, compat_HTTPError):
 737                 self.report_warning(
 738                     '%s URL is invalid, skipping' % item, video_id)
 739                 return False
 740             raise
 741
 742     def http_scheme(self):
 743         """ Either "http:" or "https:", depending on the user's preferences """
 744         return (
 745             'http:'
 746             if self._downloader.params.get('prefer_insecure', False)
 747             else 'https:')
 748
 749     def _proto_relative_url(self, url, scheme=None):
 750         if url is None:
 751             return url
 752         if url.startswith('//'):
 753             if scheme is None:
 754                 scheme = self.http_scheme()
 755             return scheme + url
 756         else:
 757             return url
 758
 759     def _sleep(self, timeout, video_id, msg_template=None):
 760         if msg_template is None:
 761             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
 762         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
 763         self.to_screen(msg)
 764         time.sleep(timeout)
 765
 766     def _extract_f4m_formats(self, manifest_url, video_id):
 767         manifest = self._download_xml(
 768             manifest_url, video_id, 'Downloading f4m manifest',
 769             'Unable to download f4m manifest')
 770
 771         formats = []
 772         manifest_version = '1.0'
 773         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
 774         if not media_nodes:
 775             manifest_version = '2.0'
 776             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
 777         for i, media_el in enumerate(media_nodes):
 778             if manifest_version == '2.0':
 779                 manifest_url = '/'.join(manifest_url.split('/')[:-1]) + '/' + media_el.attrib.get('href')
 780             tbr = int_or_none(media_el.attrib.get('bitrate'))
 781             format_id = 'f4m-%d' % (i if tbr is None else tbr)
 782             formats.append({
 783                 'format_id': format_id,
 784                 'url': manifest_url,
 785                 'ext': 'flv',
 786                 'tbr': tbr,
 787                 'width': int_or_none(media_el.attrib.get('width')),
 788                 'height': int_or_none(media_el.attrib.get('height')),
 789             })
 790         self._sort_formats(formats)
 791
 792         return formats
 793
 794     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
 795                               entry_protocol='m3u8', preference=None):
 796
 797         formats = [{
 798             'format_id': 'm3u8-meta',
 799             'url': m3u8_url,
 800             'ext': ext,
 801             'protocol': 'm3u8',
 802             'preference': -1,
 803             'resolution': 'multiple',
 804             'format_note': 'Quality selection URL',
 805         }]
 806
 807         format_url = lambda u: (
 808             u
 809             if re.match(r'^https?://', u)
 810             else compat_urlparse.urljoin(m3u8_url, u))
 811
 812         m3u8_doc = self._download_webpage(
 813             m3u8_url, video_id,
 814             note='Downloading m3u8 information',
 815             errnote='Failed to download m3u8 information')
 816         last_info = None
 817         kv_rex = re.compile(
 818             r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
 819         for line in m3u8_doc.splitlines():
 820             if line.startswith('#EXT-X-STREAM-INF:'):
 821                 last_info = {}
 822                 for m in kv_rex.finditer(line):
 823                     v = m.group('val')
 824                     if v.startswith('"'):
 825                         v = v[1:-1]
 826                     last_info[m.group('key')] = v
 827             elif line.startswith('#') or not line.strip():
 828                 continue
 829             else:
 830                 if last_info is None:
 831                     formats.append({'url': format_url(line)})
 832                     continue
 833                 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
 834
 835                 f = {
 836                     'format_id': 'm3u8-%d' % (tbr if tbr else len(formats)),
 837                     'url': format_url(line.strip()),
 838                     'tbr': tbr,
 839                     'ext': ext,
 840                     'protocol': entry_protocol,
 841                     'preference': preference,
 842                 }
 843                 codecs = last_info.get('CODECS')
 844                 if codecs:
 845                     # TODO: looks like video codec is not always necessarily goes first
 846                     va_codecs = codecs.split(',')
 847                     if va_codecs[0]:
 848                         f['vcodec'] = va_codecs[0].partition('.')[0]
 849                     if len(va_codecs) > 1 and va_codecs[1]:
 850                         f['acodec'] = va_codecs[1].partition('.')[0]
 851                 resolution = last_info.get('RESOLUTION')
 852                 if resolution:
 853                     width_str, height_str = resolution.split('x')
 854                     f['width'] = int(width_str)
 855                     f['height'] = int(height_str)
 856                 formats.append(f)
 857                 last_info = {}
 858         self._sort_formats(formats)
 859         return formats
 860
 861     # TODO: improve extraction
 862     def _extract_smil_formats(self, smil_url, video_id):
 863         smil = self._download_xml(
 864             smil_url, video_id, 'Downloading SMIL file',
 865             'Unable to download SMIL file')
 866
 867         base = smil.find('./head/meta').get('base')
 868
 869         formats = []
 870         rtmp_count = 0
 871         for video in smil.findall('./body/switch/video'):
 872             src = video.get('src')
 873             if not src:
 874                 continue
 875             bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
 876             width = int_or_none(video.get('width'))
 877             height = int_or_none(video.get('height'))
 878             proto = video.get('proto')
 879             if not proto:
 880                 if base:
 881                     if base.startswith('rtmp'):
 882                         proto = 'rtmp'
 883                     elif base.startswith('http'):
 884                         proto = 'http'
 885             ext = video.get('ext')
 886             if proto == 'm3u8':
 887                 formats.extend(self._extract_m3u8_formats(src, video_id, ext))
 888             elif proto == 'rtmp':
 889                 rtmp_count += 1
 890                 streamer = video.get('streamer') or base
 891                 formats.append({
 892                     'url': streamer,
 893                     'play_path': src,
 894                     'ext': 'flv',
 895                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
 896                     'tbr': bitrate,
 897                     'width': width,
 898                     'height': height,
 899                 })
 900         self._sort_formats(formats)
 901
 902         return formats
 903
 904     def _live_title(self, name):
 905         """ Generate the title for a live video """
 906         now = datetime.datetime.now()
 907         now_str = now.strftime("%Y-%m-%d %H:%M")
 908         return name + ' ' + now_str
 909
 910     def _int(self, v, name, fatal=False, **kwargs):
 911         res = int_or_none(v, **kwargs)
 912         if 'get_attr' in kwargs:
 913             print(getattr(v, kwargs['get_attr']))
 914         if res is None:
 915             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
 916             if fatal:
 917                 raise ExtractorError(msg)
 918             else:
 919                 self._downloader.report_warning(msg)
 920         return res
 921
 922     def _float(self, v, name, fatal=False, **kwargs):
 923         res = float_or_none(v, **kwargs)
 924         if res is None:
 925             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
 926             if fatal:
 927                 raise ExtractorError(msg)
 928             else:
 929                 self._downloader.report_warning(msg)
 930         return res
 931
 932     def _set_cookie(self, domain, name, value, expire_time=None):
 933         cookie = compat_cookiejar.Cookie(
 934             0, name, value, None, None, domain, None,
 935             None, '/', True, False, expire_time, '', None, None, None)
 936         self._downloader.cookiejar.set_cookie(cookie)
 937
 938     def get_testcases(self, include_onlymatching=False):
 939         t = getattr(self, '_TEST', None)
 940         if t:
 941             assert not hasattr(self, '_TESTS'), \
 942                 '%s has _TEST and _TESTS' % type(self).__name__
 943             tests = [t]
 944         else:
 945             tests = getattr(self, '_TESTS', [])
 946         for t in tests:
 947             if not include_onlymatching and t.get('only_matching', False):
 948                 continue
 949             t['name'] = type(self).__name__[:-len('IE')]
 950             yield t
 951
 952     def is_suitable(self, age_limit):
 953         """ Test whether the extractor is generally suitable for the given
 954         age limit (i.e. pornographic sites are not, all others usually are) """
 955
 956         any_restricted = False
 957         for tc in self.get_testcases(include_onlymatching=False):
 958             if 'playlist' in tc:
 959                 tc = tc['playlist'][0]
 960             is_restricted = age_restricted(
 961                 tc.get('info_dict', {}).get('age_limit'), age_limit)
 962             if not is_restricted:
 963                 return True
 964             any_restricted = any_restricted or is_restricted
 965         return not any_restricted
 966
 967
 968 class SearchInfoExtractor(InfoExtractor):
 969     """
 970     Base class for paged search queries extractors.
 971     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 972     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 973     """
 974
 975     @classmethod
 976     def _make_valid_url(cls):
 977         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 978
 979     @classmethod
 980     def suitable(cls, url):
 981         return re.match(cls._make_valid_url(), url) is not None
 982
 983     def _real_extract(self, query):
 984         mobj = re.match(self._make_valid_url(), query)
 985         if mobj is None:
 986             raise ExtractorError('Invalid search query "%s"' % query)
 987
 988         prefix = mobj.group('prefix')
 989         query = mobj.group('query')
 990         if prefix == '':
 991             return self._get_n_results(query, 1)
 992         elif prefix == 'all':
 993             return self._get_n_results(query, self._MAX_RESULTS)
 994         else:
 995             n = int(prefix)
 996             if n <= 0:
 997                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
 998             elif n > self._MAX_RESULTS:
 999                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
1000                 n = self._MAX_RESULTS
1001             return self._get_n_results(query, n)
1002
1003     def _get_n_results(self, query, n):
1004         """Get a specified number of results for a query"""
1005         raise NotImplementedError("This method must be implemented by subclasses")
1006
1007     @property
1008     def SEARCH_KEY(self):
1009         return self._SEARCH_KEY