youtube_dl/extractor/common.py

   1 import base64
   2 import os
   3 import re
   4 import socket
   5 import sys
   6 import netrc
   7 import xml.etree.ElementTree
   8
   9 from ..utils import (
  10     compat_http_client,
  11     compat_urllib_error,
  12     compat_str,
  13
  14     clean_html,
  15     compiled_regex_type,
  16     ExtractorError,
  17     RegexNotFoundError,
  18     sanitize_filename,
  19     unescapeHTML,
  20 )
  21 _NO_DEFAULT = object()
  22
  23
  24 class InfoExtractor(object):
  25     """Information Extractor class.
  26
  27     Information extractors are the classes that, given a URL, extract
  28     information about the video (or videos) the URL refers to. This
  29     information includes the real video URL, the video title, author and
  30     others. The information is stored in a dictionary which is then
  31     passed to the FileDownloader. The FileDownloader processes this
  32     information possibly downloading the video to the file system, among
  33     other possible outcomes.
  34
  35     The dictionaries must include the following fields:
  36
  37     id:             Video identifier.
  38     title:          Video title, unescaped.
  39
  40     Additionally, it must contain either a formats entry or url and ext:
  41
  42     formats:        A list of dictionaries for each format available, it must
  43                     be ordered from worst to best quality. Potential fields:
  44                     * url        Mandatory. The URL of the video file
  45                     * ext        Will be calculated from url if missing
  46                     * format     A human-readable description of the format
  47                                  ("mp4 container with h264/opus").
  48                                  Calculated from the format_id, width, height.
  49                                  and format_note fields if missing.
  50                     * format_id  A short description of the format
  51                                  ("mp4_h264_opus" or "19")
  52                     * format_note Additional info about the format
  53                                  ("3D" or "DASH video")
  54                     * width      Width of the video, if known
  55                     * height     Height of the video, if known
  56                     * abr        Average audio bitrate in KBit/s
  57                     * acodec     Name of the audio codec in use
  58                     * vbr        Average video bitrate in KBit/s
  59                     * vcodec     Name of the video codec in use
  60                     * filesize   The number of bytes, if known in advance
  61                     * player_url SWF Player URL (used for rtmpdump).
  62     url:            Final video URL.
  63     ext:            Video filename extension.
  64     format:         The video format, defaults to ext (used for --get-format)
  65     player_url:     SWF Player URL (used for rtmpdump).
  66
  67     The following fields are optional:
  68
  69     thumbnails:     A list of dictionaries (with the entries "resolution" and
  70                     "url") for the varying thumbnails
  71     thumbnail:      Full URL to a video thumbnail image.
  72     description:    One-line video description.
  73     uploader:       Full name of the video uploader.
  74     upload_date:    Video upload date (YYYYMMDD).
  75     uploader_id:    Nickname or id of the video uploader.
  76     location:       Physical location of the video.
  77     subtitles:      The subtitle file contents as a dictionary in the format
  78                     {language: subtitles}.
  79     duration:       Length of the video in seconds, as an integer.
  80     view_count:     How many users have watched the video on the platform.
  81     like_count:     Number of positive ratings of the video
  82     dislike_count:  Number of negative ratings of the video
  83     comment_count:  Number of comments on the video
  84     age_limit:      Age restriction for the video, as an integer (years)
  85     webpage_url:    The url to the video webpage, if given to youtube-dl it
  86                     should allow to get the same result again. (It will be set
  87                     by YoutubeDL if it's missing)
  88
  89     Unless mentioned otherwise, the fields should be Unicode strings.
  90
  91     Subclasses of this one should re-define the _real_initialize() and
  92     _real_extract() methods and define a _VALID_URL regexp.
  93     Probably, they should also be added to the list of extractors.
  94
  95     _real_extract() must return a *list* of information dictionaries as
  96     described above.
  97
  98     Finally, the _WORKING attribute should be set to False for broken IEs
  99     in order to warn the users and skip the tests.
 100     """
 101
 102     _ready = False
 103     _downloader = None
 104     _WORKING = True
 105
 106     def __init__(self, downloader=None):
 107         """Constructor. Receives an optional downloader."""
 108         self._ready = False
 109         self.set_downloader(downloader)
 110
 111     @classmethod
 112     def suitable(cls, url):
 113         """Receives a URL and returns True if suitable for this IE."""
 114
 115         # This does not use has/getattr intentionally - we want to know whether
 116         # we have cached the regexp for *this* class, whereas getattr would also
 117         # match the superclass
 118         if '_VALID_URL_RE' not in cls.__dict__:
 119             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 120         return cls._VALID_URL_RE.match(url) is not None
 121
 122     @classmethod
 123     def working(cls):
 124         """Getter method for _WORKING."""
 125         return cls._WORKING
 126
 127     def initialize(self):
 128         """Initializes an instance (authentication, etc)."""
 129         if not self._ready:
 130             self._real_initialize()
 131             self._ready = True
 132
 133     def extract(self, url):
 134         """Extracts URL information and returns it in list of dicts."""
 135         self.initialize()
 136         return self._real_extract(url)
 137
 138     def set_downloader(self, downloader):
 139         """Sets the downloader for this IE."""
 140         self._downloader = downloader
 141
 142     def _real_initialize(self):
 143         """Real initialization process. Redefine in subclasses."""
 144         pass
 145
 146     def _real_extract(self, url):
 147         """Real extraction process. Redefine in subclasses."""
 148         pass
 149
 150     @classmethod
 151     def ie_key(cls):
 152         """A string for getting the InfoExtractor with get_info_extractor"""
 153         return cls.__name__[:-2]
 154
 155     @property
 156     def IE_NAME(self):
 157         return type(self).__name__[:-2]
 158
 159     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 160         """ Returns the response handle """
 161         if note is None:
 162             self.report_download_webpage(video_id)
 163         elif note is not False:
 164             if video_id is None:
 165                 self.to_screen(u'%s' % (note,))
 166             else:
 167                 self.to_screen(u'%s: %s' % (video_id, note))
 168         try:
 169             return self._downloader.urlopen(url_or_request)
 170         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 171             if errnote is False:
 172                 return False
 173             if errnote is None:
 174                 errnote = u'Unable to download webpage'
 175             errmsg = u'%s: %s' % (errnote, compat_str(err))
 176             if fatal:
 177                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 178             else:
 179                 self._downloader.report_warning(errmsg)
 180                 return False
 181
 182     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 183         """ Returns a tuple (page content as string, URL handle) """
 184
 185         # Strip hashes from the URL (#1038)
 186         if isinstance(url_or_request, (compat_str, str)):
 187             url_or_request = url_or_request.partition('#')[0]
 188
 189         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 190         if urlh is False:
 191             assert not fatal
 192             return False
 193         content_type = urlh.headers.get('Content-Type', '')
 194         webpage_bytes = urlh.read()
 195         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 196         if m:
 197             encoding = m.group(1)
 198         else:
 199             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 200                           webpage_bytes[:1024])
 201             if m:
 202                 encoding = m.group(1).decode('ascii')
 203             else:
 204                 encoding = 'utf-8'
 205         if self._downloader.params.get('dump_intermediate_pages', False):
 206             try:
 207                 url = url_or_request.get_full_url()
 208             except AttributeError:
 209                 url = url_or_request
 210             self.to_screen(u'Dumping request to ' + url)
 211             dump = base64.b64encode(webpage_bytes).decode('ascii')
 212             self._downloader.to_screen(dump)
 213         if self._downloader.params.get('write_pages', False):
 214             try:
 215                 url = url_or_request.get_full_url()
 216             except AttributeError:
 217                 url = url_or_request
 218             raw_filename = ('%s_%s.dump' % (video_id, url))
 219             filename = sanitize_filename(raw_filename, restricted=True)
 220             self.to_screen(u'Saving request to ' + filename)
 221             with open(filename, 'wb') as outf:
 222                 outf.write(webpage_bytes)
 223
 224         content = webpage_bytes.decode(encoding, 'replace')
 225         return (content, urlh)
 226
 227     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 228         """ Returns the data of the page as a string """
 229         res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
 230         if res is False:
 231             return res
 232         else:
 233             content, _ = res
 234             return content
 235
 236     def _download_xml(self, url_or_request, video_id,
 237                       note=u'Downloading XML', errnote=u'Unable to download XML',
 238                       transform_source=None):
 239         """Return the xml as an xml.etree.ElementTree.Element"""
 240         xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
 241         if transform_source:
 242             xml_string = transform_source(xml_string)
 243         return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
 244
 245     def report_warning(self, msg, video_id=None):
 246         idstr = u'' if video_id is None else u'%s: ' % video_id
 247         self._downloader.report_warning(
 248             u'[%s] %s%s' % (self.IE_NAME, idstr, msg))
 249
 250     def to_screen(self, msg):
 251         """Print msg to screen, prefixing it with '[ie_name]'"""
 252         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 253
 254     def report_extraction(self, id_or_name):
 255         """Report information extraction."""
 256         self.to_screen(u'%s: Extracting information' % id_or_name)
 257
 258     def report_download_webpage(self, video_id):
 259         """Report webpage download."""
 260         self.to_screen(u'%s: Downloading webpage' % video_id)
 261
 262     def report_age_confirmation(self):
 263         """Report attempt to confirm age."""
 264         self.to_screen(u'Confirming age')
 265
 266     def report_login(self):
 267         """Report attempt to log in."""
 268         self.to_screen(u'Logging in')
 269
 270     #Methods for following #608
 271     @staticmethod
 272     def url_result(url, ie=None, video_id=None):
 273         """Returns a url that points to a page that should be processed"""
 274         #TODO: ie should be the class used for getting the info
 275         video_info = {'_type': 'url',
 276                       'url': url,
 277                       'ie_key': ie}
 278         if video_id is not None:
 279             video_info['id'] = video_id
 280         return video_info
 281     @staticmethod
 282     def playlist_result(entries, playlist_id=None, playlist_title=None):
 283         """Returns a playlist"""
 284         video_info = {'_type': 'playlist',
 285                       'entries': entries}
 286         if playlist_id:
 287             video_info['id'] = playlist_id
 288         if playlist_title:
 289             video_info['title'] = playlist_title
 290         return video_info
 291
 292     def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
 293         """
 294         Perform a regex search on the given string, using a single or a list of
 295         patterns returning the first matching group.
 296         In case of failure return a default value or raise a WARNING or a
 297         RegexNotFoundError, depending on fatal, specifying the field name.
 298         """
 299         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 300             mobj = re.search(pattern, string, flags)
 301         else:
 302             for p in pattern:
 303                 mobj = re.search(p, string, flags)
 304                 if mobj: break
 305
 306         if os.name != 'nt' and sys.stderr.isatty():
 307             _name = u'\033[0;34m%s\033[0m' % name
 308         else:
 309             _name = name
 310
 311         if mobj:
 312             # return the first matching group
 313             return next(g for g in mobj.groups() if g is not None)
 314         elif default is not _NO_DEFAULT:
 315             return default
 316         elif fatal:
 317             raise RegexNotFoundError(u'Unable to extract %s' % _name)
 318         else:
 319             self._downloader.report_warning(u'unable to extract %s; '
 320                 u'please report this issue on http://yt-dl.org/bug' % _name)
 321             return None
 322
 323     def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
 324         """
 325         Like _search_regex, but strips HTML tags and unescapes entities.
 326         """
 327         res = self._search_regex(pattern, string, name, default, fatal, flags)
 328         if res:
 329             return clean_html(res).strip()
 330         else:
 331             return res
 332
 333     def _get_login_info(self):
 334         """
 335         Get the the login info as (username, password)
 336         It will look in the netrc file using the _NETRC_MACHINE value
 337         If there's no info available, return (None, None)
 338         """
 339         if self._downloader is None:
 340             return (None, None)
 341
 342         username = None
 343         password = None
 344         downloader_params = self._downloader.params
 345
 346         # Attempt to use provided username and password or .netrc data
 347         if downloader_params.get('username', None) is not None:
 348             username = downloader_params['username']
 349             password = downloader_params['password']
 350         elif downloader_params.get('usenetrc', False):
 351             try:
 352                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 353                 if info is not None:
 354                     username = info[0]
 355                     password = info[2]
 356                 else:
 357                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 358             except (IOError, netrc.NetrcParseError) as err:
 359                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 360
 361         return (username, password)
 362
 363     # Helper functions for extracting OpenGraph info
 364     @staticmethod
 365     def _og_regexes(prop):
 366         content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
 367         property_re = r'property=[\'"]og:%s[\'"]' % re.escape(prop)
 368         template = r'<meta[^>]+?%s[^>]+?%s'
 369         return [
 370             template % (property_re, content_re),
 371             template % (content_re, property_re),
 372         ]
 373
 374     def _og_search_property(self, prop, html, name=None, **kargs):
 375         if name is None:
 376             name = 'OpenGraph %s' % prop
 377         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 378         if escaped is None:
 379             return None
 380         return unescapeHTML(escaped)
 381
 382     def _og_search_thumbnail(self, html, **kargs):
 383         return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
 384
 385     def _og_search_description(self, html, **kargs):
 386         return self._og_search_property('description', html, fatal=False, **kargs)
 387
 388     def _og_search_title(self, html, **kargs):
 389         return self._og_search_property('title', html, **kargs)
 390
 391     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 392         regexes = self._og_regexes('video')
 393         if secure: regexes = self._og_regexes('video:secure_url') + regexes
 394         return self._html_search_regex(regexes, html, name, **kargs)
 395
 396     def _html_search_meta(self, name, html, display_name=None):
 397         if display_name is None:
 398             display_name = name
 399         return self._html_search_regex(
 400             r'''(?ix)<meta
 401                     (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
 402                     [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
 403             html, display_name, fatal=False)
 404
 405     def _dc_search_uploader(self, html):
 406         return self._html_search_meta('dc.creator', html, 'uploader')
 407
 408     def _rta_search(self, html):
 409         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 410         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 411                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 412                      html):
 413             return 18
 414         return 0
 415
 416     def _media_rating_search(self, html):
 417         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 418         rating = self._html_search_meta('rating', html)
 419
 420         if not rating:
 421             return None
 422
 423         RATING_TABLE = {
 424             'safe for kids': 0,
 425             'general': 8,
 426             '14 years': 14,
 427             'mature': 17,
 428             'restricted': 19,
 429         }
 430         return RATING_TABLE.get(rating.lower(), None)
 431
 432
 433
 434 class SearchInfoExtractor(InfoExtractor):
 435     """
 436     Base class for paged search queries extractors.
 437     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 438     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 439     """
 440
 441     @classmethod
 442     def _make_valid_url(cls):
 443         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 444
 445     @classmethod
 446     def suitable(cls, url):
 447         return re.match(cls._make_valid_url(), url) is not None
 448
 449     def _real_extract(self, query):
 450         mobj = re.match(self._make_valid_url(), query)
 451         if mobj is None:
 452             raise ExtractorError(u'Invalid search query "%s"' % query)
 453
 454         prefix = mobj.group('prefix')
 455         query = mobj.group('query')
 456         if prefix == '':
 457             return self._get_n_results(query, 1)
 458         elif prefix == 'all':
 459             return self._get_n_results(query, self._MAX_RESULTS)
 460         else:
 461             n = int(prefix)
 462             if n <= 0:
 463                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
 464             elif n > self._MAX_RESULTS:
 465                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 466                 n = self._MAX_RESULTS
 467             return self._get_n_results(query, n)
 468
 469     def _get_n_results(self, query, n):
 470         """Get a specified number of results for a query"""
 471         raise NotImplementedError("This method must be implemented by subclasses")
 472
 473     @property
 474     def SEARCH_KEY(self):
 475         return self._SEARCH_KEY