youtube_dl/extractor/common.py

   1 import base64
   2 import os
   3 import re
   4 import socket
   5 import sys
   6 import netrc
   7 import xml.etree.ElementTree
   8
   9 from ..utils import (
  10     compat_http_client,
  11     compat_urllib_error,
  12     compat_str,
  13
  14     clean_html,
  15     compiled_regex_type,
  16     ExtractorError,
  17     RegexNotFoundError,
  18     sanitize_filename,
  19     unescapeHTML,
  20 )
  21 _NO_DEFAULT = object()
  22
  23
  24 class InfoExtractor(object):
  25     """Information Extractor class.
  26
  27     Information extractors are the classes that, given a URL, extract
  28     information about the video (or videos) the URL refers to. This
  29     information includes the real video URL, the video title, author and
  30     others. The information is stored in a dictionary which is then
  31     passed to the FileDownloader. The FileDownloader processes this
  32     information possibly downloading the video to the file system, among
  33     other possible outcomes.
  34
  35     The dictionaries must include the following fields:
  36
  37     id:             Video identifier.
  38     title:          Video title, unescaped.
  39
  40     Additionally, it must contain either a formats entry or url and ext:
  41
  42     formats:        A list of dictionaries for each format available, it must
  43                     be ordered from worst to best quality. Potential fields:
  44                     * url        Mandatory. The URL of the video file
  45                     * ext        Will be calculated from url if missing
  46                     * format     A human-readable description of the format
  47                                  ("mp4 container with h264/opus").
  48                                  Calculated from the format_id, width, height.
  49                                  and format_note fields if missing.
  50                     * format_id  A short description of the format
  51                                  ("mp4_h264_opus" or "19")
  52                     * format_note Additional info about the format
  53                                  ("3D" or "DASH video")
  54                     * width      Width of the video, if known
  55                     * height     Height of the video, if known
  56                     * abr        Average audio bitrate in KBit/s
  57                     * acodec     Name of the audio codec in use
  58                     * vbr        Average video bitrate in KBit/s
  59                     * vcodec     Name of the video codec in use
  60                     * filesize   The number of bytes, if known in advance
  61                     * player_url SWF Player URL (used for rtmpdump).
  62     url:            Final video URL.
  63     ext:            Video filename extension.
  64     format:         The video format, defaults to ext (used for --get-format)
  65     player_url:     SWF Player URL (used for rtmpdump).
  66     urlhandle:      [internal] The urlHandle to be used to download the file,
  67                     like returned by urllib.request.urlopen
  68
  69     The following fields are optional:
  70
  71     thumbnails:     A list of dictionaries (with the entries "resolution" and
  72                     "url") for the varying thumbnails
  73     thumbnail:      Full URL to a video thumbnail image.
  74     description:    One-line video description.
  75     uploader:       Full name of the video uploader.
  76     upload_date:    Video upload date (YYYYMMDD).
  77     uploader_id:    Nickname or id of the video uploader.
  78     location:       Physical location of the video.
  79     subtitles:      The subtitle file contents as a dictionary in the format
  80                     {language: subtitles}.
  81     duration:       Length of the video in seconds, as an integer.
  82     view_count:     How many users have watched the video on the platform.
  83     like_count:     Number of positive ratings of the video
  84     dislike_count:  Number of negative ratings of the video
  85     comment_count:  Number of comments on the video
  86     age_limit:      Age restriction for the video, as an integer (years)
  87     webpage_url:    The url to the video webpage, if given to youtube-dl it
  88                     should allow to get the same result again. (It will be set
  89                     by YoutubeDL if it's missing)
  90
  91     Unless mentioned otherwise, the fields should be Unicode strings.
  92
  93     Subclasses of this one should re-define the _real_initialize() and
  94     _real_extract() methods and define a _VALID_URL regexp.
  95     Probably, they should also be added to the list of extractors.
  96
  97     _real_extract() must return a *list* of information dictionaries as
  98     described above.
  99
 100     Finally, the _WORKING attribute should be set to False for broken IEs
 101     in order to warn the users and skip the tests.
 102     """
 103
 104     _ready = False
 105     _downloader = None
 106     _WORKING = True
 107
 108     def __init__(self, downloader=None):
 109         """Constructor. Receives an optional downloader."""
 110         self._ready = False
 111         self.set_downloader(downloader)
 112
 113     @classmethod
 114     def suitable(cls, url):
 115         """Receives a URL and returns True if suitable for this IE."""
 116
 117         # This does not use has/getattr intentionally - we want to know whether
 118         # we have cached the regexp for *this* class, whereas getattr would also
 119         # match the superclass
 120         if '_VALID_URL_RE' not in cls.__dict__:
 121             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 122         return cls._VALID_URL_RE.match(url) is not None
 123
 124     @classmethod
 125     def working(cls):
 126         """Getter method for _WORKING."""
 127         return cls._WORKING
 128
 129     def initialize(self):
 130         """Initializes an instance (authentication, etc)."""
 131         if not self._ready:
 132             self._real_initialize()
 133             self._ready = True
 134
 135     def extract(self, url):
 136         """Extracts URL information and returns it in list of dicts."""
 137         self.initialize()
 138         return self._real_extract(url)
 139
 140     def set_downloader(self, downloader):
 141         """Sets the downloader for this IE."""
 142         self._downloader = downloader
 143
 144     def _real_initialize(self):
 145         """Real initialization process. Redefine in subclasses."""
 146         pass
 147
 148     def _real_extract(self, url):
 149         """Real extraction process. Redefine in subclasses."""
 150         pass
 151
 152     @classmethod
 153     def ie_key(cls):
 154         """A string for getting the InfoExtractor with get_info_extractor"""
 155         return cls.__name__[:-2]
 156
 157     @property
 158     def IE_NAME(self):
 159         return type(self).__name__[:-2]
 160
 161     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 162         """ Returns the response handle """
 163         if note is None:
 164             self.report_download_webpage(video_id)
 165         elif note is not False:
 166             if video_id is None:
 167                 self.to_screen(u'%s' % (note,))
 168             else:
 169                 self.to_screen(u'%s: %s' % (video_id, note))
 170         try:
 171             return self._downloader.urlopen(url_or_request)
 172         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 173             if errnote is None:
 174                 errnote = u'Unable to download webpage'
 175             errmsg = u'%s: %s' % (errnote, compat_str(err))
 176             if fatal:
 177                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 178             else:
 179                 self._downloader.report_warning(errmsg)
 180                 return False
 181
 182     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 183         """ Returns a tuple (page content as string, URL handle) """
 184
 185         # Strip hashes from the URL (#1038)
 186         if isinstance(url_or_request, (compat_str, str)):
 187             url_or_request = url_or_request.partition('#')[0]
 188
 189         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 190         if urlh is False:
 191             assert not fatal
 192             return False
 193         content_type = urlh.headers.get('Content-Type', '')
 194         webpage_bytes = urlh.read()
 195         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 196         if m:
 197             encoding = m.group(1)
 198         else:
 199             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 200                           webpage_bytes[:1024])
 201             if m:
 202                 encoding = m.group(1).decode('ascii')
 203             else:
 204                 encoding = 'utf-8'
 205         if self._downloader.params.get('dump_intermediate_pages', False):
 206             try:
 207                 url = url_or_request.get_full_url()
 208             except AttributeError:
 209                 url = url_or_request
 210             self.to_screen(u'Dumping request to ' + url)
 211             dump = base64.b64encode(webpage_bytes).decode('ascii')
 212             self._downloader.to_screen(dump)
 213         if self._downloader.params.get('write_pages', False):
 214             try:
 215                 url = url_or_request.get_full_url()
 216             except AttributeError:
 217                 url = url_or_request
 218             raw_filename = ('%s_%s.dump' % (video_id, url))
 219             filename = sanitize_filename(raw_filename, restricted=True)
 220             self.to_screen(u'Saving request to ' + filename)
 221             with open(filename, 'wb') as outf:
 222                 outf.write(webpage_bytes)
 223
 224         content = webpage_bytes.decode(encoding, 'replace')
 225         return (content, urlh)
 226
 227     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 228         """ Returns the data of the page as a string """
 229         res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
 230         if res is False:
 231             return res
 232         else:
 233             content, _ = res
 234             return content
 235
 236     def _download_xml(self, url_or_request, video_id,
 237                       note=u'Downloading XML', errnote=u'Unable to download XML',
 238                       transform_source=None):
 239         """Return the xml as an xml.etree.ElementTree.Element"""
 240         xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
 241         if transform_source:
 242             xml_string = transform_source(xml_string)
 243         return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
 244
 245     def to_screen(self, msg):
 246         """Print msg to screen, prefixing it with '[ie_name]'"""
 247         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 248
 249     def report_extraction(self, id_or_name):
 250         """Report information extraction."""
 251         self.to_screen(u'%s: Extracting information' % id_or_name)
 252
 253     def report_download_webpage(self, video_id):
 254         """Report webpage download."""
 255         self.to_screen(u'%s: Downloading webpage' % video_id)
 256
 257     def report_age_confirmation(self):
 258         """Report attempt to confirm age."""
 259         self.to_screen(u'Confirming age')
 260
 261     def report_login(self):
 262         """Report attempt to log in."""
 263         self.to_screen(u'Logging in')
 264
 265     #Methods for following #608
 266     def url_result(self, url, ie=None, video_id=None):
 267         """Returns a url that points to a page that should be processed"""
 268         #TODO: ie should be the class used for getting the info
 269         video_info = {'_type': 'url',
 270                       'url': url,
 271                       'ie_key': ie}
 272         if video_id is not None:
 273             video_info['id'] = video_id
 274         return video_info
 275     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 276         """Returns a playlist"""
 277         video_info = {'_type': 'playlist',
 278                       'entries': entries}
 279         if playlist_id:
 280             video_info['id'] = playlist_id
 281         if playlist_title:
 282             video_info['title'] = playlist_title
 283         return video_info
 284
 285     def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
 286         """
 287         Perform a regex search on the given string, using a single or a list of
 288         patterns returning the first matching group.
 289         In case of failure return a default value or raise a WARNING or a
 290         RegexNotFoundError, depending on fatal, specifying the field name.
 291         """
 292         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 293             mobj = re.search(pattern, string, flags)
 294         else:
 295             for p in pattern:
 296                 mobj = re.search(p, string, flags)
 297                 if mobj: break
 298
 299         if os.name != 'nt' and sys.stderr.isatty():
 300             _name = u'\033[0;34m%s\033[0m' % name
 301         else:
 302             _name = name
 303
 304         if mobj:
 305             # return the first matching group
 306             return next(g for g in mobj.groups() if g is not None)
 307         elif default is not _NO_DEFAULT:
 308             return default
 309         elif fatal:
 310             raise RegexNotFoundError(u'Unable to extract %s' % _name)
 311         else:
 312             self._downloader.report_warning(u'unable to extract %s; '
 313                 u'please report this issue on http://yt-dl.org/bug' % _name)
 314             return None
 315
 316     def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
 317         """
 318         Like _search_regex, but strips HTML tags and unescapes entities.
 319         """
 320         res = self._search_regex(pattern, string, name, default, fatal, flags)
 321         if res:
 322             return clean_html(res).strip()
 323         else:
 324             return res
 325
 326     def _get_login_info(self):
 327         """
 328         Get the the login info as (username, password)
 329         It will look in the netrc file using the _NETRC_MACHINE value
 330         If there's no info available, return (None, None)
 331         """
 332         if self._downloader is None:
 333             return (None, None)
 334
 335         username = None
 336         password = None
 337         downloader_params = self._downloader.params
 338
 339         # Attempt to use provided username and password or .netrc data
 340         if downloader_params.get('username', None) is not None:
 341             username = downloader_params['username']
 342             password = downloader_params['password']
 343         elif downloader_params.get('usenetrc', False):
 344             try:
 345                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 346                 if info is not None:
 347                     username = info[0]
 348                     password = info[2]
 349                 else:
 350                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 351             except (IOError, netrc.NetrcParseError) as err:
 352                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 353
 354         return (username, password)
 355
 356     # Helper functions for extracting OpenGraph info
 357     @staticmethod
 358     def _og_regexes(prop):
 359         content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
 360         property_re = r'property=[\'"]og:%s[\'"]' % re.escape(prop)
 361         template = r'<meta[^>]+?%s[^>]+?%s'
 362         return [
 363             template % (property_re, content_re),
 364             template % (content_re, property_re),
 365         ]
 366
 367     def _og_search_property(self, prop, html, name=None, **kargs):
 368         if name is None:
 369             name = 'OpenGraph %s' % prop
 370         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 371         if escaped is None:
 372             return None
 373         return unescapeHTML(escaped)
 374
 375     def _og_search_thumbnail(self, html, **kargs):
 376         return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
 377
 378     def _og_search_description(self, html, **kargs):
 379         return self._og_search_property('description', html, fatal=False, **kargs)
 380
 381     def _og_search_title(self, html, **kargs):
 382         return self._og_search_property('title', html, **kargs)
 383
 384     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 385         regexes = self._og_regexes('video')
 386         if secure: regexes = self._og_regexes('video:secure_url') + regexes
 387         return self._html_search_regex(regexes, html, name, **kargs)
 388
 389     def _html_search_meta(self, name, html, display_name=None):
 390         if display_name is None:
 391             display_name = name
 392         return self._html_search_regex(
 393             r'''(?ix)<meta
 394                     (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
 395                     [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
 396             html, display_name, fatal=False)
 397
 398     def _dc_search_uploader(self, html):
 399         return self._html_search_meta('dc.creator', html, 'uploader')
 400
 401     def _rta_search(self, html):
 402         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 403         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 404                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 405                      html):
 406             return 18
 407         return 0
 408
 409     def _media_rating_search(self, html):
 410         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 411         rating = self._html_search_meta('rating', html)
 412
 413         if not rating:
 414             return None
 415
 416         RATING_TABLE = {
 417             'safe for kids': 0,
 418             'general': 8,
 419             '14 years': 14,
 420             'mature': 17,
 421             'restricted': 19,
 422         }
 423         return RATING_TABLE.get(rating.lower(), None)
 424
 425
 426
 427 class SearchInfoExtractor(InfoExtractor):
 428     """
 429     Base class for paged search queries extractors.
 430     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 431     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 432     """
 433
 434     @classmethod
 435     def _make_valid_url(cls):
 436         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 437
 438     @classmethod
 439     def suitable(cls, url):
 440         return re.match(cls._make_valid_url(), url) is not None
 441
 442     def _real_extract(self, query):
 443         mobj = re.match(self._make_valid_url(), query)
 444         if mobj is None:
 445             raise ExtractorError(u'Invalid search query "%s"' % query)
 446
 447         prefix = mobj.group('prefix')
 448         query = mobj.group('query')
 449         if prefix == '':
 450             return self._get_n_results(query, 1)
 451         elif prefix == 'all':
 452             return self._get_n_results(query, self._MAX_RESULTS)
 453         else:
 454             n = int(prefix)
 455             if n <= 0:
 456                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
 457             elif n > self._MAX_RESULTS:
 458                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 459                 n = self._MAX_RESULTS
 460             return self._get_n_results(query, n)
 461
 462     def _get_n_results(self, query, n):
 463         """Get a specified number of results for a query"""
 464         raise NotImplementedError("This method must be implemented by subclasses")
 465
 466     @property
 467     def SEARCH_KEY(self):
 468         return self._SEARCH_KEY