youtube_dl/extractor/common.py

   1 import base64
   2 import os
   3 import re
   4 import socket
   5 import sys
   6 import netrc
   7 import xml.etree.ElementTree
   8
   9 from ..utils import (
  10     compat_http_client,
  11     compat_urllib_error,
  12     compat_str,
  13
  14     clean_html,
  15     compiled_regex_type,
  16     ExtractorError,
  17     RegexNotFoundError,
  18     sanitize_filename,
  19     unescapeHTML,
  20 )
  21
  22
  23 class InfoExtractor(object):
  24     """Information Extractor class.
  25
  26     Information extractors are the classes that, given a URL, extract
  27     information about the video (or videos) the URL refers to. This
  28     information includes the real video URL, the video title, author and
  29     others. The information is stored in a dictionary which is then
  30     passed to the FileDownloader. The FileDownloader processes this
  31     information possibly downloading the video to the file system, among
  32     other possible outcomes.
  33
  34     The dictionaries must include the following fields:
  35
  36     id:             Video identifier.
  37     url:            Final video URL.
  38     title:          Video title, unescaped.
  39     ext:            Video filename extension.
  40
  41     Instead of url and ext, formats can also specified.
  42
  43     The following fields are optional:
  44
  45     format:         The video format, defaults to ext (used for --get-format)
  46     thumbnails:     A list of dictionaries (with the entries "resolution" and
  47                     "url") for the varying thumbnails
  48     thumbnail:      Full URL to a video thumbnail image.
  49     description:    One-line video description.
  50     uploader:       Full name of the video uploader.
  51     upload_date:    Video upload date (YYYYMMDD).
  52     uploader_id:    Nickname or id of the video uploader.
  53     location:       Physical location of the video.
  54     player_url:     SWF Player URL (used for rtmpdump).
  55     subtitles:      The subtitle file contents as a dictionary in the format
  56                     {language: subtitles}.
  57     view_count:     How many users have watched the video on the platform.
  58     like_count:     Number of positive ratings of the video
  59     dislike_count:  Number of negative ratings of the video
  60     comment_count:  Number of comments on the video
  61     urlhandle:      [internal] The urlHandle to be used to download the file,
  62                     like returned by urllib.request.urlopen
  63     age_limit:      Age restriction for the video, as an integer (years)
  64     formats:        A list of dictionaries for each format available, it must
  65                     be ordered from worst to best quality. Potential fields:
  66                     * url       Mandatory. The URL of the video file
  67                     * ext       Will be calculated from url if missing
  68                     * format    A human-readable description of the format
  69                                 ("mp4 container with h264/opus").
  70                                 Calculated from the format_id, width, height.
  71                                 and format_note fields if missing.
  72                     * format_id A short description of the format
  73                                 ("mp4_h264_opus" or "19")
  74                     * format_note Additional info about the format
  75                                 ("3D" or "DASH video")
  76                     * width     Width of the video, if known
  77                     * height    Height of the video, if known
  78                     * abr       Average audio bitrate in KBit/s
  79                     * acodec    Name of the audio codec in use
  80                     * vbr       Average video bitrate in KBit/s
  81                     * vcodec    Name of the video codec in use
  82                     * filesize  The number of bytes, if known in advance
  83     webpage_url:    The url to the video webpage, if given to youtube-dl it
  84                     should allow to get the same result again. (It will be set
  85                     by YoutubeDL if it's missing)
  86
  87     Unless mentioned otherwise, the fields should be Unicode strings.
  88
  89     Subclasses of this one should re-define the _real_initialize() and
  90     _real_extract() methods and define a _VALID_URL regexp.
  91     Probably, they should also be added to the list of extractors.
  92
  93     _real_extract() must return a *list* of information dictionaries as
  94     described above.
  95
  96     Finally, the _WORKING attribute should be set to False for broken IEs
  97     in order to warn the users and skip the tests.
  98     """
  99
 100     _ready = False
 101     _downloader = None
 102     _WORKING = True
 103
 104     def __init__(self, downloader=None):
 105         """Constructor. Receives an optional downloader."""
 106         self._ready = False
 107         self.set_downloader(downloader)
 108
 109     @classmethod
 110     def suitable(cls, url):
 111         """Receives a URL and returns True if suitable for this IE."""
 112
 113         # This does not use has/getattr intentionally - we want to know whether
 114         # we have cached the regexp for *this* class, whereas getattr would also
 115         # match the superclass
 116         if '_VALID_URL_RE' not in cls.__dict__:
 117             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 118         return cls._VALID_URL_RE.match(url) is not None
 119
 120     @classmethod
 121     def working(cls):
 122         """Getter method for _WORKING."""
 123         return cls._WORKING
 124
 125     def initialize(self):
 126         """Initializes an instance (authentication, etc)."""
 127         if not self._ready:
 128             self._real_initialize()
 129             self._ready = True
 130
 131     def extract(self, url):
 132         """Extracts URL information and returns it in list of dicts."""
 133         self.initialize()
 134         return self._real_extract(url)
 135
 136     def set_downloader(self, downloader):
 137         """Sets the downloader for this IE."""
 138         self._downloader = downloader
 139
 140     def _real_initialize(self):
 141         """Real initialization process. Redefine in subclasses."""
 142         pass
 143
 144     def _real_extract(self, url):
 145         """Real extraction process. Redefine in subclasses."""
 146         pass
 147
 148     @classmethod
 149     def ie_key(cls):
 150         """A string for getting the InfoExtractor with get_info_extractor"""
 151         return cls.__name__[:-2]
 152
 153     @property
 154     def IE_NAME(self):
 155         return type(self).__name__[:-2]
 156
 157     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 158         """ Returns the response handle """
 159         if note is None:
 160             self.report_download_webpage(video_id)
 161         elif note is not False:
 162             self.to_screen(u'%s: %s' % (video_id, note))
 163         try:
 164             return self._downloader.urlopen(url_or_request)
 165         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 166             if errnote is None:
 167                 errnote = u'Unable to download webpage'
 168             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2], cause=err)
 169
 170     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
 171         """ Returns a tuple (page content as string, URL handle) """
 172
 173         # Strip hashes from the URL (#1038)
 174         if isinstance(url_or_request, (compat_str, str)):
 175             url_or_request = url_or_request.partition('#')[0]
 176
 177         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 178         content_type = urlh.headers.get('Content-Type', '')
 179         webpage_bytes = urlh.read()
 180         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 181         if m:
 182             encoding = m.group(1)
 183         else:
 184             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 185                           webpage_bytes[:1024])
 186             if m:
 187                 encoding = m.group(1).decode('ascii')
 188             else:
 189                 encoding = 'utf-8'
 190         if self._downloader.params.get('dump_intermediate_pages', False):
 191             try:
 192                 url = url_or_request.get_full_url()
 193             except AttributeError:
 194                 url = url_or_request
 195             self.to_screen(u'Dumping request to ' + url)
 196             dump = base64.b64encode(webpage_bytes).decode('ascii')
 197             self._downloader.to_screen(dump)
 198         if self._downloader.params.get('write_pages', False):
 199             try:
 200                 url = url_or_request.get_full_url()
 201             except AttributeError:
 202                 url = url_or_request
 203             raw_filename = ('%s_%s.dump' % (video_id, url))
 204             filename = sanitize_filename(raw_filename, restricted=True)
 205             self.to_screen(u'Saving request to ' + filename)
 206             with open(filename, 'wb') as outf:
 207                 outf.write(webpage_bytes)
 208
 209         content = webpage_bytes.decode(encoding, 'replace')
 210         return (content, urlh)
 211
 212     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 213         """ Returns the data of the page as a string """
 214         return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
 215
 216     def _download_xml(self, url_or_request, video_id,
 217                       note=u'Downloading XML', errnote=u'Unable to download XML'):
 218         """Return the xml as an xml.etree.ElementTree.Element"""
 219         xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
 220         return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
 221
 222     def to_screen(self, msg):
 223         """Print msg to screen, prefixing it with '[ie_name]'"""
 224         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 225
 226     def report_extraction(self, id_or_name):
 227         """Report information extraction."""
 228         self.to_screen(u'%s: Extracting information' % id_or_name)
 229
 230     def report_download_webpage(self, video_id):
 231         """Report webpage download."""
 232         self.to_screen(u'%s: Downloading webpage' % video_id)
 233
 234     def report_age_confirmation(self):
 235         """Report attempt to confirm age."""
 236         self.to_screen(u'Confirming age')
 237
 238     def report_login(self):
 239         """Report attempt to log in."""
 240         self.to_screen(u'Logging in')
 241
 242     #Methods for following #608
 243     def url_result(self, url, ie=None, video_id=None):
 244         """Returns a url that points to a page that should be processed"""
 245         #TODO: ie should be the class used for getting the info
 246         video_info = {'_type': 'url',
 247                       'url': url,
 248                       'ie_key': ie}
 249         if video_id is not None:
 250             video_info['id'] = video_id
 251         return video_info
 252     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 253         """Returns a playlist"""
 254         video_info = {'_type': 'playlist',
 255                       'entries': entries}
 256         if playlist_id:
 257             video_info['id'] = playlist_id
 258         if playlist_title:
 259             video_info['title'] = playlist_title
 260         return video_info
 261
 262     def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 263         """
 264         Perform a regex search on the given string, using a single or a list of
 265         patterns returning the first matching group.
 266         In case of failure return a default value or raise a WARNING or a
 267         RegexNotFoundError, depending on fatal, specifying the field name.
 268         """
 269         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 270             mobj = re.search(pattern, string, flags)
 271         else:
 272             for p in pattern:
 273                 mobj = re.search(p, string, flags)
 274                 if mobj: break
 275
 276         if sys.stderr.isatty() and os.name != 'nt':
 277             _name = u'\033[0;34m%s\033[0m' % name
 278         else:
 279             _name = name
 280
 281         if mobj:
 282             # return the first matching group
 283             return next(g for g in mobj.groups() if g is not None)
 284         elif default is not None:
 285             return default
 286         elif fatal:
 287             raise RegexNotFoundError(u'Unable to extract %s' % _name)
 288         else:
 289             self._downloader.report_warning(u'unable to extract %s; '
 290                 u'please report this issue on http://yt-dl.org/bug' % _name)
 291             return None
 292
 293     def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 294         """
 295         Like _search_regex, but strips HTML tags and unescapes entities.
 296         """
 297         res = self._search_regex(pattern, string, name, default, fatal, flags)
 298         if res:
 299             return clean_html(res).strip()
 300         else:
 301             return res
 302
 303     def _get_login_info(self):
 304         """
 305         Get the the login info as (username, password)
 306         It will look in the netrc file using the _NETRC_MACHINE value
 307         If there's no info available, return (None, None)
 308         """
 309         if self._downloader is None:
 310             return (None, None)
 311
 312         username = None
 313         password = None
 314         downloader_params = self._downloader.params
 315
 316         # Attempt to use provided username and password or .netrc data
 317         if downloader_params.get('username', None) is not None:
 318             username = downloader_params['username']
 319             password = downloader_params['password']
 320         elif downloader_params.get('usenetrc', False):
 321             try:
 322                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 323                 if info is not None:
 324                     username = info[0]
 325                     password = info[2]
 326                 else:
 327                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 328             except (IOError, netrc.NetrcParseError) as err:
 329                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 330
 331         return (username, password)
 332
 333     # Helper functions for extracting OpenGraph info
 334     @staticmethod
 335     def _og_regexes(prop):
 336         content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
 337         property_re = r'property=[\'"]og:%s[\'"]' % re.escape(prop)
 338         template = r'<meta[^>]+?%s[^>]+?%s'
 339         return [
 340             template % (property_re, content_re),
 341             template % (content_re, property_re),
 342         ]
 343
 344     def _og_search_property(self, prop, html, name=None, **kargs):
 345         if name is None:
 346             name = 'OpenGraph %s' % prop
 347         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 348         if escaped is None:
 349             return None
 350         return unescapeHTML(escaped)
 351
 352     def _og_search_thumbnail(self, html, **kargs):
 353         return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
 354
 355     def _og_search_description(self, html, **kargs):
 356         return self._og_search_property('description', html, fatal=False, **kargs)
 357
 358     def _og_search_title(self, html, **kargs):
 359         return self._og_search_property('title', html, **kargs)
 360
 361     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 362         regexes = self._og_regexes('video')
 363         if secure: regexes = self._og_regexes('video:secure_url') + regexes
 364         return self._html_search_regex(regexes, html, name, **kargs)
 365
 366     def _html_search_meta(self, name, html, display_name=None):
 367         if display_name is None:
 368             display_name = name
 369         return self._html_search_regex(
 370             r'''(?ix)<meta
 371                     (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
 372                     [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
 373             html, display_name, fatal=False)
 374
 375     def _dc_search_uploader(self, html):
 376         return self._html_search_meta('dc.creator', html, 'uploader')
 377
 378     def _rta_search(self, html):
 379         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 380         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 381                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 382                      html):
 383             return 18
 384         return 0
 385
 386     def _media_rating_search(self, html):
 387         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 388         rating = self._html_search_meta('rating', html)
 389
 390         if not rating:
 391             return None
 392
 393         RATING_TABLE = {
 394             'safe for kids': 0,
 395             'general': 8,
 396             '14 years': 14,
 397             'mature': 17,
 398             'restricted': 19,
 399         }
 400         return RATING_TABLE.get(rating.lower(), None)
 401
 402
 403
 404 class SearchInfoExtractor(InfoExtractor):
 405     """
 406     Base class for paged search queries extractors.
 407     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 408     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 409     """
 410
 411     @classmethod
 412     def _make_valid_url(cls):
 413         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 414
 415     @classmethod
 416     def suitable(cls, url):
 417         return re.match(cls._make_valid_url(), url) is not None
 418
 419     def _real_extract(self, query):
 420         mobj = re.match(self._make_valid_url(), query)
 421         if mobj is None:
 422             raise ExtractorError(u'Invalid search query "%s"' % query)
 423
 424         prefix = mobj.group('prefix')
 425         query = mobj.group('query')
 426         if prefix == '':
 427             return self._get_n_results(query, 1)
 428         elif prefix == 'all':
 429             return self._get_n_results(query, self._MAX_RESULTS)
 430         else:
 431             n = int(prefix)
 432             if n <= 0:
 433                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
 434             elif n > self._MAX_RESULTS:
 435                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 436                 n = self._MAX_RESULTS
 437             return self._get_n_results(query, n)
 438
 439     def _get_n_results(self, query, n):
 440         """Get a specified number of results for a query"""
 441         raise NotImplementedError("This method must be implemented by subclasses")
 442
 443     @property
 444     def SEARCH_KEY(self):
 445         return self._SEARCH_KEY