youtube_dl/extractor/common.py

   1 import base64
   2 import os
   3 import re
   4 import socket
   5 import sys
   6 import netrc
   7
   8 from ..utils import (
   9     compat_http_client,
  10     compat_urllib_error,
  11     compat_urllib_request,
  12     compat_str,
  13
  14     clean_html,
  15     compiled_regex_type,
  16     ExtractorError,
  17     unescapeHTML,
  18 )
  19
  20 class InfoExtractor(object):
  21     """Information Extractor class.
  22
  23     Information extractors are the classes that, given a URL, extract
  24     information about the video (or videos) the URL refers to. This
  25     information includes the real video URL, the video title, author and
  26     others. The information is stored in a dictionary which is then
  27     passed to the FileDownloader. The FileDownloader processes this
  28     information possibly downloading the video to the file system, among
  29     other possible outcomes.
  30
  31     The dictionaries must include the following fields:
  32
  33     id:             Video identifier.
  34     url:            Final video URL.
  35     title:          Video title, unescaped.
  36     ext:            Video filename extension.
  37
  38     The following fields are optional:
  39
  40     format:         The video format, defaults to ext (used for --get-format)
  41     thumbnails:     A list of dictionaries (with the entries "resolution" and
  42                     "url") for the varying thumbnails
  43     thumbnail:      Full URL to a video thumbnail image.
  44     description:    One-line video description.
  45     uploader:       Full name of the video uploader.
  46     upload_date:    Video upload date (YYYYMMDD).
  47     uploader_id:    Nickname or id of the video uploader.
  48     location:       Physical location of the video.
  49     player_url:     SWF Player URL (used for rtmpdump).
  50     subtitles:      The subtitle file contents as a dictionary in the format
  51                     {language: subtitles}.
  52     view_count:     How many users have watched the video on the platform.
  53     urlhandle:      [internal] The urlHandle to be used to download the file,
  54                     like returned by urllib.request.urlopen
  55     formats:        A list of dictionaries for each format available, it must
  56                     be ordered from worst to best quality. Potential fields:
  57                     * url       Mandatory. The URL of the video file
  58                     * ext       Will be calculated from url if missing
  59                     * format    A human-readable description of the format
  60                                 ("mp4 container with h264/opus").
  61                                 Calculated from width and height if missing.
  62                     * format_id A short description of the format
  63                                 ("mp4_h264_opus" or "19")
  64                     * width     Width of the video, if known
  65                     * height    Height of the video, if known
  66
  67     Unless mentioned otherwise, the fields should be Unicode strings.
  68
  69     Subclasses of this one should re-define the _real_initialize() and
  70     _real_extract() methods and define a _VALID_URL regexp.
  71     Probably, they should also be added to the list of extractors.
  72
  73     _real_extract() must return a *list* of information dictionaries as
  74     described above.
  75
  76     Finally, the _WORKING attribute should be set to False for broken IEs
  77     in order to warn the users and skip the tests.
  78     """
  79
  80     _ready = False
  81     _downloader = None
  82     _WORKING = True
  83
  84     def __init__(self, downloader=None):
  85         """Constructor. Receives an optional downloader."""
  86         self._ready = False
  87         self.set_downloader(downloader)
  88
  89     @classmethod
  90     def suitable(cls, url):
  91         """Receives a URL and returns True if suitable for this IE."""
  92
  93         # This does not use has/getattr intentionally - we want to know whether
  94         # we have cached the regexp for *this* class, whereas getattr would also
  95         # match the superclass
  96         if '_VALID_URL_RE' not in cls.__dict__:
  97             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
  98         return cls._VALID_URL_RE.match(url) is not None
  99
 100     @classmethod
 101     def working(cls):
 102         """Getter method for _WORKING."""
 103         return cls._WORKING
 104
 105     def initialize(self):
 106         """Initializes an instance (authentication, etc)."""
 107         if not self._ready:
 108             self._real_initialize()
 109             self._ready = True
 110
 111     def extract(self, url):
 112         """Extracts URL information and returns it in list of dicts."""
 113         self.initialize()
 114         return self._real_extract(url)
 115
 116     def set_downloader(self, downloader):
 117         """Sets the downloader for this IE."""
 118         self._downloader = downloader
 119
 120     def _real_initialize(self):
 121         """Real initialization process. Redefine in subclasses."""
 122         pass
 123
 124     def _real_extract(self, url):
 125         """Real extraction process. Redefine in subclasses."""
 126         pass
 127
 128     @classmethod
 129     def ie_key(cls):
 130         """A string for getting the InfoExtractor with get_info_extractor"""
 131         return cls.__name__[:-2]
 132
 133     @property
 134     def IE_NAME(self):
 135         return type(self).__name__[:-2]
 136
 137     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 138         """ Returns the response handle """
 139         if note is None:
 140             self.report_download_webpage(video_id)
 141         elif note is not False:
 142             self.to_screen(u'%s: %s' % (video_id, note))
 143         try:
 144             return compat_urllib_request.urlopen(url_or_request)
 145         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 146             if errnote is None:
 147                 errnote = u'Unable to download webpage'
 148             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2], cause=err)
 149
 150     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
 151         """ Returns a tuple (page content as string, URL handle) """
 152
 153         # Strip hashes from the URL (#1038)
 154         if isinstance(url_or_request, (compat_str, str)):
 155             url_or_request = url_or_request.partition('#')[0]
 156
 157         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 158         content_type = urlh.headers.get('Content-Type', '')
 159         webpage_bytes = urlh.read()
 160         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 161         if m:
 162             encoding = m.group(1)
 163         else:
 164             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 165                           webpage_bytes[:1024])
 166             if m:
 167                 encoding = m.group(1).decode('ascii')
 168             else:
 169                 encoding = 'utf-8'
 170         if self._downloader.params.get('dump_intermediate_pages', False):
 171             try:
 172                 url = url_or_request.get_full_url()
 173             except AttributeError:
 174                 url = url_or_request
 175             self.to_screen(u'Dumping request to ' + url)
 176             dump = base64.b64encode(webpage_bytes).decode('ascii')
 177             self._downloader.to_screen(dump)
 178         content = webpage_bytes.decode(encoding, 'replace')
 179         return (content, urlh)
 180
 181     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 182         """ Returns the data of the page as a string """
 183         return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
 184
 185     def to_screen(self, msg):
 186         """Print msg to screen, prefixing it with '[ie_name]'"""
 187         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 188
 189     def report_extraction(self, id_or_name):
 190         """Report information extraction."""
 191         self.to_screen(u'%s: Extracting information' % id_or_name)
 192
 193     def report_download_webpage(self, video_id):
 194         """Report webpage download."""
 195         self.to_screen(u'%s: Downloading webpage' % video_id)
 196
 197     def report_age_confirmation(self):
 198         """Report attempt to confirm age."""
 199         self.to_screen(u'Confirming age')
 200
 201     def report_login(self):
 202         """Report attempt to log in."""
 203         self.to_screen(u'Logging in')
 204
 205     #Methods for following #608
 206     def url_result(self, url, ie=None):
 207         """Returns a url that points to a page that should be processed"""
 208         #TODO: ie should be the class used for getting the info
 209         video_info = {'_type': 'url',
 210                       'url': url,
 211                       'ie_key': ie}
 212         return video_info
 213     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 214         """Returns a playlist"""
 215         video_info = {'_type': 'playlist',
 216                       'entries': entries}
 217         if playlist_id:
 218             video_info['id'] = playlist_id
 219         if playlist_title:
 220             video_info['title'] = playlist_title
 221         return video_info
 222
 223     def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 224         """
 225         Perform a regex search on the given string, using a single or a list of
 226         patterns returning the first matching group.
 227         In case of failure return a default value or raise a WARNING or a
 228         ExtractorError, depending on fatal, specifying the field name.
 229         """
 230         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 231             mobj = re.search(pattern, string, flags)
 232         else:
 233             for p in pattern:
 234                 mobj = re.search(p, string, flags)
 235                 if mobj: break
 236
 237         if sys.stderr.isatty() and os.name != 'nt':
 238             _name = u'\033[0;34m%s\033[0m' % name
 239         else:
 240             _name = name
 241
 242         if mobj:
 243             # return the first matching group
 244             return next(g for g in mobj.groups() if g is not None)
 245         elif default is not None:
 246             return default
 247         elif fatal:
 248             raise ExtractorError(u'Unable to extract %s' % _name)
 249         else:
 250             self._downloader.report_warning(u'unable to extract %s; '
 251                 u'please report this issue on http://yt-dl.org/bug' % _name)
 252             return None
 253
 254     def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 255         """
 256         Like _search_regex, but strips HTML tags and unescapes entities.
 257         """
 258         res = self._search_regex(pattern, string, name, default, fatal, flags)
 259         if res:
 260             return clean_html(res).strip()
 261         else:
 262             return res
 263
 264     def _get_login_info(self):
 265         """
 266         Get the the login info as (username, password)
 267         It will look in the netrc file using the _NETRC_MACHINE value
 268         If there's no info available, return (None, None)
 269         """
 270         if self._downloader is None:
 271             return (None, None)
 272
 273         username = None
 274         password = None
 275         downloader_params = self._downloader.params
 276
 277         # Attempt to use provided username and password or .netrc data
 278         if downloader_params.get('username', None) is not None:
 279             username = downloader_params['username']
 280             password = downloader_params['password']
 281         elif downloader_params.get('usenetrc', False):
 282             try:
 283                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 284                 if info is not None:
 285                     username = info[0]
 286                     password = info[2]
 287                 else:
 288                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 289             except (IOError, netrc.NetrcParseError) as err:
 290                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 291
 292         return (username, password)
 293
 294     # Helper functions for extracting OpenGraph info
 295     @staticmethod
 296     def _og_regex(prop):
 297         return r'<meta.+?property=[\'"]og:%s[\'"].+?content=(?:"(.+?)"|\'(.+?)\')' % re.escape(prop)
 298
 299     def _og_search_property(self, prop, html, name=None, **kargs):
 300         if name is None:
 301             name = 'OpenGraph %s' % prop
 302         escaped = self._search_regex(self._og_regex(prop), html, name, flags=re.DOTALL, **kargs)
 303         return unescapeHTML(escaped)
 304
 305     def _og_search_thumbnail(self, html, **kargs):
 306         return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
 307
 308     def _og_search_description(self, html, **kargs):
 309         return self._og_search_property('description', html, fatal=False, **kargs)
 310
 311     def _og_search_title(self, html, **kargs):
 312         return self._og_search_property('title', html, **kargs)
 313
 314     def _og_search_video_url(self, html, name='video url', **kargs):
 315         return self._html_search_regex([self._og_regex('video:secure_url'),
 316                                         self._og_regex('video')],
 317                                        html, name, **kargs)
 318
 319 class SearchInfoExtractor(InfoExtractor):
 320     """
 321     Base class for paged search queries extractors.
 322     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 323     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 324     """
 325
 326     @classmethod
 327     def _make_valid_url(cls):
 328         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 329
 330     @classmethod
 331     def suitable(cls, url):
 332         return re.match(cls._make_valid_url(), url) is not None
 333
 334     def _real_extract(self, query):
 335         mobj = re.match(self._make_valid_url(), query)
 336         if mobj is None:
 337             raise ExtractorError(u'Invalid search query "%s"' % query)
 338
 339         prefix = mobj.group('prefix')
 340         query = mobj.group('query')
 341         if prefix == '':
 342             return self._get_n_results(query, 1)
 343         elif prefix == 'all':
 344             return self._get_n_results(query, self._MAX_RESULTS)
 345         else:
 346             n = int(prefix)
 347             if n <= 0:
 348                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
 349             elif n > self._MAX_RESULTS:
 350                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 351                 n = self._MAX_RESULTS
 352             return self._get_n_results(query, n)
 353
 354     def _get_n_results(self, query, n):
 355         """Get a specified number of results for a query"""
 356         raise NotImplementedError("This method must be implemented by sublclasses")
 357
 358     @property
 359     def SEARCH_KEY(self):
 360         return self._SEARCH_KEY