youtube_dl/extractor/common.py

   1 import base64
   2 import os
   3 import re
   4 import socket
   5 import sys
   6 import netrc
   7
   8 from ..utils import (
   9     compat_http_client,
  10     compat_urllib_error,
  11     compat_urllib_request,
  12     compat_str,
  13
  14     clean_html,
  15     compiled_regex_type,
  16     ExtractorError,
  17     RegexNotFoundError,
  18     sanitize_filename,
  19     unescapeHTML,
  20 )
  21
  22 class InfoExtractor(object):
  23     """Information Extractor class.
  24
  25     Information extractors are the classes that, given a URL, extract
  26     information about the video (or videos) the URL refers to. This
  27     information includes the real video URL, the video title, author and
  28     others. The information is stored in a dictionary which is then
  29     passed to the FileDownloader. The FileDownloader processes this
  30     information possibly downloading the video to the file system, among
  31     other possible outcomes.
  32
  33     The dictionaries must include the following fields:
  34
  35     id:             Video identifier.
  36     url:            Final video URL.
  37     title:          Video title, unescaped.
  38     ext:            Video filename extension.
  39
  40     Instead of url and ext, formats can also specified.
  41
  42     The following fields are optional:
  43
  44     format:         The video format, defaults to ext (used for --get-format)
  45     thumbnails:     A list of dictionaries (with the entries "resolution" and
  46                     "url") for the varying thumbnails
  47     thumbnail:      Full URL to a video thumbnail image.
  48     description:    One-line video description.
  49     uploader:       Full name of the video uploader.
  50     upload_date:    Video upload date (YYYYMMDD).
  51     uploader_id:    Nickname or id of the video uploader.
  52     location:       Physical location of the video.
  53     player_url:     SWF Player URL (used for rtmpdump).
  54     subtitles:      The subtitle file contents as a dictionary in the format
  55                     {language: subtitles}.
  56     view_count:     How many users have watched the video on the platform.
  57     urlhandle:      [internal] The urlHandle to be used to download the file,
  58                     like returned by urllib.request.urlopen
  59     age_limit:      Age restriction for the video, as an integer (years)
  60     formats:        A list of dictionaries for each format available, it must
  61                     be ordered from worst to best quality. Potential fields:
  62                     * url       Mandatory. The URL of the video file
  63                     * ext       Will be calculated from url if missing
  64                     * format    A human-readable description of the format
  65                                 ("mp4 container with h264/opus").
  66                                 Calculated from the format_id, width, height
  67                                 and format_note fields if missing.
  68                     * format_id A short description of the format
  69                                 ("mp4_h264_opus" or "19")
  70                     * format_note Additional info about the format
  71                                 ("3D" or "DASH video")
  72                     * width     Width of the video, if known
  73                     * height    Height of the video, if known
  74
  75     Unless mentioned otherwise, the fields should be Unicode strings.
  76
  77     Subclasses of this one should re-define the _real_initialize() and
  78     _real_extract() methods and define a _VALID_URL regexp.
  79     Probably, they should also be added to the list of extractors.
  80
  81     _real_extract() must return a *list* of information dictionaries as
  82     described above.
  83
  84     Finally, the _WORKING attribute should be set to False for broken IEs
  85     in order to warn the users and skip the tests.
  86     """
  87
  88     _ready = False
  89     _downloader = None
  90     _WORKING = True
  91
  92     def __init__(self, downloader=None):
  93         """Constructor. Receives an optional downloader."""
  94         self._ready = False
  95         self.set_downloader(downloader)
  96
  97     @classmethod
  98     def suitable(cls, url):
  99         """Receives a URL and returns True if suitable for this IE."""
 100
 101         # This does not use has/getattr intentionally - we want to know whether
 102         # we have cached the regexp for *this* class, whereas getattr would also
 103         # match the superclass
 104         if '_VALID_URL_RE' not in cls.__dict__:
 105             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 106         return cls._VALID_URL_RE.match(url) is not None
 107
 108     @classmethod
 109     def working(cls):
 110         """Getter method for _WORKING."""
 111         return cls._WORKING
 112
 113     def initialize(self):
 114         """Initializes an instance (authentication, etc)."""
 115         if not self._ready:
 116             self._real_initialize()
 117             self._ready = True
 118
 119     def extract(self, url):
 120         """Extracts URL information and returns it in list of dicts."""
 121         self.initialize()
 122         return self._real_extract(url)
 123
 124     def set_downloader(self, downloader):
 125         """Sets the downloader for this IE."""
 126         self._downloader = downloader
 127
 128     def _real_initialize(self):
 129         """Real initialization process. Redefine in subclasses."""
 130         pass
 131
 132     def _real_extract(self, url):
 133         """Real extraction process. Redefine in subclasses."""
 134         pass
 135
 136     @classmethod
 137     def ie_key(cls):
 138         """A string for getting the InfoExtractor with get_info_extractor"""
 139         return cls.__name__[:-2]
 140
 141     @property
 142     def IE_NAME(self):
 143         return type(self).__name__[:-2]
 144
 145     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 146         """ Returns the response handle """
 147         if note is None:
 148             self.report_download_webpage(video_id)
 149         elif note is not False:
 150             self.to_screen(u'%s: %s' % (video_id, note))
 151         try:
 152             return compat_urllib_request.urlopen(url_or_request)
 153         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 154             if errnote is None:
 155                 errnote = u'Unable to download webpage'
 156             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2], cause=err)
 157
 158     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
 159         """ Returns a tuple (page content as string, URL handle) """
 160
 161         # Strip hashes from the URL (#1038)
 162         if isinstance(url_or_request, (compat_str, str)):
 163             url_or_request = url_or_request.partition('#')[0]
 164
 165         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 166         content_type = urlh.headers.get('Content-Type', '')
 167         webpage_bytes = urlh.read()
 168         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 169         if m:
 170             encoding = m.group(1)
 171         else:
 172             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 173                           webpage_bytes[:1024])
 174             if m:
 175                 encoding = m.group(1).decode('ascii')
 176             else:
 177                 encoding = 'utf-8'
 178         if self._downloader.params.get('dump_intermediate_pages', False):
 179             try:
 180                 url = url_or_request.get_full_url()
 181             except AttributeError:
 182                 url = url_or_request
 183             self.to_screen(u'Dumping request to ' + url)
 184             dump = base64.b64encode(webpage_bytes).decode('ascii')
 185             self._downloader.to_screen(dump)
 186         if self._downloader.params.get('write_pages', False):
 187             try:
 188                 url = url_or_request.get_full_url()
 189             except AttributeError:
 190                 url = url_or_request
 191             raw_filename = ('%s_%s.dump' % (video_id, url))
 192             filename = sanitize_filename(raw_filename, restricted=True)
 193             self.to_screen(u'Saving request to ' + filename)
 194             with open(filename, 'wb') as outf:
 195                 outf.write(webpage_bytes)
 196
 197         content = webpage_bytes.decode(encoding, 'replace')
 198         return (content, urlh)
 199
 200     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 201         """ Returns the data of the page as a string """
 202         return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
 203
 204     def to_screen(self, msg):
 205         """Print msg to screen, prefixing it with '[ie_name]'"""
 206         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 207
 208     def report_extraction(self, id_or_name):
 209         """Report information extraction."""
 210         self.to_screen(u'%s: Extracting information' % id_or_name)
 211
 212     def report_download_webpage(self, video_id):
 213         """Report webpage download."""
 214         self.to_screen(u'%s: Downloading webpage' % video_id)
 215
 216     def report_age_confirmation(self):
 217         """Report attempt to confirm age."""
 218         self.to_screen(u'Confirming age')
 219
 220     def report_login(self):
 221         """Report attempt to log in."""
 222         self.to_screen(u'Logging in')
 223
 224     #Methods for following #608
 225     def url_result(self, url, ie=None):
 226         """Returns a url that points to a page that should be processed"""
 227         #TODO: ie should be the class used for getting the info
 228         video_info = {'_type': 'url',
 229                       'url': url,
 230                       'ie_key': ie}
 231         return video_info
 232     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 233         """Returns a playlist"""
 234         video_info = {'_type': 'playlist',
 235                       'entries': entries}
 236         if playlist_id:
 237             video_info['id'] = playlist_id
 238         if playlist_title:
 239             video_info['title'] = playlist_title
 240         return video_info
 241
 242     def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 243         """
 244         Perform a regex search on the given string, using a single or a list of
 245         patterns returning the first matching group.
 246         In case of failure return a default value or raise a WARNING or a
 247         RegexNotFoundError, depending on fatal, specifying the field name.
 248         """
 249         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 250             mobj = re.search(pattern, string, flags)
 251         else:
 252             for p in pattern:
 253                 mobj = re.search(p, string, flags)
 254                 if mobj: break
 255
 256         if sys.stderr.isatty() and os.name != 'nt':
 257             _name = u'\033[0;34m%s\033[0m' % name
 258         else:
 259             _name = name
 260
 261         if mobj:
 262             # return the first matching group
 263             return next(g for g in mobj.groups() if g is not None)
 264         elif default is not None:
 265             return default
 266         elif fatal:
 267             raise RegexNotFoundError(u'Unable to extract %s' % _name)
 268         else:
 269             self._downloader.report_warning(u'unable to extract %s; '
 270                 u'please report this issue on http://yt-dl.org/bug' % _name)
 271             return None
 272
 273     def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 274         """
 275         Like _search_regex, but strips HTML tags and unescapes entities.
 276         """
 277         res = self._search_regex(pattern, string, name, default, fatal, flags)
 278         if res:
 279             return clean_html(res).strip()
 280         else:
 281             return res
 282
 283     def _get_login_info(self):
 284         """
 285         Get the the login info as (username, password)
 286         It will look in the netrc file using the _NETRC_MACHINE value
 287         If there's no info available, return (None, None)
 288         """
 289         if self._downloader is None:
 290             return (None, None)
 291
 292         username = None
 293         password = None
 294         downloader_params = self._downloader.params
 295
 296         # Attempt to use provided username and password or .netrc data
 297         if downloader_params.get('username', None) is not None:
 298             username = downloader_params['username']
 299             password = downloader_params['password']
 300         elif downloader_params.get('usenetrc', False):
 301             try:
 302                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 303                 if info is not None:
 304                     username = info[0]
 305                     password = info[2]
 306                 else:
 307                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 308             except (IOError, netrc.NetrcParseError) as err:
 309                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 310
 311         return (username, password)
 312
 313     # Helper functions for extracting OpenGraph info
 314     @staticmethod
 315     def _og_regex(prop):
 316         return r'<meta.+?property=[\'"]og:%s[\'"].+?content=(?:"(.+?)"|\'(.+?)\')' % re.escape(prop)
 317
 318     def _og_search_property(self, prop, html, name=None, **kargs):
 319         if name is None:
 320             name = 'OpenGraph %s' % prop
 321         escaped = self._search_regex(self._og_regex(prop), html, name, flags=re.DOTALL, **kargs)
 322         return unescapeHTML(escaped)
 323
 324     def _og_search_thumbnail(self, html, **kargs):
 325         return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
 326
 327     def _og_search_description(self, html, **kargs):
 328         return self._og_search_property('description', html, fatal=False, **kargs)
 329
 330     def _og_search_title(self, html, **kargs):
 331         return self._og_search_property('title', html, **kargs)
 332
 333     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 334         regexes = [self._og_regex('video')]
 335         if secure: regexes.insert(0, self._og_regex('video:secure_url'))
 336         return self._html_search_regex(regexes, html, name, **kargs)
 337
 338     def _rta_search(self, html):
 339         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 340         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 341                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 342                      html):
 343             return 18
 344         return 0
 345
 346
 347 class SearchInfoExtractor(InfoExtractor):
 348     """
 349     Base class for paged search queries extractors.
 350     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 351     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 352     """
 353
 354     @classmethod
 355     def _make_valid_url(cls):
 356         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 357
 358     @classmethod
 359     def suitable(cls, url):
 360         return re.match(cls._make_valid_url(), url) is not None
 361
 362     def _real_extract(self, query):
 363         mobj = re.match(self._make_valid_url(), query)
 364         if mobj is None:
 365             raise ExtractorError(u'Invalid search query "%s"' % query)
 366
 367         prefix = mobj.group('prefix')
 368         query = mobj.group('query')
 369         if prefix == '':
 370             return self._get_n_results(query, 1)
 371         elif prefix == 'all':
 372             return self._get_n_results(query, self._MAX_RESULTS)
 373         else:
 374             n = int(prefix)
 375             if n <= 0:
 376                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
 377             elif n > self._MAX_RESULTS:
 378                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 379                 n = self._MAX_RESULTS
 380             return self._get_n_results(query, n)
 381
 382     def _get_n_results(self, query, n):
 383         """Get a specified number of results for a query"""
 384         raise NotImplementedError("This method must be implemented by subclasses")
 385
 386     @property
 387     def SEARCH_KEY(self):
 388         return self._SEARCH_KEY