youtube_dl/extractor/common.py

   1 import base64
   2 import os
   3 import re
   4 import socket
   5 import sys
   6 import netrc
   7 import xml.etree.ElementTree
   8
   9 from ..utils import (
  10     compat_http_client,
  11     compat_urllib_error,
  12     compat_str,
  13
  14     clean_html,
  15     compiled_regex_type,
  16     ExtractorError,
  17     RegexNotFoundError,
  18     sanitize_filename,
  19     unescapeHTML,
  20 )
  21
  22
  23 class InfoExtractor(object):
  24     """Information Extractor class.
  25
  26     Information extractors are the classes that, given a URL, extract
  27     information about the video (or videos) the URL refers to. This
  28     information includes the real video URL, the video title, author and
  29     others. The information is stored in a dictionary which is then
  30     passed to the FileDownloader. The FileDownloader processes this
  31     information possibly downloading the video to the file system, among
  32     other possible outcomes.
  33
  34     The dictionaries must include the following fields:
  35
  36     id:             Video identifier.
  37     url:            Final video URL.
  38     title:          Video title, unescaped.
  39     ext:            Video filename extension.
  40
  41     Instead of url and ext, formats can also specified.
  42
  43     The following fields are optional:
  44
  45     format:         The video format, defaults to ext (used for --get-format)
  46     thumbnails:     A list of dictionaries (with the entries "resolution" and
  47                     "url") for the varying thumbnails
  48     thumbnail:      Full URL to a video thumbnail image.
  49     description:    One-line video description.
  50     uploader:       Full name of the video uploader.
  51     upload_date:    Video upload date (YYYYMMDD).
  52     uploader_id:    Nickname or id of the video uploader.
  53     location:       Physical location of the video.
  54     player_url:     SWF Player URL (used for rtmpdump).
  55     subtitles:      The subtitle file contents as a dictionary in the format
  56                     {language: subtitles}.
  57     view_count:     How many users have watched the video on the platform.
  58     urlhandle:      [internal] The urlHandle to be used to download the file,
  59                     like returned by urllib.request.urlopen
  60     age_limit:      Age restriction for the video, as an integer (years)
  61     formats:        A list of dictionaries for each format available, it must
  62                     be ordered from worst to best quality. Potential fields:
  63                     * url       Mandatory. The URL of the video file
  64                     * ext       Will be calculated from url if missing
  65                     * format    A human-readable description of the format
  66                                 ("mp4 container with h264/opus").
  67                                 Calculated from the format_id, width, height.
  68                                 and format_note fields if missing.
  69                     * format_id A short description of the format
  70                                 ("mp4_h264_opus" or "19")
  71                     * format_note Additional info about the format
  72                                 ("3D" or "DASH video")
  73                     * width     Width of the video, if known
  74                     * height    Height of the video, if known
  75                     * abr       Average audio bitrate in KBit/s
  76                     * acodec    Name of the audio codec in use
  77                     * vbr       Average video bitrate in KBit/s
  78                     * vcodec    Name of the video codec in use
  79                     * filesize  The number of bytes, if known in advance
  80     webpage_url:    The url to the video webpage, if given to youtube-dl it
  81                     should allow to get the same result again. (It will be set
  82                     by YoutubeDL if it's missing)
  83
  84     Unless mentioned otherwise, the fields should be Unicode strings.
  85
  86     Subclasses of this one should re-define the _real_initialize() and
  87     _real_extract() methods and define a _VALID_URL regexp.
  88     Probably, they should also be added to the list of extractors.
  89
  90     _real_extract() must return a *list* of information dictionaries as
  91     described above.
  92
  93     Finally, the _WORKING attribute should be set to False for broken IEs
  94     in order to warn the users and skip the tests.
  95     """
  96
  97     _ready = False
  98     _downloader = None
  99     _WORKING = True
 100
 101     def __init__(self, downloader=None):
 102         """Constructor. Receives an optional downloader."""
 103         self._ready = False
 104         self.set_downloader(downloader)
 105
 106     @classmethod
 107     def suitable(cls, url):
 108         """Receives a URL and returns True if suitable for this IE."""
 109
 110         # This does not use has/getattr intentionally - we want to know whether
 111         # we have cached the regexp for *this* class, whereas getattr would also
 112         # match the superclass
 113         if '_VALID_URL_RE' not in cls.__dict__:
 114             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 115         return cls._VALID_URL_RE.match(url) is not None
 116
 117     @classmethod
 118     def working(cls):
 119         """Getter method for _WORKING."""
 120         return cls._WORKING
 121
 122     def initialize(self):
 123         """Initializes an instance (authentication, etc)."""
 124         if not self._ready:
 125             self._real_initialize()
 126             self._ready = True
 127
 128     def extract(self, url):
 129         """Extracts URL information and returns it in list of dicts."""
 130         self.initialize()
 131         return self._real_extract(url)
 132
 133     def set_downloader(self, downloader):
 134         """Sets the downloader for this IE."""
 135         self._downloader = downloader
 136
 137     def _real_initialize(self):
 138         """Real initialization process. Redefine in subclasses."""
 139         pass
 140
 141     def _real_extract(self, url):
 142         """Real extraction process. Redefine in subclasses."""
 143         pass
 144
 145     @classmethod
 146     def ie_key(cls):
 147         """A string for getting the InfoExtractor with get_info_extractor"""
 148         return cls.__name__[:-2]
 149
 150     @property
 151     def IE_NAME(self):
 152         return type(self).__name__[:-2]
 153
 154     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 155         """ Returns the response handle """
 156         if note is None:
 157             self.report_download_webpage(video_id)
 158         elif note is not False:
 159             self.to_screen(u'%s: %s' % (video_id, note))
 160         try:
 161             return self._downloader.urlopen(url_or_request)
 162         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 163             if errnote is None:
 164                 errnote = u'Unable to download webpage'
 165             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2], cause=err)
 166
 167     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
 168         """ Returns a tuple (page content as string, URL handle) """
 169
 170         # Strip hashes from the URL (#1038)
 171         if isinstance(url_or_request, (compat_str, str)):
 172             url_or_request = url_or_request.partition('#')[0]
 173
 174         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 175         content_type = urlh.headers.get('Content-Type', '')
 176         webpage_bytes = urlh.read()
 177         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 178         if m:
 179             encoding = m.group(1)
 180         else:
 181             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 182                           webpage_bytes[:1024])
 183             if m:
 184                 encoding = m.group(1).decode('ascii')
 185             else:
 186                 encoding = 'utf-8'
 187         if self._downloader.params.get('dump_intermediate_pages', False):
 188             try:
 189                 url = url_or_request.get_full_url()
 190             except AttributeError:
 191                 url = url_or_request
 192             self.to_screen(u'Dumping request to ' + url)
 193             dump = base64.b64encode(webpage_bytes).decode('ascii')
 194             self._downloader.to_screen(dump)
 195         if self._downloader.params.get('write_pages', False):
 196             try:
 197                 url = url_or_request.get_full_url()
 198             except AttributeError:
 199                 url = url_or_request
 200             raw_filename = ('%s_%s.dump' % (video_id, url))
 201             filename = sanitize_filename(raw_filename, restricted=True)
 202             self.to_screen(u'Saving request to ' + filename)
 203             with open(filename, 'wb') as outf:
 204                 outf.write(webpage_bytes)
 205
 206         content = webpage_bytes.decode(encoding, 'replace')
 207         return (content, urlh)
 208
 209     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 210         """ Returns the data of the page as a string """
 211         return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
 212
 213     def _download_xml(self, url_or_request, video_id, note=u'Downloading XML', errnote=u'Unable to downloand XML'):
 214         """Return the xml as an xml.etree.ElementTree.Element"""
 215         xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
 216         return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
 217
 218     def to_screen(self, msg):
 219         """Print msg to screen, prefixing it with '[ie_name]'"""
 220         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 221
 222     def report_extraction(self, id_or_name):
 223         """Report information extraction."""
 224         self.to_screen(u'%s: Extracting information' % id_or_name)
 225
 226     def report_download_webpage(self, video_id):
 227         """Report webpage download."""
 228         self.to_screen(u'%s: Downloading webpage' % video_id)
 229
 230     def report_age_confirmation(self):
 231         """Report attempt to confirm age."""
 232         self.to_screen(u'Confirming age')
 233
 234     def report_login(self):
 235         """Report attempt to log in."""
 236         self.to_screen(u'Logging in')
 237
 238     #Methods for following #608
 239     def url_result(self, url, ie=None, video_id=None):
 240         """Returns a url that points to a page that should be processed"""
 241         #TODO: ie should be the class used for getting the info
 242         video_info = {'_type': 'url',
 243                       'url': url,
 244                       'ie_key': ie}
 245         if video_id is not None:
 246             video_info['id'] = video_id
 247         return video_info
 248     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 249         """Returns a playlist"""
 250         video_info = {'_type': 'playlist',
 251                       'entries': entries}
 252         if playlist_id:
 253             video_info['id'] = playlist_id
 254         if playlist_title:
 255             video_info['title'] = playlist_title
 256         return video_info
 257
 258     def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 259         """
 260         Perform a regex search on the given string, using a single or a list of
 261         patterns returning the first matching group.
 262         In case of failure return a default value or raise a WARNING or a
 263         RegexNotFoundError, depending on fatal, specifying the field name.
 264         """
 265         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 266             mobj = re.search(pattern, string, flags)
 267         else:
 268             for p in pattern:
 269                 mobj = re.search(p, string, flags)
 270                 if mobj: break
 271
 272         if sys.stderr.isatty() and os.name != 'nt':
 273             _name = u'\033[0;34m%s\033[0m' % name
 274         else:
 275             _name = name
 276
 277         if mobj:
 278             # return the first matching group
 279             return next(g for g in mobj.groups() if g is not None)
 280         elif default is not None:
 281             return default
 282         elif fatal:
 283             raise RegexNotFoundError(u'Unable to extract %s' % _name)
 284         else:
 285             self._downloader.report_warning(u'unable to extract %s; '
 286                 u'please report this issue on http://yt-dl.org/bug' % _name)
 287             return None
 288
 289     def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 290         """
 291         Like _search_regex, but strips HTML tags and unescapes entities.
 292         """
 293         res = self._search_regex(pattern, string, name, default, fatal, flags)
 294         if res:
 295             return clean_html(res).strip()
 296         else:
 297             return res
 298
 299     def _get_login_info(self):
 300         """
 301         Get the the login info as (username, password)
 302         It will look in the netrc file using the _NETRC_MACHINE value
 303         If there's no info available, return (None, None)
 304         """
 305         if self._downloader is None:
 306             return (None, None)
 307
 308         username = None
 309         password = None
 310         downloader_params = self._downloader.params
 311
 312         # Attempt to use provided username and password or .netrc data
 313         if downloader_params.get('username', None) is not None:
 314             username = downloader_params['username']
 315             password = downloader_params['password']
 316         elif downloader_params.get('usenetrc', False):
 317             try:
 318                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 319                 if info is not None:
 320                     username = info[0]
 321                     password = info[2]
 322                 else:
 323                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 324             except (IOError, netrc.NetrcParseError) as err:
 325                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 326
 327         return (username, password)
 328
 329     # Helper functions for extracting OpenGraph info
 330     @staticmethod
 331     def _og_regexes(prop):
 332         content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
 333         property_re = r'property=[\'"]og:%s[\'"]' % re.escape(prop)
 334         template = r'<meta[^>]+?%s[^>]+?%s'
 335         return [
 336             template % (property_re, content_re),
 337             template % (content_re, property_re),
 338         ]
 339
 340     def _og_search_property(self, prop, html, name=None, **kargs):
 341         if name is None:
 342             name = 'OpenGraph %s' % prop
 343         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 344         if escaped is None:
 345             return None
 346         return unescapeHTML(escaped)
 347
 348     def _og_search_thumbnail(self, html, **kargs):
 349         return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
 350
 351     def _og_search_description(self, html, **kargs):
 352         return self._og_search_property('description', html, fatal=False, **kargs)
 353
 354     def _og_search_title(self, html, **kargs):
 355         return self._og_search_property('title', html, **kargs)
 356
 357     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 358         regexes = self._og_regexes('video')
 359         if secure: regexes = self._og_regexes('video:secure_url') + regexes
 360         return self._html_search_regex(regexes, html, name, **kargs)
 361
 362     def _html_search_meta(self, name, html, display_name=None):
 363         if display_name is None:
 364             display_name = name
 365         return self._html_search_regex(
 366             r'''(?ix)<meta(?=[^>]+(?:name|property)=["\']%s["\'])
 367                     [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
 368             html, display_name, fatal=False)
 369
 370     def _dc_search_uploader(self, html):
 371         return self._html_search_meta('dc.creator', html, 'uploader')
 372
 373     def _rta_search(self, html):
 374         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 375         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 376                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 377                      html):
 378             return 18
 379         return 0
 380
 381     def _media_rating_search(self, html):
 382         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 383         rating = self._html_search_meta('rating', html)
 384
 385         if not rating:
 386             return None
 387
 388         RATING_TABLE = {
 389             'safe for kids': 0,
 390             'general': 8,
 391             '14 years': 14,
 392             'mature': 17,
 393             'restricted': 19,
 394         }
 395         return RATING_TABLE.get(rating.lower(), None)
 396
 397
 398
 399 class SearchInfoExtractor(InfoExtractor):
 400     """
 401     Base class for paged search queries extractors.
 402     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 403     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 404     """
 405
 406     @classmethod
 407     def _make_valid_url(cls):
 408         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 409
 410     @classmethod
 411     def suitable(cls, url):
 412         return re.match(cls._make_valid_url(), url) is not None
 413
 414     def _real_extract(self, query):
 415         mobj = re.match(self._make_valid_url(), query)
 416         if mobj is None:
 417             raise ExtractorError(u'Invalid search query "%s"' % query)
 418
 419         prefix = mobj.group('prefix')
 420         query = mobj.group('query')
 421         if prefix == '':
 422             return self._get_n_results(query, 1)
 423         elif prefix == 'all':
 424             return self._get_n_results(query, self._MAX_RESULTS)
 425         else:
 426             n = int(prefix)
 427             if n <= 0:
 428                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
 429             elif n > self._MAX_RESULTS:
 430                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 431                 n = self._MAX_RESULTS
 432             return self._get_n_results(query, n)
 433
 434     def _get_n_results(self, query, n):
 435         """Get a specified number of results for a query"""
 436         raise NotImplementedError("This method must be implemented by subclasses")
 437
 438     @property
 439     def SEARCH_KEY(self):
 440         return self._SEARCH_KEY