youtube_dl/extractor/common.py

   1 import base64
   2 import os
   3 import re
   4 import socket
   5 import sys
   6 import netrc
   7
   8 from ..utils import (
   9     compat_http_client,
  10     compat_urllib_error,
  11     compat_urllib_request,
  12     compat_str,
  13
  14     clean_html,
  15     compiled_regex_type,
  16     ExtractorError,
  17     RegexNotFoundError,
  18     sanitize_filename,
  19     unescapeHTML,
  20 )
  21
  22 class InfoExtractor(object):
  23     """Information Extractor class.
  24
  25     Information extractors are the classes that, given a URL, extract
  26     information about the video (or videos) the URL refers to. This
  27     information includes the real video URL, the video title, author and
  28     others. The information is stored in a dictionary which is then
  29     passed to the FileDownloader. The FileDownloader processes this
  30     information possibly downloading the video to the file system, among
  31     other possible outcomes.
  32
  33     The dictionaries must include the following fields:
  34
  35     id:             Video identifier.
  36     url:            Final video URL.
  37     title:          Video title, unescaped.
  38     ext:            Video filename extension.
  39
  40     Instead of url and ext, formats can also specified.
  41
  42     The following fields are optional:
  43
  44     format:         The video format, defaults to ext (used for --get-format)
  45     thumbnails:     A list of dictionaries (with the entries "resolution" and
  46                     "url") for the varying thumbnails
  47     thumbnail:      Full URL to a video thumbnail image.
  48     description:    One-line video description.
  49     uploader:       Full name of the video uploader.
  50     upload_date:    Video upload date (YYYYMMDD).
  51     uploader_id:    Nickname or id of the video uploader.
  52     location:       Physical location of the video.
  53     player_url:     SWF Player URL (used for rtmpdump).
  54     subtitles:      The subtitle file contents as a dictionary in the format
  55                     {language: subtitles}.
  56     view_count:     How many users have watched the video on the platform.
  57     urlhandle:      [internal] The urlHandle to be used to download the file,
  58                     like returned by urllib.request.urlopen
  59     age_limit:      Age restriction for the video, as an integer (years)
  60     formats:        A list of dictionaries for each format available, it must
  61                     be ordered from worst to best quality. Potential fields:
  62                     * url       Mandatory. The URL of the video file
  63                     * ext       Will be calculated from url if missing
  64                     * format    A human-readable description of the format
  65                                 ("mp4 container with h264/opus").
  66                                 Calculated from the format_id, width, height.
  67                                 and format_note fields if missing.
  68                     * format_id A short description of the format
  69                                 ("mp4_h264_opus" or "19")
  70                     * format_note Additional info about the format
  71                                 ("3D" or "DASH video")
  72                     * width     Width of the video, if known
  73                     * height    Height of the video, if known
  74                     * abr       Average audio bitrate in KBit/s
  75                     * acodec    Name of the audio codec in use
  76                     * vbr       Average video bitrate in KBit/s
  77                     * vcodec    Name of the video codec in use
  78     webpage_url:    The url to the video webpage, if given to youtube-dl it
  79                     should allow to get the same result again. (It will be set
  80                     by YoutubeDL if it's missing)
  81
  82     Unless mentioned otherwise, the fields should be Unicode strings.
  83
  84     Subclasses of this one should re-define the _real_initialize() and
  85     _real_extract() methods and define a _VALID_URL regexp.
  86     Probably, they should also be added to the list of extractors.
  87
  88     _real_extract() must return a *list* of information dictionaries as
  89     described above.
  90
  91     Finally, the _WORKING attribute should be set to False for broken IEs
  92     in order to warn the users and skip the tests.
  93     """
  94
  95     _ready = False
  96     _downloader = None
  97     _WORKING = True
  98
  99     def __init__(self, downloader=None):
 100         """Constructor. Receives an optional downloader."""
 101         self._ready = False
 102         self.set_downloader(downloader)
 103
 104     @classmethod
 105     def suitable(cls, url):
 106         """Receives a URL and returns True if suitable for this IE."""
 107
 108         # This does not use has/getattr intentionally - we want to know whether
 109         # we have cached the regexp for *this* class, whereas getattr would also
 110         # match the superclass
 111         if '_VALID_URL_RE' not in cls.__dict__:
 112             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 113         return cls._VALID_URL_RE.match(url) is not None
 114
 115     @classmethod
 116     def working(cls):
 117         """Getter method for _WORKING."""
 118         return cls._WORKING
 119
 120     def initialize(self):
 121         """Initializes an instance (authentication, etc)."""
 122         if not self._ready:
 123             self._real_initialize()
 124             self._ready = True
 125
 126     def extract(self, url):
 127         """Extracts URL information and returns it in list of dicts."""
 128         self.initialize()
 129         return self._real_extract(url)
 130
 131     def set_downloader(self, downloader):
 132         """Sets the downloader for this IE."""
 133         self._downloader = downloader
 134
 135     def _real_initialize(self):
 136         """Real initialization process. Redefine in subclasses."""
 137         pass
 138
 139     def _real_extract(self, url):
 140         """Real extraction process. Redefine in subclasses."""
 141         pass
 142
 143     @classmethod
 144     def ie_key(cls):
 145         """A string for getting the InfoExtractor with get_info_extractor"""
 146         return cls.__name__[:-2]
 147
 148     @property
 149     def IE_NAME(self):
 150         return type(self).__name__[:-2]
 151
 152     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 153         """ Returns the response handle """
 154         if note is None:
 155             self.report_download_webpage(video_id)
 156         elif note is not False:
 157             self.to_screen(u'%s: %s' % (video_id, note))
 158         try:
 159             return compat_urllib_request.urlopen(url_or_request)
 160         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 161             if errnote is None:
 162                 errnote = u'Unable to download webpage'
 163             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2], cause=err)
 164
 165     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
 166         """ Returns a tuple (page content as string, URL handle) """
 167
 168         # Strip hashes from the URL (#1038)
 169         if isinstance(url_or_request, (compat_str, str)):
 170             url_or_request = url_or_request.partition('#')[0]
 171
 172         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 173         content_type = urlh.headers.get('Content-Type', '')
 174         webpage_bytes = urlh.read()
 175         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 176         if m:
 177             encoding = m.group(1)
 178         else:
 179             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 180                           webpage_bytes[:1024])
 181             if m:
 182                 encoding = m.group(1).decode('ascii')
 183             else:
 184                 encoding = 'utf-8'
 185         if self._downloader.params.get('dump_intermediate_pages', False):
 186             try:
 187                 url = url_or_request.get_full_url()
 188             except AttributeError:
 189                 url = url_or_request
 190             self.to_screen(u'Dumping request to ' + url)
 191             dump = base64.b64encode(webpage_bytes).decode('ascii')
 192             self._downloader.to_screen(dump)
 193         if self._downloader.params.get('write_pages', False):
 194             try:
 195                 url = url_or_request.get_full_url()
 196             except AttributeError:
 197                 url = url_or_request
 198             raw_filename = ('%s_%s.dump' % (video_id, url))
 199             filename = sanitize_filename(raw_filename, restricted=True)
 200             self.to_screen(u'Saving request to ' + filename)
 201             with open(filename, 'wb') as outf:
 202                 outf.write(webpage_bytes)
 203
 204         content = webpage_bytes.decode(encoding, 'replace')
 205         return (content, urlh)
 206
 207     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 208         """ Returns the data of the page as a string """
 209         return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
 210
 211     def to_screen(self, msg):
 212         """Print msg to screen, prefixing it with '[ie_name]'"""
 213         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 214
 215     def report_extraction(self, id_or_name):
 216         """Report information extraction."""
 217         self.to_screen(u'%s: Extracting information' % id_or_name)
 218
 219     def report_download_webpage(self, video_id):
 220         """Report webpage download."""
 221         self.to_screen(u'%s: Downloading webpage' % video_id)
 222
 223     def report_age_confirmation(self):
 224         """Report attempt to confirm age."""
 225         self.to_screen(u'Confirming age')
 226
 227     def report_login(self):
 228         """Report attempt to log in."""
 229         self.to_screen(u'Logging in')
 230
 231     #Methods for following #608
 232     def url_result(self, url, ie=None, video_id=None):
 233         """Returns a url that points to a page that should be processed"""
 234         #TODO: ie should be the class used for getting the info
 235         video_info = {'_type': 'url',
 236                       'url': url,
 237                       'ie_key': ie}
 238         if video_id is not None:
 239             video_info['id'] = video_id
 240         return video_info
 241     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 242         """Returns a playlist"""
 243         video_info = {'_type': 'playlist',
 244                       'entries': entries}
 245         if playlist_id:
 246             video_info['id'] = playlist_id
 247         if playlist_title:
 248             video_info['title'] = playlist_title
 249         return video_info
 250
 251     def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 252         """
 253         Perform a regex search on the given string, using a single or a list of
 254         patterns returning the first matching group.
 255         In case of failure return a default value or raise a WARNING or a
 256         RegexNotFoundError, depending on fatal, specifying the field name.
 257         """
 258         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 259             mobj = re.search(pattern, string, flags)
 260         else:
 261             for p in pattern:
 262                 mobj = re.search(p, string, flags)
 263                 if mobj: break
 264
 265         if sys.stderr.isatty() and os.name != 'nt':
 266             _name = u'\033[0;34m%s\033[0m' % name
 267         else:
 268             _name = name
 269
 270         if mobj:
 271             # return the first matching group
 272             return next(g for g in mobj.groups() if g is not None)
 273         elif default is not None:
 274             return default
 275         elif fatal:
 276             raise RegexNotFoundError(u'Unable to extract %s' % _name)
 277         else:
 278             self._downloader.report_warning(u'unable to extract %s; '
 279                 u'please report this issue on http://yt-dl.org/bug' % _name)
 280             return None
 281
 282     def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 283         """
 284         Like _search_regex, but strips HTML tags and unescapes entities.
 285         """
 286         res = self._search_regex(pattern, string, name, default, fatal, flags)
 287         if res:
 288             return clean_html(res).strip()
 289         else:
 290             return res
 291
 292     def _get_login_info(self):
 293         """
 294         Get the the login info as (username, password)
 295         It will look in the netrc file using the _NETRC_MACHINE value
 296         If there's no info available, return (None, None)
 297         """
 298         if self._downloader is None:
 299             return (None, None)
 300
 301         username = None
 302         password = None
 303         downloader_params = self._downloader.params
 304
 305         # Attempt to use provided username and password or .netrc data
 306         if downloader_params.get('username', None) is not None:
 307             username = downloader_params['username']
 308             password = downloader_params['password']
 309         elif downloader_params.get('usenetrc', False):
 310             try:
 311                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 312                 if info is not None:
 313                     username = info[0]
 314                     password = info[2]
 315                 else:
 316                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 317             except (IOError, netrc.NetrcParseError) as err:
 318                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 319
 320         return (username, password)
 321
 322     # Helper functions for extracting OpenGraph info
 323     @staticmethod
 324     def _og_regexes(prop):
 325         content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
 326         property_re = r'property=[\'"]og:%s[\'"]' % re.escape(prop)
 327         template = r'<meta[^>]+?%s[^>]+?%s'
 328         return [
 329             template % (property_re, content_re),
 330             template % (content_re, property_re),
 331         ]
 332
 333     def _og_search_property(self, prop, html, name=None, **kargs):
 334         if name is None:
 335             name = 'OpenGraph %s' % prop
 336         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 337         if escaped is None:
 338             return None
 339         return unescapeHTML(escaped)
 340
 341     def _og_search_thumbnail(self, html, **kargs):
 342         return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
 343
 344     def _og_search_description(self, html, **kargs):
 345         return self._og_search_property('description', html, fatal=False, **kargs)
 346
 347     def _og_search_title(self, html, **kargs):
 348         return self._og_search_property('title', html, **kargs)
 349
 350     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 351         regexes = self._og_regexes('video')
 352         if secure: regexes = self._og_regexes('video:secure_url') + regexes
 353         return self._html_search_regex(regexes, html, name, **kargs)
 354
 355     def _html_search_meta(self, name, html, display_name=None):
 356         if display_name is None:
 357             display_name = name
 358         return self._html_search_regex(
 359             r'''(?ix)<meta(?=[^>]+(?:name|property)=["\']%s["\'])
 360                     [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
 361             html, display_name, fatal=False)
 362
 363     def _dc_search_uploader(self, html):
 364         return self._html_search_meta('dc.creator', html, 'uploader')
 365
 366     def _rta_search(self, html):
 367         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 368         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 369                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 370                      html):
 371             return 18
 372         return 0
 373
 374     def _media_rating_search(self, html):
 375         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 376         rating = self._html_search_meta('rating', html)
 377
 378         if not rating:
 379             return None
 380
 381         RATING_TABLE = {
 382             'safe for kids': 0,
 383             'general': 8,
 384             '14 years': 14,
 385             'mature': 17,
 386             'restricted': 19,
 387         }
 388         return RATING_TABLE.get(rating.lower(), None)
 389
 390
 391
 392 class SearchInfoExtractor(InfoExtractor):
 393     """
 394     Base class for paged search queries extractors.
 395     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 396     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 397     """
 398
 399     @classmethod
 400     def _make_valid_url(cls):
 401         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 402
 403     @classmethod
 404     def suitable(cls, url):
 405         return re.match(cls._make_valid_url(), url) is not None
 406
 407     def _real_extract(self, query):
 408         mobj = re.match(self._make_valid_url(), query)
 409         if mobj is None:
 410             raise ExtractorError(u'Invalid search query "%s"' % query)
 411
 412         prefix = mobj.group('prefix')
 413         query = mobj.group('query')
 414         if prefix == '':
 415             return self._get_n_results(query, 1)
 416         elif prefix == 'all':
 417             return self._get_n_results(query, self._MAX_RESULTS)
 418         else:
 419             n = int(prefix)
 420             if n <= 0:
 421                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
 422             elif n > self._MAX_RESULTS:
 423                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 424                 n = self._MAX_RESULTS
 425             return self._get_n_results(query, n)
 426
 427     def _get_n_results(self, query, n):
 428         """Get a specified number of results for a query"""
 429         raise NotImplementedError("This method must be implemented by subclasses")
 430
 431     @property
 432     def SEARCH_KEY(self):
 433         return self._SEARCH_KEY