youtube_dl/extractor/common.py

   1 import base64
   2 import os
   3 import re
   4 import socket
   5 import sys
   6 import netrc
   7 import xml.etree.ElementTree
   8
   9 from ..utils import (
  10     compat_http_client,
  11     compat_urllib_error,
  12     compat_str,
  13
  14     clean_html,
  15     compiled_regex_type,
  16     ExtractorError,
  17     RegexNotFoundError,
  18     sanitize_filename,
  19     unescapeHTML,
  20 )
  21
  22
  23 class InfoExtractor(object):
  24     """Information Extractor class.
  25
  26     Information extractors are the classes that, given a URL, extract
  27     information about the video (or videos) the URL refers to. This
  28     information includes the real video URL, the video title, author and
  29     others. The information is stored in a dictionary which is then
  30     passed to the FileDownloader. The FileDownloader processes this
  31     information possibly downloading the video to the file system, among
  32     other possible outcomes.
  33
  34     The dictionaries must include the following fields:
  35
  36     id:             Video identifier.
  37     url:            Final video URL.
  38     title:          Video title, unescaped.
  39     ext:            Video filename extension.
  40
  41     Instead of url and ext, formats can also specified.
  42
  43     The following fields are optional:
  44
  45     format:         The video format, defaults to ext (used for --get-format)
  46     thumbnails:     A list of dictionaries (with the entries "resolution" and
  47                     "url") for the varying thumbnails
  48     thumbnail:      Full URL to a video thumbnail image.
  49     description:    One-line video description.
  50     uploader:       Full name of the video uploader.
  51     upload_date:    Video upload date (YYYYMMDD).
  52     uploader_id:    Nickname or id of the video uploader.
  53     location:       Physical location of the video.
  54     player_url:     SWF Player URL (used for rtmpdump).
  55     subtitles:      The subtitle file contents as a dictionary in the format
  56                     {language: subtitles}.
  57     view_count:     How many users have watched the video on the platform.
  58     like_count:     Number of positive ratings of the video
  59     dislike_count:  Number of negative ratings of the video
  60     comment_count:  Number of comments on the video
  61     urlhandle:      [internal] The urlHandle to be used to download the file,
  62                     like returned by urllib.request.urlopen
  63     age_limit:      Age restriction for the video, as an integer (years)
  64     formats:        A list of dictionaries for each format available, it must
  65                     be ordered from worst to best quality. Potential fields:
  66                     * url       Mandatory. The URL of the video file
  67                     * ext       Will be calculated from url if missing
  68                     * format    A human-readable description of the format
  69                                 ("mp4 container with h264/opus").
  70                                 Calculated from the format_id, width, height.
  71                                 and format_note fields if missing.
  72                     * format_id A short description of the format
  73                                 ("mp4_h264_opus" or "19")
  74                     * format_note Additional info about the format
  75                                 ("3D" or "DASH video")
  76                     * width     Width of the video, if known
  77                     * height    Height of the video, if known
  78                     * abr       Average audio bitrate in KBit/s
  79                     * acodec    Name of the audio codec in use
  80                     * vbr       Average video bitrate in KBit/s
  81                     * vcodec    Name of the video codec in use
  82                     * filesize  The number of bytes, if known in advance
  83     webpage_url:    The url to the video webpage, if given to youtube-dl it
  84                     should allow to get the same result again. (It will be set
  85                     by YoutubeDL if it's missing)
  86
  87     Unless mentioned otherwise, the fields should be Unicode strings.
  88
  89     Subclasses of this one should re-define the _real_initialize() and
  90     _real_extract() methods and define a _VALID_URL regexp.
  91     Probably, they should also be added to the list of extractors.
  92
  93     _real_extract() must return a *list* of information dictionaries as
  94     described above.
  95
  96     Finally, the _WORKING attribute should be set to False for broken IEs
  97     in order to warn the users and skip the tests.
  98     """
  99
 100     _ready = False
 101     _downloader = None
 102     _WORKING = True
 103
 104     def __init__(self, downloader=None):
 105         """Constructor. Receives an optional downloader."""
 106         self._ready = False
 107         self.set_downloader(downloader)
 108
 109     @classmethod
 110     def suitable(cls, url):
 111         """Receives a URL and returns True if suitable for this IE."""
 112
 113         # This does not use has/getattr intentionally - we want to know whether
 114         # we have cached the regexp for *this* class, whereas getattr would also
 115         # match the superclass
 116         if '_VALID_URL_RE' not in cls.__dict__:
 117             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 118         return cls._VALID_URL_RE.match(url) is not None
 119
 120     @classmethod
 121     def working(cls):
 122         """Getter method for _WORKING."""
 123         return cls._WORKING
 124
 125     def initialize(self):
 126         """Initializes an instance (authentication, etc)."""
 127         if not self._ready:
 128             self._real_initialize()
 129             self._ready = True
 130
 131     def extract(self, url):
 132         """Extracts URL information and returns it in list of dicts."""
 133         self.initialize()
 134         return self._real_extract(url)
 135
 136     def set_downloader(self, downloader):
 137         """Sets the downloader for this IE."""
 138         self._downloader = downloader
 139
 140     def _real_initialize(self):
 141         """Real initialization process. Redefine in subclasses."""
 142         pass
 143
 144     def _real_extract(self, url):
 145         """Real extraction process. Redefine in subclasses."""
 146         pass
 147
 148     @classmethod
 149     def ie_key(cls):
 150         """A string for getting the InfoExtractor with get_info_extractor"""
 151         return cls.__name__[:-2]
 152
 153     @property
 154     def IE_NAME(self):
 155         return type(self).__name__[:-2]
 156
 157     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 158         """ Returns the response handle """
 159         if note is None:
 160             self.report_download_webpage(video_id)
 161         elif note is not False:
 162             if video_id is None:
 163                 self.to_screen(u'%s' % (note,))
 164             else:
 165                 self.to_screen(u'%s: %s' % (video_id, note))
 166         try:
 167             return self._downloader.urlopen(url_or_request)
 168         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 169             if errnote is None:
 170                 errnote = u'Unable to download webpage'
 171             errmsg = u'%s: %s' % (errnote, compat_str(err))
 172             if fatal:
 173                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 174             else:
 175                 self._downloader.report_warning(errmsg)
 176                 return False
 177
 178     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 179         """ Returns a tuple (page content as string, URL handle) """
 180
 181         # Strip hashes from the URL (#1038)
 182         if isinstance(url_or_request, (compat_str, str)):
 183             url_or_request = url_or_request.partition('#')[0]
 184
 185         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 186         if urlh is False:
 187             assert not fatal
 188             return False
 189         content_type = urlh.headers.get('Content-Type', '')
 190         webpage_bytes = urlh.read()
 191         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 192         if m:
 193             encoding = m.group(1)
 194         else:
 195             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 196                           webpage_bytes[:1024])
 197             if m:
 198                 encoding = m.group(1).decode('ascii')
 199             else:
 200                 encoding = 'utf-8'
 201         if self._downloader.params.get('dump_intermediate_pages', False):
 202             try:
 203                 url = url_or_request.get_full_url()
 204             except AttributeError:
 205                 url = url_or_request
 206             self.to_screen(u'Dumping request to ' + url)
 207             dump = base64.b64encode(webpage_bytes).decode('ascii')
 208             self._downloader.to_screen(dump)
 209         if self._downloader.params.get('write_pages', False):
 210             try:
 211                 url = url_or_request.get_full_url()
 212             except AttributeError:
 213                 url = url_or_request
 214             raw_filename = ('%s_%s.dump' % (video_id, url))
 215             filename = sanitize_filename(raw_filename, restricted=True)
 216             self.to_screen(u'Saving request to ' + filename)
 217             with open(filename, 'wb') as outf:
 218                 outf.write(webpage_bytes)
 219
 220         content = webpage_bytes.decode(encoding, 'replace')
 221         return (content, urlh)
 222
 223     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 224         """ Returns the data of the page as a string """
 225         res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
 226         if res is False:
 227             return res
 228         else:
 229             content, _ = res
 230             return content
 231
 232     def _download_xml(self, url_or_request, video_id,
 233                       note=u'Downloading XML', errnote=u'Unable to download XML',
 234                       transform_source=None):
 235         """Return the xml as an xml.etree.ElementTree.Element"""
 236         xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
 237         if transform_source:
 238             xml_string = transform_source(xml_string)
 239         return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
 240
 241     def to_screen(self, msg):
 242         """Print msg to screen, prefixing it with '[ie_name]'"""
 243         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 244
 245     def report_extraction(self, id_or_name):
 246         """Report information extraction."""
 247         self.to_screen(u'%s: Extracting information' % id_or_name)
 248
 249     def report_download_webpage(self, video_id):
 250         """Report webpage download."""
 251         self.to_screen(u'%s: Downloading webpage' % video_id)
 252
 253     def report_age_confirmation(self):
 254         """Report attempt to confirm age."""
 255         self.to_screen(u'Confirming age')
 256
 257     def report_login(self):
 258         """Report attempt to log in."""
 259         self.to_screen(u'Logging in')
 260
 261     #Methods for following #608
 262     def url_result(self, url, ie=None, video_id=None):
 263         """Returns a url that points to a page that should be processed"""
 264         #TODO: ie should be the class used for getting the info
 265         video_info = {'_type': 'url',
 266                       'url': url,
 267                       'ie_key': ie}
 268         if video_id is not None:
 269             video_info['id'] = video_id
 270         return video_info
 271     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 272         """Returns a playlist"""
 273         video_info = {'_type': 'playlist',
 274                       'entries': entries}
 275         if playlist_id:
 276             video_info['id'] = playlist_id
 277         if playlist_title:
 278             video_info['title'] = playlist_title
 279         return video_info
 280
 281     def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 282         """
 283         Perform a regex search on the given string, using a single or a list of
 284         patterns returning the first matching group.
 285         In case of failure return a default value or raise a WARNING or a
 286         RegexNotFoundError, depending on fatal, specifying the field name.
 287         """
 288         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 289             mobj = re.search(pattern, string, flags)
 290         else:
 291             for p in pattern:
 292                 mobj = re.search(p, string, flags)
 293                 if mobj: break
 294
 295         if sys.stderr.isatty() and os.name != 'nt':
 296             _name = u'\033[0;34m%s\033[0m' % name
 297         else:
 298             _name = name
 299
 300         if mobj:
 301             # return the first matching group
 302             return next(g for g in mobj.groups() if g is not None)
 303         elif default is not None:
 304             return default
 305         elif fatal:
 306             raise RegexNotFoundError(u'Unable to extract %s' % _name)
 307         else:
 308             self._downloader.report_warning(u'unable to extract %s; '
 309                 u'please report this issue on http://yt-dl.org/bug' % _name)
 310             return None
 311
 312     def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 313         """
 314         Like _search_regex, but strips HTML tags and unescapes entities.
 315         """
 316         res = self._search_regex(pattern, string, name, default, fatal, flags)
 317         if res:
 318             return clean_html(res).strip()
 319         else:
 320             return res
 321
 322     def _get_login_info(self):
 323         """
 324         Get the the login info as (username, password)
 325         It will look in the netrc file using the _NETRC_MACHINE value
 326         If there's no info available, return (None, None)
 327         """
 328         if self._downloader is None:
 329             return (None, None)
 330
 331         username = None
 332         password = None
 333         downloader_params = self._downloader.params
 334
 335         # Attempt to use provided username and password or .netrc data
 336         if downloader_params.get('username', None) is not None:
 337             username = downloader_params['username']
 338             password = downloader_params['password']
 339         elif downloader_params.get('usenetrc', False):
 340             try:
 341                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 342                 if info is not None:
 343                     username = info[0]
 344                     password = info[2]
 345                 else:
 346                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 347             except (IOError, netrc.NetrcParseError) as err:
 348                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 349
 350         return (username, password)
 351
 352     # Helper functions for extracting OpenGraph info
 353     @staticmethod
 354     def _og_regexes(prop):
 355         content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
 356         property_re = r'property=[\'"]og:%s[\'"]' % re.escape(prop)
 357         template = r'<meta[^>]+?%s[^>]+?%s'
 358         return [
 359             template % (property_re, content_re),
 360             template % (content_re, property_re),
 361         ]
 362
 363     def _og_search_property(self, prop, html, name=None, **kargs):
 364         if name is None:
 365             name = 'OpenGraph %s' % prop
 366         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 367         if escaped is None:
 368             return None
 369         return unescapeHTML(escaped)
 370
 371     def _og_search_thumbnail(self, html, **kargs):
 372         return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
 373
 374     def _og_search_description(self, html, **kargs):
 375         return self._og_search_property('description', html, fatal=False, **kargs)
 376
 377     def _og_search_title(self, html, **kargs):
 378         return self._og_search_property('title', html, **kargs)
 379
 380     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 381         regexes = self._og_regexes('video')
 382         if secure: regexes = self._og_regexes('video:secure_url') + regexes
 383         return self._html_search_regex(regexes, html, name, **kargs)
 384
 385     def _html_search_meta(self, name, html, display_name=None):
 386         if display_name is None:
 387             display_name = name
 388         return self._html_search_regex(
 389             r'''(?ix)<meta
 390                     (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
 391                     [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
 392             html, display_name, fatal=False)
 393
 394     def _dc_search_uploader(self, html):
 395         return self._html_search_meta('dc.creator', html, 'uploader')
 396
 397     def _rta_search(self, html):
 398         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 399         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 400                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 401                      html):
 402             return 18
 403         return 0
 404
 405     def _media_rating_search(self, html):
 406         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 407         rating = self._html_search_meta('rating', html)
 408
 409         if not rating:
 410             return None
 411
 412         RATING_TABLE = {
 413             'safe for kids': 0,
 414             'general': 8,
 415             '14 years': 14,
 416             'mature': 17,
 417             'restricted': 19,
 418         }
 419         return RATING_TABLE.get(rating.lower(), None)
 420
 421
 422
 423 class SearchInfoExtractor(InfoExtractor):
 424     """
 425     Base class for paged search queries extractors.
 426     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 427     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 428     """
 429
 430     @classmethod
 431     def _make_valid_url(cls):
 432         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 433
 434     @classmethod
 435     def suitable(cls, url):
 436         return re.match(cls._make_valid_url(), url) is not None
 437
 438     def _real_extract(self, query):
 439         mobj = re.match(self._make_valid_url(), query)
 440         if mobj is None:
 441             raise ExtractorError(u'Invalid search query "%s"' % query)
 442
 443         prefix = mobj.group('prefix')
 444         query = mobj.group('query')
 445         if prefix == '':
 446             return self._get_n_results(query, 1)
 447         elif prefix == 'all':
 448             return self._get_n_results(query, self._MAX_RESULTS)
 449         else:
 450             n = int(prefix)
 451             if n <= 0:
 452                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
 453             elif n > self._MAX_RESULTS:
 454                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 455                 n = self._MAX_RESULTS
 456             return self._get_n_results(query, n)
 457
 458     def _get_n_results(self, query, n):
 459         """Get a specified number of results for a query"""
 460         raise NotImplementedError("This method must be implemented by subclasses")
 461
 462     @property
 463     def SEARCH_KEY(self):
 464         return self._SEARCH_KEY