youtube_dl/extractor/common.py

   1 import base64
   2 import os
   3 import re
   4 import socket
   5 import sys
   6 import netrc
   7 import xml.etree.ElementTree
   8
   9 from ..utils import (
  10     compat_http_client,
  11     compat_urllib_error,
  12     compat_str,
  13
  14     clean_html,
  15     compiled_regex_type,
  16     ExtractorError,
  17     RegexNotFoundError,
  18     sanitize_filename,
  19     unescapeHTML,
  20 )
  21
  22
  23 class InfoExtractor(object):
  24     """Information Extractor class.
  25
  26     Information extractors are the classes that, given a URL, extract
  27     information about the video (or videos) the URL refers to. This
  28     information includes the real video URL, the video title, author and
  29     others. The information is stored in a dictionary which is then
  30     passed to the FileDownloader. The FileDownloader processes this
  31     information possibly downloading the video to the file system, among
  32     other possible outcomes.
  33
  34     The dictionaries must include the following fields:
  35
  36     id:             Video identifier.
  37     title:          Video title, unescaped.
  38
  39     Additionally, it must contain either a formats entry or url and ext:
  40
  41     formats:        A list of dictionaries for each format available, it must
  42                     be ordered from worst to best quality. Potential fields:
  43                     * url        Mandatory. The URL of the video file
  44                     * ext        Will be calculated from url if missing
  45                     * format     A human-readable description of the format
  46                                  ("mp4 container with h264/opus").
  47                                  Calculated from the format_id, width, height.
  48                                  and format_note fields if missing.
  49                     * format_id  A short description of the format
  50                                  ("mp4_h264_opus" or "19")
  51                     * format_note Additional info about the format
  52                                  ("3D" or "DASH video")
  53                     * width      Width of the video, if known
  54                     * height     Height of the video, if known
  55                     * abr        Average audio bitrate in KBit/s
  56                     * acodec     Name of the audio codec in use
  57                     * vbr        Average video bitrate in KBit/s
  58                     * vcodec     Name of the video codec in use
  59                     * filesize   The number of bytes, if known in advance
  60                     * player_url SWF Player URL (used for rtmpdump).
  61     url:            Final video URL.
  62     ext:            Video filename extension.
  63     format:         The video format, defaults to ext (used for --get-format)
  64     player_url:     SWF Player URL (used for rtmpdump).
  65     urlhandle:      [internal] The urlHandle to be used to download the file,
  66                     like returned by urllib.request.urlopen
  67
  68     The following fields are optional:
  69
  70     thumbnails:     A list of dictionaries (with the entries "resolution" and
  71                     "url") for the varying thumbnails
  72     thumbnail:      Full URL to a video thumbnail image.
  73     description:    One-line video description.
  74     uploader:       Full name of the video uploader.
  75     upload_date:    Video upload date (YYYYMMDD).
  76     uploader_id:    Nickname or id of the video uploader.
  77     location:       Physical location of the video.
  78     subtitles:      The subtitle file contents as a dictionary in the format
  79                     {language: subtitles}.
  80     duration:       Length of the video in seconds, as an integer.
  81     view_count:     How many users have watched the video on the platform.
  82     like_count:     Number of positive ratings of the video
  83     dislike_count:  Number of negative ratings of the video
  84     comment_count:  Number of comments on the video
  85     age_limit:      Age restriction for the video, as an integer (years)
  86     webpage_url:    The url to the video webpage, if given to youtube-dl it
  87                     should allow to get the same result again. (It will be set
  88                     by YoutubeDL if it's missing)
  89
  90     Unless mentioned otherwise, the fields should be Unicode strings.
  91
  92     Subclasses of this one should re-define the _real_initialize() and
  93     _real_extract() methods and define a _VALID_URL regexp.
  94     Probably, they should also be added to the list of extractors.
  95
  96     _real_extract() must return a *list* of information dictionaries as
  97     described above.
  98
  99     Finally, the _WORKING attribute should be set to False for broken IEs
 100     in order to warn the users and skip the tests.
 101     """
 102
 103     _ready = False
 104     _downloader = None
 105     _WORKING = True
 106
 107     def __init__(self, downloader=None):
 108         """Constructor. Receives an optional downloader."""
 109         self._ready = False
 110         self.set_downloader(downloader)
 111
 112     @classmethod
 113     def suitable(cls, url):
 114         """Receives a URL and returns True if suitable for this IE."""
 115
 116         # This does not use has/getattr intentionally - we want to know whether
 117         # we have cached the regexp for *this* class, whereas getattr would also
 118         # match the superclass
 119         if '_VALID_URL_RE' not in cls.__dict__:
 120             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 121         return cls._VALID_URL_RE.match(url) is not None
 122
 123     @classmethod
 124     def working(cls):
 125         """Getter method for _WORKING."""
 126         return cls._WORKING
 127
 128     def initialize(self):
 129         """Initializes an instance (authentication, etc)."""
 130         if not self._ready:
 131             self._real_initialize()
 132             self._ready = True
 133
 134     def extract(self, url):
 135         """Extracts URL information and returns it in list of dicts."""
 136         self.initialize()
 137         return self._real_extract(url)
 138
 139     def set_downloader(self, downloader):
 140         """Sets the downloader for this IE."""
 141         self._downloader = downloader
 142
 143     def _real_initialize(self):
 144         """Real initialization process. Redefine in subclasses."""
 145         pass
 146
 147     def _real_extract(self, url):
 148         """Real extraction process. Redefine in subclasses."""
 149         pass
 150
 151     @classmethod
 152     def ie_key(cls):
 153         """A string for getting the InfoExtractor with get_info_extractor"""
 154         return cls.__name__[:-2]
 155
 156     @property
 157     def IE_NAME(self):
 158         return type(self).__name__[:-2]
 159
 160     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 161         """ Returns the response handle """
 162         if note is None:
 163             self.report_download_webpage(video_id)
 164         elif note is not False:
 165             if video_id is None:
 166                 self.to_screen(u'%s' % (note,))
 167             else:
 168                 self.to_screen(u'%s: %s' % (video_id, note))
 169         try:
 170             return self._downloader.urlopen(url_or_request)
 171         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 172             if errnote is None:
 173                 errnote = u'Unable to download webpage'
 174             errmsg = u'%s: %s' % (errnote, compat_str(err))
 175             if fatal:
 176                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 177             else:
 178                 self._downloader.report_warning(errmsg)
 179                 return False
 180
 181     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 182         """ Returns a tuple (page content as string, URL handle) """
 183
 184         # Strip hashes from the URL (#1038)
 185         if isinstance(url_or_request, (compat_str, str)):
 186             url_or_request = url_or_request.partition('#')[0]
 187
 188         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 189         if urlh is False:
 190             assert not fatal
 191             return False
 192         content_type = urlh.headers.get('Content-Type', '')
 193         webpage_bytes = urlh.read()
 194         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 195         if m:
 196             encoding = m.group(1)
 197         else:
 198             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 199                           webpage_bytes[:1024])
 200             if m:
 201                 encoding = m.group(1).decode('ascii')
 202             else:
 203                 encoding = 'utf-8'
 204         if self._downloader.params.get('dump_intermediate_pages', False):
 205             try:
 206                 url = url_or_request.get_full_url()
 207             except AttributeError:
 208                 url = url_or_request
 209             self.to_screen(u'Dumping request to ' + url)
 210             dump = base64.b64encode(webpage_bytes).decode('ascii')
 211             self._downloader.to_screen(dump)
 212         if self._downloader.params.get('write_pages', False):
 213             try:
 214                 url = url_or_request.get_full_url()
 215             except AttributeError:
 216                 url = url_or_request
 217             raw_filename = ('%s_%s.dump' % (video_id, url))
 218             filename = sanitize_filename(raw_filename, restricted=True)
 219             self.to_screen(u'Saving request to ' + filename)
 220             with open(filename, 'wb') as outf:
 221                 outf.write(webpage_bytes)
 222
 223         content = webpage_bytes.decode(encoding, 'replace')
 224         return (content, urlh)
 225
 226     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 227         """ Returns the data of the page as a string """
 228         res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
 229         if res is False:
 230             return res
 231         else:
 232             content, _ = res
 233             return content
 234
 235     def _download_xml(self, url_or_request, video_id,
 236                       note=u'Downloading XML', errnote=u'Unable to download XML',
 237                       transform_source=None):
 238         """Return the xml as an xml.etree.ElementTree.Element"""
 239         xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
 240         if transform_source:
 241             xml_string = transform_source(xml_string)
 242         return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
 243
 244     def to_screen(self, msg):
 245         """Print msg to screen, prefixing it with '[ie_name]'"""
 246         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 247
 248     def report_extraction(self, id_or_name):
 249         """Report information extraction."""
 250         self.to_screen(u'%s: Extracting information' % id_or_name)
 251
 252     def report_download_webpage(self, video_id):
 253         """Report webpage download."""
 254         self.to_screen(u'%s: Downloading webpage' % video_id)
 255
 256     def report_age_confirmation(self):
 257         """Report attempt to confirm age."""
 258         self.to_screen(u'Confirming age')
 259
 260     def report_login(self):
 261         """Report attempt to log in."""
 262         self.to_screen(u'Logging in')
 263
 264     #Methods for following #608
 265     def url_result(self, url, ie=None, video_id=None):
 266         """Returns a url that points to a page that should be processed"""
 267         #TODO: ie should be the class used for getting the info
 268         video_info = {'_type': 'url',
 269                       'url': url,
 270                       'ie_key': ie}
 271         if video_id is not None:
 272             video_info['id'] = video_id
 273         return video_info
 274     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 275         """Returns a playlist"""
 276         video_info = {'_type': 'playlist',
 277                       'entries': entries}
 278         if playlist_id:
 279             video_info['id'] = playlist_id
 280         if playlist_title:
 281             video_info['title'] = playlist_title
 282         return video_info
 283
 284     def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 285         """
 286         Perform a regex search on the given string, using a single or a list of
 287         patterns returning the first matching group.
 288         In case of failure return a default value or raise a WARNING or a
 289         RegexNotFoundError, depending on fatal, specifying the field name.
 290         """
 291         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 292             mobj = re.search(pattern, string, flags)
 293         else:
 294             for p in pattern:
 295                 mobj = re.search(p, string, flags)
 296                 if mobj: break
 297
 298         if sys.stderr.isatty() and os.name != 'nt':
 299             _name = u'\033[0;34m%s\033[0m' % name
 300         else:
 301             _name = name
 302
 303         if mobj:
 304             # return the first matching group
 305             return next(g for g in mobj.groups() if g is not None)
 306         elif default is not None:
 307             return default
 308         elif fatal:
 309             raise RegexNotFoundError(u'Unable to extract %s' % _name)
 310         else:
 311             self._downloader.report_warning(u'unable to extract %s; '
 312                 u'please report this issue on http://yt-dl.org/bug' % _name)
 313             return None
 314
 315     def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 316         """
 317         Like _search_regex, but strips HTML tags and unescapes entities.
 318         """
 319         res = self._search_regex(pattern, string, name, default, fatal, flags)
 320         if res:
 321             return clean_html(res).strip()
 322         else:
 323             return res
 324
 325     def _get_login_info(self):
 326         """
 327         Get the the login info as (username, password)
 328         It will look in the netrc file using the _NETRC_MACHINE value
 329         If there's no info available, return (None, None)
 330         """
 331         if self._downloader is None:
 332             return (None, None)
 333
 334         username = None
 335         password = None
 336         downloader_params = self._downloader.params
 337
 338         # Attempt to use provided username and password or .netrc data
 339         if downloader_params.get('username', None) is not None:
 340             username = downloader_params['username']
 341             password = downloader_params['password']
 342         elif downloader_params.get('usenetrc', False):
 343             try:
 344                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 345                 if info is not None:
 346                     username = info[0]
 347                     password = info[2]
 348                 else:
 349                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 350             except (IOError, netrc.NetrcParseError) as err:
 351                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 352
 353         return (username, password)
 354
 355     # Helper functions for extracting OpenGraph info
 356     @staticmethod
 357     def _og_regexes(prop):
 358         content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
 359         property_re = r'property=[\'"]og:%s[\'"]' % re.escape(prop)
 360         template = r'<meta[^>]+?%s[^>]+?%s'
 361         return [
 362             template % (property_re, content_re),
 363             template % (content_re, property_re),
 364         ]
 365
 366     def _og_search_property(self, prop, html, name=None, **kargs):
 367         if name is None:
 368             name = 'OpenGraph %s' % prop
 369         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 370         if escaped is None:
 371             return None
 372         return unescapeHTML(escaped)
 373
 374     def _og_search_thumbnail(self, html, **kargs):
 375         return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
 376
 377     def _og_search_description(self, html, **kargs):
 378         return self._og_search_property('description', html, fatal=False, **kargs)
 379
 380     def _og_search_title(self, html, **kargs):
 381         return self._og_search_property('title', html, **kargs)
 382
 383     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 384         regexes = self._og_regexes('video')
 385         if secure: regexes = self._og_regexes('video:secure_url') + regexes
 386         return self._html_search_regex(regexes, html, name, **kargs)
 387
 388     def _html_search_meta(self, name, html, display_name=None):
 389         if display_name is None:
 390             display_name = name
 391         return self._html_search_regex(
 392             r'''(?ix)<meta
 393                     (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
 394                     [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
 395             html, display_name, fatal=False)
 396
 397     def _dc_search_uploader(self, html):
 398         return self._html_search_meta('dc.creator', html, 'uploader')
 399
 400     def _rta_search(self, html):
 401         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 402         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 403                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 404                      html):
 405             return 18
 406         return 0
 407
 408     def _media_rating_search(self, html):
 409         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 410         rating = self._html_search_meta('rating', html)
 411
 412         if not rating:
 413             return None
 414
 415         RATING_TABLE = {
 416             'safe for kids': 0,
 417             'general': 8,
 418             '14 years': 14,
 419             'mature': 17,
 420             'restricted': 19,
 421         }
 422         return RATING_TABLE.get(rating.lower(), None)
 423
 424
 425
 426 class SearchInfoExtractor(InfoExtractor):
 427     """
 428     Base class for paged search queries extractors.
 429     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 430     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 431     """
 432
 433     @classmethod
 434     def _make_valid_url(cls):
 435         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 436
 437     @classmethod
 438     def suitable(cls, url):
 439         return re.match(cls._make_valid_url(), url) is not None
 440
 441     def _real_extract(self, query):
 442         mobj = re.match(self._make_valid_url(), query)
 443         if mobj is None:
 444             raise ExtractorError(u'Invalid search query "%s"' % query)
 445
 446         prefix = mobj.group('prefix')
 447         query = mobj.group('query')
 448         if prefix == '':
 449             return self._get_n_results(query, 1)
 450         elif prefix == 'all':
 451             return self._get_n_results(query, self._MAX_RESULTS)
 452         else:
 453             n = int(prefix)
 454             if n <= 0:
 455                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
 456             elif n > self._MAX_RESULTS:
 457                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 458                 n = self._MAX_RESULTS
 459             return self._get_n_results(query, n)
 460
 461     def _get_n_results(self, query, n):
 462         """Get a specified number of results for a query"""
 463         raise NotImplementedError("This method must be implemented by subclasses")
 464
 465     @property
 466     def SEARCH_KEY(self):
 467         return self._SEARCH_KEY