youtube_dl/extractor/common.py

   1 import base64
   2 import os
   3 import re
   4 import socket
   5 import sys
   6 import netrc
   7 import xml.etree.ElementTree
   8
   9 from ..utils import (
  10     compat_http_client,
  11     compat_urllib_error,
  12     compat_str,
  13
  14     clean_html,
  15     compiled_regex_type,
  16     ExtractorError,
  17     RegexNotFoundError,
  18     sanitize_filename,
  19     unescapeHTML,
  20 )
  21 _NO_DEFAULT = object()
  22
  23
  24 class InfoExtractor(object):
  25     """Information Extractor class.
  26
  27     Information extractors are the classes that, given a URL, extract
  28     information about the video (or videos) the URL refers to. This
  29     information includes the real video URL, the video title, author and
  30     others. The information is stored in a dictionary which is then
  31     passed to the FileDownloader. The FileDownloader processes this
  32     information possibly downloading the video to the file system, among
  33     other possible outcomes.
  34
  35     The dictionaries must include the following fields:
  36
  37     id:             Video identifier.
  38     title:          Video title, unescaped.
  39
  40     Additionally, it must contain either a formats entry or a url one:
  41
  42     formats:        A list of dictionaries for each format available, ordered
  43                     from worst to best quality.
  44
  45                     Potential fields:
  46                     * url        Mandatory. The URL of the video file
  47                     * ext        Will be calculated from url if missing
  48                     * format     A human-readable description of the format
  49                                  ("mp4 container with h264/opus").
  50                                  Calculated from the format_id, width, height.
  51                                  and format_note fields if missing.
  52                     * format_id  A short description of the format
  53                                  ("mp4_h264_opus" or "19")
  54                     * format_note Additional info about the format
  55                                  ("3D" or "DASH video")
  56                     * width      Width of the video, if known
  57                     * height     Height of the video, if known
  58                     * resolution Textual description of width and height
  59                     * abr        Average audio bitrate in KBit/s
  60                     * acodec     Name of the audio codec in use
  61                     * vbr        Average video bitrate in KBit/s
  62                     * vcodec     Name of the video codec in use
  63                     * filesize   The number of bytes, if known in advance
  64                     * player_url SWF Player URL (used for rtmpdump).
  65                     * preference Order number of this format. If this field is
  66                                  present, the formats get sorted by this field.
  67                                  -1 for default (order by other properties),
  68                                  -2 or smaller for less than default.
  69     url:            Final video URL.
  70     ext:            Video filename extension.
  71     format:         The video format, defaults to ext (used for --get-format)
  72     player_url:     SWF Player URL (used for rtmpdump).
  73
  74     The following fields are optional:
  75
  76     thumbnails:     A list of dictionaries (with the entries "resolution" and
  77                     "url") for the varying thumbnails
  78     thumbnail:      Full URL to a video thumbnail image.
  79     description:    One-line video description.
  80     uploader:       Full name of the video uploader.
  81     upload_date:    Video upload date (YYYYMMDD).
  82     uploader_id:    Nickname or id of the video uploader.
  83     location:       Physical location of the video.
  84     subtitles:      The subtitle file contents as a dictionary in the format
  85                     {language: subtitles}.
  86     duration:       Length of the video in seconds, as an integer.
  87     view_count:     How many users have watched the video on the platform.
  88     like_count:     Number of positive ratings of the video
  89     dislike_count:  Number of negative ratings of the video
  90     comment_count:  Number of comments on the video
  91     age_limit:      Age restriction for the video, as an integer (years)
  92     webpage_url:    The url to the video webpage, if given to youtube-dl it
  93                     should allow to get the same result again. (It will be set
  94                     by YoutubeDL if it's missing)
  95
  96     Unless mentioned otherwise, the fields should be Unicode strings.
  97
  98     Subclasses of this one should re-define the _real_initialize() and
  99     _real_extract() methods and define a _VALID_URL regexp.
 100     Probably, they should also be added to the list of extractors.
 101
 102     _real_extract() must return a *list* of information dictionaries as
 103     described above.
 104
 105     Finally, the _WORKING attribute should be set to False for broken IEs
 106     in order to warn the users and skip the tests.
 107     """
 108
 109     _ready = False
 110     _downloader = None
 111     _WORKING = True
 112
 113     def __init__(self, downloader=None):
 114         """Constructor. Receives an optional downloader."""
 115         self._ready = False
 116         self.set_downloader(downloader)
 117
 118     @classmethod
 119     def suitable(cls, url):
 120         """Receives a URL and returns True if suitable for this IE."""
 121
 122         # This does not use has/getattr intentionally - we want to know whether
 123         # we have cached the regexp for *this* class, whereas getattr would also
 124         # match the superclass
 125         if '_VALID_URL_RE' not in cls.__dict__:
 126             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 127         return cls._VALID_URL_RE.match(url) is not None
 128
 129     @classmethod
 130     def working(cls):
 131         """Getter method for _WORKING."""
 132         return cls._WORKING
 133
 134     def initialize(self):
 135         """Initializes an instance (authentication, etc)."""
 136         if not self._ready:
 137             self._real_initialize()
 138             self._ready = True
 139
 140     def extract(self, url):
 141         """Extracts URL information and returns it in list of dicts."""
 142         self.initialize()
 143         return self._real_extract(url)
 144
 145     def set_downloader(self, downloader):
 146         """Sets the downloader for this IE."""
 147         self._downloader = downloader
 148
 149     def _real_initialize(self):
 150         """Real initialization process. Redefine in subclasses."""
 151         pass
 152
 153     def _real_extract(self, url):
 154         """Real extraction process. Redefine in subclasses."""
 155         pass
 156
 157     @classmethod
 158     def ie_key(cls):
 159         """A string for getting the InfoExtractor with get_info_extractor"""
 160         return cls.__name__[:-2]
 161
 162     @property
 163     def IE_NAME(self):
 164         return type(self).__name__[:-2]
 165
 166     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 167         """ Returns the response handle """
 168         if note is None:
 169             self.report_download_webpage(video_id)
 170         elif note is not False:
 171             if video_id is None:
 172                 self.to_screen(u'%s' % (note,))
 173             else:
 174                 self.to_screen(u'%s: %s' % (video_id, note))
 175         try:
 176             return self._downloader.urlopen(url_or_request)
 177         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 178             if errnote is False:
 179                 return False
 180             if errnote is None:
 181                 errnote = u'Unable to download webpage'
 182             errmsg = u'%s: %s' % (errnote, compat_str(err))
 183             if fatal:
 184                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 185             else:
 186                 self._downloader.report_warning(errmsg)
 187                 return False
 188
 189     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 190         """ Returns a tuple (page content as string, URL handle) """
 191
 192         # Strip hashes from the URL (#1038)
 193         if isinstance(url_or_request, (compat_str, str)):
 194             url_or_request = url_or_request.partition('#')[0]
 195
 196         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 197         if urlh is False:
 198             assert not fatal
 199             return False
 200         content_type = urlh.headers.get('Content-Type', '')
 201         webpage_bytes = urlh.read()
 202         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 203         if m:
 204             encoding = m.group(1)
 205         else:
 206             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 207                           webpage_bytes[:1024])
 208             if m:
 209                 encoding = m.group(1).decode('ascii')
 210             else:
 211                 encoding = 'utf-8'
 212         if self._downloader.params.get('dump_intermediate_pages', False):
 213             try:
 214                 url = url_or_request.get_full_url()
 215             except AttributeError:
 216                 url = url_or_request
 217             self.to_screen(u'Dumping request to ' + url)
 218             dump = base64.b64encode(webpage_bytes).decode('ascii')
 219             self._downloader.to_screen(dump)
 220         if self._downloader.params.get('write_pages', False):
 221             try:
 222                 url = url_or_request.get_full_url()
 223             except AttributeError:
 224                 url = url_or_request
 225             raw_filename = ('%s_%s.dump' % (video_id, url))
 226             filename = sanitize_filename(raw_filename, restricted=True)
 227             self.to_screen(u'Saving request to ' + filename)
 228             with open(filename, 'wb') as outf:
 229                 outf.write(webpage_bytes)
 230
 231         content = webpage_bytes.decode(encoding, 'replace')
 232         return (content, urlh)
 233
 234     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 235         """ Returns the data of the page as a string """
 236         res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
 237         if res is False:
 238             return res
 239         else:
 240             content, _ = res
 241             return content
 242
 243     def _download_xml(self, url_or_request, video_id,
 244                       note=u'Downloading XML', errnote=u'Unable to download XML',
 245                       transform_source=None):
 246         """Return the xml as an xml.etree.ElementTree.Element"""
 247         xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
 248         if transform_source:
 249             xml_string = transform_source(xml_string)
 250         return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
 251
 252     def report_warning(self, msg, video_id=None):
 253         idstr = u'' if video_id is None else u'%s: ' % video_id
 254         self._downloader.report_warning(
 255             u'[%s] %s%s' % (self.IE_NAME, idstr, msg))
 256
 257     def to_screen(self, msg):
 258         """Print msg to screen, prefixing it with '[ie_name]'"""
 259         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 260
 261     def report_extraction(self, id_or_name):
 262         """Report information extraction."""
 263         self.to_screen(u'%s: Extracting information' % id_or_name)
 264
 265     def report_download_webpage(self, video_id):
 266         """Report webpage download."""
 267         self.to_screen(u'%s: Downloading webpage' % video_id)
 268
 269     def report_age_confirmation(self):
 270         """Report attempt to confirm age."""
 271         self.to_screen(u'Confirming age')
 272
 273     def report_login(self):
 274         """Report attempt to log in."""
 275         self.to_screen(u'Logging in')
 276
 277     #Methods for following #608
 278     @staticmethod
 279     def url_result(url, ie=None, video_id=None):
 280         """Returns a url that points to a page that should be processed"""
 281         #TODO: ie should be the class used for getting the info
 282         video_info = {'_type': 'url',
 283                       'url': url,
 284                       'ie_key': ie}
 285         if video_id is not None:
 286             video_info['id'] = video_id
 287         return video_info
 288     @staticmethod
 289     def playlist_result(entries, playlist_id=None, playlist_title=None):
 290         """Returns a playlist"""
 291         video_info = {'_type': 'playlist',
 292                       'entries': entries}
 293         if playlist_id:
 294             video_info['id'] = playlist_id
 295         if playlist_title:
 296             video_info['title'] = playlist_title
 297         return video_info
 298
 299     def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
 300         """
 301         Perform a regex search on the given string, using a single or a list of
 302         patterns returning the first matching group.
 303         In case of failure return a default value or raise a WARNING or a
 304         RegexNotFoundError, depending on fatal, specifying the field name.
 305         """
 306         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 307             mobj = re.search(pattern, string, flags)
 308         else:
 309             for p in pattern:
 310                 mobj = re.search(p, string, flags)
 311                 if mobj: break
 312
 313         if os.name != 'nt' and sys.stderr.isatty():
 314             _name = u'\033[0;34m%s\033[0m' % name
 315         else:
 316             _name = name
 317
 318         if mobj:
 319             # return the first matching group
 320             return next(g for g in mobj.groups() if g is not None)
 321         elif default is not _NO_DEFAULT:
 322             return default
 323         elif fatal:
 324             raise RegexNotFoundError(u'Unable to extract %s' % _name)
 325         else:
 326             self._downloader.report_warning(u'unable to extract %s; '
 327                 u'please report this issue on http://yt-dl.org/bug' % _name)
 328             return None
 329
 330     def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
 331         """
 332         Like _search_regex, but strips HTML tags and unescapes entities.
 333         """
 334         res = self._search_regex(pattern, string, name, default, fatal, flags)
 335         if res:
 336             return clean_html(res).strip()
 337         else:
 338             return res
 339
 340     def _get_login_info(self):
 341         """
 342         Get the the login info as (username, password)
 343         It will look in the netrc file using the _NETRC_MACHINE value
 344         If there's no info available, return (None, None)
 345         """
 346         if self._downloader is None:
 347             return (None, None)
 348
 349         username = None
 350         password = None
 351         downloader_params = self._downloader.params
 352
 353         # Attempt to use provided username and password or .netrc data
 354         if downloader_params.get('username', None) is not None:
 355             username = downloader_params['username']
 356             password = downloader_params['password']
 357         elif downloader_params.get('usenetrc', False):
 358             try:
 359                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 360                 if info is not None:
 361                     username = info[0]
 362                     password = info[2]
 363                 else:
 364                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 365             except (IOError, netrc.NetrcParseError) as err:
 366                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 367
 368         return (username, password)
 369
 370     # Helper functions for extracting OpenGraph info
 371     @staticmethod
 372     def _og_regexes(prop):
 373         content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
 374         property_re = r'property=[\'"]og:%s[\'"]' % re.escape(prop)
 375         template = r'<meta[^>]+?%s[^>]+?%s'
 376         return [
 377             template % (property_re, content_re),
 378             template % (content_re, property_re),
 379         ]
 380
 381     def _og_search_property(self, prop, html, name=None, **kargs):
 382         if name is None:
 383             name = 'OpenGraph %s' % prop
 384         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 385         if escaped is None:
 386             return None
 387         return unescapeHTML(escaped)
 388
 389     def _og_search_thumbnail(self, html, **kargs):
 390         return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
 391
 392     def _og_search_description(self, html, **kargs):
 393         return self._og_search_property('description', html, fatal=False, **kargs)
 394
 395     def _og_search_title(self, html, **kargs):
 396         return self._og_search_property('title', html, **kargs)
 397
 398     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 399         regexes = self._og_regexes('video')
 400         if secure: regexes = self._og_regexes('video:secure_url') + regexes
 401         return self._html_search_regex(regexes, html, name, **kargs)
 402
 403     def _html_search_meta(self, name, html, display_name=None):
 404         if display_name is None:
 405             display_name = name
 406         return self._html_search_regex(
 407             r'''(?ix)<meta
 408                     (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
 409                     [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
 410             html, display_name, fatal=False)
 411
 412     def _dc_search_uploader(self, html):
 413         return self._html_search_meta('dc.creator', html, 'uploader')
 414
 415     def _rta_search(self, html):
 416         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 417         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 418                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 419                      html):
 420             return 18
 421         return 0
 422
 423     def _media_rating_search(self, html):
 424         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 425         rating = self._html_search_meta('rating', html)
 426
 427         if not rating:
 428             return None
 429
 430         RATING_TABLE = {
 431             'safe for kids': 0,
 432             'general': 8,
 433             '14 years': 14,
 434             'mature': 17,
 435             'restricted': 19,
 436         }
 437         return RATING_TABLE.get(rating.lower(), None)
 438
 439
 440
 441 class SearchInfoExtractor(InfoExtractor):
 442     """
 443     Base class for paged search queries extractors.
 444     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 445     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 446     """
 447
 448     @classmethod
 449     def _make_valid_url(cls):
 450         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 451
 452     @classmethod
 453     def suitable(cls, url):
 454         return re.match(cls._make_valid_url(), url) is not None
 455
 456     def _real_extract(self, query):
 457         mobj = re.match(self._make_valid_url(), query)
 458         if mobj is None:
 459             raise ExtractorError(u'Invalid search query "%s"' % query)
 460
 461         prefix = mobj.group('prefix')
 462         query = mobj.group('query')
 463         if prefix == '':
 464             return self._get_n_results(query, 1)
 465         elif prefix == 'all':
 466             return self._get_n_results(query, self._MAX_RESULTS)
 467         else:
 468             n = int(prefix)
 469             if n <= 0:
 470                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
 471             elif n > self._MAX_RESULTS:
 472                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 473                 n = self._MAX_RESULTS
 474             return self._get_n_results(query, n)
 475
 476     def _get_n_results(self, query, n):
 477         """Get a specified number of results for a query"""
 478         raise NotImplementedError("This method must be implemented by subclasses")
 479
 480     @property
 481     def SEARCH_KEY(self):
 482         return self._SEARCH_KEY