youtube_dl/extractor/common.py

   1 import base64
   2 import os
   3 import re
   4 import socket
   5 import sys
   6 import netrc
   7 import xml.etree.ElementTree
   8
   9 from ..utils import (
  10     compat_http_client,
  11     compat_urllib_error,
  12     compat_urllib_parse_urlparse,
  13     compat_str,
  14
  15     clean_html,
  16     compiled_regex_type,
  17     ExtractorError,
  18     RegexNotFoundError,
  19     sanitize_filename,
  20     unescapeHTML,
  21 )
  22 _NO_DEFAULT = object()
  23
  24
  25 class InfoExtractor(object):
  26     """Information Extractor class.
  27
  28     Information extractors are the classes that, given a URL, extract
  29     information about the video (or videos) the URL refers to. This
  30     information includes the real video URL, the video title, author and
  31     others. The information is stored in a dictionary which is then
  32     passed to the FileDownloader. The FileDownloader processes this
  33     information possibly downloading the video to the file system, among
  34     other possible outcomes.
  35
  36     The dictionaries must include the following fields:
  37
  38     id:             Video identifier.
  39     title:          Video title, unescaped.
  40
  41     Additionally, it must contain either a formats entry or a url one:
  42
  43     formats:        A list of dictionaries for each format available, ordered
  44                     from worst to best quality.
  45
  46                     Potential fields:
  47                     * url        Mandatory. The URL of the video file
  48                     * ext        Will be calculated from url if missing
  49                     * format     A human-readable description of the format
  50                                  ("mp4 container with h264/opus").
  51                                  Calculated from the format_id, width, height.
  52                                  and format_note fields if missing.
  53                     * format_id  A short description of the format
  54                                  ("mp4_h264_opus" or "19")
  55                     * format_note Additional info about the format
  56                                  ("3D" or "DASH video")
  57                     * width      Width of the video, if known
  58                     * height     Height of the video, if known
  59                     * resolution Textual description of width and height
  60                     * tbr        Average bitrate of audio and video in KBit/s
  61                     * abr        Average audio bitrate in KBit/s
  62                     * acodec     Name of the audio codec in use
  63                     * vbr        Average video bitrate in KBit/s
  64                     * vcodec     Name of the video codec in use
  65                     * filesize   The number of bytes, if known in advance
  66                     * player_url SWF Player URL (used for rtmpdump).
  67                     * protocol   The protocol that will be used for the actual
  68                                  download, lower-case.
  69                                  "http", "https", "rtsp", "rtmp" or so.
  70                     * preference Order number of this format. If this field is
  71                                  present, the formats get sorted by this field.
  72                                  -1 for default (order by other properties),
  73                                  -2 or smaller for less than default.
  74     url:            Final video URL.
  75     ext:            Video filename extension.
  76     format:         The video format, defaults to ext (used for --get-format)
  77     player_url:     SWF Player URL (used for rtmpdump).
  78
  79     The following fields are optional:
  80
  81     thumbnails:     A list of dictionaries (with the entries "resolution" and
  82                     "url") for the varying thumbnails
  83     thumbnail:      Full URL to a video thumbnail image.
  84     description:    One-line video description.
  85     uploader:       Full name of the video uploader.
  86     upload_date:    Video upload date (YYYYMMDD).
  87     uploader_id:    Nickname or id of the video uploader.
  88     location:       Physical location of the video.
  89     subtitles:      The subtitle file contents as a dictionary in the format
  90                     {language: subtitles}.
  91     duration:       Length of the video in seconds, as an integer.
  92     view_count:     How many users have watched the video on the platform.
  93     like_count:     Number of positive ratings of the video
  94     dislike_count:  Number of negative ratings of the video
  95     comment_count:  Number of comments on the video
  96     age_limit:      Age restriction for the video, as an integer (years)
  97     webpage_url:    The url to the video webpage, if given to youtube-dl it
  98                     should allow to get the same result again. (It will be set
  99                     by YoutubeDL if it's missing)
 100
 101     Unless mentioned otherwise, the fields should be Unicode strings.
 102
 103     Subclasses of this one should re-define the _real_initialize() and
 104     _real_extract() methods and define a _VALID_URL regexp.
 105     Probably, they should also be added to the list of extractors.
 106
 107     _real_extract() must return a *list* of information dictionaries as
 108     described above.
 109
 110     Finally, the _WORKING attribute should be set to False for broken IEs
 111     in order to warn the users and skip the tests.
 112     """
 113
 114     _ready = False
 115     _downloader = None
 116     _WORKING = True
 117
 118     def __init__(self, downloader=None):
 119         """Constructor. Receives an optional downloader."""
 120         self._ready = False
 121         self.set_downloader(downloader)
 122
 123     @classmethod
 124     def suitable(cls, url):
 125         """Receives a URL and returns True if suitable for this IE."""
 126
 127         # This does not use has/getattr intentionally - we want to know whether
 128         # we have cached the regexp for *this* class, whereas getattr would also
 129         # match the superclass
 130         if '_VALID_URL_RE' not in cls.__dict__:
 131             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 132         return cls._VALID_URL_RE.match(url) is not None
 133
 134     @classmethod
 135     def working(cls):
 136         """Getter method for _WORKING."""
 137         return cls._WORKING
 138
 139     def initialize(self):
 140         """Initializes an instance (authentication, etc)."""
 141         if not self._ready:
 142             self._real_initialize()
 143             self._ready = True
 144
 145     def extract(self, url):
 146         """Extracts URL information and returns it in list of dicts."""
 147         self.initialize()
 148         return self._real_extract(url)
 149
 150     def set_downloader(self, downloader):
 151         """Sets the downloader for this IE."""
 152         self._downloader = downloader
 153
 154     def _real_initialize(self):
 155         """Real initialization process. Redefine in subclasses."""
 156         pass
 157
 158     def _real_extract(self, url):
 159         """Real extraction process. Redefine in subclasses."""
 160         pass
 161
 162     @classmethod
 163     def ie_key(cls):
 164         """A string for getting the InfoExtractor with get_info_extractor"""
 165         return cls.__name__[:-2]
 166
 167     @property
 168     def IE_NAME(self):
 169         return type(self).__name__[:-2]
 170
 171     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 172         """ Returns the response handle """
 173         if note is None:
 174             self.report_download_webpage(video_id)
 175         elif note is not False:
 176             if video_id is None:
 177                 self.to_screen(u'%s' % (note,))
 178             else:
 179                 self.to_screen(u'%s: %s' % (video_id, note))
 180         try:
 181             return self._downloader.urlopen(url_or_request)
 182         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 183             if errnote is False:
 184                 return False
 185             if errnote is None:
 186                 errnote = u'Unable to download webpage'
 187             errmsg = u'%s: %s' % (errnote, compat_str(err))
 188             if fatal:
 189                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 190             else:
 191                 self._downloader.report_warning(errmsg)
 192                 return False
 193
 194     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 195         """ Returns a tuple (page content as string, URL handle) """
 196
 197         # Strip hashes from the URL (#1038)
 198         if isinstance(url_or_request, (compat_str, str)):
 199             url_or_request = url_or_request.partition('#')[0]
 200
 201         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 202         if urlh is False:
 203             assert not fatal
 204             return False
 205         content_type = urlh.headers.get('Content-Type', '')
 206         webpage_bytes = urlh.read()
 207         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 208         if m:
 209             encoding = m.group(1)
 210         else:
 211             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 212                           webpage_bytes[:1024])
 213             if m:
 214                 encoding = m.group(1).decode('ascii')
 215             else:
 216                 encoding = 'utf-8'
 217         if self._downloader.params.get('dump_intermediate_pages', False):
 218             try:
 219                 url = url_or_request.get_full_url()
 220             except AttributeError:
 221                 url = url_or_request
 222             self.to_screen(u'Dumping request to ' + url)
 223             dump = base64.b64encode(webpage_bytes).decode('ascii')
 224             self._downloader.to_screen(dump)
 225         if self._downloader.params.get('write_pages', False):
 226             try:
 227                 url = url_or_request.get_full_url()
 228             except AttributeError:
 229                 url = url_or_request
 230             raw_filename = ('%s_%s.dump' % (video_id, url))
 231             filename = sanitize_filename(raw_filename, restricted=True)
 232             self.to_screen(u'Saving request to ' + filename)
 233             with open(filename, 'wb') as outf:
 234                 outf.write(webpage_bytes)
 235
 236         content = webpage_bytes.decode(encoding, 'replace')
 237         return (content, urlh)
 238
 239     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 240         """ Returns the data of the page as a string """
 241         res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
 242         if res is False:
 243             return res
 244         else:
 245             content, _ = res
 246             return content
 247
 248     def _download_xml(self, url_or_request, video_id,
 249                       note=u'Downloading XML', errnote=u'Unable to download XML',
 250                       transform_source=None):
 251         """Return the xml as an xml.etree.ElementTree.Element"""
 252         xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
 253         if transform_source:
 254             xml_string = transform_source(xml_string)
 255         return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
 256
 257     def report_warning(self, msg, video_id=None):
 258         idstr = u'' if video_id is None else u'%s: ' % video_id
 259         self._downloader.report_warning(
 260             u'[%s] %s%s' % (self.IE_NAME, idstr, msg))
 261
 262     def to_screen(self, msg):
 263         """Print msg to screen, prefixing it with '[ie_name]'"""
 264         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 265
 266     def report_extraction(self, id_or_name):
 267         """Report information extraction."""
 268         self.to_screen(u'%s: Extracting information' % id_or_name)
 269
 270     def report_download_webpage(self, video_id):
 271         """Report webpage download."""
 272         self.to_screen(u'%s: Downloading webpage' % video_id)
 273
 274     def report_age_confirmation(self):
 275         """Report attempt to confirm age."""
 276         self.to_screen(u'Confirming age')
 277
 278     def report_login(self):
 279         """Report attempt to log in."""
 280         self.to_screen(u'Logging in')
 281
 282     #Methods for following #608
 283     @staticmethod
 284     def url_result(url, ie=None, video_id=None):
 285         """Returns a url that points to a page that should be processed"""
 286         #TODO: ie should be the class used for getting the info
 287         video_info = {'_type': 'url',
 288                       'url': url,
 289                       'ie_key': ie}
 290         if video_id is not None:
 291             video_info['id'] = video_id
 292         return video_info
 293     @staticmethod
 294     def playlist_result(entries, playlist_id=None, playlist_title=None):
 295         """Returns a playlist"""
 296         video_info = {'_type': 'playlist',
 297                       'entries': entries}
 298         if playlist_id:
 299             video_info['id'] = playlist_id
 300         if playlist_title:
 301             video_info['title'] = playlist_title
 302         return video_info
 303
 304     def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
 305         """
 306         Perform a regex search on the given string, using a single or a list of
 307         patterns returning the first matching group.
 308         In case of failure return a default value or raise a WARNING or a
 309         RegexNotFoundError, depending on fatal, specifying the field name.
 310         """
 311         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 312             mobj = re.search(pattern, string, flags)
 313         else:
 314             for p in pattern:
 315                 mobj = re.search(p, string, flags)
 316                 if mobj: break
 317
 318         if os.name != 'nt' and sys.stderr.isatty():
 319             _name = u'\033[0;34m%s\033[0m' % name
 320         else:
 321             _name = name
 322
 323         if mobj:
 324             # return the first matching group
 325             return next(g for g in mobj.groups() if g is not None)
 326         elif default is not _NO_DEFAULT:
 327             return default
 328         elif fatal:
 329             raise RegexNotFoundError(u'Unable to extract %s' % _name)
 330         else:
 331             self._downloader.report_warning(u'unable to extract %s; '
 332                 u'please report this issue on http://yt-dl.org/bug' % _name)
 333             return None
 334
 335     def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
 336         """
 337         Like _search_regex, but strips HTML tags and unescapes entities.
 338         """
 339         res = self._search_regex(pattern, string, name, default, fatal, flags)
 340         if res:
 341             return clean_html(res).strip()
 342         else:
 343             return res
 344
 345     def _get_login_info(self):
 346         """
 347         Get the the login info as (username, password)
 348         It will look in the netrc file using the _NETRC_MACHINE value
 349         If there's no info available, return (None, None)
 350         """
 351         if self._downloader is None:
 352             return (None, None)
 353
 354         username = None
 355         password = None
 356         downloader_params = self._downloader.params
 357
 358         # Attempt to use provided username and password or .netrc data
 359         if downloader_params.get('username', None) is not None:
 360             username = downloader_params['username']
 361             password = downloader_params['password']
 362         elif downloader_params.get('usenetrc', False):
 363             try:
 364                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 365                 if info is not None:
 366                     username = info[0]
 367                     password = info[2]
 368                 else:
 369                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 370             except (IOError, netrc.NetrcParseError) as err:
 371                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 372
 373         return (username, password)
 374
 375     # Helper functions for extracting OpenGraph info
 376     @staticmethod
 377     def _og_regexes(prop):
 378         content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
 379         property_re = r'property=[\'"]og:%s[\'"]' % re.escape(prop)
 380         template = r'<meta[^>]+?%s[^>]+?%s'
 381         return [
 382             template % (property_re, content_re),
 383             template % (content_re, property_re),
 384         ]
 385
 386     def _og_search_property(self, prop, html, name=None, **kargs):
 387         if name is None:
 388             name = 'OpenGraph %s' % prop
 389         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 390         if escaped is None:
 391             return None
 392         return unescapeHTML(escaped)
 393
 394     def _og_search_thumbnail(self, html, **kargs):
 395         return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
 396
 397     def _og_search_description(self, html, **kargs):
 398         return self._og_search_property('description', html, fatal=False, **kargs)
 399
 400     def _og_search_title(self, html, **kargs):
 401         return self._og_search_property('title', html, **kargs)
 402
 403     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 404         regexes = self._og_regexes('video')
 405         if secure: regexes = self._og_regexes('video:secure_url') + regexes
 406         return self._html_search_regex(regexes, html, name, **kargs)
 407
 408     def _html_search_meta(self, name, html, display_name=None):
 409         if display_name is None:
 410             display_name = name
 411         return self._html_search_regex(
 412             r'''(?ix)<meta
 413                     (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
 414                     [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
 415             html, display_name, fatal=False)
 416
 417     def _dc_search_uploader(self, html):
 418         return self._html_search_meta('dc.creator', html, 'uploader')
 419
 420     def _rta_search(self, html):
 421         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 422         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 423                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 424                      html):
 425             return 18
 426         return 0
 427
 428     def _media_rating_search(self, html):
 429         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 430         rating = self._html_search_meta('rating', html)
 431
 432         if not rating:
 433             return None
 434
 435         RATING_TABLE = {
 436             'safe for kids': 0,
 437             'general': 8,
 438             '14 years': 14,
 439             'mature': 17,
 440             'restricted': 19,
 441         }
 442         return RATING_TABLE.get(rating.lower(), None)
 443
 444     def _sort_formats(self, formats):
 445         def _formats_key(f):
 446             # TODO remove the following workaround
 447             from ..utils import determine_ext
 448             if not f.get('ext') and 'url' in f:
 449                 f['ext'] = determine_ext(f['url'])
 450
 451             preference = f.get('preference')
 452             if preference is None:
 453                 proto = f.get('protocol')
 454                 if proto is None:
 455                     proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
 456
 457                 preference = 0 if proto in ['http', 'https'] else -0.1
 458                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 459                     preference -= 0.5
 460
 461             if f.get('vcodec') == 'none':  # audio only
 462                 if self._downloader.params.get('prefer_free_formats'):
 463                     ORDER = [u'aac', u'mp3', u'm4a', u'webm', u'ogg', u'opus']
 464                 else:
 465                     ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a']
 466                 ext_preference = 0
 467                 try:
 468                     audio_ext_preference = ORDER.index(f['ext'])
 469                 except ValueError:
 470                     audio_ext_preference = -1
 471             else:
 472                 if self._downloader.params.get('prefer_free_formats'):
 473                     ORDER = [u'flv', u'mp4', u'webm']
 474                 else:
 475                     ORDER = [u'webm', u'flv', u'mp4']
 476                 try:
 477                     ext_preference = ORDER.index(f['ext'])
 478                 except ValueError:
 479                     ext_preference = -1
 480                 audio_ext_preference = 0
 481
 482             return (
 483                 preference,
 484                 f.get('height') if f.get('height') is not None else -1,
 485                 f.get('width') if f.get('width') is not None else -1,
 486                 ext_preference,
 487                 f.get('vbr') if f.get('vbr') is not None else -1,
 488                 f.get('abr') if f.get('abr') is not None else -1,
 489                 audio_ext_preference,
 490                 f.get('filesize') if f.get('filesize') is not None else -1,
 491                 f.get('format_id'),
 492             )
 493         formats.sort(key=_formats_key)
 494
 495
 496 class SearchInfoExtractor(InfoExtractor):
 497     """
 498     Base class for paged search queries extractors.
 499     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 500     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 501     """
 502
 503     @classmethod
 504     def _make_valid_url(cls):
 505         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 506
 507     @classmethod
 508     def suitable(cls, url):
 509         return re.match(cls._make_valid_url(), url) is not None
 510
 511     def _real_extract(self, query):
 512         mobj = re.match(self._make_valid_url(), query)
 513         if mobj is None:
 514             raise ExtractorError(u'Invalid search query "%s"' % query)
 515
 516         prefix = mobj.group('prefix')
 517         query = mobj.group('query')
 518         if prefix == '':
 519             return self._get_n_results(query, 1)
 520         elif prefix == 'all':
 521             return self._get_n_results(query, self._MAX_RESULTS)
 522         else:
 523             n = int(prefix)
 524             if n <= 0:
 525                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
 526             elif n > self._MAX_RESULTS:
 527                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 528                 n = self._MAX_RESULTS
 529             return self._get_n_results(query, n)
 530
 531     def _get_n_results(self, query, n):
 532         """Get a specified number of results for a query"""
 533         raise NotImplementedError("This method must be implemented by subclasses")
 534
 535     @property
 536     def SEARCH_KEY(self):
 537         return self._SEARCH_KEY