youtube_dl/extractor/common.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import base64
   5 import datetime
   6 import hashlib
   7 import json
   8 import netrc
   9 import os
  10 import random
  11 import re
  12 import socket
  13 import sys
  14 import time
  15 import math
  16
  17 from ..compat import (
  18     compat_cookiejar,
  19     compat_cookies,
  20     compat_etree_Element,
  21     compat_etree_fromstring,
  22     compat_getpass,
  23     compat_integer_types,
  24     compat_http_client,
  25     compat_os_name,
  26     compat_str,
  27     compat_urllib_error,
  28     compat_urllib_parse_unquote,
  29     compat_urllib_parse_urlencode,
  30     compat_urllib_request,
  31     compat_urlparse,
  32     compat_xml_parse_error,
  33 )
  34 from ..downloader.f4m import (
  35     get_base_url,
  36     remove_encrypted_media,
  37 )
  38 from ..utils import (
  39     NO_DEFAULT,
  40     age_restricted,
  41     base_url,
  42     bug_reports_message,
  43     clean_html,
  44     compiled_regex_type,
  45     determine_ext,
  46     determine_protocol,
  47     error_to_compat_str,
  48     ExtractorError,
  49     extract_attributes,
  50     fix_xml_ampersands,
  51     float_or_none,
  52     GeoRestrictedError,
  53     GeoUtils,
  54     int_or_none,
  55     js_to_json,
  56     JSON_LD_RE,
  57     mimetype2ext,
  58     orderedSet,
  59     parse_codecs,
  60     parse_duration,
  61     parse_iso8601,
  62     parse_m3u8_attributes,
  63     RegexNotFoundError,
  64     sanitized_Request,
  65     sanitize_filename,
  66     unescapeHTML,
  67     unified_strdate,
  68     unified_timestamp,
  69     update_Request,
  70     update_url_query,
  71     urljoin,
  72     url_basename,
  73     url_or_none,
  74     xpath_element,
  75     xpath_text,
  76     xpath_with_ns,
  77 )
  78
  79
  80 class InfoExtractor(object):
  81     """Information Extractor class.
  82
  83     Information extractors are the classes that, given a URL, extract
  84     information about the video (or videos) the URL refers to. This
  85     information includes the real video URL, the video title, author and
  86     others. The information is stored in a dictionary which is then
  87     passed to the YoutubeDL. The YoutubeDL processes this
  88     information possibly downloading the video to the file system, among
  89     other possible outcomes.
  90
  91     The type field determines the type of the result.
  92     By far the most common value (and the default if _type is missing) is
  93     "video", which indicates a single video.
  94
  95     For a video, the dictionaries must include the following fields:
  96
  97     id:             Video identifier.
  98     title:          Video title, unescaped.
  99
 100     Additionally, it must contain either a formats entry or a url one:
 101
 102     formats:        A list of dictionaries for each format available, ordered
 103                     from worst to best quality.
 104
 105                     Potential fields:
 106                     * url        The mandatory URL representing the media:
 107                                    for plain file media - HTTP URL of this file,
 108                                    for RTMP - RTMP URL,
 109                                    for HLS - URL of the M3U8 media playlist,
 110                                    for HDS - URL of the F4M manifest,
 111                                    for DASH
 112                                      - HTTP URL to plain file media (in case of
 113                                        unfragmented media)
 114                                      - URL of the MPD manifest or base URL
 115                                        representing the media if MPD manifest
 116                                        is parsed froma string (in case of
 117                                        fragmented media)
 118                                    for MSS - URL of the ISM manifest.
 119                     * manifest_url
 120                                  The URL of the manifest file in case of
 121                                  fragmented media:
 122                                    for HLS - URL of the M3U8 master playlist,
 123                                    for HDS - URL of the F4M manifest,
 124                                    for DASH - URL of the MPD manifest,
 125                                    for MSS - URL of the ISM manifest.
 126                     * ext        Will be calculated from URL if missing
 127                     * format     A human-readable description of the format
 128                                  ("mp4 container with h264/opus").
 129                                  Calculated from the format_id, width, height.
 130                                  and format_note fields if missing.
 131                     * format_id  A short description of the format
 132                                  ("mp4_h264_opus" or "19").
 133                                 Technically optional, but strongly recommended.
 134                     * format_note Additional info about the format
 135                                  ("3D" or "DASH video")
 136                     * width      Width of the video, if known
 137                     * height     Height of the video, if known
 138                     * resolution Textual description of width and height
 139                     * tbr        Average bitrate of audio and video in KBit/s
 140                     * abr        Average audio bitrate in KBit/s
 141                     * acodec     Name of the audio codec in use
 142                     * asr        Audio sampling rate in Hertz
 143                     * vbr        Average video bitrate in KBit/s
 144                     * fps        Frame rate
 145                     * vcodec     Name of the video codec in use
 146                     * container  Name of the container format
 147                     * filesize   The number of bytes, if known in advance
 148                     * filesize_approx  An estimate for the number of bytes
 149                     * player_url SWF Player URL (used for rtmpdump).
 150                     * protocol   The protocol that will be used for the actual
 151                                  download, lower-case.
 152                                  "http", "https", "rtsp", "rtmp", "rtmpe",
 153                                  "m3u8", "m3u8_native" or "http_dash_segments".
 154                     * fragment_base_url
 155                                  Base URL for fragments. Each fragment's path
 156                                  value (if present) will be relative to
 157                                  this URL.
 158                     * fragments  A list of fragments of a fragmented media.
 159                                  Each fragment entry must contain either an url
 160                                  or a path. If an url is present it should be
 161                                  considered by a client. Otherwise both path and
 162                                  fragment_base_url must be present. Here is
 163                                  the list of all potential fields:
 164                                  * "url" - fragment's URL
 165                                  * "path" - fragment's path relative to
 166                                             fragment_base_url
 167                                  * "duration" (optional, int or float)
 168                                  * "filesize" (optional, int)
 169                     * preference Order number of this format. If this field is
 170                                  present and not None, the formats get sorted
 171                                  by this field, regardless of all other values.
 172                                  -1 for default (order by other properties),
 173                                  -2 or smaller for less than default.
 174                                  < -1000 to hide the format (if there is
 175                                     another one which is strictly better)
 176                     * language   Language code, e.g. "de" or "en-US".
 177                     * language_preference  Is this in the language mentioned in
 178                                  the URL?
 179                                  10 if it's what the URL is about,
 180                                  -1 for default (don't know),
 181                                  -10 otherwise, other values reserved for now.
 182                     * quality    Order number of the video quality of this
 183                                  format, irrespective of the file format.
 184                                  -1 for default (order by other properties),
 185                                  -2 or smaller for less than default.
 186                     * source_preference  Order number for this video source
 187                                   (quality takes higher priority)
 188                                  -1 for default (order by other properties),
 189                                  -2 or smaller for less than default.
 190                     * http_headers  A dictionary of additional HTTP headers
 191                                  to add to the request.
 192                     * stretched_ratio  If given and not 1, indicates that the
 193                                  video's pixels are not square.
 194                                  width : height ratio as float.
 195                     * no_resume  The server does not support resuming the
 196                                  (HTTP or RTMP) download. Boolean.
 197                     * downloader_options  A dictionary of downloader options as
 198                                  described in FileDownloader
 199
 200     url:            Final video URL.
 201     ext:            Video filename extension.
 202     format:         The video format, defaults to ext (used for --get-format)
 203     player_url:     SWF Player URL (used for rtmpdump).
 204
 205     The following fields are optional:
 206
 207     alt_title:      A secondary title of the video.
 208     display_id      An alternative identifier for the video, not necessarily
 209                     unique, but available before title. Typically, id is
 210                     something like "4234987", title "Dancing naked mole rats",
 211                     and display_id "dancing-naked-mole-rats"
 212     thumbnails:     A list of dictionaries, with the following entries:
 213                         * "id" (optional, string) - Thumbnail format ID
 214                         * "url"
 215                         * "preference" (optional, int) - quality of the image
 216                         * "width" (optional, int)
 217                         * "height" (optional, int)
 218                         * "resolution" (optional, string "{width}x{height"},
 219                                         deprecated)
 220                         * "filesize" (optional, int)
 221     thumbnail:      Full URL to a video thumbnail image.
 222     description:    Full video description.
 223     uploader:       Full name of the video uploader.
 224     license:        License name the video is licensed under.
 225     creator:        The creator of the video.
 226     release_date:   The date (YYYYMMDD) when the video was released.
 227     timestamp:      UNIX timestamp of the moment the video became available.
 228     upload_date:    Video upload date (YYYYMMDD).
 229                     If not explicitly set, calculated from timestamp.
 230     uploader_id:    Nickname or id of the video uploader.
 231     uploader_url:   Full URL to a personal webpage of the video uploader.
 232     channel:        Full name of the channel the video is uploaded on.
 233                     Note that channel fields may or may not repeat uploader
 234                     fields. This depends on a particular extractor.
 235     channel_id:     Id of the channel.
 236     channel_url:    Full URL to a channel webpage.
 237     location:       Physical location where the video was filmed.
 238     subtitles:      The available subtitles as a dictionary in the format
 239                     {tag: subformats}. "tag" is usually a language code, and
 240                     "subformats" is a list sorted from lower to higher
 241                     preference, each element is a dictionary with the "ext"
 242                     entry and one of:
 243                         * "data": The subtitles file contents
 244                         * "url": A URL pointing to the subtitles file
 245                     "ext" will be calculated from URL if missing
 246     automatic_captions: Like 'subtitles', used by the YoutubeIE for
 247                     automatically generated captions
 248     duration:       Length of the video in seconds, as an integer or float.
 249     view_count:     How many users have watched the video on the platform.
 250     like_count:     Number of positive ratings of the video
 251     dislike_count:  Number of negative ratings of the video
 252     repost_count:   Number of reposts of the video
 253     average_rating: Average rating give by users, the scale used depends on the webpage
 254     comment_count:  Number of comments on the video
 255     comments:       A list of comments, each with one or more of the following
 256                     properties (all but one of text or html optional):
 257                         * "author" - human-readable name of the comment author
 258                         * "author_id" - user ID of the comment author
 259                         * "id" - Comment ID
 260                         * "html" - Comment as HTML
 261                         * "text" - Plain text of the comment
 262                         * "timestamp" - UNIX timestamp of comment
 263                         * "parent" - ID of the comment this one is replying to.
 264                                      Set to "root" to indicate that this is a
 265                                      comment to the original video.
 266     age_limit:      Age restriction for the video, as an integer (years)
 267     webpage_url:    The URL to the video webpage, if given to youtube-dl it
 268                     should allow to get the same result again. (It will be set
 269                     by YoutubeDL if it's missing)
 270     categories:     A list of categories that the video falls in, for example
 271                     ["Sports", "Berlin"]
 272     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 273     is_live:        True, False, or None (=unknown). Whether this video is a
 274                     live stream that goes on instead of a fixed-length video.
 275     start_time:     Time in seconds where the reproduction should start, as
 276                     specified in the URL.
 277     end_time:       Time in seconds where the reproduction should end, as
 278                     specified in the URL.
 279     chapters:       A list of dictionaries, with the following entries:
 280                         * "start_time" - The start time of the chapter in seconds
 281                         * "end_time" - The end time of the chapter in seconds
 282                         * "title" (optional, string)
 283
 284     The following fields should only be used when the video belongs to some logical
 285     chapter or section:
 286
 287     chapter:        Name or title of the chapter the video belongs to.
 288     chapter_number: Number of the chapter the video belongs to, as an integer.
 289     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 290
 291     The following fields should only be used when the video is an episode of some
 292     series, programme or podcast:
 293
 294     series:         Title of the series or programme the video episode belongs to.
 295     season:         Title of the season the video episode belongs to.
 296     season_number:  Number of the season the video episode belongs to, as an integer.
 297     season_id:      Id of the season the video episode belongs to, as a unicode string.
 298     episode:        Title of the video episode. Unlike mandatory video title field,
 299                     this field should denote the exact title of the video episode
 300                     without any kind of decoration.
 301     episode_number: Number of the video episode within a season, as an integer.
 302     episode_id:     Id of the video episode, as a unicode string.
 303
 304     The following fields should only be used when the media is a track or a part of
 305     a music album:
 306
 307     track:          Title of the track.
 308     track_number:   Number of the track within an album or a disc, as an integer.
 309     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 310                     as a unicode string.
 311     artist:         Artist(s) of the track.
 312     genre:          Genre(s) of the track.
 313     album:          Title of the album the track belongs to.
 314     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 315     album_artist:   List of all artists appeared on the album (e.g.
 316                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 317                     and compilations).
 318     disc_number:    Number of the disc or other physical medium the track belongs to,
 319                     as an integer.
 320     release_year:   Year (YYYY) when the album was released.
 321
 322     Unless mentioned otherwise, the fields should be Unicode strings.
 323
 324     Unless mentioned otherwise, None is equivalent to absence of information.
 325
 326
 327     _type "playlist" indicates multiple videos.
 328     There must be a key "entries", which is a list, an iterable, or a PagedList
 329     object, each element of which is a valid dictionary by this specification.
 330
 331     Additionally, playlists can have "id", "title", "description", "uploader",
 332     "uploader_id", "uploader_url" attributes with the same semantics as videos
 333     (see above).
 334
 335
 336     _type "multi_video" indicates that there are multiple videos that
 337     form a single show, for examples multiple acts of an opera or TV episode.
 338     It must have an entries key like a playlist and contain all the keys
 339     required for a video at the same time.
 340
 341
 342     _type "url" indicates that the video must be extracted from another
 343     location, possibly by a different extractor. Its only required key is:
 344     "url" - the next URL to extract.
 345     The key "ie_key" can be set to the class name (minus the trailing "IE",
 346     e.g. "Youtube") if the extractor class is known in advance.
 347     Additionally, the dictionary may have any properties of the resolved entity
 348     known in advance, for example "title" if the title of the referred video is
 349     known ahead of time.
 350
 351
 352     _type "url_transparent" entities have the same specification as "url", but
 353     indicate that the given additional information is more precise than the one
 354     associated with the resolved URL.
 355     This is useful when a site employs a video service that hosts the video and
 356     its technical metadata, but that video service does not embed a useful
 357     title, description etc.
 358
 359
 360     Subclasses of this one should re-define the _real_initialize() and
 361     _real_extract() methods and define a _VALID_URL regexp.
 362     Probably, they should also be added to the list of extractors.
 363
 364     _GEO_BYPASS attribute may be set to False in order to disable
 365     geo restriction bypass mechanisms for a particular extractor.
 366     Though it won't disable explicit geo restriction bypass based on
 367     country code provided with geo_bypass_country.
 368
 369     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 370     countries for this extractor. One of these countries will be used by
 371     geo restriction bypass mechanism right away in order to bypass
 372     geo restriction, of course, if the mechanism is not disabled.
 373
 374     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 375     IP blocks in CIDR notation for this extractor. One of these IP blocks
 376     will be used by geo restriction bypass mechanism similarly
 377     to _GEO_COUNTRIES.
 378
 379     Finally, the _WORKING attribute should be set to False for broken IEs
 380     in order to warn the users and skip the tests.
 381     """
 382
 383     _ready = False
 384     _downloader = None
 385     _x_forwarded_for_ip = None
 386     _GEO_BYPASS = True
 387     _GEO_COUNTRIES = None
 388     _GEO_IP_BLOCKS = None
 389     _WORKING = True
 390
 391     def __init__(self, downloader=None):
 392         """Constructor. Receives an optional downloader."""
 393         self._ready = False
 394         self._x_forwarded_for_ip = None
 395         self.set_downloader(downloader)
 396
 397     @classmethod
 398     def suitable(cls, url):
 399         """Receives a URL and returns True if suitable for this IE."""
 400
 401         # This does not use has/getattr intentionally - we want to know whether
 402         # we have cached the regexp for *this* class, whereas getattr would also
 403         # match the superclass
 404         if '_VALID_URL_RE' not in cls.__dict__:
 405             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 406         return cls._VALID_URL_RE.match(url) is not None
 407
 408     @classmethod
 409     def _match_id(cls, url):
 410         if '_VALID_URL_RE' not in cls.__dict__:
 411             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 412         m = cls._VALID_URL_RE.match(url)
 413         assert m
 414         return compat_str(m.group('id'))
 415
 416     @classmethod
 417     def working(cls):
 418         """Getter method for _WORKING."""
 419         return cls._WORKING
 420
 421     def initialize(self):
 422         """Initializes an instance (authentication, etc)."""
 423         self._initialize_geo_bypass({
 424             'countries': self._GEO_COUNTRIES,
 425             'ip_blocks': self._GEO_IP_BLOCKS,
 426         })
 427         if not self._ready:
 428             self._real_initialize()
 429             self._ready = True
 430
 431     def _initialize_geo_bypass(self, geo_bypass_context):
 432         """
 433         Initialize geo restriction bypass mechanism.
 434
 435         This method is used to initialize geo bypass mechanism based on faking
 436         X-Forwarded-For HTTP header. A random country from provided country list
 437         is selected and a random IP belonging to this country is generated. This
 438         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 439         HTTP requests.
 440
 441         This method will be used for initial geo bypass mechanism initialization
 442         during the instance initialization with _GEO_COUNTRIES and
 443         _GEO_IP_BLOCKS.
 444
 445         You may also manually call it from extractor's code if geo bypass
 446         information is not available beforehand (e.g. obtained during
 447         extraction) or due to some other reason. In this case you should pass
 448         this information in geo bypass context passed as first argument. It may
 449         contain following fields:
 450
 451         countries:  List of geo unrestricted countries (similar
 452                     to _GEO_COUNTRIES)
 453         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 454                     (similar to _GEO_IP_BLOCKS)
 455
 456         """
 457         if not self._x_forwarded_for_ip:
 458
 459             # Geo bypass mechanism is explicitly disabled by user
 460             if not self._downloader.params.get('geo_bypass', True):
 461                 return
 462
 463             if not geo_bypass_context:
 464                 geo_bypass_context = {}
 465
 466             # Backward compatibility: previously _initialize_geo_bypass
 467             # expected a list of countries, some 3rd party code may still use
 468             # it this way
 469             if isinstance(geo_bypass_context, (list, tuple)):
 470                 geo_bypass_context = {
 471                     'countries': geo_bypass_context,
 472                 }
 473
 474             # The whole point of geo bypass mechanism is to fake IP
 475             # as X-Forwarded-For HTTP header based on some IP block or
 476             # country code.
 477
 478             # Path 1: bypassing based on IP block in CIDR notation
 479
 480             # Explicit IP block specified by user, use it right away
 481             # regardless of whether extractor is geo bypassable or not
 482             ip_block = self._downloader.params.get('geo_bypass_ip_block', None)
 483
 484             # Otherwise use random IP block from geo bypass context but only
 485             # if extractor is known as geo bypassable
 486             if not ip_block:
 487                 ip_blocks = geo_bypass_context.get('ip_blocks')
 488                 if self._GEO_BYPASS and ip_blocks:
 489                     ip_block = random.choice(ip_blocks)
 490
 491             if ip_block:
 492                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 493                 if self._downloader.params.get('verbose', False):
 494                     self._downloader.to_screen(
 495                         '[debug] Using fake IP %s as X-Forwarded-For.'
 496                         % self._x_forwarded_for_ip)
 497                 return
 498
 499             # Path 2: bypassing based on country code
 500
 501             # Explicit country code specified by user, use it right away
 502             # regardless of whether extractor is geo bypassable or not
 503             country = self._downloader.params.get('geo_bypass_country', None)
 504
 505             # Otherwise use random country code from geo bypass context but
 506             # only if extractor is known as geo bypassable
 507             if not country:
 508                 countries = geo_bypass_context.get('countries')
 509                 if self._GEO_BYPASS and countries:
 510                     country = random.choice(countries)
 511
 512             if country:
 513                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 514                 if self._downloader.params.get('verbose', False):
 515                     self._downloader.to_screen(
 516                         '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
 517                         % (self._x_forwarded_for_ip, country.upper()))
 518
 519     def extract(self, url):
 520         """Extracts URL information and returns it in list of dicts."""
 521         try:
 522             for _ in range(2):
 523                 try:
 524                     self.initialize()
 525                     ie_result = self._real_extract(url)
 526                     if self._x_forwarded_for_ip:
 527                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 528                     return ie_result
 529                 except GeoRestrictedError as e:
 530                     if self.__maybe_fake_ip_and_retry(e.countries):
 531                         continue
 532                     raise
 533         except ExtractorError:
 534             raise
 535         except compat_http_client.IncompleteRead as e:
 536             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
 537         except (KeyError, StopIteration) as e:
 538             raise ExtractorError('An extractor error has occurred.', cause=e)
 539
 540     def __maybe_fake_ip_and_retry(self, countries):
 541         if (not self._downloader.params.get('geo_bypass_country', None) and
 542                 self._GEO_BYPASS and
 543                 self._downloader.params.get('geo_bypass', True) and
 544                 not self._x_forwarded_for_ip and
 545                 countries):
 546             country_code = random.choice(countries)
 547             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 548             if self._x_forwarded_for_ip:
 549                 self.report_warning(
 550                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 551                     % (self._x_forwarded_for_ip, country_code.upper()))
 552                 return True
 553         return False
 554
 555     def set_downloader(self, downloader):
 556         """Sets the downloader for this IE."""
 557         self._downloader = downloader
 558
 559     def _real_initialize(self):
 560         """Real initialization process. Redefine in subclasses."""
 561         pass
 562
 563     def _real_extract(self, url):
 564         """Real extraction process. Redefine in subclasses."""
 565         pass
 566
 567     @classmethod
 568     def ie_key(cls):
 569         """A string for getting the InfoExtractor with get_info_extractor"""
 570         return compat_str(cls.__name__[:-2])
 571
 572     @property
 573     def IE_NAME(self):
 574         return compat_str(type(self).__name__[:-2])
 575
 576     @staticmethod
 577     def __can_accept_status_code(err, expected_status):
 578         assert isinstance(err, compat_urllib_error.HTTPError)
 579         if expected_status is None:
 580             return False
 581         if isinstance(expected_status, compat_integer_types):
 582             return err.code == expected_status
 583         elif isinstance(expected_status, (list, tuple)):
 584             return err.code in expected_status
 585         elif callable(expected_status):
 586             return expected_status(err.code) is True
 587         else:
 588             assert False
 589
 590     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
 591         """
 592         Return the response handle.
 593
 594         See _download_webpage docstring for arguments specification.
 595         """
 596         if note is None:
 597             self.report_download_webpage(video_id)
 598         elif note is not False:
 599             if video_id is None:
 600                 self.to_screen('%s' % (note,))
 601             else:
 602                 self.to_screen('%s: %s' % (video_id, note))
 603
 604         # Some sites check X-Forwarded-For HTTP header in order to figure out
 605         # the origin of the client behind proxy. This allows bypassing geo
 606         # restriction by faking this header's value to IP that belongs to some
 607         # geo unrestricted country. We will do so once we encounter any
 608         # geo restriction error.
 609         if self._x_forwarded_for_ip:
 610             if 'X-Forwarded-For' not in headers:
 611                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 612
 613         if isinstance(url_or_request, compat_urllib_request.Request):
 614             url_or_request = update_Request(
 615                 url_or_request, data=data, headers=headers, query=query)
 616         else:
 617             if query:
 618                 url_or_request = update_url_query(url_or_request, query)
 619             if data is not None or headers:
 620                 url_or_request = sanitized_Request(url_or_request, data, headers)
 621         try:
 622             return self._downloader.urlopen(url_or_request)
 623         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 624             if isinstance(err, compat_urllib_error.HTTPError):
 625                 if self.__can_accept_status_code(err, expected_status):
 626                     # Retain reference to error to prevent file object from
 627                     # being closed before it can be read. Works around the
 628                     # effects of <https://bugs.python.org/issue15002>
 629                     # introduced in Python 3.4.1.
 630                     err.fp._error = err
 631                     return err.fp
 632
 633             if errnote is False:
 634                 return False
 635             if errnote is None:
 636                 errnote = 'Unable to download webpage'
 637
 638             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 639             if fatal:
 640                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 641             else:
 642                 self._downloader.report_warning(errmsg)
 643                 return False
 644
 645     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 646         """
 647         Return a tuple (page content as string, URL handle).
 648
 649         See _download_webpage docstring for arguments specification.
 650         """
 651         # Strip hashes from the URL (#1038)
 652         if isinstance(url_or_request, (compat_str, str)):
 653             url_or_request = url_or_request.partition('#')[0]
 654
 655         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 656         if urlh is False:
 657             assert not fatal
 658             return False
 659         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 660         return (content, urlh)
 661
 662     @staticmethod
 663     def _guess_encoding_from_content(content_type, webpage_bytes):
 664         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 665         if m:
 666             encoding = m.group(1)
 667         else:
 668             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 669                           webpage_bytes[:1024])
 670             if m:
 671                 encoding = m.group(1).decode('ascii')
 672             elif webpage_bytes.startswith(b'\xff\xfe'):
 673                 encoding = 'utf-16'
 674             else:
 675                 encoding = 'utf-8'
 676
 677         return encoding
 678
 679     def __check_blocked(self, content):
 680         first_block = content[:512]
 681         if ('<title>Access to this site is blocked</title>' in content and
 682                 'Websense' in first_block):
 683             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 684             blocked_iframe = self._html_search_regex(
 685                 r'<iframe src="([^"]+)"', content,
 686                 'Websense information URL', default=None)
 687             if blocked_iframe:
 688                 msg += ' Visit %s for more details' % blocked_iframe
 689             raise ExtractorError(msg, expected=True)
 690         if '<title>The URL you requested has been blocked</title>' in first_block:
 691             msg = (
 692                 'Access to this webpage has been blocked by Indian censorship. '
 693                 'Use a VPN or proxy server (with --proxy) to route around it.')
 694             block_msg = self._html_search_regex(
 695                 r'</h1><p>(.*?)</p>',
 696                 content, 'block message', default=None)
 697             if block_msg:
 698                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 699             raise ExtractorError(msg, expected=True)
 700         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content and
 701                 'blocklist.rkn.gov.ru' in content):
 702             raise ExtractorError(
 703                 'Access to this webpage has been blocked by decision of the Russian government. '
 704                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 705                 expected=True)
 706
 707     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 708         content_type = urlh.headers.get('Content-Type', '')
 709         webpage_bytes = urlh.read()
 710         if prefix is not None:
 711             webpage_bytes = prefix + webpage_bytes
 712         if not encoding:
 713             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 714         if self._downloader.params.get('dump_intermediate_pages', False):
 715             self.to_screen('Dumping request to ' + urlh.geturl())
 716             dump = base64.b64encode(webpage_bytes).decode('ascii')
 717             self._downloader.to_screen(dump)
 718         if self._downloader.params.get('write_pages', False):
 719             basen = '%s_%s' % (video_id, urlh.geturl())
 720             if len(basen) > 240:
 721                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 722                 basen = basen[:240 - len(h)] + h
 723             raw_filename = basen + '.dump'
 724             filename = sanitize_filename(raw_filename, restricted=True)
 725             self.to_screen('Saving request to ' + filename)
 726             # Working around MAX_PATH limitation on Windows (see
 727             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 728             if compat_os_name == 'nt':
 729                 absfilepath = os.path.abspath(filename)
 730                 if len(absfilepath) > 259:
 731                     filename = '\\\\?\\' + absfilepath
 732             with open(filename, 'wb') as outf:
 733                 outf.write(webpage_bytes)
 734
 735         try:
 736             content = webpage_bytes.decode(encoding, 'replace')
 737         except LookupError:
 738             content = webpage_bytes.decode('utf-8', 'replace')
 739
 740         self.__check_blocked(content)
 741
 742         return content
 743
 744     def _download_webpage(
 745             self, url_or_request, video_id, note=None, errnote=None,
 746             fatal=True, tries=1, timeout=5, encoding=None, data=None,
 747             headers={}, query={}, expected_status=None):
 748         """
 749         Return the data of the page as a string.
 750
 751         Arguments:
 752         url_or_request -- plain text URL as a string or
 753             a compat_urllib_request.Requestobject
 754         video_id -- Video/playlist/item identifier (string)
 755
 756         Keyword arguments:
 757         note -- note printed before downloading (string)
 758         errnote -- note printed in case of an error (string)
 759         fatal -- flag denoting whether error should be considered fatal,
 760             i.e. whether it should cause ExtractionError to be raised,
 761             otherwise a warning will be reported and extraction continued
 762         tries -- number of tries
 763         timeout -- sleep interval between tries
 764         encoding -- encoding for a page content decoding, guessed automatically
 765             when not explicitly specified
 766         data -- POST data (bytes)
 767         headers -- HTTP headers (dict)
 768         query -- URL query (dict)
 769         expected_status -- allows to accept failed HTTP requests (non 2xx
 770             status code) by explicitly specifying a set of accepted status
 771             codes. Can be any of the following entities:
 772                 - an integer type specifying an exact failed status code to
 773                   accept
 774                 - a list or a tuple of integer types specifying a list of
 775                   failed status codes to accept
 776                 - a callable accepting an actual failed status code and
 777                   returning True if it should be accepted
 778             Note that this argument does not affect success status codes (2xx)
 779             which are always accepted.
 780         """
 781
 782         success = False
 783         try_count = 0
 784         while success is False:
 785             try:
 786                 res = self._download_webpage_handle(
 787                     url_or_request, video_id, note, errnote, fatal,
 788                     encoding=encoding, data=data, headers=headers, query=query,
 789                     expected_status=expected_status)
 790                 success = True
 791             except compat_http_client.IncompleteRead as e:
 792                 try_count += 1
 793                 if try_count >= tries:
 794                     raise e
 795                 self._sleep(timeout, video_id)
 796         if res is False:
 797             return res
 798         else:
 799             content, _ = res
 800             return content
 801
 802     def _download_xml_handle(
 803             self, url_or_request, video_id, note='Downloading XML',
 804             errnote='Unable to download XML', transform_source=None,
 805             fatal=True, encoding=None, data=None, headers={}, query={},
 806             expected_status=None):
 807         """
 808         Return a tuple (xml as an compat_etree_Element, URL handle).
 809
 810         See _download_webpage docstring for arguments specification.
 811         """
 812         res = self._download_webpage_handle(
 813             url_or_request, video_id, note, errnote, fatal=fatal,
 814             encoding=encoding, data=data, headers=headers, query=query,
 815             expected_status=expected_status)
 816         if res is False:
 817             return res
 818         xml_string, urlh = res
 819         return self._parse_xml(
 820             xml_string, video_id, transform_source=transform_source,
 821             fatal=fatal), urlh
 822
 823     def _download_xml(
 824             self, url_or_request, video_id,
 825             note='Downloading XML', errnote='Unable to download XML',
 826             transform_source=None, fatal=True, encoding=None,
 827             data=None, headers={}, query={}, expected_status=None):
 828         """
 829         Return the xml as an compat_etree_Element.
 830
 831         See _download_webpage docstring for arguments specification.
 832         """
 833         res = self._download_xml_handle(
 834             url_or_request, video_id, note=note, errnote=errnote,
 835             transform_source=transform_source, fatal=fatal, encoding=encoding,
 836             data=data, headers=headers, query=query,
 837             expected_status=expected_status)
 838         return res if res is False else res[0]
 839
 840     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
 841         if transform_source:
 842             xml_string = transform_source(xml_string)
 843         try:
 844             return compat_etree_fromstring(xml_string.encode('utf-8'))
 845         except compat_xml_parse_error as ve:
 846             errmsg = '%s: Failed to parse XML ' % video_id
 847             if fatal:
 848                 raise ExtractorError(errmsg, cause=ve)
 849             else:
 850                 self.report_warning(errmsg + str(ve))
 851
 852     def _download_json_handle(
 853             self, url_or_request, video_id, note='Downloading JSON metadata',
 854             errnote='Unable to download JSON metadata', transform_source=None,
 855             fatal=True, encoding=None, data=None, headers={}, query={},
 856             expected_status=None):
 857         """
 858         Return a tuple (JSON object, URL handle).
 859
 860         See _download_webpage docstring for arguments specification.
 861         """
 862         res = self._download_webpage_handle(
 863             url_or_request, video_id, note, errnote, fatal=fatal,
 864             encoding=encoding, data=data, headers=headers, query=query,
 865             expected_status=expected_status)
 866         if res is False:
 867             return res
 868         json_string, urlh = res
 869         return self._parse_json(
 870             json_string, video_id, transform_source=transform_source,
 871             fatal=fatal), urlh
 872
 873     def _download_json(
 874             self, url_or_request, video_id, note='Downloading JSON metadata',
 875             errnote='Unable to download JSON metadata', transform_source=None,
 876             fatal=True, encoding=None, data=None, headers={}, query={},
 877             expected_status=None):
 878         """
 879         Return the JSON object as a dict.
 880
 881         See _download_webpage docstring for arguments specification.
 882         """
 883         res = self._download_json_handle(
 884             url_or_request, video_id, note=note, errnote=errnote,
 885             transform_source=transform_source, fatal=fatal, encoding=encoding,
 886             data=data, headers=headers, query=query,
 887             expected_status=expected_status)
 888         return res if res is False else res[0]
 889
 890     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 891         if transform_source:
 892             json_string = transform_source(json_string)
 893         try:
 894             return json.loads(json_string)
 895         except ValueError as ve:
 896             errmsg = '%s: Failed to parse JSON ' % video_id
 897             if fatal:
 898                 raise ExtractorError(errmsg, cause=ve)
 899             else:
 900                 self.report_warning(errmsg + str(ve))
 901
 902     def report_warning(self, msg, video_id=None):
 903         idstr = '' if video_id is None else '%s: ' % video_id
 904         self._downloader.report_warning(
 905             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 906
 907     def to_screen(self, msg):
 908         """Print msg to screen, prefixing it with '[ie_name]'"""
 909         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 910
 911     def report_extraction(self, id_or_name):
 912         """Report information extraction."""
 913         self.to_screen('%s: Extracting information' % id_or_name)
 914
 915     def report_download_webpage(self, video_id):
 916         """Report webpage download."""
 917         self.to_screen('%s: Downloading webpage' % video_id)
 918
 919     def report_age_confirmation(self):
 920         """Report attempt to confirm age."""
 921         self.to_screen('Confirming age')
 922
 923     def report_login(self):
 924         """Report attempt to log in."""
 925         self.to_screen('Logging in')
 926
 927     @staticmethod
 928     def raise_login_required(msg='This video is only available for registered users'):
 929         raise ExtractorError(
 930             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
 931             expected=True)
 932
 933     @staticmethod
 934     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
 935         raise GeoRestrictedError(msg, countries=countries)
 936
 937     # Methods for following #608
 938     @staticmethod
 939     def url_result(url, ie=None, video_id=None, video_title=None):
 940         """Returns a URL that points to a page that should be processed"""
 941         # TODO: ie should be the class used for getting the info
 942         video_info = {'_type': 'url',
 943                       'url': url,
 944                       'ie_key': ie}
 945         if video_id is not None:
 946             video_info['id'] = video_id
 947         if video_title is not None:
 948             video_info['title'] = video_title
 949         return video_info
 950
 951     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
 952         urls = orderedSet(
 953             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
 954             for m in matches)
 955         return self.playlist_result(
 956             urls, playlist_id=playlist_id, playlist_title=playlist_title)
 957
 958     @staticmethod
 959     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 960         """Returns a playlist"""
 961         video_info = {'_type': 'playlist',
 962                       'entries': entries}
 963         if playlist_id:
 964             video_info['id'] = playlist_id
 965         if playlist_title:
 966             video_info['title'] = playlist_title
 967         if playlist_description:
 968             video_info['description'] = playlist_description
 969         return video_info
 970
 971     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 972         """
 973         Perform a regex search on the given string, using a single or a list of
 974         patterns returning the first matching group.
 975         In case of failure return a default value or raise a WARNING or a
 976         RegexNotFoundError, depending on fatal, specifying the field name.
 977         """
 978         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 979             mobj = re.search(pattern, string, flags)
 980         else:
 981             for p in pattern:
 982                 mobj = re.search(p, string, flags)
 983                 if mobj:
 984                     break
 985
 986         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
 987             _name = '\033[0;34m%s\033[0m' % name
 988         else:
 989             _name = name
 990
 991         if mobj:
 992             if group is None:
 993                 # return the first matching group
 994                 return next(g for g in mobj.groups() if g is not None)
 995             else:
 996                 return mobj.group(group)
 997         elif default is not NO_DEFAULT:
 998             return default
 999         elif fatal:
1000             raise RegexNotFoundError('Unable to extract %s' % _name)
1001         else:
1002             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
1003             return None
1004
1005     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1006         """
1007         Like _search_regex, but strips HTML tags and unescapes entities.
1008         """
1009         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1010         if res:
1011             return clean_html(res).strip()
1012         else:
1013             return res
1014
1015     def _get_netrc_login_info(self, netrc_machine=None):
1016         username = None
1017         password = None
1018         netrc_machine = netrc_machine or self._NETRC_MACHINE
1019
1020         if self._downloader.params.get('usenetrc', False):
1021             try:
1022                 info = netrc.netrc().authenticators(netrc_machine)
1023                 if info is not None:
1024                     username = info[0]
1025                     password = info[2]
1026                 else:
1027                     raise netrc.NetrcParseError(
1028                         'No authenticators for %s' % netrc_machine)
1029             except (IOError, netrc.NetrcParseError) as err:
1030                 self._downloader.report_warning(
1031                     'parsing .netrc: %s' % error_to_compat_str(err))
1032
1033         return username, password
1034
1035     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1036         """
1037         Get the login info as (username, password)
1038         First look for the manually specified credentials using username_option
1039         and password_option as keys in params dictionary. If no such credentials
1040         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1041         value.
1042         If there's no info available, return (None, None)
1043         """
1044         if self._downloader is None:
1045             return (None, None)
1046
1047         downloader_params = self._downloader.params
1048
1049         # Attempt to use provided username and password or .netrc data
1050         if downloader_params.get(username_option) is not None:
1051             username = downloader_params[username_option]
1052             password = downloader_params[password_option]
1053         else:
1054             username, password = self._get_netrc_login_info(netrc_machine)
1055
1056         return username, password
1057
1058     def _get_tfa_info(self, note='two-factor verification code'):
1059         """
1060         Get the two-factor authentication info
1061         TODO - asking the user will be required for sms/phone verify
1062         currently just uses the command line option
1063         If there's no info available, return None
1064         """
1065         if self._downloader is None:
1066             return None
1067         downloader_params = self._downloader.params
1068
1069         if downloader_params.get('twofactor') is not None:
1070             return downloader_params['twofactor']
1071
1072         return compat_getpass('Type %s and press [Return]: ' % note)
1073
1074     # Helper functions for extracting OpenGraph info
1075     @staticmethod
1076     def _og_regexes(prop):
1077         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1078         property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
1079                        % {'prop': re.escape(prop)})
1080         template = r'<meta[^>]+?%s[^>]+?%s'
1081         return [
1082             template % (property_re, content_re),
1083             template % (content_re, property_re),
1084         ]
1085
1086     @staticmethod
1087     def _meta_regex(prop):
1088         return r'''(?isx)<meta
1089                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1090                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1091
1092     def _og_search_property(self, prop, html, name=None, **kargs):
1093         if not isinstance(prop, (list, tuple)):
1094             prop = [prop]
1095         if name is None:
1096             name = 'OpenGraph %s' % prop[0]
1097         og_regexes = []
1098         for p in prop:
1099             og_regexes.extend(self._og_regexes(p))
1100         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1101         if escaped is None:
1102             return None
1103         return unescapeHTML(escaped)
1104
1105     def _og_search_thumbnail(self, html, **kargs):
1106         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1107
1108     def _og_search_description(self, html, **kargs):
1109         return self._og_search_property('description', html, fatal=False, **kargs)
1110
1111     def _og_search_title(self, html, **kargs):
1112         return self._og_search_property('title', html, **kargs)
1113
1114     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1115         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1116         if secure:
1117             regexes = self._og_regexes('video:secure_url') + regexes
1118         return self._html_search_regex(regexes, html, name, **kargs)
1119
1120     def _og_search_url(self, html, **kargs):
1121         return self._og_search_property('url', html, **kargs)
1122
1123     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1124         if not isinstance(name, (list, tuple)):
1125             name = [name]
1126         if display_name is None:
1127             display_name = name[0]
1128         return self._html_search_regex(
1129             [self._meta_regex(n) for n in name],
1130             html, display_name, fatal=fatal, group='content', **kwargs)
1131
1132     def _dc_search_uploader(self, html):
1133         return self._html_search_meta('dc.creator', html, 'uploader')
1134
1135     def _rta_search(self, html):
1136         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1137         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1138                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1139                      html):
1140             return 18
1141         return 0
1142
1143     def _media_rating_search(self, html):
1144         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1145         rating = self._html_search_meta('rating', html)
1146
1147         if not rating:
1148             return None
1149
1150         RATING_TABLE = {
1151             'safe for kids': 0,
1152             'general': 8,
1153             '14 years': 14,
1154             'mature': 17,
1155             'restricted': 19,
1156         }
1157         return RATING_TABLE.get(rating.lower())
1158
1159     def _family_friendly_search(self, html):
1160         # See http://schema.org/VideoObject
1161         family_friendly = self._html_search_meta(
1162             'isFamilyFriendly', html, default=None)
1163
1164         if not family_friendly:
1165             return None
1166
1167         RATING_TABLE = {
1168             '1': 0,
1169             'true': 0,
1170             '0': 18,
1171             'false': 18,
1172         }
1173         return RATING_TABLE.get(family_friendly.lower())
1174
1175     def _twitter_search_player(self, html):
1176         return self._html_search_meta('twitter:player', html,
1177                                       'twitter card player')
1178
1179     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1180         json_ld = self._search_regex(
1181             JSON_LD_RE, html, 'JSON-LD', group='json_ld', **kwargs)
1182         default = kwargs.get('default', NO_DEFAULT)
1183         if not json_ld:
1184             return default if default is not NO_DEFAULT else {}
1185         # JSON-LD may be malformed and thus `fatal` should be respected.
1186         # At the same time `default` may be passed that assumes `fatal=False`
1187         # for _search_regex. Let's simulate the same behavior here as well.
1188         fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
1189         return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1190
1191     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1192         if isinstance(json_ld, compat_str):
1193             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1194         if not json_ld:
1195             return {}
1196         info = {}
1197         if not isinstance(json_ld, (list, tuple, dict)):
1198             return info
1199         if isinstance(json_ld, dict):
1200             json_ld = [json_ld]
1201
1202         INTERACTION_TYPE_MAP = {
1203             'CommentAction': 'comment',
1204             'AgreeAction': 'like',
1205             'DisagreeAction': 'dislike',
1206             'LikeAction': 'like',
1207             'DislikeAction': 'dislike',
1208             'ListenAction': 'view',
1209             'WatchAction': 'view',
1210             'ViewAction': 'view',
1211         }
1212
1213         def extract_interaction_statistic(e):
1214             interaction_statistic = e.get('interactionStatistic')
1215             if not isinstance(interaction_statistic, list):
1216                 return
1217             for is_e in interaction_statistic:
1218                 if not isinstance(is_e, dict):
1219                     continue
1220                 if is_e.get('@type') != 'InteractionCounter':
1221                     continue
1222                 interaction_type = is_e.get('interactionType')
1223                 if not isinstance(interaction_type, compat_str):
1224                     continue
1225                 interaction_count = int_or_none(is_e.get('userInteractionCount'))
1226                 if interaction_count is None:
1227                     continue
1228                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1229                 if not count_kind:
1230                     continue
1231                 count_key = '%s_count' % count_kind
1232                 if info.get(count_key) is not None:
1233                     continue
1234                 info[count_key] = interaction_count
1235
1236         def extract_video_object(e):
1237             assert e['@type'] == 'VideoObject'
1238             info.update({
1239                 'url': url_or_none(e.get('contentUrl')),
1240                 'title': unescapeHTML(e.get('name')),
1241                 'description': unescapeHTML(e.get('description')),
1242                 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
1243                 'duration': parse_duration(e.get('duration')),
1244                 'timestamp': unified_timestamp(e.get('uploadDate')),
1245                 'filesize': float_or_none(e.get('contentSize')),
1246                 'tbr': int_or_none(e.get('bitrate')),
1247                 'width': int_or_none(e.get('width')),
1248                 'height': int_or_none(e.get('height')),
1249                 'view_count': int_or_none(e.get('interactionCount')),
1250             })
1251             extract_interaction_statistic(e)
1252
1253         for e in json_ld:
1254             if isinstance(e.get('@context'), compat_str) and re.match(r'^https?://schema.org/?$', e.get('@context')):
1255                 item_type = e.get('@type')
1256                 if expected_type is not None and expected_type != item_type:
1257                     return info
1258                 if item_type in ('TVEpisode', 'Episode'):
1259                     episode_name = unescapeHTML(e.get('name'))
1260                     info.update({
1261                         'episode': episode_name,
1262                         'episode_number': int_or_none(e.get('episodeNumber')),
1263                         'description': unescapeHTML(e.get('description')),
1264                     })
1265                     if not info.get('title') and episode_name:
1266                         info['title'] = episode_name
1267                     part_of_season = e.get('partOfSeason')
1268                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1269                         info.update({
1270                             'season': unescapeHTML(part_of_season.get('name')),
1271                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1272                         })
1273                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1274                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1275                         info['series'] = unescapeHTML(part_of_series.get('name'))
1276                 elif item_type == 'Movie':
1277                     info.update({
1278                         'title': unescapeHTML(e.get('name')),
1279                         'description': unescapeHTML(e.get('description')),
1280                         'duration': parse_duration(e.get('duration')),
1281                         'timestamp': unified_timestamp(e.get('dateCreated')),
1282                     })
1283                 elif item_type in ('Article', 'NewsArticle'):
1284                     info.update({
1285                         'timestamp': parse_iso8601(e.get('datePublished')),
1286                         'title': unescapeHTML(e.get('headline')),
1287                         'description': unescapeHTML(e.get('articleBody')),
1288                     })
1289                 elif item_type == 'VideoObject':
1290                     extract_video_object(e)
1291                     continue
1292                 video = e.get('video')
1293                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1294                     extract_video_object(video)
1295                 break
1296         return dict((k, v) for k, v in info.items() if v is not None)
1297
1298     @staticmethod
1299     def _hidden_inputs(html):
1300         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1301         hidden_inputs = {}
1302         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1303             attrs = extract_attributes(input)
1304             if not input:
1305                 continue
1306             if attrs.get('type') not in ('hidden', 'submit'):
1307                 continue
1308             name = attrs.get('name') or attrs.get('id')
1309             value = attrs.get('value')
1310             if name and value is not None:
1311                 hidden_inputs[name] = value
1312         return hidden_inputs
1313
1314     def _form_hidden_inputs(self, form_id, html):
1315         form = self._search_regex(
1316             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1317             html, '%s form' % form_id, group='form')
1318         return self._hidden_inputs(form)
1319
1320     def _sort_formats(self, formats, field_preference=None):
1321         if not formats:
1322             raise ExtractorError('No video formats found')
1323
1324         for f in formats:
1325             # Automatically determine tbr when missing based on abr and vbr (improves
1326             # formats sorting in some cases)
1327             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
1328                 f['tbr'] = f['abr'] + f['vbr']
1329
1330         def _formats_key(f):
1331             # TODO remove the following workaround
1332             from ..utils import determine_ext
1333             if not f.get('ext') and 'url' in f:
1334                 f['ext'] = determine_ext(f['url'])
1335
1336             if isinstance(field_preference, (list, tuple)):
1337                 return tuple(
1338                     f.get(field)
1339                     if f.get(field) is not None
1340                     else ('' if field == 'format_id' else -1)
1341                     for field in field_preference)
1342
1343             preference = f.get('preference')
1344             if preference is None:
1345                 preference = 0
1346                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
1347                     preference -= 0.5
1348
1349             protocol = f.get('protocol') or determine_protocol(f)
1350             proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
1351
1352             if f.get('vcodec') == 'none':  # audio only
1353                 preference -= 50
1354                 if self._downloader.params.get('prefer_free_formats'):
1355                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
1356                 else:
1357                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
1358                 ext_preference = 0
1359                 try:
1360                     audio_ext_preference = ORDER.index(f['ext'])
1361                 except ValueError:
1362                     audio_ext_preference = -1
1363             else:
1364                 if f.get('acodec') == 'none':  # video only
1365                     preference -= 40
1366                 if self._downloader.params.get('prefer_free_formats'):
1367                     ORDER = ['flv', 'mp4', 'webm']
1368                 else:
1369                     ORDER = ['webm', 'flv', 'mp4']
1370                 try:
1371                     ext_preference = ORDER.index(f['ext'])
1372                 except ValueError:
1373                     ext_preference = -1
1374                 audio_ext_preference = 0
1375
1376             return (
1377                 preference,
1378                 f.get('language_preference') if f.get('language_preference') is not None else -1,
1379                 f.get('quality') if f.get('quality') is not None else -1,
1380                 f.get('tbr') if f.get('tbr') is not None else -1,
1381                 f.get('filesize') if f.get('filesize') is not None else -1,
1382                 f.get('vbr') if f.get('vbr') is not None else -1,
1383                 f.get('height') if f.get('height') is not None else -1,
1384                 f.get('width') if f.get('width') is not None else -1,
1385                 proto_preference,
1386                 ext_preference,
1387                 f.get('abr') if f.get('abr') is not None else -1,
1388                 audio_ext_preference,
1389                 f.get('fps') if f.get('fps') is not None else -1,
1390                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
1391                 f.get('source_preference') if f.get('source_preference') is not None else -1,
1392                 f.get('format_id') if f.get('format_id') is not None else '',
1393             )
1394         formats.sort(key=_formats_key)
1395
1396     def _check_formats(self, formats, video_id):
1397         if formats:
1398             formats[:] = filter(
1399                 lambda f: self._is_valid_url(
1400                     f['url'], video_id,
1401                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1402                 formats)
1403
1404     @staticmethod
1405     def _remove_duplicate_formats(formats):
1406         format_urls = set()
1407         unique_formats = []
1408         for f in formats:
1409             if f['url'] not in format_urls:
1410                 format_urls.add(f['url'])
1411                 unique_formats.append(f)
1412         formats[:] = unique_formats
1413
1414     def _is_valid_url(self, url, video_id, item='video', headers={}):
1415         url = self._proto_relative_url(url, scheme='http:')
1416         # For now assume non HTTP(S) URLs always valid
1417         if not (url.startswith('http://') or url.startswith('https://')):
1418             return True
1419         try:
1420             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1421             return True
1422         except ExtractorError as e:
1423             if isinstance(e.cause, compat_urllib_error.URLError):
1424                 self.to_screen(
1425                     '%s: %s URL is invalid, skipping' % (video_id, item))
1426                 return False
1427             raise
1428
1429     def http_scheme(self):
1430         """ Either "http:" or "https:", depending on the user's preferences """
1431         return (
1432             'http:'
1433             if self._downloader.params.get('prefer_insecure', False)
1434             else 'https:')
1435
1436     def _proto_relative_url(self, url, scheme=None):
1437         if url is None:
1438             return url
1439         if url.startswith('//'):
1440             if scheme is None:
1441                 scheme = self.http_scheme()
1442             return scheme + url
1443         else:
1444             return url
1445
1446     def _sleep(self, timeout, video_id, msg_template=None):
1447         if msg_template is None:
1448             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1449         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1450         self.to_screen(msg)
1451         time.sleep(timeout)
1452
1453     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
1454                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1455                              fatal=True, m3u8_id=None):
1456         manifest = self._download_xml(
1457             manifest_url, video_id, 'Downloading f4m manifest',
1458             'Unable to download f4m manifest',
1459             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1460             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1461             transform_source=transform_source,
1462             fatal=fatal)
1463
1464         if manifest is False:
1465             return []
1466
1467         return self._parse_f4m_formats(
1468             manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1469             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1470
1471     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1472                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1473                            fatal=True, m3u8_id=None):
1474         if not isinstance(manifest, compat_etree_Element) and not fatal:
1475             return []
1476
1477         # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1478         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1479         if akamai_pv is not None and ';' in akamai_pv.text:
1480             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1481             if playerVerificationChallenge.strip() != '':
1482                 return []
1483
1484         formats = []
1485         manifest_version = '1.0'
1486         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1487         if not media_nodes:
1488             manifest_version = '2.0'
1489             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1490         # Remove unsupported DRM protected media from final formats
1491         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1492         media_nodes = remove_encrypted_media(media_nodes)
1493         if not media_nodes:
1494             return formats
1495
1496         manifest_base_url = get_base_url(manifest)
1497
1498         bootstrap_info = xpath_element(
1499             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1500             'bootstrap info', default=None)
1501
1502         vcodec = None
1503         mime_type = xpath_text(
1504             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1505             'base URL', default=None)
1506         if mime_type and mime_type.startswith('audio/'):
1507             vcodec = 'none'
1508
1509         for i, media_el in enumerate(media_nodes):
1510             tbr = int_or_none(media_el.attrib.get('bitrate'))
1511             width = int_or_none(media_el.attrib.get('width'))
1512             height = int_or_none(media_el.attrib.get('height'))
1513             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1514             # If <bootstrapInfo> is present, the specified f4m is a
1515             # stream-level manifest, and only set-level manifests may refer to
1516             # external resources.  See section 11.4 and section 4 of F4M spec
1517             if bootstrap_info is None:
1518                 media_url = None
1519                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1520                 if manifest_version == '2.0':
1521                     media_url = media_el.attrib.get('href')
1522                 if media_url is None:
1523                     media_url = media_el.attrib.get('url')
1524                 if not media_url:
1525                     continue
1526                 manifest_url = (
1527                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1528                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1529                 # If media_url is itself a f4m manifest do the recursive extraction
1530                 # since bitrates in parent manifest (this one) and media_url manifest
1531                 # may differ leading to inability to resolve the format by requested
1532                 # bitrate in f4m downloader
1533                 ext = determine_ext(manifest_url)
1534                 if ext == 'f4m':
1535                     f4m_formats = self._extract_f4m_formats(
1536                         manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1537                         transform_source=transform_source, fatal=fatal)
1538                     # Sometimes stream-level manifest contains single media entry that
1539                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1540                     # At the same time parent's media entry in set-level manifest may
1541                     # contain it. We will copy it from parent in such cases.
1542                     if len(f4m_formats) == 1:
1543                         f = f4m_formats[0]
1544                         f.update({
1545                             'tbr': f.get('tbr') or tbr,
1546                             'width': f.get('width') or width,
1547                             'height': f.get('height') or height,
1548                             'format_id': f.get('format_id') if not tbr else format_id,
1549                             'vcodec': vcodec,
1550                         })
1551                     formats.extend(f4m_formats)
1552                     continue
1553                 elif ext == 'm3u8':
1554                     formats.extend(self._extract_m3u8_formats(
1555                         manifest_url, video_id, 'mp4', preference=preference,
1556                         m3u8_id=m3u8_id, fatal=fatal))
1557                     continue
1558             formats.append({
1559                 'format_id': format_id,
1560                 'url': manifest_url,
1561                 'manifest_url': manifest_url,
1562                 'ext': 'flv' if bootstrap_info is not None else None,
1563                 'protocol': 'f4m',
1564                 'tbr': tbr,
1565                 'width': width,
1566                 'height': height,
1567                 'vcodec': vcodec,
1568                 'preference': preference,
1569             })
1570         return formats
1571
1572     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1573         return {
1574             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1575             'url': m3u8_url,
1576             'ext': ext,
1577             'protocol': 'm3u8',
1578             'preference': preference - 100 if preference else -100,
1579             'resolution': 'multiple',
1580             'format_note': 'Quality selection URL',
1581         }
1582
1583     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1584                               entry_protocol='m3u8', preference=None,
1585                               m3u8_id=None, note=None, errnote=None,
1586                               fatal=True, live=False):
1587         res = self._download_webpage_handle(
1588             m3u8_url, video_id,
1589             note=note or 'Downloading m3u8 information',
1590             errnote=errnote or 'Failed to download m3u8 information',
1591             fatal=fatal)
1592
1593         if res is False:
1594             return []
1595
1596         m3u8_doc, urlh = res
1597         m3u8_url = urlh.geturl()
1598
1599         return self._parse_m3u8_formats(
1600             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1601             preference=preference, m3u8_id=m3u8_id, live=live)
1602
1603     def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
1604                             entry_protocol='m3u8', preference=None,
1605                             m3u8_id=None, live=False):
1606         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
1607             return []
1608
1609         if re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc):  # Apple FairPlay
1610             return []
1611
1612         formats = []
1613
1614         format_url = lambda u: (
1615             u
1616             if re.match(r'^https?://', u)
1617             else compat_urlparse.urljoin(m3u8_url, u))
1618
1619         # References:
1620         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
1621         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
1622         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
1623
1624         # We should try extracting formats only from master playlists [1, 4.3.4],
1625         # i.e. playlists that describe available qualities. On the other hand
1626         # media playlists [1, 4.3.3] should be returned as is since they contain
1627         # just the media without qualities renditions.
1628         # Fortunately, master playlist can be easily distinguished from media
1629         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
1630         # master playlist tags MUST NOT appear in a media playist and vice versa.
1631         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
1632         # media playlist and MUST NOT appear in master playlist thus we can
1633         # clearly detect media playlist with this criterion.
1634
1635         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1636             return [{
1637                 'url': m3u8_url,
1638                 'format_id': m3u8_id,
1639                 'ext': ext,
1640                 'protocol': entry_protocol,
1641                 'preference': preference,
1642             }]
1643
1644         groups = {}
1645         last_stream_inf = {}
1646
1647         def extract_media(x_media_line):
1648             media = parse_m3u8_attributes(x_media_line)
1649             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
1650             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
1651             if not (media_type and group_id and name):
1652                 return
1653             groups.setdefault(group_id, []).append(media)
1654             if media_type not in ('VIDEO', 'AUDIO'):
1655                 return
1656             media_url = media.get('URI')
1657             if media_url:
1658                 format_id = []
1659                 for v in (m3u8_id, group_id, name):
1660                     if v:
1661                         format_id.append(v)
1662                 f = {
1663                     'format_id': '-'.join(format_id),
1664                     'url': format_url(media_url),
1665                     'manifest_url': m3u8_url,
1666                     'language': media.get('LANGUAGE'),
1667                     'ext': ext,
1668                     'protocol': entry_protocol,
1669                     'preference': preference,
1670                 }
1671                 if media_type == 'AUDIO':
1672                     f['vcodec'] = 'none'
1673                 formats.append(f)
1674
1675         def build_stream_name():
1676             # Despite specification does not mention NAME attribute for
1677             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
1678             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
1679             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
1680             stream_name = last_stream_inf.get('NAME')
1681             if stream_name:
1682                 return stream_name
1683             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
1684             # from corresponding rendition group
1685             stream_group_id = last_stream_inf.get('VIDEO')
1686             if not stream_group_id:
1687                 return
1688             stream_group = groups.get(stream_group_id)
1689             if not stream_group:
1690                 return stream_group_id
1691             rendition = stream_group[0]
1692             return rendition.get('NAME') or stream_group_id
1693
1694         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
1695         # chance to detect video only formats when EXT-X-STREAM-INF tags
1696         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
1697         for line in m3u8_doc.splitlines():
1698             if line.startswith('#EXT-X-MEDIA:'):
1699                 extract_media(line)
1700
1701         for line in m3u8_doc.splitlines():
1702             if line.startswith('#EXT-X-STREAM-INF:'):
1703                 last_stream_inf = parse_m3u8_attributes(line)
1704             elif line.startswith('#') or not line.strip():
1705                 continue
1706             else:
1707                 tbr = float_or_none(
1708                     last_stream_inf.get('AVERAGE-BANDWIDTH') or
1709                     last_stream_inf.get('BANDWIDTH'), scale=1000)
1710                 format_id = []
1711                 if m3u8_id:
1712                     format_id.append(m3u8_id)
1713                 stream_name = build_stream_name()
1714                 # Bandwidth of live streams may differ over time thus making
1715                 # format_id unpredictable. So it's better to keep provided
1716                 # format_id intact.
1717                 if not live:
1718                     format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
1719                 manifest_url = format_url(line.strip())
1720                 f = {
1721                     'format_id': '-'.join(format_id),
1722                     'url': manifest_url,
1723                     'manifest_url': m3u8_url,
1724                     'tbr': tbr,
1725                     'ext': ext,
1726                     'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
1727                     'protocol': entry_protocol,
1728                     'preference': preference,
1729                 }
1730                 resolution = last_stream_inf.get('RESOLUTION')
1731                 if resolution:
1732                     mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
1733                     if mobj:
1734                         f['width'] = int(mobj.group('width'))
1735                         f['height'] = int(mobj.group('height'))
1736                 # Unified Streaming Platform
1737                 mobj = re.search(
1738                     r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
1739                 if mobj:
1740                     abr, vbr = mobj.groups()
1741                     abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
1742                     f.update({
1743                         'vbr': vbr,
1744                         'abr': abr,
1745                     })
1746                 codecs = parse_codecs(last_stream_inf.get('CODECS'))
1747                 f.update(codecs)
1748                 audio_group_id = last_stream_inf.get('AUDIO')
1749                 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
1750                 # references a rendition group MUST have a CODECS attribute.
1751                 # However, this is not always respected, for example, [2]
1752                 # contains EXT-X-STREAM-INF tag which references AUDIO
1753                 # rendition group but does not have CODECS and despite
1754                 # referencing an audio group it represents a complete
1755                 # (with audio and video) format. So, for such cases we will
1756                 # ignore references to rendition groups and treat them
1757                 # as complete formats.
1758                 if audio_group_id and codecs and f.get('vcodec') != 'none':
1759                     audio_group = groups.get(audio_group_id)
1760                     if audio_group and audio_group[0].get('URI'):
1761                         # TODO: update acodec for audio only formats with
1762                         # the same GROUP-ID
1763                         f['acodec'] = 'none'
1764                 formats.append(f)
1765                 last_stream_inf = {}
1766         return formats
1767
1768     @staticmethod
1769     def _xpath_ns(path, namespace=None):
1770         if not namespace:
1771             return path
1772         out = []
1773         for c in path.split('/'):
1774             if not c or c == '.':
1775                 out.append(c)
1776             else:
1777                 out.append('{%s}%s' % (namespace, c))
1778         return '/'.join(out)
1779
1780     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1781         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1782
1783         if smil is False:
1784             assert not fatal
1785             return []
1786
1787         namespace = self._parse_smil_namespace(smil)
1788
1789         return self._parse_smil_formats(
1790             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1791
1792     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1793         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1794         if smil is False:
1795             return {}
1796         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1797
1798     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1799         return self._download_xml(
1800             smil_url, video_id, 'Downloading SMIL file',
1801             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1802
1803     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1804         namespace = self._parse_smil_namespace(smil)
1805
1806         formats = self._parse_smil_formats(
1807             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1808         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1809
1810         video_id = os.path.splitext(url_basename(smil_url))[0]
1811         title = None
1812         description = None
1813         upload_date = None
1814         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1815             name = meta.attrib.get('name')
1816             content = meta.attrib.get('content')
1817             if not name or not content:
1818                 continue
1819             if not title and name == 'title':
1820                 title = content
1821             elif not description and name in ('description', 'abstract'):
1822                 description = content
1823             elif not upload_date and name == 'date':
1824                 upload_date = unified_strdate(content)
1825
1826         thumbnails = [{
1827             'id': image.get('type'),
1828             'url': image.get('src'),
1829             'width': int_or_none(image.get('width')),
1830             'height': int_or_none(image.get('height')),
1831         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1832
1833         return {
1834             'id': video_id,
1835             'title': title or video_id,
1836             'description': description,
1837             'upload_date': upload_date,
1838             'thumbnails': thumbnails,
1839             'formats': formats,
1840             'subtitles': subtitles,
1841         }
1842
1843     def _parse_smil_namespace(self, smil):
1844         return self._search_regex(
1845             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1846
1847     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1848         base = smil_url
1849         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1850             b = meta.get('base') or meta.get('httpBase')
1851             if b:
1852                 base = b
1853                 break
1854
1855         formats = []
1856         rtmp_count = 0
1857         http_count = 0
1858         m3u8_count = 0
1859
1860         srcs = []
1861         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1862         for medium in media:
1863             src = medium.get('src')
1864             if not src or src in srcs:
1865                 continue
1866             srcs.append(src)
1867
1868             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1869             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1870             width = int_or_none(medium.get('width'))
1871             height = int_or_none(medium.get('height'))
1872             proto = medium.get('proto')
1873             ext = medium.get('ext')
1874             src_ext = determine_ext(src)
1875             streamer = medium.get('streamer') or base
1876
1877             if proto == 'rtmp' or streamer.startswith('rtmp'):
1878                 rtmp_count += 1
1879                 formats.append({
1880                     'url': streamer,
1881                     'play_path': src,
1882                     'ext': 'flv',
1883                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1884                     'tbr': bitrate,
1885                     'filesize': filesize,
1886                     'width': width,
1887                     'height': height,
1888                 })
1889                 if transform_rtmp_url:
1890                     streamer, src = transform_rtmp_url(streamer, src)
1891                     formats[-1].update({
1892                         'url': streamer,
1893                         'play_path': src,
1894                     })
1895                 continue
1896
1897             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1898             src_url = src_url.strip()
1899
1900             if proto == 'm3u8' or src_ext == 'm3u8':
1901                 m3u8_formats = self._extract_m3u8_formats(
1902                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1903                 if len(m3u8_formats) == 1:
1904                     m3u8_count += 1
1905                     m3u8_formats[0].update({
1906                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1907                         'tbr': bitrate,
1908                         'width': width,
1909                         'height': height,
1910                     })
1911                 formats.extend(m3u8_formats)
1912             elif src_ext == 'f4m':
1913                 f4m_url = src_url
1914                 if not f4m_params:
1915                     f4m_params = {
1916                         'hdcore': '3.2.0',
1917                         'plugin': 'flowplayer-3.2.0.1',
1918                     }
1919                 f4m_url += '&' if '?' in f4m_url else '?'
1920                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1921                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1922             elif src_ext == 'mpd':
1923                 formats.extend(self._extract_mpd_formats(
1924                     src_url, video_id, mpd_id='dash', fatal=False))
1925             elif re.search(r'\.ism/[Mm]anifest', src_url):
1926                 formats.extend(self._extract_ism_formats(
1927                     src_url, video_id, ism_id='mss', fatal=False))
1928             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
1929                 http_count += 1
1930                 formats.append({
1931                     'url': src_url,
1932                     'ext': ext or src_ext or 'flv',
1933                     'format_id': 'http-%d' % (bitrate or http_count),
1934                     'tbr': bitrate,
1935                     'filesize': filesize,
1936                     'width': width,
1937                     'height': height,
1938                 })
1939
1940         return formats
1941
1942     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1943         urls = []
1944         subtitles = {}
1945         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1946             src = textstream.get('src')
1947             if not src or src in urls:
1948                 continue
1949             urls.append(src)
1950             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
1951             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1952             subtitles.setdefault(lang, []).append({
1953                 'url': src,
1954                 'ext': ext,
1955             })
1956         return subtitles
1957
1958     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
1959         xspf = self._download_xml(
1960             xspf_url, playlist_id, 'Downloading xpsf playlist',
1961             'Unable to download xspf manifest', fatal=fatal)
1962         if xspf is False:
1963             return []
1964         return self._parse_xspf(
1965             xspf, playlist_id, xspf_url=xspf_url,
1966             xspf_base_url=base_url(xspf_url))
1967
1968     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
1969         NS_MAP = {
1970             'xspf': 'http://xspf.org/ns/0/',
1971             's1': 'http://static.streamone.nl/player/ns/0',
1972         }
1973
1974         entries = []
1975         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1976             title = xpath_text(
1977                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1978             description = xpath_text(
1979                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1980             thumbnail = xpath_text(
1981                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1982             duration = float_or_none(
1983                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1984
1985             formats = []
1986             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
1987                 format_url = urljoin(xspf_base_url, location.text)
1988                 if not format_url:
1989                     continue
1990                 formats.append({
1991                     'url': format_url,
1992                     'manifest_url': xspf_url,
1993                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1994                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1995                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1996                 })
1997             self._sort_formats(formats)
1998
1999             entries.append({
2000                 'id': playlist_id,
2001                 'title': title,
2002                 'description': description,
2003                 'thumbnail': thumbnail,
2004                 'duration': duration,
2005                 'formats': formats,
2006             })
2007         return entries
2008
2009     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
2010         res = self._download_xml_handle(
2011             mpd_url, video_id,
2012             note=note or 'Downloading MPD manifest',
2013             errnote=errnote or 'Failed to download MPD manifest',
2014             fatal=fatal)
2015         if res is False:
2016             return []
2017         mpd_doc, urlh = res
2018         mpd_base_url = base_url(urlh.geturl())
2019
2020         return self._parse_mpd_formats(
2021             mpd_doc, mpd_id=mpd_id, mpd_base_url=mpd_base_url,
2022             formats_dict=formats_dict, mpd_url=mpd_url)
2023
2024     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
2025         """
2026         Parse formats from MPD manifest.
2027         References:
2028          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2029             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2030          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2031         """
2032         if mpd_doc.get('type') == 'dynamic':
2033             return []
2034
2035         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2036
2037         def _add_ns(path):
2038             return self._xpath_ns(path, namespace)
2039
2040         def is_drm_protected(element):
2041             return element.find(_add_ns('ContentProtection')) is not None
2042
2043         def extract_multisegment_info(element, ms_parent_info):
2044             ms_info = ms_parent_info.copy()
2045
2046             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2047             # common attributes and elements.  We will only extract relevant
2048             # for us.
2049             def extract_common(source):
2050                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2051                 if segment_timeline is not None:
2052                     s_e = segment_timeline.findall(_add_ns('S'))
2053                     if s_e:
2054                         ms_info['total_number'] = 0
2055                         ms_info['s'] = []
2056                         for s in s_e:
2057                             r = int(s.get('r', 0))
2058                             ms_info['total_number'] += 1 + r
2059                             ms_info['s'].append({
2060                                 't': int(s.get('t', 0)),
2061                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2062                                 'd': int(s.attrib['d']),
2063                                 'r': r,
2064                             })
2065                 start_number = source.get('startNumber')
2066                 if start_number:
2067                     ms_info['start_number'] = int(start_number)
2068                 timescale = source.get('timescale')
2069                 if timescale:
2070                     ms_info['timescale'] = int(timescale)
2071                 segment_duration = source.get('duration')
2072                 if segment_duration:
2073                     ms_info['segment_duration'] = float(segment_duration)
2074
2075             def extract_Initialization(source):
2076                 initialization = source.find(_add_ns('Initialization'))
2077                 if initialization is not None:
2078                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2079
2080             segment_list = element.find(_add_ns('SegmentList'))
2081             if segment_list is not None:
2082                 extract_common(segment_list)
2083                 extract_Initialization(segment_list)
2084                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2085                 if segment_urls_e:
2086                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2087             else:
2088                 segment_template = element.find(_add_ns('SegmentTemplate'))
2089                 if segment_template is not None:
2090                     extract_common(segment_template)
2091                     media = segment_template.get('media')
2092                     if media:
2093                         ms_info['media'] = media
2094                     initialization = segment_template.get('initialization')
2095                     if initialization:
2096                         ms_info['initialization'] = initialization
2097                     else:
2098                         extract_Initialization(segment_template)
2099             return ms_info
2100
2101         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2102         formats = []
2103         for period in mpd_doc.findall(_add_ns('Period')):
2104             period_duration = parse_duration(period.get('duration')) or mpd_duration
2105             period_ms_info = extract_multisegment_info(period, {
2106                 'start_number': 1,
2107                 'timescale': 1,
2108             })
2109             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2110                 if is_drm_protected(adaptation_set):
2111                     continue
2112                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2113                 for representation in adaptation_set.findall(_add_ns('Representation')):
2114                     if is_drm_protected(representation):
2115                         continue
2116                     representation_attrib = adaptation_set.attrib.copy()
2117                     representation_attrib.update(representation.attrib)
2118                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2119                     mime_type = representation_attrib['mimeType']
2120                     content_type = mime_type.split('/')[0]
2121                     if content_type == 'text':
2122                         # TODO implement WebVTT downloading
2123                         pass
2124                     elif content_type in ('video', 'audio'):
2125                         base_url = ''
2126                         for element in (representation, adaptation_set, period, mpd_doc):
2127                             base_url_e = element.find(_add_ns('BaseURL'))
2128                             if base_url_e is not None:
2129                                 base_url = base_url_e.text + base_url
2130                                 if re.match(r'^https?://', base_url):
2131                                     break
2132                         if mpd_base_url and not re.match(r'^https?://', base_url):
2133                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
2134                                 mpd_base_url += '/'
2135                             base_url = mpd_base_url + base_url
2136                         representation_id = representation_attrib.get('id')
2137                         lang = representation_attrib.get('lang')
2138                         url_el = representation.find(_add_ns('BaseURL'))
2139                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2140                         bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2141                         f = {
2142                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
2143                             'manifest_url': mpd_url,
2144                             'ext': mimetype2ext(mime_type),
2145                             'width': int_or_none(representation_attrib.get('width')),
2146                             'height': int_or_none(representation_attrib.get('height')),
2147                             'tbr': float_or_none(bandwidth, 1000),
2148                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2149                             'fps': int_or_none(representation_attrib.get('frameRate')),
2150                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2151                             'format_note': 'DASH %s' % content_type,
2152                             'filesize': filesize,
2153                             'container': mimetype2ext(mime_type) + '_dash',
2154                         }
2155                         f.update(parse_codecs(representation_attrib.get('codecs')))
2156                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2157
2158                         def prepare_template(template_name, identifiers):
2159                             tmpl = representation_ms_info[template_name]
2160                             # First of, % characters outside $...$ templates
2161                             # must be escaped by doubling for proper processing
2162                             # by % operator string formatting used further (see
2163                             # https://github.com/ytdl-org/youtube-dl/issues/16867).
2164                             t = ''
2165                             in_template = False
2166                             for c in tmpl:
2167                                 t += c
2168                                 if c == '$':
2169                                     in_template = not in_template
2170                                 elif c == '%' and not in_template:
2171                                     t += c
2172                             # Next, $...$ templates are translated to their
2173                             # %(...) counterparts to be used with % operator
2174                             t = t.replace('$RepresentationID$', representation_id)
2175                             t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2176                             t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2177                             t.replace('$$', '$')
2178                             return t
2179
2180                         # @initialization is a regular template like @media one
2181                         # so it should be handled just the same way (see
2182                         # https://github.com/ytdl-org/youtube-dl/issues/11605)
2183                         if 'initialization' in representation_ms_info:
2184                             initialization_template = prepare_template(
2185                                 'initialization',
2186                                 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2187                                 # $Time$ shall not be included for @initialization thus
2188                                 # only $Bandwidth$ remains
2189                                 ('Bandwidth', ))
2190                             representation_ms_info['initialization_url'] = initialization_template % {
2191                                 'Bandwidth': bandwidth,
2192                             }
2193
2194                         def location_key(location):
2195                             return 'url' if re.match(r'^https?://', location) else 'path'
2196
2197                         if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2198
2199                             media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2200                             media_location_key = location_key(media_template)
2201
2202                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2203                             # can't be used at the same time
2204                             if '%(Number' in media_template and 's' not in representation_ms_info:
2205                                 segment_duration = None
2206                                 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2207                                     segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2208                                     representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2209                                 representation_ms_info['fragments'] = [{
2210                                     media_location_key: media_template % {
2211                                         'Number': segment_number,
2212                                         'Bandwidth': bandwidth,
2213                                     },
2214                                     'duration': segment_duration,
2215                                 } for segment_number in range(
2216                                     representation_ms_info['start_number'],
2217                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2218                             else:
2219                                 # $Number*$ or $Time$ in media template with S list available
2220                                 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2221                                 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2222                                 representation_ms_info['fragments'] = []
2223                                 segment_time = 0
2224                                 segment_d = None
2225                                 segment_number = representation_ms_info['start_number']
2226
2227                                 def add_segment_url():
2228                                     segment_url = media_template % {
2229                                         'Time': segment_time,
2230                                         'Bandwidth': bandwidth,
2231                                         'Number': segment_number,
2232                                     }
2233                                     representation_ms_info['fragments'].append({
2234                                         media_location_key: segment_url,
2235                                         'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2236                                     })
2237
2238                                 for num, s in enumerate(representation_ms_info['s']):
2239                                     segment_time = s.get('t') or segment_time
2240                                     segment_d = s['d']
2241                                     add_segment_url()
2242                                     segment_number += 1
2243                                     for r in range(s.get('r', 0)):
2244                                         segment_time += segment_d
2245                                         add_segment_url()
2246                                         segment_number += 1
2247                                     segment_time += segment_d
2248                         elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2249                             # No media template
2250                             # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2251                             # or any YouTube dashsegments video
2252                             fragments = []
2253                             segment_index = 0
2254                             timescale = representation_ms_info['timescale']
2255                             for s in representation_ms_info['s']:
2256                                 duration = float_or_none(s['d'], timescale)
2257                                 for r in range(s.get('r', 0) + 1):
2258                                     segment_uri = representation_ms_info['segment_urls'][segment_index]
2259                                     fragments.append({
2260                                         location_key(segment_uri): segment_uri,
2261                                         'duration': duration,
2262                                     })
2263                                     segment_index += 1
2264                             representation_ms_info['fragments'] = fragments
2265                         elif 'segment_urls' in representation_ms_info:
2266                             # Segment URLs with no SegmentTimeline
2267                             # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2268                             # https://github.com/ytdl-org/youtube-dl/pull/14844
2269                             fragments = []
2270                             segment_duration = float_or_none(
2271                                 representation_ms_info['segment_duration'],
2272                                 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2273                             for segment_url in representation_ms_info['segment_urls']:
2274                                 fragment = {
2275                                     location_key(segment_url): segment_url,
2276                                 }
2277                                 if segment_duration:
2278                                     fragment['duration'] = segment_duration
2279                                 fragments.append(fragment)
2280                             representation_ms_info['fragments'] = fragments
2281                         # If there is a fragments key available then we correctly recognized fragmented media.
2282                         # Otherwise we will assume unfragmented media with direct access. Technically, such
2283                         # assumption is not necessarily correct since we may simply have no support for
2284                         # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2285                         if 'fragments' in representation_ms_info:
2286                             f.update({
2287                                 # NB: mpd_url may be empty when MPD manifest is parsed from a string
2288                                 'url': mpd_url or base_url,
2289                                 'fragment_base_url': base_url,
2290                                 'fragments': [],
2291                                 'protocol': 'http_dash_segments',
2292                             })
2293                             if 'initialization_url' in representation_ms_info:
2294                                 initialization_url = representation_ms_info['initialization_url']
2295                                 if not f.get('url'):
2296                                     f['url'] = initialization_url
2297                                 f['fragments'].append({location_key(initialization_url): initialization_url})
2298                             f['fragments'].extend(representation_ms_info['fragments'])
2299                         else:
2300                             # Assuming direct URL to unfragmented media.
2301                             f['url'] = base_url
2302
2303                         # According to [1, 5.3.5.2, Table 7, page 35] @id of Representation
2304                         # is not necessarily unique within a Period thus formats with
2305                         # the same `format_id` are quite possible. There are numerous examples
2306                         # of such manifests (see https://github.com/ytdl-org/youtube-dl/issues/15111,
2307                         # https://github.com/ytdl-org/youtube-dl/issues/13919)
2308                         full_info = formats_dict.get(representation_id, {}).copy()
2309                         full_info.update(f)
2310                         formats.append(full_info)
2311                     else:
2312                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2313         return formats
2314
2315     def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True):
2316         res = self._download_xml_handle(
2317             ism_url, video_id,
2318             note=note or 'Downloading ISM manifest',
2319             errnote=errnote or 'Failed to download ISM manifest',
2320             fatal=fatal)
2321         if res is False:
2322             return []
2323         ism_doc, urlh = res
2324
2325         return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id)
2326
2327     def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
2328         """
2329         Parse formats from ISM manifest.
2330         References:
2331          1. [MS-SSTR]: Smooth Streaming Protocol,
2332             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2333         """
2334         if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
2335             return []
2336
2337         duration = int(ism_doc.attrib['Duration'])
2338         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2339
2340         formats = []
2341         for stream in ism_doc.findall('StreamIndex'):
2342             stream_type = stream.get('Type')
2343             if stream_type not in ('video', 'audio'):
2344                 continue
2345             url_pattern = stream.attrib['Url']
2346             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2347             stream_name = stream.get('Name')
2348             for track in stream.findall('QualityLevel'):
2349                 fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None)
2350                 # TODO: add support for WVC1 and WMAP
2351                 if fourcc not in ('H264', 'AVC1', 'AACL'):
2352                     self.report_warning('%s is not a supported codec' % fourcc)
2353                     continue
2354                 tbr = int(track.attrib['Bitrate']) // 1000
2355                 # [1] does not mention Width and Height attributes. However,
2356                 # they're often present while MaxWidth and MaxHeight are
2357                 # missing, so should be used as fallbacks
2358                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2359                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2360                 sampling_rate = int_or_none(track.get('SamplingRate'))
2361
2362                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2363                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2364
2365                 fragments = []
2366                 fragment_ctx = {
2367                     'time': 0,
2368                 }
2369                 stream_fragments = stream.findall('c')
2370                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2371                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2372                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2373                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2374                     if not fragment_ctx['duration']:
2375                         try:
2376                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2377                         except IndexError:
2378                             next_fragment_time = duration
2379                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2380                     for _ in range(fragment_repeat):
2381                         fragments.append({
2382                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2383                             'duration': fragment_ctx['duration'] / stream_timescale,
2384                         })
2385                         fragment_ctx['time'] += fragment_ctx['duration']
2386
2387                 format_id = []
2388                 if ism_id:
2389                     format_id.append(ism_id)
2390                 if stream_name:
2391                     format_id.append(stream_name)
2392                 format_id.append(compat_str(tbr))
2393
2394                 formats.append({
2395                     'format_id': '-'.join(format_id),
2396                     'url': ism_url,
2397                     'manifest_url': ism_url,
2398                     'ext': 'ismv' if stream_type == 'video' else 'isma',
2399                     'width': width,
2400                     'height': height,
2401                     'tbr': tbr,
2402                     'asr': sampling_rate,
2403                     'vcodec': 'none' if stream_type == 'audio' else fourcc,
2404                     'acodec': 'none' if stream_type == 'video' else fourcc,
2405                     'protocol': 'ism',
2406                     'fragments': fragments,
2407                     '_download_params': {
2408                         'duration': duration,
2409                         'timescale': stream_timescale,
2410                         'width': width or 0,
2411                         'height': height or 0,
2412                         'fourcc': fourcc,
2413                         'codec_private_data': track.get('CodecPrivateData'),
2414                         'sampling_rate': sampling_rate,
2415                         'channels': int_or_none(track.get('Channels', 2)),
2416                         'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2417                         'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2418                     },
2419                 })
2420         return formats
2421
2422     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None):
2423         def absolute_url(item_url):
2424             return urljoin(base_url, item_url)
2425
2426         def parse_content_type(content_type):
2427             if not content_type:
2428                 return {}
2429             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2430             if ctr:
2431                 mimetype, codecs = ctr.groups()
2432                 f = parse_codecs(codecs)
2433                 f['ext'] = mimetype2ext(mimetype)
2434                 return f
2435             return {}
2436
2437         def _media_formats(src, cur_media_type, type_info={}):
2438             full_url = absolute_url(src)
2439             ext = type_info.get('ext') or determine_ext(full_url)
2440             if ext == 'm3u8':
2441                 is_plain_url = False
2442                 formats = self._extract_m3u8_formats(
2443                     full_url, video_id, ext='mp4',
2444                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2445                     preference=preference, fatal=False)
2446             elif ext == 'mpd':
2447                 is_plain_url = False
2448                 formats = self._extract_mpd_formats(
2449                     full_url, video_id, mpd_id=mpd_id, fatal=False)
2450             else:
2451                 is_plain_url = True
2452                 formats = [{
2453                     'url': full_url,
2454                     'vcodec': 'none' if cur_media_type == 'audio' else None,
2455                 }]
2456             return is_plain_url, formats
2457
2458         entries = []
2459         # amp-video and amp-audio are very similar to their HTML5 counterparts
2460         # so we wll include them right here (see
2461         # https://www.ampproject.org/docs/reference/components/amp-video)
2462         media_tags = [(media_tag, media_type, '')
2463                       for media_tag, media_type
2464                       in re.findall(r'(?s)(<(?:amp-)?(video|audio)[^>]*/>)', webpage)]
2465         media_tags.extend(re.findall(
2466             # We only allow video|audio followed by a whitespace or '>'.
2467             # Allowing more characters may end up in significant slow down (see
2468             # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
2469             # http://www.porntrex.com/maps/videositemap.xml).
2470             r'(?s)(<(?P<tag>(?:amp-)?(?:video|audio))(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
2471         for media_tag, media_type, media_content in media_tags:
2472             media_info = {
2473                 'formats': [],
2474                 'subtitles': {},
2475             }
2476             media_attributes = extract_attributes(media_tag)
2477             src = media_attributes.get('src')
2478             if src:
2479                 _, formats = _media_formats(src, media_type)
2480                 media_info['formats'].extend(formats)
2481             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
2482             if media_content:
2483                 for source_tag in re.findall(r'<source[^>]+>', media_content):
2484                     source_attributes = extract_attributes(source_tag)
2485                     src = source_attributes.get('src')
2486                     if not src:
2487                         continue
2488                     f = parse_content_type(source_attributes.get('type'))
2489                     is_plain_url, formats = _media_formats(src, media_type, f)
2490                     if is_plain_url:
2491                         # res attribute is not standard but seen several times
2492                         # in the wild
2493                         f.update({
2494                             'height': int_or_none(source_attributes.get('res')),
2495                             'format_id': source_attributes.get('label'),
2496                         })
2497                         f.update(formats[0])
2498                         media_info['formats'].append(f)
2499                     else:
2500                         media_info['formats'].extend(formats)
2501                 for track_tag in re.findall(r'<track[^>]+>', media_content):
2502                     track_attributes = extract_attributes(track_tag)
2503                     kind = track_attributes.get('kind')
2504                     if not kind or kind in ('subtitles', 'captions'):
2505                         src = track_attributes.get('src')
2506                         if not src:
2507                             continue
2508                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2509                         media_info['subtitles'].setdefault(lang, []).append({
2510                             'url': absolute_url(src),
2511                         })
2512             for f in media_info['formats']:
2513                 f.setdefault('http_headers', {})['Referer'] = base_url
2514             if media_info['formats'] or media_info['subtitles']:
2515                 entries.append(media_info)
2516         return entries
2517
2518     def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
2519         formats = []
2520         hdcore_sign = 'hdcore=3.7.0'
2521         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
2522         hds_host = hosts.get('hds')
2523         if hds_host:
2524             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
2525         if 'hdcore=' not in f4m_url:
2526             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2527         f4m_formats = self._extract_f4m_formats(
2528             f4m_url, video_id, f4m_id='hds', fatal=False)
2529         for entry in f4m_formats:
2530             entry.update({'extra_param_to_segment_url': hdcore_sign})
2531         formats.extend(f4m_formats)
2532         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2533         hls_host = hosts.get('hls')
2534         if hls_host:
2535             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
2536         formats.extend(self._extract_m3u8_formats(
2537             m3u8_url, video_id, 'mp4', 'm3u8_native',
2538             m3u8_id='hls', fatal=False))
2539         return formats
2540
2541     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
2542         query = compat_urlparse.urlparse(url).query
2543         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
2544         mobj = re.search(
2545             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
2546         url_base = mobj.group('url')
2547         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
2548         formats = []
2549
2550         def manifest_url(manifest):
2551             m_url = '%s/%s' % (http_base_url, manifest)
2552             if query:
2553                 m_url += '?%s' % query
2554             return m_url
2555
2556         if 'm3u8' not in skip_protocols:
2557             formats.extend(self._extract_m3u8_formats(
2558                 manifest_url('playlist.m3u8'), video_id, 'mp4',
2559                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2560         if 'f4m' not in skip_protocols:
2561             formats.extend(self._extract_f4m_formats(
2562                 manifest_url('manifest.f4m'),
2563                 video_id, f4m_id='hds', fatal=False))
2564         if 'dash' not in skip_protocols:
2565             formats.extend(self._extract_mpd_formats(
2566                 manifest_url('manifest.mpd'),
2567                 video_id, mpd_id='dash', fatal=False))
2568         if re.search(r'(?:/smil:|\.smil)', url_base):
2569             if 'smil' not in skip_protocols:
2570                 rtmp_formats = self._extract_smil_formats(
2571                     manifest_url('jwplayer.smil'),
2572                     video_id, fatal=False)
2573                 for rtmp_format in rtmp_formats:
2574                     rtsp_format = rtmp_format.copy()
2575                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2576                     del rtsp_format['play_path']
2577                     del rtsp_format['ext']
2578                     rtsp_format.update({
2579                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2580                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
2581                         'protocol': 'rtsp',
2582                     })
2583                     formats.extend([rtmp_format, rtsp_format])
2584         else:
2585             for protocol in ('rtmp', 'rtsp'):
2586                 if protocol not in skip_protocols:
2587                     formats.append({
2588                         'url': '%s:%s' % (protocol, url_base),
2589                         'format_id': protocol,
2590                         'protocol': protocol,
2591                     })
2592         return formats
2593
2594     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
2595         mobj = re.search(
2596             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
2597             webpage)
2598         if mobj:
2599             try:
2600                 jwplayer_data = self._parse_json(mobj.group('options'),
2601                                                  video_id=video_id,
2602                                                  transform_source=transform_source)
2603             except ExtractorError:
2604                 pass
2605             else:
2606                 if isinstance(jwplayer_data, dict):
2607                     return jwplayer_data
2608
2609     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
2610         jwplayer_data = self._find_jwplayer_data(
2611             webpage, video_id, transform_source=js_to_json)
2612         return self._parse_jwplayer_data(
2613             jwplayer_data, video_id, *args, **kwargs)
2614
2615     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
2616                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2617         # JWPlayer backward compatibility: flattened playlists
2618         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
2619         if 'playlist' not in jwplayer_data:
2620             jwplayer_data = {'playlist': [jwplayer_data]}
2621
2622         entries = []
2623
2624         # JWPlayer backward compatibility: single playlist item
2625         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
2626         if not isinstance(jwplayer_data['playlist'], list):
2627             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
2628
2629         for video_data in jwplayer_data['playlist']:
2630             # JWPlayer backward compatibility: flattened sources
2631             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
2632             if 'sources' not in video_data:
2633                 video_data['sources'] = [video_data]
2634
2635             this_video_id = video_id or video_data['mediaid']
2636
2637             formats = self._parse_jwplayer_formats(
2638                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
2639                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
2640
2641             subtitles = {}
2642             tracks = video_data.get('tracks')
2643             if tracks and isinstance(tracks, list):
2644                 for track in tracks:
2645                     if not isinstance(track, dict):
2646                         continue
2647                     track_kind = track.get('kind')
2648                     if not track_kind or not isinstance(track_kind, compat_str):
2649                         continue
2650                     if track_kind.lower() not in ('captions', 'subtitles'):
2651                         continue
2652                     track_url = urljoin(base_url, track.get('file'))
2653                     if not track_url:
2654                         continue
2655                     subtitles.setdefault(track.get('label') or 'en', []).append({
2656                         'url': self._proto_relative_url(track_url)
2657                     })
2658
2659             entry = {
2660                 'id': this_video_id,
2661                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
2662                 'description': video_data.get('description'),
2663                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
2664                 'timestamp': int_or_none(video_data.get('pubdate')),
2665                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
2666                 'subtitles': subtitles,
2667             }
2668             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
2669             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
2670                 entry.update({
2671                     '_type': 'url_transparent',
2672                     'url': formats[0]['url'],
2673                 })
2674             else:
2675                 self._sort_formats(formats)
2676                 entry['formats'] = formats
2677             entries.append(entry)
2678         if len(entries) == 1:
2679             return entries[0]
2680         else:
2681             return self.playlist_result(entries)
2682
2683     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
2684                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2685         urls = []
2686         formats = []
2687         for source in jwplayer_sources_data:
2688             if not isinstance(source, dict):
2689                 continue
2690             source_url = urljoin(
2691                 base_url, self._proto_relative_url(source.get('file')))
2692             if not source_url or source_url in urls:
2693                 continue
2694             urls.append(source_url)
2695             source_type = source.get('type') or ''
2696             ext = mimetype2ext(source_type) or determine_ext(source_url)
2697             if source_type == 'hls' or ext == 'm3u8':
2698                 formats.extend(self._extract_m3u8_formats(
2699                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
2700                     m3u8_id=m3u8_id, fatal=False))
2701             elif source_type == 'dash' or ext == 'mpd':
2702                 formats.extend(self._extract_mpd_formats(
2703                     source_url, video_id, mpd_id=mpd_id, fatal=False))
2704             elif ext == 'smil':
2705                 formats.extend(self._extract_smil_formats(
2706                     source_url, video_id, fatal=False))
2707             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
2708             elif source_type.startswith('audio') or ext in (
2709                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
2710                 formats.append({
2711                     'url': source_url,
2712                     'vcodec': 'none',
2713                     'ext': ext,
2714                 })
2715             else:
2716                 height = int_or_none(source.get('height'))
2717                 if height is None:
2718                     # Often no height is provided but there is a label in
2719                     # format like "1080p", "720p SD", or 1080.
2720                     height = int_or_none(self._search_regex(
2721                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
2722                         'height', default=None))
2723                 a_format = {
2724                     'url': source_url,
2725                     'width': int_or_none(source.get('width')),
2726                     'height': height,
2727                     'tbr': int_or_none(source.get('bitrate')),
2728                     'ext': ext,
2729                 }
2730                 if source_url.startswith('rtmp'):
2731                     a_format['ext'] = 'flv'
2732                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
2733                     # of jwplayer.flash.swf
2734                     rtmp_url_parts = re.split(
2735                         r'((?:mp4|mp3|flv):)', source_url, 1)
2736                     if len(rtmp_url_parts) == 3:
2737                         rtmp_url, prefix, play_path = rtmp_url_parts
2738                         a_format.update({
2739                             'url': rtmp_url,
2740                             'play_path': prefix + play_path,
2741                         })
2742                     if rtmp_params:
2743                         a_format.update(rtmp_params)
2744                 formats.append(a_format)
2745         return formats
2746
2747     def _live_title(self, name):
2748         """ Generate the title for a live video """
2749         now = datetime.datetime.now()
2750         now_str = now.strftime('%Y-%m-%d %H:%M')
2751         return name + ' ' + now_str
2752
2753     def _int(self, v, name, fatal=False, **kwargs):
2754         res = int_or_none(v, **kwargs)
2755         if 'get_attr' in kwargs:
2756             print(getattr(v, kwargs['get_attr']))
2757         if res is None:
2758             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2759             if fatal:
2760                 raise ExtractorError(msg)
2761             else:
2762                 self._downloader.report_warning(msg)
2763         return res
2764
2765     def _float(self, v, name, fatal=False, **kwargs):
2766         res = float_or_none(v, **kwargs)
2767         if res is None:
2768             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2769             if fatal:
2770                 raise ExtractorError(msg)
2771             else:
2772                 self._downloader.report_warning(msg)
2773         return res
2774
2775     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
2776                     path='/', secure=False, discard=False, rest={}, **kwargs):
2777         cookie = compat_cookiejar.Cookie(
2778             0, name, value, port, port is not None, domain, True,
2779             domain.startswith('.'), path, True, secure, expire_time,
2780             discard, None, None, rest)
2781         self._downloader.cookiejar.set_cookie(cookie)
2782
2783     def _get_cookies(self, url):
2784         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
2785         req = sanitized_Request(url)
2786         self._downloader.cookiejar.add_cookie_header(req)
2787         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
2788
2789     def get_testcases(self, include_onlymatching=False):
2790         t = getattr(self, '_TEST', None)
2791         if t:
2792             assert not hasattr(self, '_TESTS'), \
2793                 '%s has _TEST and _TESTS' % type(self).__name__
2794             tests = [t]
2795         else:
2796             tests = getattr(self, '_TESTS', [])
2797         for t in tests:
2798             if not include_onlymatching and t.get('only_matching', False):
2799                 continue
2800             t['name'] = type(self).__name__[:-len('IE')]
2801             yield t
2802
2803     def is_suitable(self, age_limit):
2804         """ Test whether the extractor is generally suitable for the given
2805         age limit (i.e. pornographic sites are not, all others usually are) """
2806
2807         any_restricted = False
2808         for tc in self.get_testcases(include_onlymatching=False):
2809             if tc.get('playlist', []):
2810                 tc = tc['playlist'][0]
2811             is_restricted = age_restricted(
2812                 tc.get('info_dict', {}).get('age_limit'), age_limit)
2813             if not is_restricted:
2814                 return True
2815             any_restricted = any_restricted or is_restricted
2816         return not any_restricted
2817
2818     def extract_subtitles(self, *args, **kwargs):
2819         if (self._downloader.params.get('writesubtitles', False) or
2820                 self._downloader.params.get('listsubtitles')):
2821             return self._get_subtitles(*args, **kwargs)
2822         return {}
2823
2824     def _get_subtitles(self, *args, **kwargs):
2825         raise NotImplementedError('This method must be implemented by subclasses')
2826
2827     @staticmethod
2828     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
2829         """ Merge subtitle items for one language. Items with duplicated URLs
2830         will be dropped. """
2831         list1_urls = set([item['url'] for item in subtitle_list1])
2832         ret = list(subtitle_list1)
2833         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
2834         return ret
2835
2836     @classmethod
2837     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
2838         """ Merge two subtitle dictionaries, language by language. """
2839         ret = dict(subtitle_dict1)
2840         for lang in subtitle_dict2:
2841             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
2842         return ret
2843
2844     def extract_automatic_captions(self, *args, **kwargs):
2845         if (self._downloader.params.get('writeautomaticsub', False) or
2846                 self._downloader.params.get('listsubtitles')):
2847             return self._get_automatic_captions(*args, **kwargs)
2848         return {}
2849
2850     def _get_automatic_captions(self, *args, **kwargs):
2851         raise NotImplementedError('This method must be implemented by subclasses')
2852
2853     def mark_watched(self, *args, **kwargs):
2854         if (self._downloader.params.get('mark_watched', False) and
2855                 (self._get_login_info()[0] is not None or
2856                     self._downloader.params.get('cookiefile') is not None)):
2857             self._mark_watched(*args, **kwargs)
2858
2859     def _mark_watched(self, *args, **kwargs):
2860         raise NotImplementedError('This method must be implemented by subclasses')
2861
2862     def geo_verification_headers(self):
2863         headers = {}
2864         geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
2865         if geo_verification_proxy:
2866             headers['Ytdl-request-proxy'] = geo_verification_proxy
2867         return headers
2868
2869     def _generic_id(self, url):
2870         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
2871
2872     def _generic_title(self, url):
2873         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
2874
2875
2876 class SearchInfoExtractor(InfoExtractor):
2877     """
2878     Base class for paged search queries extractors.
2879     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
2880     Instances should define _SEARCH_KEY and _MAX_RESULTS.
2881     """
2882
2883     @classmethod
2884     def _make_valid_url(cls):
2885         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
2886
2887     @classmethod
2888     def suitable(cls, url):
2889         return re.match(cls._make_valid_url(), url) is not None
2890
2891     def _real_extract(self, query):
2892         mobj = re.match(self._make_valid_url(), query)
2893         if mobj is None:
2894             raise ExtractorError('Invalid search query "%s"' % query)
2895
2896         prefix = mobj.group('prefix')
2897         query = mobj.group('query')
2898         if prefix == '':
2899             return self._get_n_results(query, 1)
2900         elif prefix == 'all':
2901             return self._get_n_results(query, self._MAX_RESULTS)
2902         else:
2903             n = int(prefix)
2904             if n <= 0:
2905                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
2906             elif n > self._MAX_RESULTS:
2907                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
2908                 n = self._MAX_RESULTS
2909             return self._get_n_results(query, n)
2910
2911     def _get_n_results(self, query, n):
2912         """Get a specified number of results for a query"""
2913         raise NotImplementedError('This method must be implemented by subclasses')
2914
2915     @property
2916     def SEARCH_KEY(self):
2917         return self._SEARCH_KEY