youtube_dl/extractor/common.py

   1 from __future__ import unicode_literals
   2
   3 import base64
   4 import datetime
   5 import hashlib
   6 import json
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import sys
  12 import time
  13 import math
  14
  15 from ..compat import (
  16     compat_cookiejar,
  17     compat_cookies,
  18     compat_getpass,
  19     compat_http_client,
  20     compat_urllib_error,
  21     compat_urllib_parse,
  22     compat_urlparse,
  23     compat_str,
  24     compat_etree_fromstring,
  25 )
  26 from ..utils import (
  27     NO_DEFAULT,
  28     age_restricted,
  29     bug_reports_message,
  30     clean_html,
  31     compiled_regex_type,
  32     determine_ext,
  33     error_to_compat_str,
  34     ExtractorError,
  35     fix_xml_ampersands,
  36     float_or_none,
  37     int_or_none,
  38     parse_iso8601,
  39     RegexNotFoundError,
  40     sanitize_filename,
  41     sanitized_Request,
  42     unescapeHTML,
  43     unified_strdate,
  44     url_basename,
  45     xpath_text,
  46     xpath_with_ns,
  47     determine_protocol,
  48     parse_duration,
  49     mimetype2ext,
  50 )
  51
  52
  53 class InfoExtractor(object):
  54     """Information Extractor class.
  55
  56     Information extractors are the classes that, given a URL, extract
  57     information about the video (or videos) the URL refers to. This
  58     information includes the real video URL, the video title, author and
  59     others. The information is stored in a dictionary which is then
  60     passed to the YoutubeDL. The YoutubeDL processes this
  61     information possibly downloading the video to the file system, among
  62     other possible outcomes.
  63
  64     The type field determines the type of the result.
  65     By far the most common value (and the default if _type is missing) is
  66     "video", which indicates a single video.
  67
  68     For a video, the dictionaries must include the following fields:
  69
  70     id:             Video identifier.
  71     title:          Video title, unescaped.
  72
  73     Additionally, it must contain either a formats entry or a url one:
  74
  75     formats:        A list of dictionaries for each format available, ordered
  76                     from worst to best quality.
  77
  78                     Potential fields:
  79                     * url        Mandatory. The URL of the video file
  80                     * ext        Will be calculated from URL if missing
  81                     * format     A human-readable description of the format
  82                                  ("mp4 container with h264/opus").
  83                                  Calculated from the format_id, width, height.
  84                                  and format_note fields if missing.
  85                     * format_id  A short description of the format
  86                                  ("mp4_h264_opus" or "19").
  87                                 Technically optional, but strongly recommended.
  88                     * format_note Additional info about the format
  89                                  ("3D" or "DASH video")
  90                     * width      Width of the video, if known
  91                     * height     Height of the video, if known
  92                     * resolution Textual description of width and height
  93                     * tbr        Average bitrate of audio and video in KBit/s
  94                     * abr        Average audio bitrate in KBit/s
  95                     * acodec     Name of the audio codec in use
  96                     * asr        Audio sampling rate in Hertz
  97                     * vbr        Average video bitrate in KBit/s
  98                     * fps        Frame rate
  99                     * vcodec     Name of the video codec in use
 100                     * container  Name of the container format
 101                     * filesize   The number of bytes, if known in advance
 102                     * filesize_approx  An estimate for the number of bytes
 103                     * player_url SWF Player URL (used for rtmpdump).
 104                     * protocol   The protocol that will be used for the actual
 105                                  download, lower-case.
 106                                  "http", "https", "rtsp", "rtmp", "rtmpe",
 107                                  "m3u8", or "m3u8_native".
 108                     * preference Order number of this format. If this field is
 109                                  present and not None, the formats get sorted
 110                                  by this field, regardless of all other values.
 111                                  -1 for default (order by other properties),
 112                                  -2 or smaller for less than default.
 113                                  < -1000 to hide the format (if there is
 114                                     another one which is strictly better)
 115                     * language   Language code, e.g. "de" or "en-US".
 116                     * language_preference  Is this in the language mentioned in
 117                                  the URL?
 118                                  10 if it's what the URL is about,
 119                                  -1 for default (don't know),
 120                                  -10 otherwise, other values reserved for now.
 121                     * quality    Order number of the video quality of this
 122                                  format, irrespective of the file format.
 123                                  -1 for default (order by other properties),
 124                                  -2 or smaller for less than default.
 125                     * source_preference  Order number for this video source
 126                                   (quality takes higher priority)
 127                                  -1 for default (order by other properties),
 128                                  -2 or smaller for less than default.
 129                     * http_headers  A dictionary of additional HTTP headers
 130                                  to add to the request.
 131                     * stretched_ratio  If given and not 1, indicates that the
 132                                  video's pixels are not square.
 133                                  width : height ratio as float.
 134                     * no_resume  The server does not support resuming the
 135                                  (HTTP or RTMP) download. Boolean.
 136
 137     url:            Final video URL.
 138     ext:            Video filename extension.
 139     format:         The video format, defaults to ext (used for --get-format)
 140     player_url:     SWF Player URL (used for rtmpdump).
 141
 142     The following fields are optional:
 143
 144     alt_title:      A secondary title of the video.
 145     display_id      An alternative identifier for the video, not necessarily
 146                     unique, but available before title. Typically, id is
 147                     something like "4234987", title "Dancing naked mole rats",
 148                     and display_id "dancing-naked-mole-rats"
 149     thumbnails:     A list of dictionaries, with the following entries:
 150                         * "id" (optional, string) - Thumbnail format ID
 151                         * "url"
 152                         * "preference" (optional, int) - quality of the image
 153                         * "width" (optional, int)
 154                         * "height" (optional, int)
 155                         * "resolution" (optional, string "{width}x{height"},
 156                                         deprecated)
 157     thumbnail:      Full URL to a video thumbnail image.
 158     description:    Full video description.
 159     uploader:       Full name of the video uploader.
 160     license:        License name the video is licensed under.
 161     creator:        The main artist who created the video.
 162     release_date:   The date (YYYYMMDD) when the video was released.
 163     timestamp:      UNIX timestamp of the moment the video became available.
 164     upload_date:    Video upload date (YYYYMMDD).
 165                     If not explicitly set, calculated from timestamp.
 166     uploader_id:    Nickname or id of the video uploader.
 167     uploader_url:   Full URL to a personal webpage of the video uploader.
 168     location:       Physical location where the video was filmed.
 169     subtitles:      The available subtitles as a dictionary in the format
 170                     {language: subformats}. "subformats" is a list sorted from
 171                     lower to higher preference, each element is a dictionary
 172                     with the "ext" entry and one of:
 173                         * "data": The subtitles file contents
 174                         * "url": A URL pointing to the subtitles file
 175                     "ext" will be calculated from URL if missing
 176     automatic_captions: Like 'subtitles', used by the YoutubeIE for
 177                     automatically generated captions
 178     duration:       Length of the video in seconds, as an integer or float.
 179     view_count:     How many users have watched the video on the platform.
 180     like_count:     Number of positive ratings of the video
 181     dislike_count:  Number of negative ratings of the video
 182     repost_count:   Number of reposts of the video
 183     average_rating: Average rating give by users, the scale used depends on the webpage
 184     comment_count:  Number of comments on the video
 185     comments:       A list of comments, each with one or more of the following
 186                     properties (all but one of text or html optional):
 187                         * "author" - human-readable name of the comment author
 188                         * "author_id" - user ID of the comment author
 189                         * "id" - Comment ID
 190                         * "html" - Comment as HTML
 191                         * "text" - Plain text of the comment
 192                         * "timestamp" - UNIX timestamp of comment
 193                         * "parent" - ID of the comment this one is replying to.
 194                                      Set to "root" to indicate that this is a
 195                                      comment to the original video.
 196     age_limit:      Age restriction for the video, as an integer (years)
 197     webpage_url:    The URL to the video webpage, if given to youtube-dl it
 198                     should allow to get the same result again. (It will be set
 199                     by YoutubeDL if it's missing)
 200     categories:     A list of categories that the video falls in, for example
 201                     ["Sports", "Berlin"]
 202     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 203     is_live:        True, False, or None (=unknown). Whether this video is a
 204                     live stream that goes on instead of a fixed-length video.
 205     start_time:     Time in seconds where the reproduction should start, as
 206                     specified in the URL.
 207     end_time:       Time in seconds where the reproduction should end, as
 208                     specified in the URL.
 209
 210     The following fields should only be used when the video belongs to some logical
 211     chapter or section:
 212
 213     chapter:        Name or title of the chapter the video belongs to.
 214     chapter_number: Number of the chapter the video belongs to, as an integer.
 215     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 216
 217     The following fields should only be used when the video is an episode of some
 218     series or programme:
 219
 220     series:         Title of the series or programme the video episode belongs to.
 221     season:         Title of the season the video episode belongs to.
 222     season_number:  Number of the season the video episode belongs to, as an integer.
 223     season_id:      Id of the season the video episode belongs to, as a unicode string.
 224     episode:        Title of the video episode. Unlike mandatory video title field,
 225                     this field should denote the exact title of the video episode
 226                     without any kind of decoration.
 227     episode_number: Number of the video episode within a season, as an integer.
 228     episode_id:     Id of the video episode, as a unicode string.
 229
 230     Unless mentioned otherwise, the fields should be Unicode strings.
 231
 232     Unless mentioned otherwise, None is equivalent to absence of information.
 233
 234
 235     _type "playlist" indicates multiple videos.
 236     There must be a key "entries", which is a list, an iterable, or a PagedList
 237     object, each element of which is a valid dictionary by this specification.
 238
 239     Additionally, playlists can have "title", "description" and "id" attributes
 240     with the same semantics as videos (see above).
 241
 242
 243     _type "multi_video" indicates that there are multiple videos that
 244     form a single show, for examples multiple acts of an opera or TV episode.
 245     It must have an entries key like a playlist and contain all the keys
 246     required for a video at the same time.
 247
 248
 249     _type "url" indicates that the video must be extracted from another
 250     location, possibly by a different extractor. Its only required key is:
 251     "url" - the next URL to extract.
 252     The key "ie_key" can be set to the class name (minus the trailing "IE",
 253     e.g. "Youtube") if the extractor class is known in advance.
 254     Additionally, the dictionary may have any properties of the resolved entity
 255     known in advance, for example "title" if the title of the referred video is
 256     known ahead of time.
 257
 258
 259     _type "url_transparent" entities have the same specification as "url", but
 260     indicate that the given additional information is more precise than the one
 261     associated with the resolved URL.
 262     This is useful when a site employs a video service that hosts the video and
 263     its technical metadata, but that video service does not embed a useful
 264     title, description etc.
 265
 266
 267     Subclasses of this one should re-define the _real_initialize() and
 268     _real_extract() methods and define a _VALID_URL regexp.
 269     Probably, they should also be added to the list of extractors.
 270
 271     Finally, the _WORKING attribute should be set to False for broken IEs
 272     in order to warn the users and skip the tests.
 273     """
 274
 275     _ready = False
 276     _downloader = None
 277     _WORKING = True
 278
 279     def __init__(self, downloader=None):
 280         """Constructor. Receives an optional downloader."""
 281         self._ready = False
 282         self.set_downloader(downloader)
 283
 284     @classmethod
 285     def suitable(cls, url):
 286         """Receives a URL and returns True if suitable for this IE."""
 287
 288         # This does not use has/getattr intentionally - we want to know whether
 289         # we have cached the regexp for *this* class, whereas getattr would also
 290         # match the superclass
 291         if '_VALID_URL_RE' not in cls.__dict__:
 292             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 293         return cls._VALID_URL_RE.match(url) is not None
 294
 295     @classmethod
 296     def _match_id(cls, url):
 297         if '_VALID_URL_RE' not in cls.__dict__:
 298             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 299         m = cls._VALID_URL_RE.match(url)
 300         assert m
 301         return m.group('id')
 302
 303     @classmethod
 304     def working(cls):
 305         """Getter method for _WORKING."""
 306         return cls._WORKING
 307
 308     def initialize(self):
 309         """Initializes an instance (authentication, etc)."""
 310         if not self._ready:
 311             self._real_initialize()
 312             self._ready = True
 313
 314     def extract(self, url):
 315         """Extracts URL information and returns it in list of dicts."""
 316         try:
 317             self.initialize()
 318             return self._real_extract(url)
 319         except ExtractorError:
 320             raise
 321         except compat_http_client.IncompleteRead as e:
 322             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
 323         except (KeyError, StopIteration) as e:
 324             raise ExtractorError('An extractor error has occurred.', cause=e)
 325
 326     def set_downloader(self, downloader):
 327         """Sets the downloader for this IE."""
 328         self._downloader = downloader
 329
 330     def _real_initialize(self):
 331         """Real initialization process. Redefine in subclasses."""
 332         pass
 333
 334     def _real_extract(self, url):
 335         """Real extraction process. Redefine in subclasses."""
 336         pass
 337
 338     @classmethod
 339     def ie_key(cls):
 340         """A string for getting the InfoExtractor with get_info_extractor"""
 341         return compat_str(cls.__name__[:-2])
 342
 343     @property
 344     def IE_NAME(self):
 345         return compat_str(type(self).__name__[:-2])
 346
 347     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 348         """ Returns the response handle """
 349         if note is None:
 350             self.report_download_webpage(video_id)
 351         elif note is not False:
 352             if video_id is None:
 353                 self.to_screen('%s' % (note,))
 354             else:
 355                 self.to_screen('%s: %s' % (video_id, note))
 356         try:
 357             return self._downloader.urlopen(url_or_request)
 358         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 359             if errnote is False:
 360                 return False
 361             if errnote is None:
 362                 errnote = 'Unable to download webpage'
 363
 364             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 365             if fatal:
 366                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 367             else:
 368                 self._downloader.report_warning(errmsg)
 369                 return False
 370
 371     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None):
 372         """ Returns a tuple (page content as string, URL handle) """
 373         # Strip hashes from the URL (#1038)
 374         if isinstance(url_or_request, (compat_str, str)):
 375             url_or_request = url_or_request.partition('#')[0]
 376
 377         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 378         if urlh is False:
 379             assert not fatal
 380             return False
 381         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 382         return (content, urlh)
 383
 384     @staticmethod
 385     def _guess_encoding_from_content(content_type, webpage_bytes):
 386         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 387         if m:
 388             encoding = m.group(1)
 389         else:
 390             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 391                           webpage_bytes[:1024])
 392             if m:
 393                 encoding = m.group(1).decode('ascii')
 394             elif webpage_bytes.startswith(b'\xff\xfe'):
 395                 encoding = 'utf-16'
 396             else:
 397                 encoding = 'utf-8'
 398
 399         return encoding
 400
 401     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 402         content_type = urlh.headers.get('Content-Type', '')
 403         webpage_bytes = urlh.read()
 404         if prefix is not None:
 405             webpage_bytes = prefix + webpage_bytes
 406         if not encoding:
 407             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 408         if self._downloader.params.get('dump_intermediate_pages', False):
 409             try:
 410                 url = url_or_request.get_full_url()
 411             except AttributeError:
 412                 url = url_or_request
 413             self.to_screen('Dumping request to ' + url)
 414             dump = base64.b64encode(webpage_bytes).decode('ascii')
 415             self._downloader.to_screen(dump)
 416         if self._downloader.params.get('write_pages', False):
 417             try:
 418                 url = url_or_request.get_full_url()
 419             except AttributeError:
 420                 url = url_or_request
 421             basen = '%s_%s' % (video_id, url)
 422             if len(basen) > 240:
 423                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 424                 basen = basen[:240 - len(h)] + h
 425             raw_filename = basen + '.dump'
 426             filename = sanitize_filename(raw_filename, restricted=True)
 427             self.to_screen('Saving request to ' + filename)
 428             # Working around MAX_PATH limitation on Windows (see
 429             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 430             if os.name == 'nt':
 431                 absfilepath = os.path.abspath(filename)
 432                 if len(absfilepath) > 259:
 433                     filename = '\\\\?\\' + absfilepath
 434             with open(filename, 'wb') as outf:
 435                 outf.write(webpage_bytes)
 436
 437         try:
 438             content = webpage_bytes.decode(encoding, 'replace')
 439         except LookupError:
 440             content = webpage_bytes.decode('utf-8', 'replace')
 441
 442         if ('<title>Access to this site is blocked</title>' in content and
 443                 'Websense' in content[:512]):
 444             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 445             blocked_iframe = self._html_search_regex(
 446                 r'<iframe src="([^"]+)"', content,
 447                 'Websense information URL', default=None)
 448             if blocked_iframe:
 449                 msg += ' Visit %s for more details' % blocked_iframe
 450             raise ExtractorError(msg, expected=True)
 451         if '<title>The URL you requested has been blocked</title>' in content[:512]:
 452             msg = (
 453                 'Access to this webpage has been blocked by Indian censorship. '
 454                 'Use a VPN or proxy server (with --proxy) to route around it.')
 455             block_msg = self._html_search_regex(
 456                 r'</h1><p>(.*?)</p>',
 457                 content, 'block message', default=None)
 458             if block_msg:
 459                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 460             raise ExtractorError(msg, expected=True)
 461
 462         return content
 463
 464     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None):
 465         """ Returns the data of the page as a string """
 466         success = False
 467         try_count = 0
 468         while success is False:
 469             try:
 470                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 471                 success = True
 472             except compat_http_client.IncompleteRead as e:
 473                 try_count += 1
 474                 if try_count >= tries:
 475                     raise e
 476                 self._sleep(timeout, video_id)
 477         if res is False:
 478             return res
 479         else:
 480             content, _ = res
 481             return content
 482
 483     def _download_xml(self, url_or_request, video_id,
 484                       note='Downloading XML', errnote='Unable to download XML',
 485                       transform_source=None, fatal=True, encoding=None):
 486         """Return the xml as an xml.etree.ElementTree.Element"""
 487         xml_string = self._download_webpage(
 488             url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding)
 489         if xml_string is False:
 490             return xml_string
 491         if transform_source:
 492             xml_string = transform_source(xml_string)
 493         return compat_etree_fromstring(xml_string.encode('utf-8'))
 494
 495     def _download_json(self, url_or_request, video_id,
 496                        note='Downloading JSON metadata',
 497                        errnote='Unable to download JSON metadata',
 498                        transform_source=None,
 499                        fatal=True, encoding=None):
 500         json_string = self._download_webpage(
 501             url_or_request, video_id, note, errnote, fatal=fatal,
 502             encoding=encoding)
 503         if (not fatal) and json_string is False:
 504             return None
 505         return self._parse_json(
 506             json_string, video_id, transform_source=transform_source, fatal=fatal)
 507
 508     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 509         if transform_source:
 510             json_string = transform_source(json_string)
 511         try:
 512             return json.loads(json_string)
 513         except ValueError as ve:
 514             errmsg = '%s: Failed to parse JSON ' % video_id
 515             if fatal:
 516                 raise ExtractorError(errmsg, cause=ve)
 517             else:
 518                 self.report_warning(errmsg + str(ve))
 519
 520     def report_warning(self, msg, video_id=None):
 521         idstr = '' if video_id is None else '%s: ' % video_id
 522         self._downloader.report_warning(
 523             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 524
 525     def to_screen(self, msg):
 526         """Print msg to screen, prefixing it with '[ie_name]'"""
 527         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 528
 529     def report_extraction(self, id_or_name):
 530         """Report information extraction."""
 531         self.to_screen('%s: Extracting information' % id_or_name)
 532
 533     def report_download_webpage(self, video_id):
 534         """Report webpage download."""
 535         self.to_screen('%s: Downloading webpage' % video_id)
 536
 537     def report_age_confirmation(self):
 538         """Report attempt to confirm age."""
 539         self.to_screen('Confirming age')
 540
 541     def report_login(self):
 542         """Report attempt to log in."""
 543         self.to_screen('Logging in')
 544
 545     @staticmethod
 546     def raise_login_required(msg='This video is only available for registered users'):
 547         raise ExtractorError(
 548             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
 549             expected=True)
 550
 551     @staticmethod
 552     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'):
 553         raise ExtractorError(
 554             '%s. You might want to use --proxy to workaround.' % msg,
 555             expected=True)
 556
 557     # Methods for following #608
 558     @staticmethod
 559     def url_result(url, ie=None, video_id=None, video_title=None):
 560         """Returns a URL that points to a page that should be processed"""
 561         # TODO: ie should be the class used for getting the info
 562         video_info = {'_type': 'url',
 563                       'url': url,
 564                       'ie_key': ie}
 565         if video_id is not None:
 566             video_info['id'] = video_id
 567         if video_title is not None:
 568             video_info['title'] = video_title
 569         return video_info
 570
 571     @staticmethod
 572     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 573         """Returns a playlist"""
 574         video_info = {'_type': 'playlist',
 575                       'entries': entries}
 576         if playlist_id:
 577             video_info['id'] = playlist_id
 578         if playlist_title:
 579             video_info['title'] = playlist_title
 580         if playlist_description:
 581             video_info['description'] = playlist_description
 582         return video_info
 583
 584     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 585         """
 586         Perform a regex search on the given string, using a single or a list of
 587         patterns returning the first matching group.
 588         In case of failure return a default value or raise a WARNING or a
 589         RegexNotFoundError, depending on fatal, specifying the field name.
 590         """
 591         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 592             mobj = re.search(pattern, string, flags)
 593         else:
 594             for p in pattern:
 595                 mobj = re.search(p, string, flags)
 596                 if mobj:
 597                     break
 598
 599         if not self._downloader.params.get('no_color') and os.name != 'nt' and sys.stderr.isatty():
 600             _name = '\033[0;34m%s\033[0m' % name
 601         else:
 602             _name = name
 603
 604         if mobj:
 605             if group is None:
 606                 # return the first matching group
 607                 return next(g for g in mobj.groups() if g is not None)
 608             else:
 609                 return mobj.group(group)
 610         elif default is not NO_DEFAULT:
 611             return default
 612         elif fatal:
 613             raise RegexNotFoundError('Unable to extract %s' % _name)
 614         else:
 615             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
 616             return None
 617
 618     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 619         """
 620         Like _search_regex, but strips HTML tags and unescapes entities.
 621         """
 622         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 623         if res:
 624             return clean_html(res).strip()
 625         else:
 626             return res
 627
 628     def _get_login_info(self):
 629         """
 630         Get the login info as (username, password)
 631         It will look in the netrc file using the _NETRC_MACHINE value
 632         If there's no info available, return (None, None)
 633         """
 634         if self._downloader is None:
 635             return (None, None)
 636
 637         username = None
 638         password = None
 639         downloader_params = self._downloader.params
 640
 641         # Attempt to use provided username and password or .netrc data
 642         if downloader_params.get('username') is not None:
 643             username = downloader_params['username']
 644             password = downloader_params['password']
 645         elif downloader_params.get('usenetrc', False):
 646             try:
 647                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 648                 if info is not None:
 649                     username = info[0]
 650                     password = info[2]
 651                 else:
 652                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 653             except (IOError, netrc.NetrcParseError) as err:
 654                 self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err))
 655
 656         return (username, password)
 657
 658     def _get_tfa_info(self, note='two-factor verification code'):
 659         """
 660         Get the two-factor authentication info
 661         TODO - asking the user will be required for sms/phone verify
 662         currently just uses the command line option
 663         If there's no info available, return None
 664         """
 665         if self._downloader is None:
 666             return None
 667         downloader_params = self._downloader.params
 668
 669         if downloader_params.get('twofactor') is not None:
 670             return downloader_params['twofactor']
 671
 672         return compat_getpass('Type %s and press [Return]: ' % note)
 673
 674     # Helper functions for extracting OpenGraph info
 675     @staticmethod
 676     def _og_regexes(prop):
 677         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
 678         property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
 679                        % {'prop': re.escape(prop)})
 680         template = r'<meta[^>]+?%s[^>]+?%s'
 681         return [
 682             template % (property_re, content_re),
 683             template % (content_re, property_re),
 684         ]
 685
 686     @staticmethod
 687     def _meta_regex(prop):
 688         return r'''(?isx)<meta
 689                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
 690                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
 691
 692     def _og_search_property(self, prop, html, name=None, **kargs):
 693         if name is None:
 694             name = 'OpenGraph %s' % prop
 695         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 696         if escaped is None:
 697             return None
 698         return unescapeHTML(escaped)
 699
 700     def _og_search_thumbnail(self, html, **kargs):
 701         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
 702
 703     def _og_search_description(self, html, **kargs):
 704         return self._og_search_property('description', html, fatal=False, **kargs)
 705
 706     def _og_search_title(self, html, **kargs):
 707         return self._og_search_property('title', html, **kargs)
 708
 709     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 710         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 711         if secure:
 712             regexes = self._og_regexes('video:secure_url') + regexes
 713         return self._html_search_regex(regexes, html, name, **kargs)
 714
 715     def _og_search_url(self, html, **kargs):
 716         return self._og_search_property('url', html, **kargs)
 717
 718     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 719         if display_name is None:
 720             display_name = name
 721         return self._html_search_regex(
 722             self._meta_regex(name),
 723             html, display_name, fatal=fatal, group='content', **kwargs)
 724
 725     def _dc_search_uploader(self, html):
 726         return self._html_search_meta('dc.creator', html, 'uploader')
 727
 728     def _rta_search(self, html):
 729         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 730         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 731                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 732                      html):
 733             return 18
 734         return 0
 735
 736     def _media_rating_search(self, html):
 737         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 738         rating = self._html_search_meta('rating', html)
 739
 740         if not rating:
 741             return None
 742
 743         RATING_TABLE = {
 744             'safe for kids': 0,
 745             'general': 8,
 746             '14 years': 14,
 747             'mature': 17,
 748             'restricted': 19,
 749         }
 750         return RATING_TABLE.get(rating.lower())
 751
 752     def _family_friendly_search(self, html):
 753         # See http://schema.org/VideoObject
 754         family_friendly = self._html_search_meta('isFamilyFriendly', html)
 755
 756         if not family_friendly:
 757             return None
 758
 759         RATING_TABLE = {
 760             '1': 0,
 761             'true': 0,
 762             '0': 18,
 763             'false': 18,
 764         }
 765         return RATING_TABLE.get(family_friendly.lower())
 766
 767     def _twitter_search_player(self, html):
 768         return self._html_search_meta('twitter:player', html,
 769                                       'twitter card player')
 770
 771     def _search_json_ld(self, html, video_id, **kwargs):
 772         json_ld = self._search_regex(
 773             r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
 774             html, 'JSON-LD', group='json_ld', **kwargs)
 775         if not json_ld:
 776             return {}
 777         return self._json_ld(json_ld, video_id, fatal=kwargs.get('fatal', True))
 778
 779     def _json_ld(self, json_ld, video_id, fatal=True):
 780         if isinstance(json_ld, compat_str):
 781             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
 782         if not json_ld:
 783             return {}
 784         info = {}
 785         if json_ld.get('@context') == 'http://schema.org':
 786             item_type = json_ld.get('@type')
 787             if item_type == 'TVEpisode':
 788                 info.update({
 789                     'episode': unescapeHTML(json_ld.get('name')),
 790                     'episode_number': int_or_none(json_ld.get('episodeNumber')),
 791                     'description': unescapeHTML(json_ld.get('description')),
 792                 })
 793                 part_of_season = json_ld.get('partOfSeason')
 794                 if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
 795                     info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
 796                 part_of_series = json_ld.get('partOfSeries')
 797                 if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
 798                     info['series'] = unescapeHTML(part_of_series.get('name'))
 799             elif item_type == 'Article':
 800                 info.update({
 801                     'timestamp': parse_iso8601(json_ld.get('datePublished')),
 802                     'title': unescapeHTML(json_ld.get('headline')),
 803                     'description': unescapeHTML(json_ld.get('articleBody')),
 804                 })
 805         return dict((k, v) for k, v in info.items() if v is not None)
 806
 807     @staticmethod
 808     def _hidden_inputs(html):
 809         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
 810         hidden_inputs = {}
 811         for input in re.findall(r'(?i)<input([^>]+)>', html):
 812             if not re.search(r'type=(["\'])(?:hidden|submit)\1', input):
 813                 continue
 814             name = re.search(r'name=(["\'])(?P<value>.+?)\1', input)
 815             if not name:
 816                 continue
 817             value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
 818             if not value:
 819                 continue
 820             hidden_inputs[name.group('value')] = value.group('value')
 821         return hidden_inputs
 822
 823     def _form_hidden_inputs(self, form_id, html):
 824         form = self._search_regex(
 825             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
 826             html, '%s form' % form_id, group='form')
 827         return self._hidden_inputs(form)
 828
 829     def _sort_formats(self, formats, field_preference=None):
 830         if not formats:
 831             raise ExtractorError('No video formats found')
 832
 833         for f in formats:
 834             # Automatically determine tbr when missing based on abr and vbr (improves
 835             # formats sorting in some cases)
 836             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
 837                 f['tbr'] = f['abr'] + f['vbr']
 838
 839         def _formats_key(f):
 840             # TODO remove the following workaround
 841             from ..utils import determine_ext
 842             if not f.get('ext') and 'url' in f:
 843                 f['ext'] = determine_ext(f['url'])
 844
 845             if isinstance(field_preference, (list, tuple)):
 846                 return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
 847
 848             preference = f.get('preference')
 849             if preference is None:
 850                 preference = 0
 851                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 852                     preference -= 0.5
 853
 854             proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1
 855
 856             if f.get('vcodec') == 'none':  # audio only
 857                 if self._downloader.params.get('prefer_free_formats'):
 858                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
 859                 else:
 860                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
 861                 ext_preference = 0
 862                 try:
 863                     audio_ext_preference = ORDER.index(f['ext'])
 864                 except ValueError:
 865                     audio_ext_preference = -1
 866             else:
 867                 if self._downloader.params.get('prefer_free_formats'):
 868                     ORDER = ['flv', 'mp4', 'webm']
 869                 else:
 870                     ORDER = ['webm', 'flv', 'mp4']
 871                 try:
 872                     ext_preference = ORDER.index(f['ext'])
 873                 except ValueError:
 874                     ext_preference = -1
 875                 audio_ext_preference = 0
 876
 877             return (
 878                 preference,
 879                 f.get('language_preference') if f.get('language_preference') is not None else -1,
 880                 f.get('quality') if f.get('quality') is not None else -1,
 881                 f.get('tbr') if f.get('tbr') is not None else -1,
 882                 f.get('filesize') if f.get('filesize') is not None else -1,
 883                 f.get('vbr') if f.get('vbr') is not None else -1,
 884                 f.get('height') if f.get('height') is not None else -1,
 885                 f.get('width') if f.get('width') is not None else -1,
 886                 proto_preference,
 887                 ext_preference,
 888                 f.get('abr') if f.get('abr') is not None else -1,
 889                 audio_ext_preference,
 890                 f.get('fps') if f.get('fps') is not None else -1,
 891                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
 892                 f.get('source_preference') if f.get('source_preference') is not None else -1,
 893                 f.get('format_id') if f.get('format_id') is not None else '',
 894             )
 895         formats.sort(key=_formats_key)
 896
 897     def _check_formats(self, formats, video_id):
 898         if formats:
 899             formats[:] = filter(
 900                 lambda f: self._is_valid_url(
 901                     f['url'], video_id,
 902                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
 903                 formats)
 904
 905     @staticmethod
 906     def _remove_duplicate_formats(formats):
 907         format_urls = set()
 908         unique_formats = []
 909         for f in formats:
 910             if f['url'] not in format_urls:
 911                 format_urls.add(f['url'])
 912                 unique_formats.append(f)
 913         formats[:] = unique_formats
 914
 915     def _is_valid_url(self, url, video_id, item='video'):
 916         url = self._proto_relative_url(url, scheme='http:')
 917         # For now assume non HTTP(S) URLs always valid
 918         if not (url.startswith('http://') or url.startswith('https://')):
 919             return True
 920         try:
 921             self._request_webpage(url, video_id, 'Checking %s URL' % item)
 922             return True
 923         except ExtractorError as e:
 924             if isinstance(e.cause, compat_urllib_error.URLError):
 925                 self.to_screen(
 926                     '%s: %s URL is invalid, skipping' % (video_id, item))
 927                 return False
 928             raise
 929
 930     def http_scheme(self):
 931         """ Either "http:" or "https:", depending on the user's preferences """
 932         return (
 933             'http:'
 934             if self._downloader.params.get('prefer_insecure', False)
 935             else 'https:')
 936
 937     def _proto_relative_url(self, url, scheme=None):
 938         if url is None:
 939             return url
 940         if url.startswith('//'):
 941             if scheme is None:
 942                 scheme = self.http_scheme()
 943             return scheme + url
 944         else:
 945             return url
 946
 947     def _sleep(self, timeout, video_id, msg_template=None):
 948         if msg_template is None:
 949             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
 950         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
 951         self.to_screen(msg)
 952         time.sleep(timeout)
 953
 954     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
 955                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
 956                              fatal=True):
 957         manifest = self._download_xml(
 958             manifest_url, video_id, 'Downloading f4m manifest',
 959             'Unable to download f4m manifest',
 960             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
 961             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
 962             transform_source=transform_source,
 963             fatal=fatal)
 964
 965         if manifest is False:
 966             return []
 967
 968         formats = []
 969         manifest_version = '1.0'
 970         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
 971         if not media_nodes:
 972             manifest_version = '2.0'
 973             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
 974         base_url = xpath_text(
 975             manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
 976             'base URL', default=None)
 977         if base_url:
 978             base_url = base_url.strip()
 979         for i, media_el in enumerate(media_nodes):
 980             if manifest_version == '2.0':
 981                 media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
 982                 if not media_url:
 983                     continue
 984                 manifest_url = (
 985                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
 986                     else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
 987                 # If media_url is itself a f4m manifest do the recursive extraction
 988                 # since bitrates in parent manifest (this one) and media_url manifest
 989                 # may differ leading to inability to resolve the format by requested
 990                 # bitrate in f4m downloader
 991                 if determine_ext(manifest_url) == 'f4m':
 992                     formats.extend(self._extract_f4m_formats(
 993                         manifest_url, video_id, preference, f4m_id, fatal=fatal))
 994                     continue
 995             tbr = int_or_none(media_el.attrib.get('bitrate'))
 996             formats.append({
 997                 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
 998                 'url': manifest_url,
 999                 'ext': 'flv',
1000                 'tbr': tbr,
1001                 'width': int_or_none(media_el.attrib.get('width')),
1002                 'height': int_or_none(media_el.attrib.get('height')),
1003                 'preference': preference,
1004             })
1005         self._sort_formats(formats)
1006
1007         return formats
1008
1009     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1010                               entry_protocol='m3u8', preference=None,
1011                               m3u8_id=None, note=None, errnote=None,
1012                               fatal=True):
1013
1014         formats = [{
1015             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1016             'url': m3u8_url,
1017             'ext': ext,
1018             'protocol': 'm3u8',
1019             'preference': preference - 1 if preference else -1,
1020             'resolution': 'multiple',
1021             'format_note': 'Quality selection URL',
1022         }]
1023
1024         format_url = lambda u: (
1025             u
1026             if re.match(r'^https?://', u)
1027             else compat_urlparse.urljoin(m3u8_url, u))
1028
1029         res = self._download_webpage_handle(
1030             m3u8_url, video_id,
1031             note=note or 'Downloading m3u8 information',
1032             errnote=errnote or 'Failed to download m3u8 information',
1033             fatal=fatal)
1034         if res is False:
1035             return []
1036         m3u8_doc, urlh = res
1037         m3u8_url = urlh.geturl()
1038
1039         # We should try extracting formats only from master playlists [1], i.e.
1040         # playlists that describe available qualities. On the other hand media
1041         # playlists [2] should be returned as is since they contain just the media
1042         # without qualities renditions.
1043         # Fortunately, master playlist can be easily distinguished from media
1044         # playlist based on particular tags availability. As of [1, 2] master
1045         # playlist tags MUST NOT appear in a media playist and vice versa.
1046         # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
1047         # and MUST NOT appear in master playlist thus we can clearly detect media
1048         # playlist with this criterion.
1049         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
1050         # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
1051         # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
1052         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1053             return [{
1054                 'url': m3u8_url,
1055                 'format_id': m3u8_id,
1056                 'ext': ext,
1057                 'protocol': entry_protocol,
1058                 'preference': preference,
1059             }]
1060         last_info = None
1061         last_media = None
1062         kv_rex = re.compile(
1063             r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
1064         for line in m3u8_doc.splitlines():
1065             if line.startswith('#EXT-X-STREAM-INF:'):
1066                 last_info = {}
1067                 for m in kv_rex.finditer(line):
1068                     v = m.group('val')
1069                     if v.startswith('"'):
1070                         v = v[1:-1]
1071                     last_info[m.group('key')] = v
1072             elif line.startswith('#EXT-X-MEDIA:'):
1073                 last_media = {}
1074                 for m in kv_rex.finditer(line):
1075                     v = m.group('val')
1076                     if v.startswith('"'):
1077                         v = v[1:-1]
1078                     last_media[m.group('key')] = v
1079             elif line.startswith('#') or not line.strip():
1080                 continue
1081             else:
1082                 if last_info is None:
1083                     formats.append({'url': format_url(line)})
1084                     continue
1085                 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
1086                 format_id = []
1087                 if m3u8_id:
1088                     format_id.append(m3u8_id)
1089                 last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
1090                 format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
1091                 f = {
1092                     'format_id': '-'.join(format_id),
1093                     'url': format_url(line.strip()),
1094                     'tbr': tbr,
1095                     'ext': ext,
1096                     'protocol': entry_protocol,
1097                     'preference': preference,
1098                 }
1099                 resolution = last_info.get('RESOLUTION')
1100                 if resolution:
1101                     width_str, height_str = resolution.split('x')
1102                     f['width'] = int(width_str)
1103                     f['height'] = int(height_str)
1104                 codecs = last_info.get('CODECS')
1105                 if codecs:
1106                     vcodec, acodec = [None] * 2
1107                     va_codecs = codecs.split(',')
1108                     if len(va_codecs) == 1:
1109                         # Audio only entries usually come with single codec and
1110                         # no resolution. For more robustness we also check it to
1111                         # be mp4 audio.
1112                         if not resolution and va_codecs[0].startswith('mp4a'):
1113                             vcodec, acodec = 'none', va_codecs[0]
1114                         else:
1115                             vcodec = va_codecs[0]
1116                     else:
1117                         vcodec, acodec = va_codecs[:2]
1118                     f.update({
1119                         'acodec': acodec,
1120                         'vcodec': vcodec,
1121                     })
1122                 if last_media is not None:
1123                     f['m3u8_media'] = last_media
1124                     last_media = None
1125                 formats.append(f)
1126                 last_info = {}
1127         self._sort_formats(formats)
1128         return formats
1129
1130     @staticmethod
1131     def _xpath_ns(path, namespace=None):
1132         if not namespace:
1133             return path
1134         out = []
1135         for c in path.split('/'):
1136             if not c or c == '.':
1137                 out.append(c)
1138             else:
1139                 out.append('{%s}%s' % (namespace, c))
1140         return '/'.join(out)
1141
1142     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None):
1143         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1144
1145         if smil is False:
1146             assert not fatal
1147             return []
1148
1149         namespace = self._parse_smil_namespace(smil)
1150
1151         return self._parse_smil_formats(
1152             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1153
1154     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1155         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1156         if smil is False:
1157             return {}
1158         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1159
1160     def _download_smil(self, smil_url, video_id, fatal=True):
1161         return self._download_xml(
1162             smil_url, video_id, 'Downloading SMIL file',
1163             'Unable to download SMIL file', fatal=fatal)
1164
1165     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1166         namespace = self._parse_smil_namespace(smil)
1167
1168         formats = self._parse_smil_formats(
1169             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1170         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1171
1172         video_id = os.path.splitext(url_basename(smil_url))[0]
1173         title = None
1174         description = None
1175         upload_date = None
1176         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1177             name = meta.attrib.get('name')
1178             content = meta.attrib.get('content')
1179             if not name or not content:
1180                 continue
1181             if not title and name == 'title':
1182                 title = content
1183             elif not description and name in ('description', 'abstract'):
1184                 description = content
1185             elif not upload_date and name == 'date':
1186                 upload_date = unified_strdate(content)
1187
1188         thumbnails = [{
1189             'id': image.get('type'),
1190             'url': image.get('src'),
1191             'width': int_or_none(image.get('width')),
1192             'height': int_or_none(image.get('height')),
1193         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1194
1195         return {
1196             'id': video_id,
1197             'title': title or video_id,
1198             'description': description,
1199             'upload_date': upload_date,
1200             'thumbnails': thumbnails,
1201             'formats': formats,
1202             'subtitles': subtitles,
1203         }
1204
1205     def _parse_smil_namespace(self, smil):
1206         return self._search_regex(
1207             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1208
1209     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1210         base = smil_url
1211         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1212             b = meta.get('base') or meta.get('httpBase')
1213             if b:
1214                 base = b
1215                 break
1216
1217         formats = []
1218         rtmp_count = 0
1219         http_count = 0
1220         m3u8_count = 0
1221
1222         srcs = []
1223         videos = smil.findall(self._xpath_ns('.//video', namespace))
1224         for video in videos:
1225             src = video.get('src')
1226             if not src or src in srcs:
1227                 continue
1228             srcs.append(src)
1229
1230             bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
1231             filesize = int_or_none(video.get('size') or video.get('fileSize'))
1232             width = int_or_none(video.get('width'))
1233             height = int_or_none(video.get('height'))
1234             proto = video.get('proto')
1235             ext = video.get('ext')
1236             src_ext = determine_ext(src)
1237             streamer = video.get('streamer') or base
1238
1239             if proto == 'rtmp' or streamer.startswith('rtmp'):
1240                 rtmp_count += 1
1241                 formats.append({
1242                     'url': streamer,
1243                     'play_path': src,
1244                     'ext': 'flv',
1245                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1246                     'tbr': bitrate,
1247                     'filesize': filesize,
1248                     'width': width,
1249                     'height': height,
1250                 })
1251                 if transform_rtmp_url:
1252                     streamer, src = transform_rtmp_url(streamer, src)
1253                     formats[-1].update({
1254                         'url': streamer,
1255                         'play_path': src,
1256                     })
1257                 continue
1258
1259             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1260             src_url = src_url.strip()
1261
1262             if proto == 'm3u8' or src_ext == 'm3u8':
1263                 m3u8_formats = self._extract_m3u8_formats(
1264                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1265                 if len(m3u8_formats) == 1:
1266                     m3u8_count += 1
1267                     m3u8_formats[0].update({
1268                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1269                         'tbr': bitrate,
1270                         'width': width,
1271                         'height': height,
1272                     })
1273                 formats.extend(m3u8_formats)
1274                 continue
1275
1276             if src_ext == 'f4m':
1277                 f4m_url = src_url
1278                 if not f4m_params:
1279                     f4m_params = {
1280                         'hdcore': '3.2.0',
1281                         'plugin': 'flowplayer-3.2.0.1',
1282                     }
1283                 f4m_url += '&' if '?' in f4m_url else '?'
1284                 f4m_url += compat_urllib_parse.urlencode(f4m_params)
1285                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1286                 continue
1287
1288             if src_url.startswith('http') and self._is_valid_url(src, video_id):
1289                 http_count += 1
1290                 formats.append({
1291                     'url': src_url,
1292                     'ext': ext or src_ext or 'flv',
1293                     'format_id': 'http-%d' % (bitrate or http_count),
1294                     'tbr': bitrate,
1295                     'filesize': filesize,
1296                     'width': width,
1297                     'height': height,
1298                 })
1299                 continue
1300
1301         self._sort_formats(formats)
1302
1303         return formats
1304
1305     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1306         urls = []
1307         subtitles = {}
1308         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1309             src = textstream.get('src')
1310             if not src or src in urls:
1311                 continue
1312             urls.append(src)
1313             ext = textstream.get('ext') or determine_ext(src) or mimetype2ext(textstream.get('type'))
1314             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1315             subtitles.setdefault(lang, []).append({
1316                 'url': src,
1317                 'ext': ext,
1318             })
1319         return subtitles
1320
1321     def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1322         xspf = self._download_xml(
1323             playlist_url, playlist_id, 'Downloading xpsf playlist',
1324             'Unable to download xspf manifest', fatal=fatal)
1325         if xspf is False:
1326             return []
1327         return self._parse_xspf(xspf, playlist_id)
1328
1329     def _parse_xspf(self, playlist, playlist_id):
1330         NS_MAP = {
1331             'xspf': 'http://xspf.org/ns/0/',
1332             's1': 'http://static.streamone.nl/player/ns/0',
1333         }
1334
1335         entries = []
1336         for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1337             title = xpath_text(
1338                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1339             description = xpath_text(
1340                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1341             thumbnail = xpath_text(
1342                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1343             duration = float_or_none(
1344                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1345
1346             formats = [{
1347                 'url': location.text,
1348                 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1349                 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1350                 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1351             } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1352             self._sort_formats(formats)
1353
1354             entries.append({
1355                 'id': playlist_id,
1356                 'title': title,
1357                 'description': description,
1358                 'thumbnail': thumbnail,
1359                 'duration': duration,
1360                 'formats': formats,
1361             })
1362         return entries
1363
1364     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1365         res = self._download_webpage_handle(
1366             mpd_url, video_id,
1367             note=note or 'Downloading MPD manifest',
1368             errnote=errnote or 'Failed to download MPD manifest',
1369             fatal=fatal)
1370         if res is False:
1371             return []
1372         mpd, urlh = res
1373         mpd_base_url = re.match(r'https?://.+/', urlh.geturl()).group()
1374
1375         return self._parse_mpd_formats(
1376             compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, formats_dict=formats_dict)
1377
1378     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}):
1379         if mpd_doc.get('type') == 'dynamic':
1380             return []
1381
1382         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1383
1384         def _add_ns(path):
1385             return self._xpath_ns(path, namespace)
1386
1387         def is_drm_protected(element):
1388             return element.find(_add_ns('ContentProtection')) is not None
1389
1390         def extract_multisegment_info(element, ms_parent_info):
1391             ms_info = ms_parent_info.copy()
1392             segment_list = element.find(_add_ns('SegmentList'))
1393             if segment_list is not None:
1394                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1395                 if segment_urls_e:
1396                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1397                 initialization = segment_list.find(_add_ns('Initialization'))
1398                 if initialization is not None:
1399                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
1400             else:
1401                 segment_template = element.find(_add_ns('SegmentTemplate'))
1402                 if segment_template is not None:
1403                     start_number = segment_template.get('startNumber')
1404                     if start_number:
1405                         ms_info['start_number'] = int(start_number)
1406                     segment_timeline = segment_template.find(_add_ns('SegmentTimeline'))
1407                     if segment_timeline is not None:
1408                         s_e = segment_timeline.findall(_add_ns('S'))
1409                         if s_e:
1410                             ms_info['total_number'] = 0
1411                             for s in s_e:
1412                                 ms_info['total_number'] += 1 + int(s.get('r', '0'))
1413                     else:
1414                         timescale = segment_template.get('timescale')
1415                         if timescale:
1416                             ms_info['timescale'] = int(timescale)
1417                         segment_duration = segment_template.get('duration')
1418                         if segment_duration:
1419                             ms_info['segment_duration'] = int(segment_duration)
1420                     media_template = segment_template.get('media')
1421                     if media_template:
1422                         ms_info['media_template'] = media_template
1423                     initialization = segment_template.get('initialization')
1424                     if initialization:
1425                         ms_info['initialization_url'] = initialization
1426                     else:
1427                         initialization = segment_template.find(_add_ns('Initialization'))
1428                         if initialization is not None:
1429                             ms_info['initialization_url'] = initialization.attrib['sourceURL']
1430             return ms_info
1431
1432         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1433         formats = []
1434         for period in mpd_doc.findall(_add_ns('Period')):
1435             period_duration = parse_duration(period.get('duration')) or mpd_duration
1436             period_ms_info = extract_multisegment_info(period, {
1437                 'start_number': 1,
1438                 'timescale': 1,
1439             })
1440             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1441                 if is_drm_protected(adaptation_set):
1442                     continue
1443                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1444                 for representation in adaptation_set.findall(_add_ns('Representation')):
1445                     if is_drm_protected(representation):
1446                         continue
1447                     representation_attrib = adaptation_set.attrib.copy()
1448                     representation_attrib.update(representation.attrib)
1449                     mime_type = representation_attrib.get('mimeType')
1450                     content_type = mime_type.split('/')[0] if mime_type else representation_attrib.get('contentType')
1451                     if content_type == 'text':
1452                         # TODO implement WebVTT downloading
1453                         pass
1454                     elif content_type == 'video' or content_type == 'audio':
1455                         base_url = ''
1456                         for element in (representation, adaptation_set, period, mpd_doc):
1457                             base_url_e = element.find(_add_ns('BaseURL'))
1458                             if base_url_e is not None:
1459                                 base_url = base_url_e.text + base_url
1460                                 if re.match(r'^https?://', base_url):
1461                                     break
1462                         if mpd_base_url and not re.match(r'^https?://', base_url):
1463                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1464                                 mpd_base_url += '/'
1465                             base_url = mpd_base_url + base_url
1466                         representation_id = representation_attrib.get('id')
1467                         lang = representation_attrib.get('lang')
1468                         url_el = representation.find(_add_ns('BaseURL'))
1469                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1470                         f = {
1471                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1472                             'url': base_url,
1473                             'width': int_or_none(representation_attrib.get('width')),
1474                             'height': int_or_none(representation_attrib.get('height')),
1475                             'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000),
1476                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1477                             'fps': int_or_none(representation_attrib.get('frameRate')),
1478                             'vcodec': 'none' if content_type == 'audio' else representation_attrib.get('codecs'),
1479                             'acodec': 'none' if content_type == 'video' else representation_attrib.get('codecs'),
1480                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1481                             'format_note': 'DASH %s' % content_type,
1482                             'filesize': filesize,
1483                         }
1484                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1485                         if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info:
1486                             if 'total_number' not in representation_ms_info and 'segment_duration':
1487                                 segment_duration = float(representation_ms_info['segment_duration']) / float(representation_ms_info['timescale'])
1488                                 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1489                             media_template = representation_ms_info['media_template']
1490                             media_template = media_template.replace('$RepresentationID$', representation_id)
1491                             media_template = re.sub(r'\$(Number|Bandwidth)(?:%(0\d+)d)?\$', r'%(\1)\2d', media_template)
1492                             media_template.replace('$$', '$')
1493                             representation_ms_info['segment_urls'] = [media_template % {'Number': segment_number, 'Bandwidth': representation_attrib.get('bandwidth')} for segment_number in range(representation_ms_info['start_number'], representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1494                         if 'segment_urls' in representation_ms_info:
1495                             f.update({
1496                                 'segment_urls': representation_ms_info['segment_urls'],
1497                                 'protocol': 'http_dash_segments',
1498                             })
1499                             if 'initialization_url' in representation_ms_info:
1500                                 initialization_url = representation_ms_info['initialization_url'].replace('$RepresentationID$', representation_id)
1501                                 f.update({
1502                                     'initialization_url': initialization_url,
1503                                 })
1504                                 if not f.get('url'):
1505                                     f['url'] = initialization_url
1506                         try:
1507                             existing_format = next(
1508                                 fo for fo in formats
1509                                 if fo['format_id'] == representation_id)
1510                         except StopIteration:
1511                             full_info = formats_dict.get(representation_id, {}).copy()
1512                             full_info.update(f)
1513                             formats.append(full_info)
1514                         else:
1515                             existing_format.update(f)
1516                     else:
1517                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
1518         self._sort_formats(formats)
1519         return formats
1520
1521     def _live_title(self, name):
1522         """ Generate the title for a live video """
1523         now = datetime.datetime.now()
1524         now_str = now.strftime('%Y-%m-%d %H:%M')
1525         return name + ' ' + now_str
1526
1527     def _int(self, v, name, fatal=False, **kwargs):
1528         res = int_or_none(v, **kwargs)
1529         if 'get_attr' in kwargs:
1530             print(getattr(v, kwargs['get_attr']))
1531         if res is None:
1532             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1533             if fatal:
1534                 raise ExtractorError(msg)
1535             else:
1536                 self._downloader.report_warning(msg)
1537         return res
1538
1539     def _float(self, v, name, fatal=False, **kwargs):
1540         res = float_or_none(v, **kwargs)
1541         if res is None:
1542             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1543             if fatal:
1544                 raise ExtractorError(msg)
1545             else:
1546                 self._downloader.report_warning(msg)
1547         return res
1548
1549     def _set_cookie(self, domain, name, value, expire_time=None):
1550         cookie = compat_cookiejar.Cookie(
1551             0, name, value, None, None, domain, None,
1552             None, '/', True, False, expire_time, '', None, None, None)
1553         self._downloader.cookiejar.set_cookie(cookie)
1554
1555     def _get_cookies(self, url):
1556         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
1557         req = sanitized_Request(url)
1558         self._downloader.cookiejar.add_cookie_header(req)
1559         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
1560
1561     def get_testcases(self, include_onlymatching=False):
1562         t = getattr(self, '_TEST', None)
1563         if t:
1564             assert not hasattr(self, '_TESTS'), \
1565                 '%s has _TEST and _TESTS' % type(self).__name__
1566             tests = [t]
1567         else:
1568             tests = getattr(self, '_TESTS', [])
1569         for t in tests:
1570             if not include_onlymatching and t.get('only_matching', False):
1571                 continue
1572             t['name'] = type(self).__name__[:-len('IE')]
1573             yield t
1574
1575     def is_suitable(self, age_limit):
1576         """ Test whether the extractor is generally suitable for the given
1577         age limit (i.e. pornographic sites are not, all others usually are) """
1578
1579         any_restricted = False
1580         for tc in self.get_testcases(include_onlymatching=False):
1581             if 'playlist' in tc:
1582                 tc = tc['playlist'][0]
1583             is_restricted = age_restricted(
1584                 tc.get('info_dict', {}).get('age_limit'), age_limit)
1585             if not is_restricted:
1586                 return True
1587             any_restricted = any_restricted or is_restricted
1588         return not any_restricted
1589
1590     def extract_subtitles(self, *args, **kwargs):
1591         if (self._downloader.params.get('writesubtitles', False) or
1592                 self._downloader.params.get('listsubtitles')):
1593             return self._get_subtitles(*args, **kwargs)
1594         return {}
1595
1596     def _get_subtitles(self, *args, **kwargs):
1597         raise NotImplementedError('This method must be implemented by subclasses')
1598
1599     @staticmethod
1600     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
1601         """ Merge subtitle items for one language. Items with duplicated URLs
1602         will be dropped. """
1603         list1_urls = set([item['url'] for item in subtitle_list1])
1604         ret = list(subtitle_list1)
1605         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
1606         return ret
1607
1608     @classmethod
1609     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
1610         """ Merge two subtitle dictionaries, language by language. """
1611         ret = dict(subtitle_dict1)
1612         for lang in subtitle_dict2:
1613             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
1614         return ret
1615
1616     def extract_automatic_captions(self, *args, **kwargs):
1617         if (self._downloader.params.get('writeautomaticsub', False) or
1618                 self._downloader.params.get('listsubtitles')):
1619             return self._get_automatic_captions(*args, **kwargs)
1620         return {}
1621
1622     def _get_automatic_captions(self, *args, **kwargs):
1623         raise NotImplementedError('This method must be implemented by subclasses')
1624
1625     def mark_watched(self, *args, **kwargs):
1626         if (self._downloader.params.get('mark_watched', False) and
1627                 (self._get_login_info()[0] is not None or
1628                     self._downloader.params.get('cookiefile') is not None)):
1629             self._mark_watched(*args, **kwargs)
1630
1631     def _mark_watched(self, *args, **kwargs):
1632         raise NotImplementedError('This method must be implemented by subclasses')
1633
1634
1635 class SearchInfoExtractor(InfoExtractor):
1636     """
1637     Base class for paged search queries extractors.
1638     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
1639     Instances should define _SEARCH_KEY and _MAX_RESULTS.
1640     """
1641
1642     @classmethod
1643     def _make_valid_url(cls):
1644         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
1645
1646     @classmethod
1647     def suitable(cls, url):
1648         return re.match(cls._make_valid_url(), url) is not None
1649
1650     def _real_extract(self, query):
1651         mobj = re.match(self._make_valid_url(), query)
1652         if mobj is None:
1653             raise ExtractorError('Invalid search query "%s"' % query)
1654
1655         prefix = mobj.group('prefix')
1656         query = mobj.group('query')
1657         if prefix == '':
1658             return self._get_n_results(query, 1)
1659         elif prefix == 'all':
1660             return self._get_n_results(query, self._MAX_RESULTS)
1661         else:
1662             n = int(prefix)
1663             if n <= 0:
1664                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
1665             elif n > self._MAX_RESULTS:
1666                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
1667                 n = self._MAX_RESULTS
1668             return self._get_n_results(query, n)
1669
1670     def _get_n_results(self, query, n):
1671         """Get a specified number of results for a query"""
1672         raise NotImplementedError('This method must be implemented by subclasses')
1673
1674     @property
1675     def SEARCH_KEY(self):
1676         return self._SEARCH_KEY