youtube_dl/extractor/common.py

   1 from __future__ import unicode_literals
   2
   3 import base64
   4 import datetime
   5 import hashlib
   6 import json
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import sys
  12 import time
  13 import math
  14
  15 from ..compat import (
  16     compat_cookiejar,
  17     compat_cookies,
  18     compat_etree_fromstring,
  19     compat_getpass,
  20     compat_http_client,
  21     compat_os_name,
  22     compat_str,
  23     compat_urllib_error,
  24     compat_urllib_parse,
  25     compat_urlparse,
  26 )
  27 from ..utils import (
  28     NO_DEFAULT,
  29     age_restricted,
  30     bug_reports_message,
  31     clean_html,
  32     compiled_regex_type,
  33     determine_ext,
  34     error_to_compat_str,
  35     ExtractorError,
  36     fix_xml_ampersands,
  37     float_or_none,
  38     int_or_none,
  39     parse_iso8601,
  40     RegexNotFoundError,
  41     sanitize_filename,
  42     sanitized_Request,
  43     unescapeHTML,
  44     unified_strdate,
  45     url_basename,
  46     xpath_text,
  47     xpath_with_ns,
  48     determine_protocol,
  49     parse_duration,
  50     mimetype2ext,
  51     update_url_query,
  52 )
  53
  54
  55 class InfoExtractor(object):
  56     """Information Extractor class.
  57
  58     Information extractors are the classes that, given a URL, extract
  59     information about the video (or videos) the URL refers to. This
  60     information includes the real video URL, the video title, author and
  61     others. The information is stored in a dictionary which is then
  62     passed to the YoutubeDL. The YoutubeDL processes this
  63     information possibly downloading the video to the file system, among
  64     other possible outcomes.
  65
  66     The type field determines the type of the result.
  67     By far the most common value (and the default if _type is missing) is
  68     "video", which indicates a single video.
  69
  70     For a video, the dictionaries must include the following fields:
  71
  72     id:             Video identifier.
  73     title:          Video title, unescaped.
  74
  75     Additionally, it must contain either a formats entry or a url one:
  76
  77     formats:        A list of dictionaries for each format available, ordered
  78                     from worst to best quality.
  79
  80                     Potential fields:
  81                     * url        Mandatory. The URL of the video file
  82                     * ext        Will be calculated from URL if missing
  83                     * format     A human-readable description of the format
  84                                  ("mp4 container with h264/opus").
  85                                  Calculated from the format_id, width, height.
  86                                  and format_note fields if missing.
  87                     * format_id  A short description of the format
  88                                  ("mp4_h264_opus" or "19").
  89                                 Technically optional, but strongly recommended.
  90                     * format_note Additional info about the format
  91                                  ("3D" or "DASH video")
  92                     * width      Width of the video, if known
  93                     * height     Height of the video, if known
  94                     * resolution Textual description of width and height
  95                     * tbr        Average bitrate of audio and video in KBit/s
  96                     * abr        Average audio bitrate in KBit/s
  97                     * acodec     Name of the audio codec in use
  98                     * asr        Audio sampling rate in Hertz
  99                     * vbr        Average video bitrate in KBit/s
 100                     * fps        Frame rate
 101                     * vcodec     Name of the video codec in use
 102                     * container  Name of the container format
 103                     * filesize   The number of bytes, if known in advance
 104                     * filesize_approx  An estimate for the number of bytes
 105                     * player_url SWF Player URL (used for rtmpdump).
 106                     * protocol   The protocol that will be used for the actual
 107                                  download, lower-case.
 108                                  "http", "https", "rtsp", "rtmp", "rtmpe",
 109                                  "m3u8", "m3u8_native" or "http_dash_segments".
 110                     * preference Order number of this format. If this field is
 111                                  present and not None, the formats get sorted
 112                                  by this field, regardless of all other values.
 113                                  -1 for default (order by other properties),
 114                                  -2 or smaller for less than default.
 115                                  < -1000 to hide the format (if there is
 116                                     another one which is strictly better)
 117                     * language   Language code, e.g. "de" or "en-US".
 118                     * language_preference  Is this in the language mentioned in
 119                                  the URL?
 120                                  10 if it's what the URL is about,
 121                                  -1 for default (don't know),
 122                                  -10 otherwise, other values reserved for now.
 123                     * quality    Order number of the video quality of this
 124                                  format, irrespective of the file format.
 125                                  -1 for default (order by other properties),
 126                                  -2 or smaller for less than default.
 127                     * source_preference  Order number for this video source
 128                                   (quality takes higher priority)
 129                                  -1 for default (order by other properties),
 130                                  -2 or smaller for less than default.
 131                     * http_headers  A dictionary of additional HTTP headers
 132                                  to add to the request.
 133                     * stretched_ratio  If given and not 1, indicates that the
 134                                  video's pixels are not square.
 135                                  width : height ratio as float.
 136                     * no_resume  The server does not support resuming the
 137                                  (HTTP or RTMP) download. Boolean.
 138
 139     url:            Final video URL.
 140     ext:            Video filename extension.
 141     format:         The video format, defaults to ext (used for --get-format)
 142     player_url:     SWF Player URL (used for rtmpdump).
 143
 144     The following fields are optional:
 145
 146     alt_title:      A secondary title of the video.
 147     display_id      An alternative identifier for the video, not necessarily
 148                     unique, but available before title. Typically, id is
 149                     something like "4234987", title "Dancing naked mole rats",
 150                     and display_id "dancing-naked-mole-rats"
 151     thumbnails:     A list of dictionaries, with the following entries:
 152                         * "id" (optional, string) - Thumbnail format ID
 153                         * "url"
 154                         * "preference" (optional, int) - quality of the image
 155                         * "width" (optional, int)
 156                         * "height" (optional, int)
 157                         * "resolution" (optional, string "{width}x{height"},
 158                                         deprecated)
 159     thumbnail:      Full URL to a video thumbnail image.
 160     description:    Full video description.
 161     uploader:       Full name of the video uploader.
 162     license:        License name the video is licensed under.
 163     creator:        The main artist who created the video.
 164     release_date:   The date (YYYYMMDD) when the video was released.
 165     timestamp:      UNIX timestamp of the moment the video became available.
 166     upload_date:    Video upload date (YYYYMMDD).
 167                     If not explicitly set, calculated from timestamp.
 168     uploader_id:    Nickname or id of the video uploader.
 169     uploader_url:   Full URL to a personal webpage of the video uploader.
 170     location:       Physical location where the video was filmed.
 171     subtitles:      The available subtitles as a dictionary in the format
 172                     {language: subformats}. "subformats" is a list sorted from
 173                     lower to higher preference, each element is a dictionary
 174                     with the "ext" entry and one of:
 175                         * "data": The subtitles file contents
 176                         * "url": A URL pointing to the subtitles file
 177                     "ext" will be calculated from URL if missing
 178     automatic_captions: Like 'subtitles', used by the YoutubeIE for
 179                     automatically generated captions
 180     duration:       Length of the video in seconds, as an integer or float.
 181     view_count:     How many users have watched the video on the platform.
 182     like_count:     Number of positive ratings of the video
 183     dislike_count:  Number of negative ratings of the video
 184     repost_count:   Number of reposts of the video
 185     average_rating: Average rating give by users, the scale used depends on the webpage
 186     comment_count:  Number of comments on the video
 187     comments:       A list of comments, each with one or more of the following
 188                     properties (all but one of text or html optional):
 189                         * "author" - human-readable name of the comment author
 190                         * "author_id" - user ID of the comment author
 191                         * "id" - Comment ID
 192                         * "html" - Comment as HTML
 193                         * "text" - Plain text of the comment
 194                         * "timestamp" - UNIX timestamp of comment
 195                         * "parent" - ID of the comment this one is replying to.
 196                                      Set to "root" to indicate that this is a
 197                                      comment to the original video.
 198     age_limit:      Age restriction for the video, as an integer (years)
 199     webpage_url:    The URL to the video webpage, if given to youtube-dl it
 200                     should allow to get the same result again. (It will be set
 201                     by YoutubeDL if it's missing)
 202     categories:     A list of categories that the video falls in, for example
 203                     ["Sports", "Berlin"]
 204     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 205     is_live:        True, False, or None (=unknown). Whether this video is a
 206                     live stream that goes on instead of a fixed-length video.
 207     start_time:     Time in seconds where the reproduction should start, as
 208                     specified in the URL.
 209     end_time:       Time in seconds where the reproduction should end, as
 210                     specified in the URL.
 211
 212     The following fields should only be used when the video belongs to some logical
 213     chapter or section:
 214
 215     chapter:        Name or title of the chapter the video belongs to.
 216     chapter_number: Number of the chapter the video belongs to, as an integer.
 217     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 218
 219     The following fields should only be used when the video is an episode of some
 220     series or programme:
 221
 222     series:         Title of the series or programme the video episode belongs to.
 223     season:         Title of the season the video episode belongs to.
 224     season_number:  Number of the season the video episode belongs to, as an integer.
 225     season_id:      Id of the season the video episode belongs to, as a unicode string.
 226     episode:        Title of the video episode. Unlike mandatory video title field,
 227                     this field should denote the exact title of the video episode
 228                     without any kind of decoration.
 229     episode_number: Number of the video episode within a season, as an integer.
 230     episode_id:     Id of the video episode, as a unicode string.
 231
 232     Unless mentioned otherwise, the fields should be Unicode strings.
 233
 234     Unless mentioned otherwise, None is equivalent to absence of information.
 235
 236
 237     _type "playlist" indicates multiple videos.
 238     There must be a key "entries", which is a list, an iterable, or a PagedList
 239     object, each element of which is a valid dictionary by this specification.
 240
 241     Additionally, playlists can have "title", "description" and "id" attributes
 242     with the same semantics as videos (see above).
 243
 244
 245     _type "multi_video" indicates that there are multiple videos that
 246     form a single show, for examples multiple acts of an opera or TV episode.
 247     It must have an entries key like a playlist and contain all the keys
 248     required for a video at the same time.
 249
 250
 251     _type "url" indicates that the video must be extracted from another
 252     location, possibly by a different extractor. Its only required key is:
 253     "url" - the next URL to extract.
 254     The key "ie_key" can be set to the class name (minus the trailing "IE",
 255     e.g. "Youtube") if the extractor class is known in advance.
 256     Additionally, the dictionary may have any properties of the resolved entity
 257     known in advance, for example "title" if the title of the referred video is
 258     known ahead of time.
 259
 260
 261     _type "url_transparent" entities have the same specification as "url", but
 262     indicate that the given additional information is more precise than the one
 263     associated with the resolved URL.
 264     This is useful when a site employs a video service that hosts the video and
 265     its technical metadata, but that video service does not embed a useful
 266     title, description etc.
 267
 268
 269     Subclasses of this one should re-define the _real_initialize() and
 270     _real_extract() methods and define a _VALID_URL regexp.
 271     Probably, they should also be added to the list of extractors.
 272
 273     Finally, the _WORKING attribute should be set to False for broken IEs
 274     in order to warn the users and skip the tests.
 275     """
 276
 277     _ready = False
 278     _downloader = None
 279     _WORKING = True
 280
 281     def __init__(self, downloader=None):
 282         """Constructor. Receives an optional downloader."""
 283         self._ready = False
 284         self.set_downloader(downloader)
 285
 286     @classmethod
 287     def suitable(cls, url):
 288         """Receives a URL and returns True if suitable for this IE."""
 289
 290         # This does not use has/getattr intentionally - we want to know whether
 291         # we have cached the regexp for *this* class, whereas getattr would also
 292         # match the superclass
 293         if '_VALID_URL_RE' not in cls.__dict__:
 294             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 295         return cls._VALID_URL_RE.match(url) is not None
 296
 297     @classmethod
 298     def _match_id(cls, url):
 299         if '_VALID_URL_RE' not in cls.__dict__:
 300             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 301         m = cls._VALID_URL_RE.match(url)
 302         assert m
 303         return m.group('id')
 304
 305     @classmethod
 306     def working(cls):
 307         """Getter method for _WORKING."""
 308         return cls._WORKING
 309
 310     def initialize(self):
 311         """Initializes an instance (authentication, etc)."""
 312         if not self._ready:
 313             self._real_initialize()
 314             self._ready = True
 315
 316     def extract(self, url):
 317         """Extracts URL information and returns it in list of dicts."""
 318         try:
 319             self.initialize()
 320             return self._real_extract(url)
 321         except ExtractorError:
 322             raise
 323         except compat_http_client.IncompleteRead as e:
 324             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
 325         except (KeyError, StopIteration) as e:
 326             raise ExtractorError('An extractor error has occurred.', cause=e)
 327
 328     def set_downloader(self, downloader):
 329         """Sets the downloader for this IE."""
 330         self._downloader = downloader
 331
 332     def _real_initialize(self):
 333         """Real initialization process. Redefine in subclasses."""
 334         pass
 335
 336     def _real_extract(self, url):
 337         """Real extraction process. Redefine in subclasses."""
 338         pass
 339
 340     @classmethod
 341     def ie_key(cls):
 342         """A string for getting the InfoExtractor with get_info_extractor"""
 343         return compat_str(cls.__name__[:-2])
 344
 345     @property
 346     def IE_NAME(self):
 347         return compat_str(type(self).__name__[:-2])
 348
 349     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None):
 350         """ Returns the response handle """
 351         if note is None:
 352             self.report_download_webpage(video_id)
 353         elif note is not False:
 354             if video_id is None:
 355                 self.to_screen('%s' % (note,))
 356             else:
 357                 self.to_screen('%s: %s' % (video_id, note))
 358         # data, headers and query params will be ignored for `Request` objects
 359         if isinstance(url_or_request, compat_str):
 360             if query:
 361                 url_or_request = update_url_query(url_or_request, query)
 362             if data or headers:
 363                 url_or_request = sanitized_Request(url_or_request, data, headers or {})
 364         try:
 365             return self._downloader.urlopen(url_or_request)
 366         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 367             if errnote is False:
 368                 return False
 369             if errnote is None:
 370                 errnote = 'Unable to download webpage'
 371
 372             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 373             if fatal:
 374                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 375             else:
 376                 self._downloader.report_warning(errmsg)
 377                 return False
 378
 379     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers=None, query=None):
 380         """ Returns a tuple (page content as string, URL handle) """
 381         # Strip hashes from the URL (#1038)
 382         if isinstance(url_or_request, (compat_str, str)):
 383             url_or_request = url_or_request.partition('#')[0]
 384
 385         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
 386         if urlh is False:
 387             assert not fatal
 388             return False
 389         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 390         return (content, urlh)
 391
 392     @staticmethod
 393     def _guess_encoding_from_content(content_type, webpage_bytes):
 394         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 395         if m:
 396             encoding = m.group(1)
 397         else:
 398             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 399                           webpage_bytes[:1024])
 400             if m:
 401                 encoding = m.group(1).decode('ascii')
 402             elif webpage_bytes.startswith(b'\xff\xfe'):
 403                 encoding = 'utf-16'
 404             else:
 405                 encoding = 'utf-8'
 406
 407         return encoding
 408
 409     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 410         content_type = urlh.headers.get('Content-Type', '')
 411         webpage_bytes = urlh.read()
 412         if prefix is not None:
 413             webpage_bytes = prefix + webpage_bytes
 414         if not encoding:
 415             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 416         if self._downloader.params.get('dump_intermediate_pages', False):
 417             try:
 418                 url = url_or_request.get_full_url()
 419             except AttributeError:
 420                 url = url_or_request
 421             self.to_screen('Dumping request to ' + url)
 422             dump = base64.b64encode(webpage_bytes).decode('ascii')
 423             self._downloader.to_screen(dump)
 424         if self._downloader.params.get('write_pages', False):
 425             try:
 426                 url = url_or_request.get_full_url()
 427             except AttributeError:
 428                 url = url_or_request
 429             basen = '%s_%s' % (video_id, url)
 430             if len(basen) > 240:
 431                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 432                 basen = basen[:240 - len(h)] + h
 433             raw_filename = basen + '.dump'
 434             filename = sanitize_filename(raw_filename, restricted=True)
 435             self.to_screen('Saving request to ' + filename)
 436             # Working around MAX_PATH limitation on Windows (see
 437             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 438             if compat_os_name == 'nt':
 439                 absfilepath = os.path.abspath(filename)
 440                 if len(absfilepath) > 259:
 441                     filename = '\\\\?\\' + absfilepath
 442             with open(filename, 'wb') as outf:
 443                 outf.write(webpage_bytes)
 444
 445         try:
 446             content = webpage_bytes.decode(encoding, 'replace')
 447         except LookupError:
 448             content = webpage_bytes.decode('utf-8', 'replace')
 449
 450         if ('<title>Access to this site is blocked</title>' in content and
 451                 'Websense' in content[:512]):
 452             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 453             blocked_iframe = self._html_search_regex(
 454                 r'<iframe src="([^"]+)"', content,
 455                 'Websense information URL', default=None)
 456             if blocked_iframe:
 457                 msg += ' Visit %s for more details' % blocked_iframe
 458             raise ExtractorError(msg, expected=True)
 459         if '<title>The URL you requested has been blocked</title>' in content[:512]:
 460             msg = (
 461                 'Access to this webpage has been blocked by Indian censorship. '
 462                 'Use a VPN or proxy server (with --proxy) to route around it.')
 463             block_msg = self._html_search_regex(
 464                 r'</h1><p>(.*?)</p>',
 465                 content, 'block message', default=None)
 466             if block_msg:
 467                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 468             raise ExtractorError(msg, expected=True)
 469
 470         return content
 471
 472     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers=None, query=None):
 473         """ Returns the data of the page as a string """
 474         success = False
 475         try_count = 0
 476         while success is False:
 477             try:
 478                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
 479                 success = True
 480             except compat_http_client.IncompleteRead as e:
 481                 try_count += 1
 482                 if try_count >= tries:
 483                     raise e
 484                 self._sleep(timeout, video_id)
 485         if res is False:
 486             return res
 487         else:
 488             content, _ = res
 489             return content
 490
 491     def _download_xml(self, url_or_request, video_id,
 492                       note='Downloading XML', errnote='Unable to download XML',
 493                       transform_source=None, fatal=True, encoding=None, data=None, headers=None, query=None):
 494         """Return the xml as an xml.etree.ElementTree.Element"""
 495         xml_string = self._download_webpage(
 496             url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query)
 497         if xml_string is False:
 498             return xml_string
 499         if transform_source:
 500             xml_string = transform_source(xml_string)
 501         return compat_etree_fromstring(xml_string.encode('utf-8'))
 502
 503     def _download_json(self, url_or_request, video_id,
 504                        note='Downloading JSON metadata',
 505                        errnote='Unable to download JSON metadata',
 506                        transform_source=None,
 507                        fatal=True, encoding=None, data=None, headers=None, query=None):
 508         json_string = self._download_webpage(
 509             url_or_request, video_id, note, errnote, fatal=fatal,
 510             encoding=encoding, data=data, headers=headers, query=query)
 511         if (not fatal) and json_string is False:
 512             return None
 513         return self._parse_json(
 514             json_string, video_id, transform_source=transform_source, fatal=fatal)
 515
 516     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 517         if transform_source:
 518             json_string = transform_source(json_string)
 519         try:
 520             return json.loads(json_string)
 521         except ValueError as ve:
 522             errmsg = '%s: Failed to parse JSON ' % video_id
 523             if fatal:
 524                 raise ExtractorError(errmsg, cause=ve)
 525             else:
 526                 self.report_warning(errmsg + str(ve))
 527
 528     def report_warning(self, msg, video_id=None):
 529         idstr = '' if video_id is None else '%s: ' % video_id
 530         self._downloader.report_warning(
 531             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 532
 533     def to_screen(self, msg):
 534         """Print msg to screen, prefixing it with '[ie_name]'"""
 535         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 536
 537     def report_extraction(self, id_or_name):
 538         """Report information extraction."""
 539         self.to_screen('%s: Extracting information' % id_or_name)
 540
 541     def report_download_webpage(self, video_id):
 542         """Report webpage download."""
 543         self.to_screen('%s: Downloading webpage' % video_id)
 544
 545     def report_age_confirmation(self):
 546         """Report attempt to confirm age."""
 547         self.to_screen('Confirming age')
 548
 549     def report_login(self):
 550         """Report attempt to log in."""
 551         self.to_screen('Logging in')
 552
 553     @staticmethod
 554     def raise_login_required(msg='This video is only available for registered users'):
 555         raise ExtractorError(
 556             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
 557             expected=True)
 558
 559     @staticmethod
 560     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'):
 561         raise ExtractorError(
 562             '%s. You might want to use --proxy to workaround.' % msg,
 563             expected=True)
 564
 565     # Methods for following #608
 566     @staticmethod
 567     def url_result(url, ie=None, video_id=None, video_title=None):
 568         """Returns a URL that points to a page that should be processed"""
 569         # TODO: ie should be the class used for getting the info
 570         video_info = {'_type': 'url',
 571                       'url': url,
 572                       'ie_key': ie}
 573         if video_id is not None:
 574             video_info['id'] = video_id
 575         if video_title is not None:
 576             video_info['title'] = video_title
 577         return video_info
 578
 579     @staticmethod
 580     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 581         """Returns a playlist"""
 582         video_info = {'_type': 'playlist',
 583                       'entries': entries}
 584         if playlist_id:
 585             video_info['id'] = playlist_id
 586         if playlist_title:
 587             video_info['title'] = playlist_title
 588         if playlist_description:
 589             video_info['description'] = playlist_description
 590         return video_info
 591
 592     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 593         """
 594         Perform a regex search on the given string, using a single or a list of
 595         patterns returning the first matching group.
 596         In case of failure return a default value or raise a WARNING or a
 597         RegexNotFoundError, depending on fatal, specifying the field name.
 598         """
 599         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 600             mobj = re.search(pattern, string, flags)
 601         else:
 602             for p in pattern:
 603                 mobj = re.search(p, string, flags)
 604                 if mobj:
 605                     break
 606
 607         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
 608             _name = '\033[0;34m%s\033[0m' % name
 609         else:
 610             _name = name
 611
 612         if mobj:
 613             if group is None:
 614                 # return the first matching group
 615                 return next(g for g in mobj.groups() if g is not None)
 616             else:
 617                 return mobj.group(group)
 618         elif default is not NO_DEFAULT:
 619             return default
 620         elif fatal:
 621             raise RegexNotFoundError('Unable to extract %s' % _name)
 622         else:
 623             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
 624             return None
 625
 626     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 627         """
 628         Like _search_regex, but strips HTML tags and unescapes entities.
 629         """
 630         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 631         if res:
 632             return clean_html(res).strip()
 633         else:
 634             return res
 635
 636     def _get_login_info(self):
 637         """
 638         Get the login info as (username, password)
 639         It will look in the netrc file using the _NETRC_MACHINE value
 640         If there's no info available, return (None, None)
 641         """
 642         if self._downloader is None:
 643             return (None, None)
 644
 645         username = None
 646         password = None
 647         downloader_params = self._downloader.params
 648
 649         # Attempt to use provided username and password or .netrc data
 650         if downloader_params.get('username') is not None:
 651             username = downloader_params['username']
 652             password = downloader_params['password']
 653         elif downloader_params.get('usenetrc', False):
 654             try:
 655                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 656                 if info is not None:
 657                     username = info[0]
 658                     password = info[2]
 659                 else:
 660                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 661             except (IOError, netrc.NetrcParseError) as err:
 662                 self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err))
 663
 664         return (username, password)
 665
 666     def _get_tfa_info(self, note='two-factor verification code'):
 667         """
 668         Get the two-factor authentication info
 669         TODO - asking the user will be required for sms/phone verify
 670         currently just uses the command line option
 671         If there's no info available, return None
 672         """
 673         if self._downloader is None:
 674             return None
 675         downloader_params = self._downloader.params
 676
 677         if downloader_params.get('twofactor') is not None:
 678             return downloader_params['twofactor']
 679
 680         return compat_getpass('Type %s and press [Return]: ' % note)
 681
 682     # Helper functions for extracting OpenGraph info
 683     @staticmethod
 684     def _og_regexes(prop):
 685         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
 686         property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
 687                        % {'prop': re.escape(prop)})
 688         template = r'<meta[^>]+?%s[^>]+?%s'
 689         return [
 690             template % (property_re, content_re),
 691             template % (content_re, property_re),
 692         ]
 693
 694     @staticmethod
 695     def _meta_regex(prop):
 696         return r'''(?isx)<meta
 697                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
 698                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
 699
 700     def _og_search_property(self, prop, html, name=None, **kargs):
 701         if name is None:
 702             name = 'OpenGraph %s' % prop
 703         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 704         if escaped is None:
 705             return None
 706         return unescapeHTML(escaped)
 707
 708     def _og_search_thumbnail(self, html, **kargs):
 709         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
 710
 711     def _og_search_description(self, html, **kargs):
 712         return self._og_search_property('description', html, fatal=False, **kargs)
 713
 714     def _og_search_title(self, html, **kargs):
 715         return self._og_search_property('title', html, **kargs)
 716
 717     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 718         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 719         if secure:
 720             regexes = self._og_regexes('video:secure_url') + regexes
 721         return self._html_search_regex(regexes, html, name, **kargs)
 722
 723     def _og_search_url(self, html, **kargs):
 724         return self._og_search_property('url', html, **kargs)
 725
 726     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 727         if display_name is None:
 728             display_name = name
 729         return self._html_search_regex(
 730             self._meta_regex(name),
 731             html, display_name, fatal=fatal, group='content', **kwargs)
 732
 733     def _dc_search_uploader(self, html):
 734         return self._html_search_meta('dc.creator', html, 'uploader')
 735
 736     def _rta_search(self, html):
 737         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 738         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 739                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 740                      html):
 741             return 18
 742         return 0
 743
 744     def _media_rating_search(self, html):
 745         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 746         rating = self._html_search_meta('rating', html)
 747
 748         if not rating:
 749             return None
 750
 751         RATING_TABLE = {
 752             'safe for kids': 0,
 753             'general': 8,
 754             '14 years': 14,
 755             'mature': 17,
 756             'restricted': 19,
 757         }
 758         return RATING_TABLE.get(rating.lower())
 759
 760     def _family_friendly_search(self, html):
 761         # See http://schema.org/VideoObject
 762         family_friendly = self._html_search_meta('isFamilyFriendly', html)
 763
 764         if not family_friendly:
 765             return None
 766
 767         RATING_TABLE = {
 768             '1': 0,
 769             'true': 0,
 770             '0': 18,
 771             'false': 18,
 772         }
 773         return RATING_TABLE.get(family_friendly.lower())
 774
 775     def _twitter_search_player(self, html):
 776         return self._html_search_meta('twitter:player', html,
 777                                       'twitter card player')
 778
 779     def _search_json_ld(self, html, video_id, **kwargs):
 780         json_ld = self._search_regex(
 781             r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
 782             html, 'JSON-LD', group='json_ld', **kwargs)
 783         if not json_ld:
 784             return {}
 785         return self._json_ld(json_ld, video_id, fatal=kwargs.get('fatal', True))
 786
 787     def _json_ld(self, json_ld, video_id, fatal=True):
 788         if isinstance(json_ld, compat_str):
 789             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
 790         if not json_ld:
 791             return {}
 792         info = {}
 793         if json_ld.get('@context') == 'http://schema.org':
 794             item_type = json_ld.get('@type')
 795             if item_type == 'TVEpisode':
 796                 info.update({
 797                     'episode': unescapeHTML(json_ld.get('name')),
 798                     'episode_number': int_or_none(json_ld.get('episodeNumber')),
 799                     'description': unescapeHTML(json_ld.get('description')),
 800                 })
 801                 part_of_season = json_ld.get('partOfSeason')
 802                 if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
 803                     info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
 804                 part_of_series = json_ld.get('partOfSeries')
 805                 if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
 806                     info['series'] = unescapeHTML(part_of_series.get('name'))
 807             elif item_type == 'Article':
 808                 info.update({
 809                     'timestamp': parse_iso8601(json_ld.get('datePublished')),
 810                     'title': unescapeHTML(json_ld.get('headline')),
 811                     'description': unescapeHTML(json_ld.get('articleBody')),
 812                 })
 813         return dict((k, v) for k, v in info.items() if v is not None)
 814
 815     @staticmethod
 816     def _hidden_inputs(html):
 817         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
 818         hidden_inputs = {}
 819         for input in re.findall(r'(?i)<input([^>]+)>', html):
 820             if not re.search(r'type=(["\'])(?:hidden|submit)\1', input):
 821                 continue
 822             name = re.search(r'name=(["\'])(?P<value>.+?)\1', input)
 823             if not name:
 824                 continue
 825             value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
 826             if not value:
 827                 continue
 828             hidden_inputs[name.group('value')] = value.group('value')
 829         return hidden_inputs
 830
 831     def _form_hidden_inputs(self, form_id, html):
 832         form = self._search_regex(
 833             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
 834             html, '%s form' % form_id, group='form')
 835         return self._hidden_inputs(form)
 836
 837     def _sort_formats(self, formats, field_preference=None):
 838         if not formats:
 839             raise ExtractorError('No video formats found')
 840
 841         for f in formats:
 842             # Automatically determine tbr when missing based on abr and vbr (improves
 843             # formats sorting in some cases)
 844             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
 845                 f['tbr'] = f['abr'] + f['vbr']
 846
 847         def _formats_key(f):
 848             # TODO remove the following workaround
 849             from ..utils import determine_ext
 850             if not f.get('ext') and 'url' in f:
 851                 f['ext'] = determine_ext(f['url'])
 852
 853             if isinstance(field_preference, (list, tuple)):
 854                 return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
 855
 856             preference = f.get('preference')
 857             if preference is None:
 858                 preference = 0
 859                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 860                     preference -= 0.5
 861
 862             proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1
 863
 864             if f.get('vcodec') == 'none':  # audio only
 865                 preference -= 50
 866                 if self._downloader.params.get('prefer_free_formats'):
 867                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
 868                 else:
 869                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
 870                 ext_preference = 0
 871                 try:
 872                     audio_ext_preference = ORDER.index(f['ext'])
 873                 except ValueError:
 874                     audio_ext_preference = -1
 875             else:
 876                 if f.get('acodec') == 'none':  # video only
 877                     preference -= 40
 878                 if self._downloader.params.get('prefer_free_formats'):
 879                     ORDER = ['flv', 'mp4', 'webm']
 880                 else:
 881                     ORDER = ['webm', 'flv', 'mp4']
 882                 try:
 883                     ext_preference = ORDER.index(f['ext'])
 884                 except ValueError:
 885                     ext_preference = -1
 886                 audio_ext_preference = 0
 887
 888             return (
 889                 preference,
 890                 f.get('language_preference') if f.get('language_preference') is not None else -1,
 891                 f.get('quality') if f.get('quality') is not None else -1,
 892                 f.get('tbr') if f.get('tbr') is not None else -1,
 893                 f.get('filesize') if f.get('filesize') is not None else -1,
 894                 f.get('vbr') if f.get('vbr') is not None else -1,
 895                 f.get('height') if f.get('height') is not None else -1,
 896                 f.get('width') if f.get('width') is not None else -1,
 897                 proto_preference,
 898                 ext_preference,
 899                 f.get('abr') if f.get('abr') is not None else -1,
 900                 audio_ext_preference,
 901                 f.get('fps') if f.get('fps') is not None else -1,
 902                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
 903                 f.get('source_preference') if f.get('source_preference') is not None else -1,
 904                 f.get('format_id') if f.get('format_id') is not None else '',
 905             )
 906         formats.sort(key=_formats_key)
 907
 908     def _check_formats(self, formats, video_id):
 909         if formats:
 910             formats[:] = filter(
 911                 lambda f: self._is_valid_url(
 912                     f['url'], video_id,
 913                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
 914                 formats)
 915
 916     @staticmethod
 917     def _remove_duplicate_formats(formats):
 918         format_urls = set()
 919         unique_formats = []
 920         for f in formats:
 921             if f['url'] not in format_urls:
 922                 format_urls.add(f['url'])
 923                 unique_formats.append(f)
 924         formats[:] = unique_formats
 925
 926     def _is_valid_url(self, url, video_id, item='video'):
 927         url = self._proto_relative_url(url, scheme='http:')
 928         # For now assume non HTTP(S) URLs always valid
 929         if not (url.startswith('http://') or url.startswith('https://')):
 930             return True
 931         try:
 932             self._request_webpage(url, video_id, 'Checking %s URL' % item)
 933             return True
 934         except ExtractorError as e:
 935             if isinstance(e.cause, compat_urllib_error.URLError):
 936                 self.to_screen(
 937                     '%s: %s URL is invalid, skipping' % (video_id, item))
 938                 return False
 939             raise
 940
 941     def http_scheme(self):
 942         """ Either "http:" or "https:", depending on the user's preferences """
 943         return (
 944             'http:'
 945             if self._downloader.params.get('prefer_insecure', False)
 946             else 'https:')
 947
 948     def _proto_relative_url(self, url, scheme=None):
 949         if url is None:
 950             return url
 951         if url.startswith('//'):
 952             if scheme is None:
 953                 scheme = self.http_scheme()
 954             return scheme + url
 955         else:
 956             return url
 957
 958     def _sleep(self, timeout, video_id, msg_template=None):
 959         if msg_template is None:
 960             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
 961         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
 962         self.to_screen(msg)
 963         time.sleep(timeout)
 964
 965     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
 966                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
 967                              fatal=True):
 968         manifest = self._download_xml(
 969             manifest_url, video_id, 'Downloading f4m manifest',
 970             'Unable to download f4m manifest',
 971             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
 972             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
 973             transform_source=transform_source,
 974             fatal=fatal)
 975
 976         if manifest is False:
 977             return []
 978
 979         return self._parse_f4m_formats(
 980             manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
 981             transform_source=transform_source, fatal=fatal)
 982
 983     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
 984                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
 985                            fatal=True):
 986         formats = []
 987         manifest_version = '1.0'
 988         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
 989         if not media_nodes:
 990             manifest_version = '2.0'
 991             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
 992         base_url = xpath_text(
 993             manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
 994             'base URL', default=None)
 995         if base_url:
 996             base_url = base_url.strip()
 997         for i, media_el in enumerate(media_nodes):
 998             if manifest_version == '2.0':
 999                 media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
1000                 if not media_url:
1001                     continue
1002                 manifest_url = (
1003                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1004                     else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1005                 # If media_url is itself a f4m manifest do the recursive extraction
1006                 # since bitrates in parent manifest (this one) and media_url manifest
1007                 # may differ leading to inability to resolve the format by requested
1008                 # bitrate in f4m downloader
1009                 if determine_ext(manifest_url) == 'f4m':
1010                     formats.extend(self._extract_f4m_formats(
1011                         manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1012                         transform_source=transform_source, fatal=fatal))
1013                     continue
1014             tbr = int_or_none(media_el.attrib.get('bitrate'))
1015             formats.append({
1016                 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
1017                 'url': manifest_url,
1018                 'ext': 'flv',
1019                 'tbr': tbr,
1020                 'width': int_or_none(media_el.attrib.get('width')),
1021                 'height': int_or_none(media_el.attrib.get('height')),
1022                 'preference': preference,
1023             })
1024         self._sort_formats(formats)
1025
1026         return formats
1027
1028     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1029                               entry_protocol='m3u8', preference=None,
1030                               m3u8_id=None, note=None, errnote=None,
1031                               fatal=True):
1032
1033         formats = [{
1034             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1035             'url': m3u8_url,
1036             'ext': ext,
1037             'protocol': 'm3u8',
1038             'preference': preference - 1 if preference else -1,
1039             'resolution': 'multiple',
1040             'format_note': 'Quality selection URL',
1041         }]
1042
1043         format_url = lambda u: (
1044             u
1045             if re.match(r'^https?://', u)
1046             else compat_urlparse.urljoin(m3u8_url, u))
1047
1048         res = self._download_webpage_handle(
1049             m3u8_url, video_id,
1050             note=note or 'Downloading m3u8 information',
1051             errnote=errnote or 'Failed to download m3u8 information',
1052             fatal=fatal)
1053         if res is False:
1054             return []
1055         m3u8_doc, urlh = res
1056         m3u8_url = urlh.geturl()
1057
1058         # We should try extracting formats only from master playlists [1], i.e.
1059         # playlists that describe available qualities. On the other hand media
1060         # playlists [2] should be returned as is since they contain just the media
1061         # without qualities renditions.
1062         # Fortunately, master playlist can be easily distinguished from media
1063         # playlist based on particular tags availability. As of [1, 2] master
1064         # playlist tags MUST NOT appear in a media playist and vice versa.
1065         # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
1066         # and MUST NOT appear in master playlist thus we can clearly detect media
1067         # playlist with this criterion.
1068         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
1069         # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
1070         # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
1071         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1072             return [{
1073                 'url': m3u8_url,
1074                 'format_id': m3u8_id,
1075                 'ext': ext,
1076                 'protocol': entry_protocol,
1077                 'preference': preference,
1078             }]
1079         last_info = None
1080         last_media = None
1081         kv_rex = re.compile(
1082             r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
1083         for line in m3u8_doc.splitlines():
1084             if line.startswith('#EXT-X-STREAM-INF:'):
1085                 last_info = {}
1086                 for m in kv_rex.finditer(line):
1087                     v = m.group('val')
1088                     if v.startswith('"'):
1089                         v = v[1:-1]
1090                     last_info[m.group('key')] = v
1091             elif line.startswith('#EXT-X-MEDIA:'):
1092                 last_media = {}
1093                 for m in kv_rex.finditer(line):
1094                     v = m.group('val')
1095                     if v.startswith('"'):
1096                         v = v[1:-1]
1097                     last_media[m.group('key')] = v
1098             elif line.startswith('#') or not line.strip():
1099                 continue
1100             else:
1101                 if last_info is None:
1102                     formats.append({'url': format_url(line)})
1103                     continue
1104                 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
1105                 format_id = []
1106                 if m3u8_id:
1107                     format_id.append(m3u8_id)
1108                 last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
1109                 format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
1110                 f = {
1111                     'format_id': '-'.join(format_id),
1112                     'url': format_url(line.strip()),
1113                     'tbr': tbr,
1114                     'ext': ext,
1115                     'protocol': entry_protocol,
1116                     'preference': preference,
1117                 }
1118                 resolution = last_info.get('RESOLUTION')
1119                 if resolution:
1120                     width_str, height_str = resolution.split('x')
1121                     f['width'] = int(width_str)
1122                     f['height'] = int(height_str)
1123                 codecs = last_info.get('CODECS')
1124                 if codecs:
1125                     vcodec, acodec = [None] * 2
1126                     va_codecs = codecs.split(',')
1127                     if len(va_codecs) == 1:
1128                         # Audio only entries usually come with single codec and
1129                         # no resolution. For more robustness we also check it to
1130                         # be mp4 audio.
1131                         if not resolution and va_codecs[0].startswith('mp4a'):
1132                             vcodec, acodec = 'none', va_codecs[0]
1133                         else:
1134                             vcodec = va_codecs[0]
1135                     else:
1136                         vcodec, acodec = va_codecs[:2]
1137                     f.update({
1138                         'acodec': acodec,
1139                         'vcodec': vcodec,
1140                     })
1141                 if last_media is not None:
1142                     f['m3u8_media'] = last_media
1143                     last_media = None
1144                 formats.append(f)
1145                 last_info = {}
1146         self._sort_formats(formats)
1147         return formats
1148
1149     @staticmethod
1150     def _xpath_ns(path, namespace=None):
1151         if not namespace:
1152             return path
1153         out = []
1154         for c in path.split('/'):
1155             if not c or c == '.':
1156                 out.append(c)
1157             else:
1158                 out.append('{%s}%s' % (namespace, c))
1159         return '/'.join(out)
1160
1161     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1162         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1163
1164         if smil is False:
1165             assert not fatal
1166             return []
1167
1168         namespace = self._parse_smil_namespace(smil)
1169
1170         return self._parse_smil_formats(
1171             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1172
1173     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1174         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1175         if smil is False:
1176             return {}
1177         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1178
1179     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1180         return self._download_xml(
1181             smil_url, video_id, 'Downloading SMIL file',
1182             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1183
1184     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1185         namespace = self._parse_smil_namespace(smil)
1186
1187         formats = self._parse_smil_formats(
1188             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1189         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1190
1191         video_id = os.path.splitext(url_basename(smil_url))[0]
1192         title = None
1193         description = None
1194         upload_date = None
1195         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1196             name = meta.attrib.get('name')
1197             content = meta.attrib.get('content')
1198             if not name or not content:
1199                 continue
1200             if not title and name == 'title':
1201                 title = content
1202             elif not description and name in ('description', 'abstract'):
1203                 description = content
1204             elif not upload_date and name == 'date':
1205                 upload_date = unified_strdate(content)
1206
1207         thumbnails = [{
1208             'id': image.get('type'),
1209             'url': image.get('src'),
1210             'width': int_or_none(image.get('width')),
1211             'height': int_or_none(image.get('height')),
1212         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1213
1214         return {
1215             'id': video_id,
1216             'title': title or video_id,
1217             'description': description,
1218             'upload_date': upload_date,
1219             'thumbnails': thumbnails,
1220             'formats': formats,
1221             'subtitles': subtitles,
1222         }
1223
1224     def _parse_smil_namespace(self, smil):
1225         return self._search_regex(
1226             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1227
1228     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1229         base = smil_url
1230         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1231             b = meta.get('base') or meta.get('httpBase')
1232             if b:
1233                 base = b
1234                 break
1235
1236         formats = []
1237         rtmp_count = 0
1238         http_count = 0
1239         m3u8_count = 0
1240
1241         srcs = []
1242         videos = smil.findall(self._xpath_ns('.//video', namespace))
1243         for video in videos:
1244             src = video.get('src')
1245             if not src or src in srcs:
1246                 continue
1247             srcs.append(src)
1248
1249             bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
1250             filesize = int_or_none(video.get('size') or video.get('fileSize'))
1251             width = int_or_none(video.get('width'))
1252             height = int_or_none(video.get('height'))
1253             proto = video.get('proto')
1254             ext = video.get('ext')
1255             src_ext = determine_ext(src)
1256             streamer = video.get('streamer') or base
1257
1258             if proto == 'rtmp' or streamer.startswith('rtmp'):
1259                 rtmp_count += 1
1260                 formats.append({
1261                     'url': streamer,
1262                     'play_path': src,
1263                     'ext': 'flv',
1264                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1265                     'tbr': bitrate,
1266                     'filesize': filesize,
1267                     'width': width,
1268                     'height': height,
1269                 })
1270                 if transform_rtmp_url:
1271                     streamer, src = transform_rtmp_url(streamer, src)
1272                     formats[-1].update({
1273                         'url': streamer,
1274                         'play_path': src,
1275                     })
1276                 continue
1277
1278             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1279             src_url = src_url.strip()
1280
1281             if proto == 'm3u8' or src_ext == 'm3u8':
1282                 m3u8_formats = self._extract_m3u8_formats(
1283                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1284                 if len(m3u8_formats) == 1:
1285                     m3u8_count += 1
1286                     m3u8_formats[0].update({
1287                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1288                         'tbr': bitrate,
1289                         'width': width,
1290                         'height': height,
1291                     })
1292                 formats.extend(m3u8_formats)
1293                 continue
1294
1295             if src_ext == 'f4m':
1296                 f4m_url = src_url
1297                 if not f4m_params:
1298                     f4m_params = {
1299                         'hdcore': '3.2.0',
1300                         'plugin': 'flowplayer-3.2.0.1',
1301                     }
1302                 f4m_url += '&' if '?' in f4m_url else '?'
1303                 f4m_url += compat_urllib_parse.urlencode(f4m_params)
1304                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1305                 continue
1306
1307             if src_url.startswith('http') and self._is_valid_url(src, video_id):
1308                 http_count += 1
1309                 formats.append({
1310                     'url': src_url,
1311                     'ext': ext or src_ext or 'flv',
1312                     'format_id': 'http-%d' % (bitrate or http_count),
1313                     'tbr': bitrate,
1314                     'filesize': filesize,
1315                     'width': width,
1316                     'height': height,
1317                 })
1318                 continue
1319
1320         self._sort_formats(formats)
1321
1322         return formats
1323
1324     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1325         urls = []
1326         subtitles = {}
1327         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1328             src = textstream.get('src')
1329             if not src or src in urls:
1330                 continue
1331             urls.append(src)
1332             ext = textstream.get('ext') or determine_ext(src) or mimetype2ext(textstream.get('type'))
1333             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1334             subtitles.setdefault(lang, []).append({
1335                 'url': src,
1336                 'ext': ext,
1337             })
1338         return subtitles
1339
1340     def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1341         xspf = self._download_xml(
1342             playlist_url, playlist_id, 'Downloading xpsf playlist',
1343             'Unable to download xspf manifest', fatal=fatal)
1344         if xspf is False:
1345             return []
1346         return self._parse_xspf(xspf, playlist_id)
1347
1348     def _parse_xspf(self, playlist, playlist_id):
1349         NS_MAP = {
1350             'xspf': 'http://xspf.org/ns/0/',
1351             's1': 'http://static.streamone.nl/player/ns/0',
1352         }
1353
1354         entries = []
1355         for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1356             title = xpath_text(
1357                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1358             description = xpath_text(
1359                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1360             thumbnail = xpath_text(
1361                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1362             duration = float_or_none(
1363                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1364
1365             formats = [{
1366                 'url': location.text,
1367                 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1368                 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1369                 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1370             } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1371             self._sort_formats(formats)
1372
1373             entries.append({
1374                 'id': playlist_id,
1375                 'title': title,
1376                 'description': description,
1377                 'thumbnail': thumbnail,
1378                 'duration': duration,
1379                 'formats': formats,
1380             })
1381         return entries
1382
1383     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1384         res = self._download_webpage_handle(
1385             mpd_url, video_id,
1386             note=note or 'Downloading MPD manifest',
1387             errnote=errnote or 'Failed to download MPD manifest',
1388             fatal=fatal)
1389         if res is False:
1390             return []
1391         mpd, urlh = res
1392         mpd_base_url = re.match(r'https?://.+/', urlh.geturl()).group()
1393
1394         return self._parse_mpd_formats(
1395             compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, formats_dict=formats_dict)
1396
1397     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}):
1398         if mpd_doc.get('type') == 'dynamic':
1399             return []
1400
1401         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1402
1403         def _add_ns(path):
1404             return self._xpath_ns(path, namespace)
1405
1406         def is_drm_protected(element):
1407             return element.find(_add_ns('ContentProtection')) is not None
1408
1409         def extract_multisegment_info(element, ms_parent_info):
1410             ms_info = ms_parent_info.copy()
1411             segment_list = element.find(_add_ns('SegmentList'))
1412             if segment_list is not None:
1413                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1414                 if segment_urls_e:
1415                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1416                 initialization = segment_list.find(_add_ns('Initialization'))
1417                 if initialization is not None:
1418                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
1419             else:
1420                 segment_template = element.find(_add_ns('SegmentTemplate'))
1421                 if segment_template is not None:
1422                     start_number = segment_template.get('startNumber')
1423                     if start_number:
1424                         ms_info['start_number'] = int(start_number)
1425                     segment_timeline = segment_template.find(_add_ns('SegmentTimeline'))
1426                     if segment_timeline is not None:
1427                         s_e = segment_timeline.findall(_add_ns('S'))
1428                         if s_e:
1429                             ms_info['total_number'] = 0
1430                             for s in s_e:
1431                                 ms_info['total_number'] += 1 + int(s.get('r', '0'))
1432                     else:
1433                         timescale = segment_template.get('timescale')
1434                         if timescale:
1435                             ms_info['timescale'] = int(timescale)
1436                         segment_duration = segment_template.get('duration')
1437                         if segment_duration:
1438                             ms_info['segment_duration'] = int(segment_duration)
1439                     media_template = segment_template.get('media')
1440                     if media_template:
1441                         ms_info['media_template'] = media_template
1442                     initialization = segment_template.get('initialization')
1443                     if initialization:
1444                         ms_info['initialization_url'] = initialization
1445                     else:
1446                         initialization = segment_template.find(_add_ns('Initialization'))
1447                         if initialization is not None:
1448                             ms_info['initialization_url'] = initialization.attrib['sourceURL']
1449             return ms_info
1450
1451         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1452         formats = []
1453         for period in mpd_doc.findall(_add_ns('Period')):
1454             period_duration = parse_duration(period.get('duration')) or mpd_duration
1455             period_ms_info = extract_multisegment_info(period, {
1456                 'start_number': 1,
1457                 'timescale': 1,
1458             })
1459             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1460                 if is_drm_protected(adaptation_set):
1461                     continue
1462                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1463                 for representation in adaptation_set.findall(_add_ns('Representation')):
1464                     if is_drm_protected(representation):
1465                         continue
1466                     representation_attrib = adaptation_set.attrib.copy()
1467                     representation_attrib.update(representation.attrib)
1468                     # According to page 41 of ISO/IEC 29001-1:2014, @mimeType is mandatory
1469                     mime_type = representation_attrib['mimeType']
1470                     content_type = mime_type.split('/')[0]
1471                     if content_type == 'text':
1472                         # TODO implement WebVTT downloading
1473                         pass
1474                     elif content_type == 'video' or content_type == 'audio':
1475                         base_url = ''
1476                         for element in (representation, adaptation_set, period, mpd_doc):
1477                             base_url_e = element.find(_add_ns('BaseURL'))
1478                             if base_url_e is not None:
1479                                 base_url = base_url_e.text + base_url
1480                                 if re.match(r'^https?://', base_url):
1481                                     break
1482                         if mpd_base_url and not re.match(r'^https?://', base_url):
1483                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1484                                 mpd_base_url += '/'
1485                             base_url = mpd_base_url + base_url
1486                         representation_id = representation_attrib.get('id')
1487                         lang = representation_attrib.get('lang')
1488                         url_el = representation.find(_add_ns('BaseURL'))
1489                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1490                         f = {
1491                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1492                             'url': base_url,
1493                             'ext': mimetype2ext(mime_type),
1494                             'width': int_or_none(representation_attrib.get('width')),
1495                             'height': int_or_none(representation_attrib.get('height')),
1496                             'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000),
1497                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1498                             'fps': int_or_none(representation_attrib.get('frameRate')),
1499                             'vcodec': 'none' if content_type == 'audio' else representation_attrib.get('codecs'),
1500                             'acodec': 'none' if content_type == 'video' else representation_attrib.get('codecs'),
1501                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1502                             'format_note': 'DASH %s' % content_type,
1503                             'filesize': filesize,
1504                         }
1505                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1506                         if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info:
1507                             if 'total_number' not in representation_ms_info and 'segment_duration':
1508                                 segment_duration = float(representation_ms_info['segment_duration']) / float(representation_ms_info['timescale'])
1509                                 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1510                             media_template = representation_ms_info['media_template']
1511                             media_template = media_template.replace('$RepresentationID$', representation_id)
1512                             media_template = re.sub(r'\$(Number|Bandwidth)(?:%(0\d+)d)?\$', r'%(\1)\2d', media_template)
1513                             media_template.replace('$$', '$')
1514                             representation_ms_info['segment_urls'] = [media_template % {'Number': segment_number, 'Bandwidth': representation_attrib.get('bandwidth')} for segment_number in range(representation_ms_info['start_number'], representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1515                         if 'segment_urls' in representation_ms_info:
1516                             f.update({
1517                                 'segment_urls': representation_ms_info['segment_urls'],
1518                                 'protocol': 'http_dash_segments',
1519                             })
1520                             if 'initialization_url' in representation_ms_info:
1521                                 initialization_url = representation_ms_info['initialization_url'].replace('$RepresentationID$', representation_id)
1522                                 f.update({
1523                                     'initialization_url': initialization_url,
1524                                 })
1525                                 if not f.get('url'):
1526                                     f['url'] = initialization_url
1527                         try:
1528                             existing_format = next(
1529                                 fo for fo in formats
1530                                 if fo['format_id'] == representation_id)
1531                         except StopIteration:
1532                             full_info = formats_dict.get(representation_id, {}).copy()
1533                             full_info.update(f)
1534                             formats.append(full_info)
1535                         else:
1536                             existing_format.update(f)
1537                     else:
1538                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
1539         self._sort_formats(formats)
1540         return formats
1541
1542     def _live_title(self, name):
1543         """ Generate the title for a live video """
1544         now = datetime.datetime.now()
1545         now_str = now.strftime('%Y-%m-%d %H:%M')
1546         return name + ' ' + now_str
1547
1548     def _int(self, v, name, fatal=False, **kwargs):
1549         res = int_or_none(v, **kwargs)
1550         if 'get_attr' in kwargs:
1551             print(getattr(v, kwargs['get_attr']))
1552         if res is None:
1553             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1554             if fatal:
1555                 raise ExtractorError(msg)
1556             else:
1557                 self._downloader.report_warning(msg)
1558         return res
1559
1560     def _float(self, v, name, fatal=False, **kwargs):
1561         res = float_or_none(v, **kwargs)
1562         if res is None:
1563             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1564             if fatal:
1565                 raise ExtractorError(msg)
1566             else:
1567                 self._downloader.report_warning(msg)
1568         return res
1569
1570     def _set_cookie(self, domain, name, value, expire_time=None):
1571         cookie = compat_cookiejar.Cookie(
1572             0, name, value, None, None, domain, None,
1573             None, '/', True, False, expire_time, '', None, None, None)
1574         self._downloader.cookiejar.set_cookie(cookie)
1575
1576     def _get_cookies(self, url):
1577         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
1578         req = sanitized_Request(url)
1579         self._downloader.cookiejar.add_cookie_header(req)
1580         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
1581
1582     def get_testcases(self, include_onlymatching=False):
1583         t = getattr(self, '_TEST', None)
1584         if t:
1585             assert not hasattr(self, '_TESTS'), \
1586                 '%s has _TEST and _TESTS' % type(self).__name__
1587             tests = [t]
1588         else:
1589             tests = getattr(self, '_TESTS', [])
1590         for t in tests:
1591             if not include_onlymatching and t.get('only_matching', False):
1592                 continue
1593             t['name'] = type(self).__name__[:-len('IE')]
1594             yield t
1595
1596     def is_suitable(self, age_limit):
1597         """ Test whether the extractor is generally suitable for the given
1598         age limit (i.e. pornographic sites are not, all others usually are) """
1599
1600         any_restricted = False
1601         for tc in self.get_testcases(include_onlymatching=False):
1602             if 'playlist' in tc:
1603                 tc = tc['playlist'][0]
1604             is_restricted = age_restricted(
1605                 tc.get('info_dict', {}).get('age_limit'), age_limit)
1606             if not is_restricted:
1607                 return True
1608             any_restricted = any_restricted or is_restricted
1609         return not any_restricted
1610
1611     def extract_subtitles(self, *args, **kwargs):
1612         if (self._downloader.params.get('writesubtitles', False) or
1613                 self._downloader.params.get('listsubtitles')):
1614             return self._get_subtitles(*args, **kwargs)
1615         return {}
1616
1617     def _get_subtitles(self, *args, **kwargs):
1618         raise NotImplementedError('This method must be implemented by subclasses')
1619
1620     @staticmethod
1621     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
1622         """ Merge subtitle items for one language. Items with duplicated URLs
1623         will be dropped. """
1624         list1_urls = set([item['url'] for item in subtitle_list1])
1625         ret = list(subtitle_list1)
1626         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
1627         return ret
1628
1629     @classmethod
1630     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
1631         """ Merge two subtitle dictionaries, language by language. """
1632         ret = dict(subtitle_dict1)
1633         for lang in subtitle_dict2:
1634             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
1635         return ret
1636
1637     def extract_automatic_captions(self, *args, **kwargs):
1638         if (self._downloader.params.get('writeautomaticsub', False) or
1639                 self._downloader.params.get('listsubtitles')):
1640             return self._get_automatic_captions(*args, **kwargs)
1641         return {}
1642
1643     def _get_automatic_captions(self, *args, **kwargs):
1644         raise NotImplementedError('This method must be implemented by subclasses')
1645
1646     def mark_watched(self, *args, **kwargs):
1647         if (self._downloader.params.get('mark_watched', False) and
1648                 (self._get_login_info()[0] is not None or
1649                     self._downloader.params.get('cookiefile') is not None)):
1650             self._mark_watched(*args, **kwargs)
1651
1652     def _mark_watched(self, *args, **kwargs):
1653         raise NotImplementedError('This method must be implemented by subclasses')
1654
1655
1656 class SearchInfoExtractor(InfoExtractor):
1657     """
1658     Base class for paged search queries extractors.
1659     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
1660     Instances should define _SEARCH_KEY and _MAX_RESULTS.
1661     """
1662
1663     @classmethod
1664     def _make_valid_url(cls):
1665         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
1666
1667     @classmethod
1668     def suitable(cls, url):
1669         return re.match(cls._make_valid_url(), url) is not None
1670
1671     def _real_extract(self, query):
1672         mobj = re.match(self._make_valid_url(), query)
1673         if mobj is None:
1674             raise ExtractorError('Invalid search query "%s"' % query)
1675
1676         prefix = mobj.group('prefix')
1677         query = mobj.group('query')
1678         if prefix == '':
1679             return self._get_n_results(query, 1)
1680         elif prefix == 'all':
1681             return self._get_n_results(query, self._MAX_RESULTS)
1682         else:
1683             n = int(prefix)
1684             if n <= 0:
1685                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
1686             elif n > self._MAX_RESULTS:
1687                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
1688                 n = self._MAX_RESULTS
1689             return self._get_n_results(query, n)
1690
1691     def _get_n_results(self, query, n):
1692         """Get a specified number of results for a query"""
1693         raise NotImplementedError('This method must be implemented by subclasses')
1694
1695     @property
1696     def SEARCH_KEY(self):
1697         return self._SEARCH_KEY