youtube_dl/extractor/common.py

   1 from __future__ import unicode_literals
   2
   3 import base64
   4 import datetime
   5 import hashlib
   6 import json
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import sys
  12 import time
  13 import math
  14
  15 from ..compat import (
  16     compat_cookiejar,
  17     compat_cookies,
  18     compat_etree_fromstring,
  19     compat_getpass,
  20     compat_http_client,
  21     compat_os_name,
  22     compat_str,
  23     compat_urllib_error,
  24     compat_urllib_parse,
  25     compat_urlparse,
  26 )
  27 from ..utils import (
  28     NO_DEFAULT,
  29     age_restricted,
  30     bug_reports_message,
  31     clean_html,
  32     compiled_regex_type,
  33     determine_ext,
  34     error_to_compat_str,
  35     ExtractorError,
  36     fix_xml_ampersands,
  37     float_or_none,
  38     int_or_none,
  39     parse_iso8601,
  40     RegexNotFoundError,
  41     sanitize_filename,
  42     sanitized_Request,
  43     unescapeHTML,
  44     unified_strdate,
  45     url_basename,
  46     xpath_text,
  47     xpath_with_ns,
  48     determine_protocol,
  49     parse_duration,
  50     mimetype2ext,
  51     update_url_query,
  52 )
  53
  54
  55 class InfoExtractor(object):
  56     """Information Extractor class.
  57
  58     Information extractors are the classes that, given a URL, extract
  59     information about the video (or videos) the URL refers to. This
  60     information includes the real video URL, the video title, author and
  61     others. The information is stored in a dictionary which is then
  62     passed to the YoutubeDL. The YoutubeDL processes this
  63     information possibly downloading the video to the file system, among
  64     other possible outcomes.
  65
  66     The type field determines the type of the result.
  67     By far the most common value (and the default if _type is missing) is
  68     "video", which indicates a single video.
  69
  70     For a video, the dictionaries must include the following fields:
  71
  72     id:             Video identifier.
  73     title:          Video title, unescaped.
  74
  75     Additionally, it must contain either a formats entry or a url one:
  76
  77     formats:        A list of dictionaries for each format available, ordered
  78                     from worst to best quality.
  79
  80                     Potential fields:
  81                     * url        Mandatory. The URL of the video file
  82                     * ext        Will be calculated from URL if missing
  83                     * format     A human-readable description of the format
  84                                  ("mp4 container with h264/opus").
  85                                  Calculated from the format_id, width, height.
  86                                  and format_note fields if missing.
  87                     * format_id  A short description of the format
  88                                  ("mp4_h264_opus" or "19").
  89                                 Technically optional, but strongly recommended.
  90                     * format_note Additional info about the format
  91                                  ("3D" or "DASH video")
  92                     * width      Width of the video, if known
  93                     * height     Height of the video, if known
  94                     * resolution Textual description of width and height
  95                     * tbr        Average bitrate of audio and video in KBit/s
  96                     * abr        Average audio bitrate in KBit/s
  97                     * acodec     Name of the audio codec in use
  98                     * asr        Audio sampling rate in Hertz
  99                     * vbr        Average video bitrate in KBit/s
 100                     * fps        Frame rate
 101                     * vcodec     Name of the video codec in use
 102                     * container  Name of the container format
 103                     * filesize   The number of bytes, if known in advance
 104                     * filesize_approx  An estimate for the number of bytes
 105                     * player_url SWF Player URL (used for rtmpdump).
 106                     * protocol   The protocol that will be used for the actual
 107                                  download, lower-case.
 108                                  "http", "https", "rtsp", "rtmp", "rtmpe",
 109                                  "m3u8", "m3u8_native" or "http_dash_segments".
 110                     * preference Order number of this format. If this field is
 111                                  present and not None, the formats get sorted
 112                                  by this field, regardless of all other values.
 113                                  -1 for default (order by other properties),
 114                                  -2 or smaller for less than default.
 115                                  < -1000 to hide the format (if there is
 116                                     another one which is strictly better)
 117                     * language   Language code, e.g. "de" or "en-US".
 118                     * language_preference  Is this in the language mentioned in
 119                                  the URL?
 120                                  10 if it's what the URL is about,
 121                                  -1 for default (don't know),
 122                                  -10 otherwise, other values reserved for now.
 123                     * quality    Order number of the video quality of this
 124                                  format, irrespective of the file format.
 125                                  -1 for default (order by other properties),
 126                                  -2 or smaller for less than default.
 127                     * source_preference  Order number for this video source
 128                                   (quality takes higher priority)
 129                                  -1 for default (order by other properties),
 130                                  -2 or smaller for less than default.
 131                     * http_headers  A dictionary of additional HTTP headers
 132                                  to add to the request.
 133                     * stretched_ratio  If given and not 1, indicates that the
 134                                  video's pixels are not square.
 135                                  width : height ratio as float.
 136                     * no_resume  The server does not support resuming the
 137                                  (HTTP or RTMP) download. Boolean.
 138
 139     url:            Final video URL.
 140     ext:            Video filename extension.
 141     format:         The video format, defaults to ext (used for --get-format)
 142     player_url:     SWF Player URL (used for rtmpdump).
 143
 144     The following fields are optional:
 145
 146     alt_title:      A secondary title of the video.
 147     display_id      An alternative identifier for the video, not necessarily
 148                     unique, but available before title. Typically, id is
 149                     something like "4234987", title "Dancing naked mole rats",
 150                     and display_id "dancing-naked-mole-rats"
 151     thumbnails:     A list of dictionaries, with the following entries:
 152                         * "id" (optional, string) - Thumbnail format ID
 153                         * "url"
 154                         * "preference" (optional, int) - quality of the image
 155                         * "width" (optional, int)
 156                         * "height" (optional, int)
 157                         * "resolution" (optional, string "{width}x{height"},
 158                                         deprecated)
 159     thumbnail:      Full URL to a video thumbnail image.
 160     description:    Full video description.
 161     uploader:       Full name of the video uploader.
 162     license:        License name the video is licensed under.
 163     creator:        The main artist who created the video.
 164     release_date:   The date (YYYYMMDD) when the video was released.
 165     timestamp:      UNIX timestamp of the moment the video became available.
 166     upload_date:    Video upload date (YYYYMMDD).
 167                     If not explicitly set, calculated from timestamp.
 168     uploader_id:    Nickname or id of the video uploader.
 169     uploader_url:   Full URL to a personal webpage of the video uploader.
 170     location:       Physical location where the video was filmed.
 171     subtitles:      The available subtitles as a dictionary in the format
 172                     {language: subformats}. "subformats" is a list sorted from
 173                     lower to higher preference, each element is a dictionary
 174                     with the "ext" entry and one of:
 175                         * "data": The subtitles file contents
 176                         * "url": A URL pointing to the subtitles file
 177                     "ext" will be calculated from URL if missing
 178     automatic_captions: Like 'subtitles', used by the YoutubeIE for
 179                     automatically generated captions
 180     duration:       Length of the video in seconds, as an integer or float.
 181     view_count:     How many users have watched the video on the platform.
 182     like_count:     Number of positive ratings of the video
 183     dislike_count:  Number of negative ratings of the video
 184     repost_count:   Number of reposts of the video
 185     average_rating: Average rating give by users, the scale used depends on the webpage
 186     comment_count:  Number of comments on the video
 187     comments:       A list of comments, each with one or more of the following
 188                     properties (all but one of text or html optional):
 189                         * "author" - human-readable name of the comment author
 190                         * "author_id" - user ID of the comment author
 191                         * "id" - Comment ID
 192                         * "html" - Comment as HTML
 193                         * "text" - Plain text of the comment
 194                         * "timestamp" - UNIX timestamp of comment
 195                         * "parent" - ID of the comment this one is replying to.
 196                                      Set to "root" to indicate that this is a
 197                                      comment to the original video.
 198     age_limit:      Age restriction for the video, as an integer (years)
 199     webpage_url:    The URL to the video webpage, if given to youtube-dl it
 200                     should allow to get the same result again. (It will be set
 201                     by YoutubeDL if it's missing)
 202     categories:     A list of categories that the video falls in, for example
 203                     ["Sports", "Berlin"]
 204     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 205     is_live:        True, False, or None (=unknown). Whether this video is a
 206                     live stream that goes on instead of a fixed-length video.
 207     start_time:     Time in seconds where the reproduction should start, as
 208                     specified in the URL.
 209     end_time:       Time in seconds where the reproduction should end, as
 210                     specified in the URL.
 211
 212     The following fields should only be used when the video belongs to some logical
 213     chapter or section:
 214
 215     chapter:        Name or title of the chapter the video belongs to.
 216     chapter_number: Number of the chapter the video belongs to, as an integer.
 217     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 218
 219     The following fields should only be used when the video is an episode of some
 220     series or programme:
 221
 222     series:         Title of the series or programme the video episode belongs to.
 223     season:         Title of the season the video episode belongs to.
 224     season_number:  Number of the season the video episode belongs to, as an integer.
 225     season_id:      Id of the season the video episode belongs to, as a unicode string.
 226     episode:        Title of the video episode. Unlike mandatory video title field,
 227                     this field should denote the exact title of the video episode
 228                     without any kind of decoration.
 229     episode_number: Number of the video episode within a season, as an integer.
 230     episode_id:     Id of the video episode, as a unicode string.
 231
 232     Unless mentioned otherwise, the fields should be Unicode strings.
 233
 234     Unless mentioned otherwise, None is equivalent to absence of information.
 235
 236
 237     _type "playlist" indicates multiple videos.
 238     There must be a key "entries", which is a list, an iterable, or a PagedList
 239     object, each element of which is a valid dictionary by this specification.
 240
 241     Additionally, playlists can have "title", "description" and "id" attributes
 242     with the same semantics as videos (see above).
 243
 244
 245     _type "multi_video" indicates that there are multiple videos that
 246     form a single show, for examples multiple acts of an opera or TV episode.
 247     It must have an entries key like a playlist and contain all the keys
 248     required for a video at the same time.
 249
 250
 251     _type "url" indicates that the video must be extracted from another
 252     location, possibly by a different extractor. Its only required key is:
 253     "url" - the next URL to extract.
 254     The key "ie_key" can be set to the class name (minus the trailing "IE",
 255     e.g. "Youtube") if the extractor class is known in advance.
 256     Additionally, the dictionary may have any properties of the resolved entity
 257     known in advance, for example "title" if the title of the referred video is
 258     known ahead of time.
 259
 260
 261     _type "url_transparent" entities have the same specification as "url", but
 262     indicate that the given additional information is more precise than the one
 263     associated with the resolved URL.
 264     This is useful when a site employs a video service that hosts the video and
 265     its technical metadata, but that video service does not embed a useful
 266     title, description etc.
 267
 268
 269     Subclasses of this one should re-define the _real_initialize() and
 270     _real_extract() methods and define a _VALID_URL regexp.
 271     Probably, they should also be added to the list of extractors.
 272
 273     Finally, the _WORKING attribute should be set to False for broken IEs
 274     in order to warn the users and skip the tests.
 275     """
 276
 277     _ready = False
 278     _downloader = None
 279     _WORKING = True
 280
 281     def __init__(self, downloader=None):
 282         """Constructor. Receives an optional downloader."""
 283         self._ready = False
 284         self.set_downloader(downloader)
 285
 286     @classmethod
 287     def suitable(cls, url):
 288         """Receives a URL and returns True if suitable for this IE."""
 289
 290         # This does not use has/getattr intentionally - we want to know whether
 291         # we have cached the regexp for *this* class, whereas getattr would also
 292         # match the superclass
 293         if '_VALID_URL_RE' not in cls.__dict__:
 294             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 295         return cls._VALID_URL_RE.match(url) is not None
 296
 297     @classmethod
 298     def _match_id(cls, url):
 299         if '_VALID_URL_RE' not in cls.__dict__:
 300             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 301         m = cls._VALID_URL_RE.match(url)
 302         assert m
 303         return m.group('id')
 304
 305     @classmethod
 306     def working(cls):
 307         """Getter method for _WORKING."""
 308         return cls._WORKING
 309
 310     def initialize(self):
 311         """Initializes an instance (authentication, etc)."""
 312         if not self._ready:
 313             self._real_initialize()
 314             self._ready = True
 315
 316     def extract(self, url):
 317         """Extracts URL information and returns it in list of dicts."""
 318         try:
 319             self.initialize()
 320             return self._real_extract(url)
 321         except ExtractorError:
 322             raise
 323         except compat_http_client.IncompleteRead as e:
 324             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
 325         except (KeyError, StopIteration) as e:
 326             raise ExtractorError('An extractor error has occurred.', cause=e)
 327
 328     def set_downloader(self, downloader):
 329         """Sets the downloader for this IE."""
 330         self._downloader = downloader
 331
 332     def _real_initialize(self):
 333         """Real initialization process. Redefine in subclasses."""
 334         pass
 335
 336     def _real_extract(self, url):
 337         """Real extraction process. Redefine in subclasses."""
 338         pass
 339
 340     @classmethod
 341     def ie_key(cls):
 342         """A string for getting the InfoExtractor with get_info_extractor"""
 343         return compat_str(cls.__name__[:-2])
 344
 345     @property
 346     def IE_NAME(self):
 347         return compat_str(type(self).__name__[:-2])
 348
 349     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None):
 350         """ Returns the response handle """
 351         if note is None:
 352             self.report_download_webpage(video_id)
 353         elif note is not False:
 354             if video_id is None:
 355                 self.to_screen('%s' % (note,))
 356             else:
 357                 self.to_screen('%s: %s' % (video_id, note))
 358         # data, headers and query params will be ignored for `Request` objects
 359         if isinstance(url_or_request, compat_str):
 360             if query:
 361                 url_or_request = update_url_query(url_or_request, query)
 362             if data or headers:
 363                 url_or_request = sanitized_Request(url_or_request, data, headers or {})
 364         try:
 365             return self._downloader.urlopen(url_or_request)
 366         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 367             if errnote is False:
 368                 return False
 369             if errnote is None:
 370                 errnote = 'Unable to download webpage'
 371
 372             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 373             if fatal:
 374                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 375             else:
 376                 self._downloader.report_warning(errmsg)
 377                 return False
 378
 379     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers=None, query=None):
 380         """ Returns a tuple (page content as string, URL handle) """
 381         # Strip hashes from the URL (#1038)
 382         if isinstance(url_or_request, (compat_str, str)):
 383             url_or_request = url_or_request.partition('#')[0]
 384
 385         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
 386         if urlh is False:
 387             assert not fatal
 388             return False
 389         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 390         return (content, urlh)
 391
 392     @staticmethod
 393     def _guess_encoding_from_content(content_type, webpage_bytes):
 394         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 395         if m:
 396             encoding = m.group(1)
 397         else:
 398             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 399                           webpage_bytes[:1024])
 400             if m:
 401                 encoding = m.group(1).decode('ascii')
 402             elif webpage_bytes.startswith(b'\xff\xfe'):
 403                 encoding = 'utf-16'
 404             else:
 405                 encoding = 'utf-8'
 406
 407         return encoding
 408
 409     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 410         content_type = urlh.headers.get('Content-Type', '')
 411         webpage_bytes = urlh.read()
 412         if prefix is not None:
 413             webpage_bytes = prefix + webpage_bytes
 414         if not encoding:
 415             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 416         if self._downloader.params.get('dump_intermediate_pages', False):
 417             try:
 418                 url = url_or_request.get_full_url()
 419             except AttributeError:
 420                 url = url_or_request
 421             self.to_screen('Dumping request to ' + url)
 422             dump = base64.b64encode(webpage_bytes).decode('ascii')
 423             self._downloader.to_screen(dump)
 424         if self._downloader.params.get('write_pages', False):
 425             try:
 426                 url = url_or_request.get_full_url()
 427             except AttributeError:
 428                 url = url_or_request
 429             basen = '%s_%s' % (video_id, url)
 430             if len(basen) > 240:
 431                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 432                 basen = basen[:240 - len(h)] + h
 433             raw_filename = basen + '.dump'
 434             filename = sanitize_filename(raw_filename, restricted=True)
 435             self.to_screen('Saving request to ' + filename)
 436             # Working around MAX_PATH limitation on Windows (see
 437             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 438             if compat_os_name == 'nt':
 439                 absfilepath = os.path.abspath(filename)
 440                 if len(absfilepath) > 259:
 441                     filename = '\\\\?\\' + absfilepath
 442             with open(filename, 'wb') as outf:
 443                 outf.write(webpage_bytes)
 444
 445         try:
 446             content = webpage_bytes.decode(encoding, 'replace')
 447         except LookupError:
 448             content = webpage_bytes.decode('utf-8', 'replace')
 449
 450         if ('<title>Access to this site is blocked</title>' in content and
 451                 'Websense' in content[:512]):
 452             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 453             blocked_iframe = self._html_search_regex(
 454                 r'<iframe src="([^"]+)"', content,
 455                 'Websense information URL', default=None)
 456             if blocked_iframe:
 457                 msg += ' Visit %s for more details' % blocked_iframe
 458             raise ExtractorError(msg, expected=True)
 459         if '<title>The URL you requested has been blocked</title>' in content[:512]:
 460             msg = (
 461                 'Access to this webpage has been blocked by Indian censorship. '
 462                 'Use a VPN or proxy server (with --proxy) to route around it.')
 463             block_msg = self._html_search_regex(
 464                 r'</h1><p>(.*?)</p>',
 465                 content, 'block message', default=None)
 466             if block_msg:
 467                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 468             raise ExtractorError(msg, expected=True)
 469
 470         return content
 471
 472     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers=None, query=None):
 473         """ Returns the data of the page as a string """
 474         success = False
 475         try_count = 0
 476         while success is False:
 477             try:
 478                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
 479                 success = True
 480             except compat_http_client.IncompleteRead as e:
 481                 try_count += 1
 482                 if try_count >= tries:
 483                     raise e
 484                 self._sleep(timeout, video_id)
 485         if res is False:
 486             return res
 487         else:
 488             content, _ = res
 489             return content
 490
 491     def _download_xml(self, url_or_request, video_id,
 492                       note='Downloading XML', errnote='Unable to download XML',
 493                       transform_source=None, fatal=True, encoding=None, data=None, headers=None, query=None):
 494         """Return the xml as an xml.etree.ElementTree.Element"""
 495         xml_string = self._download_webpage(
 496             url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query)
 497         if xml_string is False:
 498             return xml_string
 499         if transform_source:
 500             xml_string = transform_source(xml_string)
 501         return compat_etree_fromstring(xml_string.encode('utf-8'))
 502
 503     def _download_json(self, url_or_request, video_id,
 504                        note='Downloading JSON metadata',
 505                        errnote='Unable to download JSON metadata',
 506                        transform_source=None,
 507                        fatal=True, encoding=None, data=None, headers=None, query=None):
 508         json_string = self._download_webpage(
 509             url_or_request, video_id, note, errnote, fatal=fatal,
 510             encoding=encoding, data=data, headers=headers, query=query)
 511         if (not fatal) and json_string is False:
 512             return None
 513         return self._parse_json(
 514             json_string, video_id, transform_source=transform_source, fatal=fatal)
 515
 516     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 517         if transform_source:
 518             json_string = transform_source(json_string)
 519         try:
 520             return json.loads(json_string)
 521         except ValueError as ve:
 522             errmsg = '%s: Failed to parse JSON ' % video_id
 523             if fatal:
 524                 raise ExtractorError(errmsg, cause=ve)
 525             else:
 526                 self.report_warning(errmsg + str(ve))
 527
 528     def report_warning(self, msg, video_id=None):
 529         idstr = '' if video_id is None else '%s: ' % video_id
 530         self._downloader.report_warning(
 531             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 532
 533     def to_screen(self, msg):
 534         """Print msg to screen, prefixing it with '[ie_name]'"""
 535         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 536
 537     def report_extraction(self, id_or_name):
 538         """Report information extraction."""
 539         self.to_screen('%s: Extracting information' % id_or_name)
 540
 541     def report_download_webpage(self, video_id):
 542         """Report webpage download."""
 543         self.to_screen('%s: Downloading webpage' % video_id)
 544
 545     def report_age_confirmation(self):
 546         """Report attempt to confirm age."""
 547         self.to_screen('Confirming age')
 548
 549     def report_login(self):
 550         """Report attempt to log in."""
 551         self.to_screen('Logging in')
 552
 553     @staticmethod
 554     def raise_login_required(msg='This video is only available for registered users'):
 555         raise ExtractorError(
 556             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
 557             expected=True)
 558
 559     @staticmethod
 560     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'):
 561         raise ExtractorError(
 562             '%s. You might want to use --proxy to workaround.' % msg,
 563             expected=True)
 564
 565     # Methods for following #608
 566     @staticmethod
 567     def url_result(url, ie=None, video_id=None, video_title=None):
 568         """Returns a URL that points to a page that should be processed"""
 569         # TODO: ie should be the class used for getting the info
 570         video_info = {'_type': 'url',
 571                       'url': url,
 572                       'ie_key': ie}
 573         if video_id is not None:
 574             video_info['id'] = video_id
 575         if video_title is not None:
 576             video_info['title'] = video_title
 577         return video_info
 578
 579     @staticmethod
 580     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 581         """Returns a playlist"""
 582         video_info = {'_type': 'playlist',
 583                       'entries': entries}
 584         if playlist_id:
 585             video_info['id'] = playlist_id
 586         if playlist_title:
 587             video_info['title'] = playlist_title
 588         if playlist_description:
 589             video_info['description'] = playlist_description
 590         return video_info
 591
 592     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 593         """
 594         Perform a regex search on the given string, using a single or a list of
 595         patterns returning the first matching group.
 596         In case of failure return a default value or raise a WARNING or a
 597         RegexNotFoundError, depending on fatal, specifying the field name.
 598         """
 599         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 600             mobj = re.search(pattern, string, flags)
 601         else:
 602             for p in pattern:
 603                 mobj = re.search(p, string, flags)
 604                 if mobj:
 605                     break
 606
 607         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
 608             _name = '\033[0;34m%s\033[0m' % name
 609         else:
 610             _name = name
 611
 612         if mobj:
 613             if group is None:
 614                 # return the first matching group
 615                 return next(g for g in mobj.groups() if g is not None)
 616             else:
 617                 return mobj.group(group)
 618         elif default is not NO_DEFAULT:
 619             return default
 620         elif fatal:
 621             raise RegexNotFoundError('Unable to extract %s' % _name)
 622         else:
 623             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
 624             return None
 625
 626     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 627         """
 628         Like _search_regex, but strips HTML tags and unescapes entities.
 629         """
 630         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 631         if res:
 632             return clean_html(res).strip()
 633         else:
 634             return res
 635
 636     def _get_login_info(self):
 637         """
 638         Get the login info as (username, password)
 639         It will look in the netrc file using the _NETRC_MACHINE value
 640         If there's no info available, return (None, None)
 641         """
 642         if self._downloader is None:
 643             return (None, None)
 644
 645         username = None
 646         password = None
 647         downloader_params = self._downloader.params
 648
 649         # Attempt to use provided username and password or .netrc data
 650         if downloader_params.get('username') is not None:
 651             username = downloader_params['username']
 652             password = downloader_params['password']
 653         elif downloader_params.get('usenetrc', False):
 654             try:
 655                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 656                 if info is not None:
 657                     username = info[0]
 658                     password = info[2]
 659                 else:
 660                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 661             except (IOError, netrc.NetrcParseError) as err:
 662                 self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err))
 663
 664         return (username, password)
 665
 666     def _get_tfa_info(self, note='two-factor verification code'):
 667         """
 668         Get the two-factor authentication info
 669         TODO - asking the user will be required for sms/phone verify
 670         currently just uses the command line option
 671         If there's no info available, return None
 672         """
 673         if self._downloader is None:
 674             return None
 675         downloader_params = self._downloader.params
 676
 677         if downloader_params.get('twofactor') is not None:
 678             return downloader_params['twofactor']
 679
 680         return compat_getpass('Type %s and press [Return]: ' % note)
 681
 682     # Helper functions for extracting OpenGraph info
 683     @staticmethod
 684     def _og_regexes(prop):
 685         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
 686         property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
 687                        % {'prop': re.escape(prop)})
 688         template = r'<meta[^>]+?%s[^>]+?%s'
 689         return [
 690             template % (property_re, content_re),
 691             template % (content_re, property_re),
 692         ]
 693
 694     @staticmethod
 695     def _meta_regex(prop):
 696         return r'''(?isx)<meta
 697                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
 698                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
 699
 700     def _og_search_property(self, prop, html, name=None, **kargs):
 701         if name is None:
 702             name = 'OpenGraph %s' % prop
 703         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 704         if escaped is None:
 705             return None
 706         return unescapeHTML(escaped)
 707
 708     def _og_search_thumbnail(self, html, **kargs):
 709         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
 710
 711     def _og_search_description(self, html, **kargs):
 712         return self._og_search_property('description', html, fatal=False, **kargs)
 713
 714     def _og_search_title(self, html, **kargs):
 715         return self._og_search_property('title', html, **kargs)
 716
 717     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 718         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 719         if secure:
 720             regexes = self._og_regexes('video:secure_url') + regexes
 721         return self._html_search_regex(regexes, html, name, **kargs)
 722
 723     def _og_search_url(self, html, **kargs):
 724         return self._og_search_property('url', html, **kargs)
 725
 726     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 727         if display_name is None:
 728             display_name = name
 729         return self._html_search_regex(
 730             self._meta_regex(name),
 731             html, display_name, fatal=fatal, group='content', **kwargs)
 732
 733     def _dc_search_uploader(self, html):
 734         return self._html_search_meta('dc.creator', html, 'uploader')
 735
 736     def _rta_search(self, html):
 737         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 738         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 739                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 740                      html):
 741             return 18
 742         return 0
 743
 744     def _media_rating_search(self, html):
 745         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 746         rating = self._html_search_meta('rating', html)
 747
 748         if not rating:
 749             return None
 750
 751         RATING_TABLE = {
 752             'safe for kids': 0,
 753             'general': 8,
 754             '14 years': 14,
 755             'mature': 17,
 756             'restricted': 19,
 757         }
 758         return RATING_TABLE.get(rating.lower())
 759
 760     def _family_friendly_search(self, html):
 761         # See http://schema.org/VideoObject
 762         family_friendly = self._html_search_meta('isFamilyFriendly', html)
 763
 764         if not family_friendly:
 765             return None
 766
 767         RATING_TABLE = {
 768             '1': 0,
 769             'true': 0,
 770             '0': 18,
 771             'false': 18,
 772         }
 773         return RATING_TABLE.get(family_friendly.lower())
 774
 775     def _twitter_search_player(self, html):
 776         return self._html_search_meta('twitter:player', html,
 777                                       'twitter card player')
 778
 779     def _search_json_ld(self, html, video_id, **kwargs):
 780         json_ld = self._search_regex(
 781             r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
 782             html, 'JSON-LD', group='json_ld', **kwargs)
 783         if not json_ld:
 784             return {}
 785         return self._json_ld(json_ld, video_id, fatal=kwargs.get('fatal', True))
 786
 787     def _json_ld(self, json_ld, video_id, fatal=True):
 788         if isinstance(json_ld, compat_str):
 789             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
 790         if not json_ld:
 791             return {}
 792         info = {}
 793         if json_ld.get('@context') == 'http://schema.org':
 794             item_type = json_ld.get('@type')
 795             if item_type == 'TVEpisode':
 796                 info.update({
 797                     'episode': unescapeHTML(json_ld.get('name')),
 798                     'episode_number': int_or_none(json_ld.get('episodeNumber')),
 799                     'description': unescapeHTML(json_ld.get('description')),
 800                 })
 801                 part_of_season = json_ld.get('partOfSeason')
 802                 if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
 803                     info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
 804                 part_of_series = json_ld.get('partOfSeries')
 805                 if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
 806                     info['series'] = unescapeHTML(part_of_series.get('name'))
 807             elif item_type == 'Article':
 808                 info.update({
 809                     'timestamp': parse_iso8601(json_ld.get('datePublished')),
 810                     'title': unescapeHTML(json_ld.get('headline')),
 811                     'description': unescapeHTML(json_ld.get('articleBody')),
 812                 })
 813         return dict((k, v) for k, v in info.items() if v is not None)
 814
 815     @staticmethod
 816     def _hidden_inputs(html):
 817         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
 818         hidden_inputs = {}
 819         for input in re.findall(r'(?i)<input([^>]+)>', html):
 820             if not re.search(r'type=(["\'])(?:hidden|submit)\1', input):
 821                 continue
 822             name = re.search(r'name=(["\'])(?P<value>.+?)\1', input)
 823             if not name:
 824                 continue
 825             value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
 826             if not value:
 827                 continue
 828             hidden_inputs[name.group('value')] = value.group('value')
 829         return hidden_inputs
 830
 831     def _form_hidden_inputs(self, form_id, html):
 832         form = self._search_regex(
 833             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
 834             html, '%s form' % form_id, group='form')
 835         return self._hidden_inputs(form)
 836
 837     def _sort_formats(self, formats, field_preference=None):
 838         if not formats:
 839             raise ExtractorError('No video formats found')
 840
 841         for f in formats:
 842             # Automatically determine tbr when missing based on abr and vbr (improves
 843             # formats sorting in some cases)
 844             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
 845                 f['tbr'] = f['abr'] + f['vbr']
 846
 847         def _formats_key(f):
 848             # TODO remove the following workaround
 849             from ..utils import determine_ext
 850             if not f.get('ext') and 'url' in f:
 851                 f['ext'] = determine_ext(f['url'])
 852
 853             if isinstance(field_preference, (list, tuple)):
 854                 return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
 855
 856             preference = f.get('preference')
 857             if preference is None:
 858                 preference = 0
 859                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 860                     preference -= 0.5
 861
 862             proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1
 863
 864             if f.get('vcodec') == 'none':  # audio only
 865                 if self._downloader.params.get('prefer_free_formats'):
 866                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
 867                 else:
 868                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
 869                 ext_preference = 0
 870                 try:
 871                     audio_ext_preference = ORDER.index(f['ext'])
 872                 except ValueError:
 873                     audio_ext_preference = -1
 874             else:
 875                 if self._downloader.params.get('prefer_free_formats'):
 876                     ORDER = ['flv', 'mp4', 'webm']
 877                 else:
 878                     ORDER = ['webm', 'flv', 'mp4']
 879                 try:
 880                     ext_preference = ORDER.index(f['ext'])
 881                 except ValueError:
 882                     ext_preference = -1
 883                 audio_ext_preference = 0
 884
 885             return (
 886                 preference,
 887                 f.get('language_preference') if f.get('language_preference') is not None else -1,
 888                 f.get('quality') if f.get('quality') is not None else -1,
 889                 f.get('tbr') if f.get('tbr') is not None else -1,
 890                 f.get('filesize') if f.get('filesize') is not None else -1,
 891                 f.get('vbr') if f.get('vbr') is not None else -1,
 892                 f.get('height') if f.get('height') is not None else -1,
 893                 f.get('width') if f.get('width') is not None else -1,
 894                 proto_preference,
 895                 ext_preference,
 896                 f.get('abr') if f.get('abr') is not None else -1,
 897                 audio_ext_preference,
 898                 f.get('fps') if f.get('fps') is not None else -1,
 899                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
 900                 f.get('source_preference') if f.get('source_preference') is not None else -1,
 901                 f.get('format_id') if f.get('format_id') is not None else '',
 902             )
 903         formats.sort(key=_formats_key)
 904
 905     def _check_formats(self, formats, video_id):
 906         if formats:
 907             formats[:] = filter(
 908                 lambda f: self._is_valid_url(
 909                     f['url'], video_id,
 910                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
 911                 formats)
 912
 913     @staticmethod
 914     def _remove_duplicate_formats(formats):
 915         format_urls = set()
 916         unique_formats = []
 917         for f in formats:
 918             if f['url'] not in format_urls:
 919                 format_urls.add(f['url'])
 920                 unique_formats.append(f)
 921         formats[:] = unique_formats
 922
 923     def _is_valid_url(self, url, video_id, item='video'):
 924         url = self._proto_relative_url(url, scheme='http:')
 925         # For now assume non HTTP(S) URLs always valid
 926         if not (url.startswith('http://') or url.startswith('https://')):
 927             return True
 928         try:
 929             self._request_webpage(url, video_id, 'Checking %s URL' % item)
 930             return True
 931         except ExtractorError as e:
 932             if isinstance(e.cause, compat_urllib_error.URLError):
 933                 self.to_screen(
 934                     '%s: %s URL is invalid, skipping' % (video_id, item))
 935                 return False
 936             raise
 937
 938     def http_scheme(self):
 939         """ Either "http:" or "https:", depending on the user's preferences """
 940         return (
 941             'http:'
 942             if self._downloader.params.get('prefer_insecure', False)
 943             else 'https:')
 944
 945     def _proto_relative_url(self, url, scheme=None):
 946         if url is None:
 947             return url
 948         if url.startswith('//'):
 949             if scheme is None:
 950                 scheme = self.http_scheme()
 951             return scheme + url
 952         else:
 953             return url
 954
 955     def _sleep(self, timeout, video_id, msg_template=None):
 956         if msg_template is None:
 957             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
 958         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
 959         self.to_screen(msg)
 960         time.sleep(timeout)
 961
 962     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
 963                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
 964                              fatal=True):
 965         manifest = self._download_xml(
 966             manifest_url, video_id, 'Downloading f4m manifest',
 967             'Unable to download f4m manifest',
 968             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
 969             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
 970             transform_source=transform_source,
 971             fatal=fatal)
 972
 973         if manifest is False:
 974             return []
 975
 976         formats = []
 977         manifest_version = '1.0'
 978         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
 979         if not media_nodes:
 980             manifest_version = '2.0'
 981             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
 982         base_url = xpath_text(
 983             manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
 984             'base URL', default=None)
 985         if base_url:
 986             base_url = base_url.strip()
 987         for i, media_el in enumerate(media_nodes):
 988             if manifest_version == '2.0':
 989                 media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
 990                 if not media_url:
 991                     continue
 992                 manifest_url = (
 993                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
 994                     else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
 995                 # If media_url is itself a f4m manifest do the recursive extraction
 996                 # since bitrates in parent manifest (this one) and media_url manifest
 997                 # may differ leading to inability to resolve the format by requested
 998                 # bitrate in f4m downloader
 999                 if determine_ext(manifest_url) == 'f4m':
1000                     formats.extend(self._extract_f4m_formats(
1001                         manifest_url, video_id, preference, f4m_id, fatal=fatal))
1002                     continue
1003             tbr = int_or_none(media_el.attrib.get('bitrate'))
1004             formats.append({
1005                 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
1006                 'url': manifest_url,
1007                 'ext': 'flv',
1008                 'tbr': tbr,
1009                 'width': int_or_none(media_el.attrib.get('width')),
1010                 'height': int_or_none(media_el.attrib.get('height')),
1011                 'preference': preference,
1012             })
1013         self._sort_formats(formats)
1014
1015         return formats
1016
1017     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1018                               entry_protocol='m3u8', preference=None,
1019                               m3u8_id=None, note=None, errnote=None,
1020                               fatal=True):
1021
1022         formats = [{
1023             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1024             'url': m3u8_url,
1025             'ext': ext,
1026             'protocol': 'm3u8',
1027             'preference': preference - 1 if preference else -1,
1028             'resolution': 'multiple',
1029             'format_note': 'Quality selection URL',
1030         }]
1031
1032         format_url = lambda u: (
1033             u
1034             if re.match(r'^https?://', u)
1035             else compat_urlparse.urljoin(m3u8_url, u))
1036
1037         res = self._download_webpage_handle(
1038             m3u8_url, video_id,
1039             note=note or 'Downloading m3u8 information',
1040             errnote=errnote or 'Failed to download m3u8 information',
1041             fatal=fatal)
1042         if res is False:
1043             return []
1044         m3u8_doc, urlh = res
1045         m3u8_url = urlh.geturl()
1046
1047         # We should try extracting formats only from master playlists [1], i.e.
1048         # playlists that describe available qualities. On the other hand media
1049         # playlists [2] should be returned as is since they contain just the media
1050         # without qualities renditions.
1051         # Fortunately, master playlist can be easily distinguished from media
1052         # playlist based on particular tags availability. As of [1, 2] master
1053         # playlist tags MUST NOT appear in a media playist and vice versa.
1054         # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
1055         # and MUST NOT appear in master playlist thus we can clearly detect media
1056         # playlist with this criterion.
1057         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
1058         # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
1059         # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
1060         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1061             return [{
1062                 'url': m3u8_url,
1063                 'format_id': m3u8_id,
1064                 'ext': ext,
1065                 'protocol': entry_protocol,
1066                 'preference': preference,
1067             }]
1068         last_info = None
1069         last_media = None
1070         kv_rex = re.compile(
1071             r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
1072         for line in m3u8_doc.splitlines():
1073             if line.startswith('#EXT-X-STREAM-INF:'):
1074                 last_info = {}
1075                 for m in kv_rex.finditer(line):
1076                     v = m.group('val')
1077                     if v.startswith('"'):
1078                         v = v[1:-1]
1079                     last_info[m.group('key')] = v
1080             elif line.startswith('#EXT-X-MEDIA:'):
1081                 last_media = {}
1082                 for m in kv_rex.finditer(line):
1083                     v = m.group('val')
1084                     if v.startswith('"'):
1085                         v = v[1:-1]
1086                     last_media[m.group('key')] = v
1087             elif line.startswith('#') or not line.strip():
1088                 continue
1089             else:
1090                 if last_info is None:
1091                     formats.append({'url': format_url(line)})
1092                     continue
1093                 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
1094                 format_id = []
1095                 if m3u8_id:
1096                     format_id.append(m3u8_id)
1097                 last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
1098                 format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
1099                 f = {
1100                     'format_id': '-'.join(format_id),
1101                     'url': format_url(line.strip()),
1102                     'tbr': tbr,
1103                     'ext': ext,
1104                     'protocol': entry_protocol,
1105                     'preference': preference,
1106                 }
1107                 resolution = last_info.get('RESOLUTION')
1108                 if resolution:
1109                     width_str, height_str = resolution.split('x')
1110                     f['width'] = int(width_str)
1111                     f['height'] = int(height_str)
1112                 codecs = last_info.get('CODECS')
1113                 if codecs:
1114                     vcodec, acodec = [None] * 2
1115                     va_codecs = codecs.split(',')
1116                     if len(va_codecs) == 1:
1117                         # Audio only entries usually come with single codec and
1118                         # no resolution. For more robustness we also check it to
1119                         # be mp4 audio.
1120                         if not resolution and va_codecs[0].startswith('mp4a'):
1121                             vcodec, acodec = 'none', va_codecs[0]
1122                         else:
1123                             vcodec = va_codecs[0]
1124                     else:
1125                         vcodec, acodec = va_codecs[:2]
1126                     f.update({
1127                         'acodec': acodec,
1128                         'vcodec': vcodec,
1129                     })
1130                 if last_media is not None:
1131                     f['m3u8_media'] = last_media
1132                     last_media = None
1133                 formats.append(f)
1134                 last_info = {}
1135         self._sort_formats(formats)
1136         return formats
1137
1138     @staticmethod
1139     def _xpath_ns(path, namespace=None):
1140         if not namespace:
1141             return path
1142         out = []
1143         for c in path.split('/'):
1144             if not c or c == '.':
1145                 out.append(c)
1146             else:
1147                 out.append('{%s}%s' % (namespace, c))
1148         return '/'.join(out)
1149
1150     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1151         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1152
1153         if smil is False:
1154             assert not fatal
1155             return []
1156
1157         namespace = self._parse_smil_namespace(smil)
1158
1159         return self._parse_smil_formats(
1160             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1161
1162     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1163         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1164         if smil is False:
1165             return {}
1166         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1167
1168     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1169         return self._download_xml(
1170             smil_url, video_id, 'Downloading SMIL file',
1171             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1172
1173     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1174         namespace = self._parse_smil_namespace(smil)
1175
1176         formats = self._parse_smil_formats(
1177             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1178         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1179
1180         video_id = os.path.splitext(url_basename(smil_url))[0]
1181         title = None
1182         description = None
1183         upload_date = None
1184         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1185             name = meta.attrib.get('name')
1186             content = meta.attrib.get('content')
1187             if not name or not content:
1188                 continue
1189             if not title and name == 'title':
1190                 title = content
1191             elif not description and name in ('description', 'abstract'):
1192                 description = content
1193             elif not upload_date and name == 'date':
1194                 upload_date = unified_strdate(content)
1195
1196         thumbnails = [{
1197             'id': image.get('type'),
1198             'url': image.get('src'),
1199             'width': int_or_none(image.get('width')),
1200             'height': int_or_none(image.get('height')),
1201         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1202
1203         return {
1204             'id': video_id,
1205             'title': title or video_id,
1206             'description': description,
1207             'upload_date': upload_date,
1208             'thumbnails': thumbnails,
1209             'formats': formats,
1210             'subtitles': subtitles,
1211         }
1212
1213     def _parse_smil_namespace(self, smil):
1214         return self._search_regex(
1215             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1216
1217     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1218         base = smil_url
1219         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1220             b = meta.get('base') or meta.get('httpBase')
1221             if b:
1222                 base = b
1223                 break
1224
1225         formats = []
1226         rtmp_count = 0
1227         http_count = 0
1228         m3u8_count = 0
1229
1230         srcs = []
1231         videos = smil.findall(self._xpath_ns('.//video', namespace))
1232         for video in videos:
1233             src = video.get('src')
1234             if not src or src in srcs:
1235                 continue
1236             srcs.append(src)
1237
1238             bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
1239             filesize = int_or_none(video.get('size') or video.get('fileSize'))
1240             width = int_or_none(video.get('width'))
1241             height = int_or_none(video.get('height'))
1242             proto = video.get('proto')
1243             ext = video.get('ext')
1244             src_ext = determine_ext(src)
1245             streamer = video.get('streamer') or base
1246
1247             if proto == 'rtmp' or streamer.startswith('rtmp'):
1248                 rtmp_count += 1
1249                 formats.append({
1250                     'url': streamer,
1251                     'play_path': src,
1252                     'ext': 'flv',
1253                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1254                     'tbr': bitrate,
1255                     'filesize': filesize,
1256                     'width': width,
1257                     'height': height,
1258                 })
1259                 if transform_rtmp_url:
1260                     streamer, src = transform_rtmp_url(streamer, src)
1261                     formats[-1].update({
1262                         'url': streamer,
1263                         'play_path': src,
1264                     })
1265                 continue
1266
1267             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1268             src_url = src_url.strip()
1269
1270             if proto == 'm3u8' or src_ext == 'm3u8':
1271                 m3u8_formats = self._extract_m3u8_formats(
1272                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1273                 if len(m3u8_formats) == 1:
1274                     m3u8_count += 1
1275                     m3u8_formats[0].update({
1276                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1277                         'tbr': bitrate,
1278                         'width': width,
1279                         'height': height,
1280                     })
1281                 formats.extend(m3u8_formats)
1282                 continue
1283
1284             if src_ext == 'f4m':
1285                 f4m_url = src_url
1286                 if not f4m_params:
1287                     f4m_params = {
1288                         'hdcore': '3.2.0',
1289                         'plugin': 'flowplayer-3.2.0.1',
1290                     }
1291                 f4m_url += '&' if '?' in f4m_url else '?'
1292                 f4m_url += compat_urllib_parse.urlencode(f4m_params)
1293                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1294                 continue
1295
1296             if src_url.startswith('http') and self._is_valid_url(src, video_id):
1297                 http_count += 1
1298                 formats.append({
1299                     'url': src_url,
1300                     'ext': ext or src_ext or 'flv',
1301                     'format_id': 'http-%d' % (bitrate or http_count),
1302                     'tbr': bitrate,
1303                     'filesize': filesize,
1304                     'width': width,
1305                     'height': height,
1306                 })
1307                 continue
1308
1309         self._sort_formats(formats)
1310
1311         return formats
1312
1313     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1314         urls = []
1315         subtitles = {}
1316         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1317             src = textstream.get('src')
1318             if not src or src in urls:
1319                 continue
1320             urls.append(src)
1321             ext = textstream.get('ext') or determine_ext(src) or mimetype2ext(textstream.get('type'))
1322             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1323             subtitles.setdefault(lang, []).append({
1324                 'url': src,
1325                 'ext': ext,
1326             })
1327         return subtitles
1328
1329     def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1330         xspf = self._download_xml(
1331             playlist_url, playlist_id, 'Downloading xpsf playlist',
1332             'Unable to download xspf manifest', fatal=fatal)
1333         if xspf is False:
1334             return []
1335         return self._parse_xspf(xspf, playlist_id)
1336
1337     def _parse_xspf(self, playlist, playlist_id):
1338         NS_MAP = {
1339             'xspf': 'http://xspf.org/ns/0/',
1340             's1': 'http://static.streamone.nl/player/ns/0',
1341         }
1342
1343         entries = []
1344         for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1345             title = xpath_text(
1346                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1347             description = xpath_text(
1348                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1349             thumbnail = xpath_text(
1350                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1351             duration = float_or_none(
1352                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1353
1354             formats = [{
1355                 'url': location.text,
1356                 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1357                 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1358                 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1359             } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1360             self._sort_formats(formats)
1361
1362             entries.append({
1363                 'id': playlist_id,
1364                 'title': title,
1365                 'description': description,
1366                 'thumbnail': thumbnail,
1367                 'duration': duration,
1368                 'formats': formats,
1369             })
1370         return entries
1371
1372     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1373         res = self._download_webpage_handle(
1374             mpd_url, video_id,
1375             note=note or 'Downloading MPD manifest',
1376             errnote=errnote or 'Failed to download MPD manifest',
1377             fatal=fatal)
1378         if res is False:
1379             return []
1380         mpd, urlh = res
1381         mpd_base_url = re.match(r'https?://.+/', urlh.geturl()).group()
1382
1383         return self._parse_mpd_formats(
1384             compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, formats_dict=formats_dict)
1385
1386     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}):
1387         if mpd_doc.get('type') == 'dynamic':
1388             return []
1389
1390         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1391
1392         def _add_ns(path):
1393             return self._xpath_ns(path, namespace)
1394
1395         def is_drm_protected(element):
1396             return element.find(_add_ns('ContentProtection')) is not None
1397
1398         def extract_multisegment_info(element, ms_parent_info):
1399             ms_info = ms_parent_info.copy()
1400             segment_list = element.find(_add_ns('SegmentList'))
1401             if segment_list is not None:
1402                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1403                 if segment_urls_e:
1404                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1405                 initialization = segment_list.find(_add_ns('Initialization'))
1406                 if initialization is not None:
1407                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
1408             else:
1409                 segment_template = element.find(_add_ns('SegmentTemplate'))
1410                 if segment_template is not None:
1411                     start_number = segment_template.get('startNumber')
1412                     if start_number:
1413                         ms_info['start_number'] = int(start_number)
1414                     segment_timeline = segment_template.find(_add_ns('SegmentTimeline'))
1415                     if segment_timeline is not None:
1416                         s_e = segment_timeline.findall(_add_ns('S'))
1417                         if s_e:
1418                             ms_info['total_number'] = 0
1419                             for s in s_e:
1420                                 ms_info['total_number'] += 1 + int(s.get('r', '0'))
1421                     else:
1422                         timescale = segment_template.get('timescale')
1423                         if timescale:
1424                             ms_info['timescale'] = int(timescale)
1425                         segment_duration = segment_template.get('duration')
1426                         if segment_duration:
1427                             ms_info['segment_duration'] = int(segment_duration)
1428                     media_template = segment_template.get('media')
1429                     if media_template:
1430                         ms_info['media_template'] = media_template
1431                     initialization = segment_template.get('initialization')
1432                     if initialization:
1433                         ms_info['initialization_url'] = initialization
1434                     else:
1435                         initialization = segment_template.find(_add_ns('Initialization'))
1436                         if initialization is not None:
1437                             ms_info['initialization_url'] = initialization.attrib['sourceURL']
1438             return ms_info
1439
1440         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1441         formats = []
1442         for period in mpd_doc.findall(_add_ns('Period')):
1443             period_duration = parse_duration(period.get('duration')) or mpd_duration
1444             period_ms_info = extract_multisegment_info(period, {
1445                 'start_number': 1,
1446                 'timescale': 1,
1447             })
1448             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1449                 if is_drm_protected(adaptation_set):
1450                     continue
1451                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1452                 for representation in adaptation_set.findall(_add_ns('Representation')):
1453                     if is_drm_protected(representation):
1454                         continue
1455                     representation_attrib = adaptation_set.attrib.copy()
1456                     representation_attrib.update(representation.attrib)
1457                     # According to page 41 of ISO/IEC 29001-1:2014, @mimeType is mandatory
1458                     mime_type = representation_attrib['mimeType']
1459                     content_type = mime_type.split('/')[0]
1460                     if content_type == 'text':
1461                         # TODO implement WebVTT downloading
1462                         pass
1463                     elif content_type == 'video' or content_type == 'audio':
1464                         base_url = ''
1465                         for element in (representation, adaptation_set, period, mpd_doc):
1466                             base_url_e = element.find(_add_ns('BaseURL'))
1467                             if base_url_e is not None:
1468                                 base_url = base_url_e.text + base_url
1469                                 if re.match(r'^https?://', base_url):
1470                                     break
1471                         if mpd_base_url and not re.match(r'^https?://', base_url):
1472                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1473                                 mpd_base_url += '/'
1474                             base_url = mpd_base_url + base_url
1475                         representation_id = representation_attrib.get('id')
1476                         lang = representation_attrib.get('lang')
1477                         url_el = representation.find(_add_ns('BaseURL'))
1478                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1479                         f = {
1480                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1481                             'url': base_url,
1482                             'ext': mimetype2ext(mime_type),
1483                             'width': int_or_none(representation_attrib.get('width')),
1484                             'height': int_or_none(representation_attrib.get('height')),
1485                             'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000),
1486                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1487                             'fps': int_or_none(representation_attrib.get('frameRate')),
1488                             'vcodec': 'none' if content_type == 'audio' else representation_attrib.get('codecs'),
1489                             'acodec': 'none' if content_type == 'video' else representation_attrib.get('codecs'),
1490                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1491                             'format_note': 'DASH %s' % content_type,
1492                             'filesize': filesize,
1493                         }
1494                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1495                         if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info:
1496                             if 'total_number' not in representation_ms_info and 'segment_duration':
1497                                 segment_duration = float(representation_ms_info['segment_duration']) / float(representation_ms_info['timescale'])
1498                                 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1499                             media_template = representation_ms_info['media_template']
1500                             media_template = media_template.replace('$RepresentationID$', representation_id)
1501                             media_template = re.sub(r'\$(Number|Bandwidth)(?:%(0\d+)d)?\$', r'%(\1)\2d', media_template)
1502                             media_template.replace('$$', '$')
1503                             representation_ms_info['segment_urls'] = [media_template % {'Number': segment_number, 'Bandwidth': representation_attrib.get('bandwidth')} for segment_number in range(representation_ms_info['start_number'], representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1504                         if 'segment_urls' in representation_ms_info:
1505                             f.update({
1506                                 'segment_urls': representation_ms_info['segment_urls'],
1507                                 'protocol': 'http_dash_segments',
1508                             })
1509                             if 'initialization_url' in representation_ms_info:
1510                                 initialization_url = representation_ms_info['initialization_url'].replace('$RepresentationID$', representation_id)
1511                                 f.update({
1512                                     'initialization_url': initialization_url,
1513                                 })
1514                                 if not f.get('url'):
1515                                     f['url'] = initialization_url
1516                         try:
1517                             existing_format = next(
1518                                 fo for fo in formats
1519                                 if fo['format_id'] == representation_id)
1520                         except StopIteration:
1521                             full_info = formats_dict.get(representation_id, {}).copy()
1522                             full_info.update(f)
1523                             formats.append(full_info)
1524                         else:
1525                             existing_format.update(f)
1526                     else:
1527                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
1528         self._sort_formats(formats)
1529         return formats
1530
1531     def _live_title(self, name):
1532         """ Generate the title for a live video """
1533         now = datetime.datetime.now()
1534         now_str = now.strftime('%Y-%m-%d %H:%M')
1535         return name + ' ' + now_str
1536
1537     def _int(self, v, name, fatal=False, **kwargs):
1538         res = int_or_none(v, **kwargs)
1539         if 'get_attr' in kwargs:
1540             print(getattr(v, kwargs['get_attr']))
1541         if res is None:
1542             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1543             if fatal:
1544                 raise ExtractorError(msg)
1545             else:
1546                 self._downloader.report_warning(msg)
1547         return res
1548
1549     def _float(self, v, name, fatal=False, **kwargs):
1550         res = float_or_none(v, **kwargs)
1551         if res is None:
1552             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1553             if fatal:
1554                 raise ExtractorError(msg)
1555             else:
1556                 self._downloader.report_warning(msg)
1557         return res
1558
1559     def _set_cookie(self, domain, name, value, expire_time=None):
1560         cookie = compat_cookiejar.Cookie(
1561             0, name, value, None, None, domain, None,
1562             None, '/', True, False, expire_time, '', None, None, None)
1563         self._downloader.cookiejar.set_cookie(cookie)
1564
1565     def _get_cookies(self, url):
1566         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
1567         req = sanitized_Request(url)
1568         self._downloader.cookiejar.add_cookie_header(req)
1569         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
1570
1571     def get_testcases(self, include_onlymatching=False):
1572         t = getattr(self, '_TEST', None)
1573         if t:
1574             assert not hasattr(self, '_TESTS'), \
1575                 '%s has _TEST and _TESTS' % type(self).__name__
1576             tests = [t]
1577         else:
1578             tests = getattr(self, '_TESTS', [])
1579         for t in tests:
1580             if not include_onlymatching and t.get('only_matching', False):
1581                 continue
1582             t['name'] = type(self).__name__[:-len('IE')]
1583             yield t
1584
1585     def is_suitable(self, age_limit):
1586         """ Test whether the extractor is generally suitable for the given
1587         age limit (i.e. pornographic sites are not, all others usually are) """
1588
1589         any_restricted = False
1590         for tc in self.get_testcases(include_onlymatching=False):
1591             if 'playlist' in tc:
1592                 tc = tc['playlist'][0]
1593             is_restricted = age_restricted(
1594                 tc.get('info_dict', {}).get('age_limit'), age_limit)
1595             if not is_restricted:
1596                 return True
1597             any_restricted = any_restricted or is_restricted
1598         return not any_restricted
1599
1600     def extract_subtitles(self, *args, **kwargs):
1601         if (self._downloader.params.get('writesubtitles', False) or
1602                 self._downloader.params.get('listsubtitles')):
1603             return self._get_subtitles(*args, **kwargs)
1604         return {}
1605
1606     def _get_subtitles(self, *args, **kwargs):
1607         raise NotImplementedError('This method must be implemented by subclasses')
1608
1609     @staticmethod
1610     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
1611         """ Merge subtitle items for one language. Items with duplicated URLs
1612         will be dropped. """
1613         list1_urls = set([item['url'] for item in subtitle_list1])
1614         ret = list(subtitle_list1)
1615         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
1616         return ret
1617
1618     @classmethod
1619     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
1620         """ Merge two subtitle dictionaries, language by language. """
1621         ret = dict(subtitle_dict1)
1622         for lang in subtitle_dict2:
1623             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
1624         return ret
1625
1626     def extract_automatic_captions(self, *args, **kwargs):
1627         if (self._downloader.params.get('writeautomaticsub', False) or
1628                 self._downloader.params.get('listsubtitles')):
1629             return self._get_automatic_captions(*args, **kwargs)
1630         return {}
1631
1632     def _get_automatic_captions(self, *args, **kwargs):
1633         raise NotImplementedError('This method must be implemented by subclasses')
1634
1635     def mark_watched(self, *args, **kwargs):
1636         if (self._downloader.params.get('mark_watched', False) and
1637                 (self._get_login_info()[0] is not None or
1638                     self._downloader.params.get('cookiefile') is not None)):
1639             self._mark_watched(*args, **kwargs)
1640
1641     def _mark_watched(self, *args, **kwargs):
1642         raise NotImplementedError('This method must be implemented by subclasses')
1643
1644
1645 class SearchInfoExtractor(InfoExtractor):
1646     """
1647     Base class for paged search queries extractors.
1648     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
1649     Instances should define _SEARCH_KEY and _MAX_RESULTS.
1650     """
1651
1652     @classmethod
1653     def _make_valid_url(cls):
1654         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
1655
1656     @classmethod
1657     def suitable(cls, url):
1658         return re.match(cls._make_valid_url(), url) is not None
1659
1660     def _real_extract(self, query):
1661         mobj = re.match(self._make_valid_url(), query)
1662         if mobj is None:
1663             raise ExtractorError('Invalid search query "%s"' % query)
1664
1665         prefix = mobj.group('prefix')
1666         query = mobj.group('query')
1667         if prefix == '':
1668             return self._get_n_results(query, 1)
1669         elif prefix == 'all':
1670             return self._get_n_results(query, self._MAX_RESULTS)
1671         else:
1672             n = int(prefix)
1673             if n <= 0:
1674                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
1675             elif n > self._MAX_RESULTS:
1676                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
1677                 n = self._MAX_RESULTS
1678             return self._get_n_results(query, n)
1679
1680     def _get_n_results(self, query, n):
1681         """Get a specified number of results for a query"""
1682         raise NotImplementedError('This method must be implemented by subclasses')
1683
1684     @property
1685     def SEARCH_KEY(self):
1686         return self._SEARCH_KEY