youtube_dl/extractor/common.py

   1 from __future__ import unicode_literals
   2
   3 import base64
   4 import datetime
   5 import hashlib
   6 import json
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import sys
  12 import time
  13
  14 from ..compat import (
  15     compat_cookiejar,
  16     compat_cookies,
  17     compat_getpass,
  18     compat_http_client,
  19     compat_urllib_error,
  20     compat_urllib_parse,
  21     compat_urlparse,
  22     compat_str,
  23     compat_etree_fromstring,
  24 )
  25 from ..utils import (
  26     NO_DEFAULT,
  27     age_restricted,
  28     bug_reports_message,
  29     clean_html,
  30     compiled_regex_type,
  31     determine_ext,
  32     error_to_compat_str,
  33     ExtractorError,
  34     fix_xml_ampersands,
  35     float_or_none,
  36     int_or_none,
  37     parse_iso8601,
  38     RegexNotFoundError,
  39     sanitize_filename,
  40     sanitized_Request,
  41     unescapeHTML,
  42     unified_strdate,
  43     url_basename,
  44     xpath_text,
  45     xpath_with_ns,
  46     determine_protocol,
  47 )
  48
  49
  50 class InfoExtractor(object):
  51     """Information Extractor class.
  52
  53     Information extractors are the classes that, given a URL, extract
  54     information about the video (or videos) the URL refers to. This
  55     information includes the real video URL, the video title, author and
  56     others. The information is stored in a dictionary which is then
  57     passed to the YoutubeDL. The YoutubeDL processes this
  58     information possibly downloading the video to the file system, among
  59     other possible outcomes.
  60
  61     The type field determines the type of the result.
  62     By far the most common value (and the default if _type is missing) is
  63     "video", which indicates a single video.
  64
  65     For a video, the dictionaries must include the following fields:
  66
  67     id:             Video identifier.
  68     title:          Video title, unescaped.
  69
  70     Additionally, it must contain either a formats entry or a url one:
  71
  72     formats:        A list of dictionaries for each format available, ordered
  73                     from worst to best quality.
  74
  75                     Potential fields:
  76                     * url        Mandatory. The URL of the video file
  77                     * ext        Will be calculated from URL if missing
  78                     * format     A human-readable description of the format
  79                                  ("mp4 container with h264/opus").
  80                                  Calculated from the format_id, width, height.
  81                                  and format_note fields if missing.
  82                     * format_id  A short description of the format
  83                                  ("mp4_h264_opus" or "19").
  84                                 Technically optional, but strongly recommended.
  85                     * format_note Additional info about the format
  86                                  ("3D" or "DASH video")
  87                     * width      Width of the video, if known
  88                     * height     Height of the video, if known
  89                     * resolution Textual description of width and height
  90                     * tbr        Average bitrate of audio and video in KBit/s
  91                     * abr        Average audio bitrate in KBit/s
  92                     * acodec     Name of the audio codec in use
  93                     * asr        Audio sampling rate in Hertz
  94                     * vbr        Average video bitrate in KBit/s
  95                     * fps        Frame rate
  96                     * vcodec     Name of the video codec in use
  97                     * container  Name of the container format
  98                     * filesize   The number of bytes, if known in advance
  99                     * filesize_approx  An estimate for the number of bytes
 100                     * player_url SWF Player URL (used for rtmpdump).
 101                     * protocol   The protocol that will be used for the actual
 102                                  download, lower-case.
 103                                  "http", "https", "rtsp", "rtmp", "rtmpe",
 104                                  "m3u8", or "m3u8_native".
 105                     * preference Order number of this format. If this field is
 106                                  present and not None, the formats get sorted
 107                                  by this field, regardless of all other values.
 108                                  -1 for default (order by other properties),
 109                                  -2 or smaller for less than default.
 110                                  < -1000 to hide the format (if there is
 111                                     another one which is strictly better)
 112                     * language   Language code, e.g. "de" or "en-US".
 113                     * language_preference  Is this in the language mentioned in
 114                                  the URL?
 115                                  10 if it's what the URL is about,
 116                                  -1 for default (don't know),
 117                                  -10 otherwise, other values reserved for now.
 118                     * quality    Order number of the video quality of this
 119                                  format, irrespective of the file format.
 120                                  -1 for default (order by other properties),
 121                                  -2 or smaller for less than default.
 122                     * source_preference  Order number for this video source
 123                                   (quality takes higher priority)
 124                                  -1 for default (order by other properties),
 125                                  -2 or smaller for less than default.
 126                     * http_headers  A dictionary of additional HTTP headers
 127                                  to add to the request.
 128                     * stretched_ratio  If given and not 1, indicates that the
 129                                  video's pixels are not square.
 130                                  width : height ratio as float.
 131                     * no_resume  The server does not support resuming the
 132                                  (HTTP or RTMP) download. Boolean.
 133
 134     url:            Final video URL.
 135     ext:            Video filename extension.
 136     format:         The video format, defaults to ext (used for --get-format)
 137     player_url:     SWF Player URL (used for rtmpdump).
 138
 139     The following fields are optional:
 140
 141     alt_title:      A secondary title of the video.
 142     display_id      An alternative identifier for the video, not necessarily
 143                     unique, but available before title. Typically, id is
 144                     something like "4234987", title "Dancing naked mole rats",
 145                     and display_id "dancing-naked-mole-rats"
 146     thumbnails:     A list of dictionaries, with the following entries:
 147                         * "id" (optional, string) - Thumbnail format ID
 148                         * "url"
 149                         * "preference" (optional, int) - quality of the image
 150                         * "width" (optional, int)
 151                         * "height" (optional, int)
 152                         * "resolution" (optional, string "{width}x{height"},
 153                                         deprecated)
 154     thumbnail:      Full URL to a video thumbnail image.
 155     description:    Full video description.
 156     uploader:       Full name of the video uploader.
 157     creator:        The main artist who created the video.
 158     release_date:   The date (YYYYMMDD) when the video was released.
 159     timestamp:      UNIX timestamp of the moment the video became available.
 160     upload_date:    Video upload date (YYYYMMDD).
 161                     If not explicitly set, calculated from timestamp.
 162     uploader_id:    Nickname or id of the video uploader.
 163     location:       Physical location where the video was filmed.
 164     subtitles:      The available subtitles as a dictionary in the format
 165                     {language: subformats}. "subformats" is a list sorted from
 166                     lower to higher preference, each element is a dictionary
 167                     with the "ext" entry and one of:
 168                         * "data": The subtitles file contents
 169                         * "url": A URL pointing to the subtitles file
 170                     "ext" will be calculated from URL if missing
 171     automatic_captions: Like 'subtitles', used by the YoutubeIE for
 172                     automatically generated captions
 173     duration:       Length of the video in seconds, as an integer or float.
 174     view_count:     How many users have watched the video on the platform.
 175     like_count:     Number of positive ratings of the video
 176     dislike_count:  Number of negative ratings of the video
 177     repost_count:   Number of reposts of the video
 178     average_rating: Average rating give by users, the scale used depends on the webpage
 179     comment_count:  Number of comments on the video
 180     comments:       A list of comments, each with one or more of the following
 181                     properties (all but one of text or html optional):
 182                         * "author" - human-readable name of the comment author
 183                         * "author_id" - user ID of the comment author
 184                         * "id" - Comment ID
 185                         * "html" - Comment as HTML
 186                         * "text" - Plain text of the comment
 187                         * "timestamp" - UNIX timestamp of comment
 188                         * "parent" - ID of the comment this one is replying to.
 189                                      Set to "root" to indicate that this is a
 190                                      comment to the original video.
 191     age_limit:      Age restriction for the video, as an integer (years)
 192     webpage_url:    The URL to the video webpage, if given to youtube-dl it
 193                     should allow to get the same result again. (It will be set
 194                     by YoutubeDL if it's missing)
 195     categories:     A list of categories that the video falls in, for example
 196                     ["Sports", "Berlin"]
 197     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 198     is_live:        True, False, or None (=unknown). Whether this video is a
 199                     live stream that goes on instead of a fixed-length video.
 200     start_time:     Time in seconds where the reproduction should start, as
 201                     specified in the URL.
 202     end_time:       Time in seconds where the reproduction should end, as
 203                     specified in the URL.
 204
 205     The following fields should only be used when the video belongs to some logical
 206     chapter or section:
 207
 208     chapter:        Name or title of the chapter the video belongs to.
 209     chapter_number: Number of the chapter the video belongs to, as an integer.
 210     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 211
 212     The following fields should only be used when the video is an episode of some
 213     series or programme:
 214
 215     series:         Title of the series or programme the video episode belongs to.
 216     season:         Title of the season the video episode belongs to.
 217     season_number:  Number of the season the video episode belongs to, as an integer.
 218     season_id:      Id of the season the video episode belongs to, as a unicode string.
 219     episode:        Title of the video episode. Unlike mandatory video title field,
 220                     this field should denote the exact title of the video episode
 221                     without any kind of decoration.
 222     episode_number: Number of the video episode within a season, as an integer.
 223     episode_id:     Id of the video episode, as a unicode string.
 224
 225     Unless mentioned otherwise, the fields should be Unicode strings.
 226
 227     Unless mentioned otherwise, None is equivalent to absence of information.
 228
 229
 230     _type "playlist" indicates multiple videos.
 231     There must be a key "entries", which is a list, an iterable, or a PagedList
 232     object, each element of which is a valid dictionary by this specification.
 233
 234     Additionally, playlists can have "title", "description" and "id" attributes
 235     with the same semantics as videos (see above).
 236
 237
 238     _type "multi_video" indicates that there are multiple videos that
 239     form a single show, for examples multiple acts of an opera or TV episode.
 240     It must have an entries key like a playlist and contain all the keys
 241     required for a video at the same time.
 242
 243
 244     _type "url" indicates that the video must be extracted from another
 245     location, possibly by a different extractor. Its only required key is:
 246     "url" - the next URL to extract.
 247     The key "ie_key" can be set to the class name (minus the trailing "IE",
 248     e.g. "Youtube") if the extractor class is known in advance.
 249     Additionally, the dictionary may have any properties of the resolved entity
 250     known in advance, for example "title" if the title of the referred video is
 251     known ahead of time.
 252
 253
 254     _type "url_transparent" entities have the same specification as "url", but
 255     indicate that the given additional information is more precise than the one
 256     associated with the resolved URL.
 257     This is useful when a site employs a video service that hosts the video and
 258     its technical metadata, but that video service does not embed a useful
 259     title, description etc.
 260
 261
 262     Subclasses of this one should re-define the _real_initialize() and
 263     _real_extract() methods and define a _VALID_URL regexp.
 264     Probably, they should also be added to the list of extractors.
 265
 266     Finally, the _WORKING attribute should be set to False for broken IEs
 267     in order to warn the users and skip the tests.
 268     """
 269
 270     _ready = False
 271     _downloader = None
 272     _WORKING = True
 273
 274     def __init__(self, downloader=None):
 275         """Constructor. Receives an optional downloader."""
 276         self._ready = False
 277         self.set_downloader(downloader)
 278
 279     @classmethod
 280     def suitable(cls, url):
 281         """Receives a URL and returns True if suitable for this IE."""
 282
 283         # This does not use has/getattr intentionally - we want to know whether
 284         # we have cached the regexp for *this* class, whereas getattr would also
 285         # match the superclass
 286         if '_VALID_URL_RE' not in cls.__dict__:
 287             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 288         return cls._VALID_URL_RE.match(url) is not None
 289
 290     @classmethod
 291     def _match_id(cls, url):
 292         if '_VALID_URL_RE' not in cls.__dict__:
 293             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 294         m = cls._VALID_URL_RE.match(url)
 295         assert m
 296         return m.group('id')
 297
 298     @classmethod
 299     def working(cls):
 300         """Getter method for _WORKING."""
 301         return cls._WORKING
 302
 303     def initialize(self):
 304         """Initializes an instance (authentication, etc)."""
 305         if not self._ready:
 306             self._real_initialize()
 307             self._ready = True
 308
 309     def extract(self, url):
 310         """Extracts URL information and returns it in list of dicts."""
 311         try:
 312             self.initialize()
 313             return self._real_extract(url)
 314         except ExtractorError:
 315             raise
 316         except compat_http_client.IncompleteRead as e:
 317             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
 318         except (KeyError, StopIteration) as e:
 319             raise ExtractorError('An extractor error has occurred.', cause=e)
 320
 321     def set_downloader(self, downloader):
 322         """Sets the downloader for this IE."""
 323         self._downloader = downloader
 324
 325     def _real_initialize(self):
 326         """Real initialization process. Redefine in subclasses."""
 327         pass
 328
 329     def _real_extract(self, url):
 330         """Real extraction process. Redefine in subclasses."""
 331         pass
 332
 333     @classmethod
 334     def ie_key(cls):
 335         """A string for getting the InfoExtractor with get_info_extractor"""
 336         return compat_str(cls.__name__[:-2])
 337
 338     @property
 339     def IE_NAME(self):
 340         return compat_str(type(self).__name__[:-2])
 341
 342     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 343         """ Returns the response handle """
 344         if note is None:
 345             self.report_download_webpage(video_id)
 346         elif note is not False:
 347             if video_id is None:
 348                 self.to_screen('%s' % (note,))
 349             else:
 350                 self.to_screen('%s: %s' % (video_id, note))
 351         try:
 352             return self._downloader.urlopen(url_or_request)
 353         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 354             if errnote is False:
 355                 return False
 356             if errnote is None:
 357                 errnote = 'Unable to download webpage'
 358
 359             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 360             if fatal:
 361                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 362             else:
 363                 self._downloader.report_warning(errmsg)
 364                 return False
 365
 366     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None):
 367         """ Returns a tuple (page content as string, URL handle) """
 368         # Strip hashes from the URL (#1038)
 369         if isinstance(url_or_request, (compat_str, str)):
 370             url_or_request = url_or_request.partition('#')[0]
 371
 372         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 373         if urlh is False:
 374             assert not fatal
 375             return False
 376         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 377         return (content, urlh)
 378
 379     @staticmethod
 380     def _guess_encoding_from_content(content_type, webpage_bytes):
 381         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 382         if m:
 383             encoding = m.group(1)
 384         else:
 385             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 386                           webpage_bytes[:1024])
 387             if m:
 388                 encoding = m.group(1).decode('ascii')
 389             elif webpage_bytes.startswith(b'\xff\xfe'):
 390                 encoding = 'utf-16'
 391             else:
 392                 encoding = 'utf-8'
 393
 394         return encoding
 395
 396     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 397         content_type = urlh.headers.get('Content-Type', '')
 398         webpage_bytes = urlh.read()
 399         if prefix is not None:
 400             webpage_bytes = prefix + webpage_bytes
 401         if not encoding:
 402             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 403         if self._downloader.params.get('dump_intermediate_pages', False):
 404             try:
 405                 url = url_or_request.get_full_url()
 406             except AttributeError:
 407                 url = url_or_request
 408             self.to_screen('Dumping request to ' + url)
 409             dump = base64.b64encode(webpage_bytes).decode('ascii')
 410             self._downloader.to_screen(dump)
 411         if self._downloader.params.get('write_pages', False):
 412             try:
 413                 url = url_or_request.get_full_url()
 414             except AttributeError:
 415                 url = url_or_request
 416             basen = '%s_%s' % (video_id, url)
 417             if len(basen) > 240:
 418                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 419                 basen = basen[:240 - len(h)] + h
 420             raw_filename = basen + '.dump'
 421             filename = sanitize_filename(raw_filename, restricted=True)
 422             self.to_screen('Saving request to ' + filename)
 423             # Working around MAX_PATH limitation on Windows (see
 424             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 425             if os.name == 'nt':
 426                 absfilepath = os.path.abspath(filename)
 427                 if len(absfilepath) > 259:
 428                     filename = '\\\\?\\' + absfilepath
 429             with open(filename, 'wb') as outf:
 430                 outf.write(webpage_bytes)
 431
 432         try:
 433             content = webpage_bytes.decode(encoding, 'replace')
 434         except LookupError:
 435             content = webpage_bytes.decode('utf-8', 'replace')
 436
 437         if ('<title>Access to this site is blocked</title>' in content and
 438                 'Websense' in content[:512]):
 439             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 440             blocked_iframe = self._html_search_regex(
 441                 r'<iframe src="([^"]+)"', content,
 442                 'Websense information URL', default=None)
 443             if blocked_iframe:
 444                 msg += ' Visit %s for more details' % blocked_iframe
 445             raise ExtractorError(msg, expected=True)
 446         if '<title>The URL you requested has been blocked</title>' in content[:512]:
 447             msg = (
 448                 'Access to this webpage has been blocked by Indian censorship. '
 449                 'Use a VPN or proxy server (with --proxy) to route around it.')
 450             block_msg = self._html_search_regex(
 451                 r'</h1><p>(.*?)</p>',
 452                 content, 'block message', default=None)
 453             if block_msg:
 454                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 455             raise ExtractorError(msg, expected=True)
 456
 457         return content
 458
 459     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None):
 460         """ Returns the data of the page as a string """
 461         success = False
 462         try_count = 0
 463         while success is False:
 464             try:
 465                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 466                 success = True
 467             except compat_http_client.IncompleteRead as e:
 468                 try_count += 1
 469                 if try_count >= tries:
 470                     raise e
 471                 self._sleep(timeout, video_id)
 472         if res is False:
 473             return res
 474         else:
 475             content, _ = res
 476             return content
 477
 478     def _download_xml(self, url_or_request, video_id,
 479                       note='Downloading XML', errnote='Unable to download XML',
 480                       transform_source=None, fatal=True, encoding=None):
 481         """Return the xml as an xml.etree.ElementTree.Element"""
 482         xml_string = self._download_webpage(
 483             url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding)
 484         if xml_string is False:
 485             return xml_string
 486         if transform_source:
 487             xml_string = transform_source(xml_string)
 488         return compat_etree_fromstring(xml_string.encode('utf-8'))
 489
 490     def _download_json(self, url_or_request, video_id,
 491                        note='Downloading JSON metadata',
 492                        errnote='Unable to download JSON metadata',
 493                        transform_source=None,
 494                        fatal=True, encoding=None):
 495         json_string = self._download_webpage(
 496             url_or_request, video_id, note, errnote, fatal=fatal,
 497             encoding=encoding)
 498         if (not fatal) and json_string is False:
 499             return None
 500         return self._parse_json(
 501             json_string, video_id, transform_source=transform_source, fatal=fatal)
 502
 503     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 504         if transform_source:
 505             json_string = transform_source(json_string)
 506         try:
 507             return json.loads(json_string)
 508         except ValueError as ve:
 509             errmsg = '%s: Failed to parse JSON ' % video_id
 510             if fatal:
 511                 raise ExtractorError(errmsg, cause=ve)
 512             else:
 513                 self.report_warning(errmsg + str(ve))
 514
 515     def report_warning(self, msg, video_id=None):
 516         idstr = '' if video_id is None else '%s: ' % video_id
 517         self._downloader.report_warning(
 518             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 519
 520     def to_screen(self, msg):
 521         """Print msg to screen, prefixing it with '[ie_name]'"""
 522         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 523
 524     def report_extraction(self, id_or_name):
 525         """Report information extraction."""
 526         self.to_screen('%s: Extracting information' % id_or_name)
 527
 528     def report_download_webpage(self, video_id):
 529         """Report webpage download."""
 530         self.to_screen('%s: Downloading webpage' % video_id)
 531
 532     def report_age_confirmation(self):
 533         """Report attempt to confirm age."""
 534         self.to_screen('Confirming age')
 535
 536     def report_login(self):
 537         """Report attempt to log in."""
 538         self.to_screen('Logging in')
 539
 540     @staticmethod
 541     def raise_login_required(msg='This video is only available for registered users'):
 542         raise ExtractorError(
 543             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
 544             expected=True)
 545
 546     @staticmethod
 547     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'):
 548         raise ExtractorError(
 549             '%s. You might want to use --proxy to workaround.' % msg,
 550             expected=True)
 551
 552     # Methods for following #608
 553     @staticmethod
 554     def url_result(url, ie=None, video_id=None, video_title=None):
 555         """Returns a URL that points to a page that should be processed"""
 556         # TODO: ie should be the class used for getting the info
 557         video_info = {'_type': 'url',
 558                       'url': url,
 559                       'ie_key': ie}
 560         if video_id is not None:
 561             video_info['id'] = video_id
 562         if video_title is not None:
 563             video_info['title'] = video_title
 564         return video_info
 565
 566     @staticmethod
 567     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 568         """Returns a playlist"""
 569         video_info = {'_type': 'playlist',
 570                       'entries': entries}
 571         if playlist_id:
 572             video_info['id'] = playlist_id
 573         if playlist_title:
 574             video_info['title'] = playlist_title
 575         if playlist_description:
 576             video_info['description'] = playlist_description
 577         return video_info
 578
 579     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 580         """
 581         Perform a regex search on the given string, using a single or a list of
 582         patterns returning the first matching group.
 583         In case of failure return a default value or raise a WARNING or a
 584         RegexNotFoundError, depending on fatal, specifying the field name.
 585         """
 586         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 587             mobj = re.search(pattern, string, flags)
 588         else:
 589             for p in pattern:
 590                 mobj = re.search(p, string, flags)
 591                 if mobj:
 592                     break
 593
 594         if not self._downloader.params.get('no_color') and os.name != 'nt' and sys.stderr.isatty():
 595             _name = '\033[0;34m%s\033[0m' % name
 596         else:
 597             _name = name
 598
 599         if mobj:
 600             if group is None:
 601                 # return the first matching group
 602                 return next(g for g in mobj.groups() if g is not None)
 603             else:
 604                 return mobj.group(group)
 605         elif default is not NO_DEFAULT:
 606             return default
 607         elif fatal:
 608             raise RegexNotFoundError('Unable to extract %s' % _name)
 609         else:
 610             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
 611             return None
 612
 613     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 614         """
 615         Like _search_regex, but strips HTML tags and unescapes entities.
 616         """
 617         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 618         if res:
 619             return clean_html(res).strip()
 620         else:
 621             return res
 622
 623     def _get_login_info(self):
 624         """
 625         Get the login info as (username, password)
 626         It will look in the netrc file using the _NETRC_MACHINE value
 627         If there's no info available, return (None, None)
 628         """
 629         if self._downloader is None:
 630             return (None, None)
 631
 632         username = None
 633         password = None
 634         downloader_params = self._downloader.params
 635
 636         # Attempt to use provided username and password or .netrc data
 637         if downloader_params.get('username', None) is not None:
 638             username = downloader_params['username']
 639             password = downloader_params['password']
 640         elif downloader_params.get('usenetrc', False):
 641             try:
 642                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 643                 if info is not None:
 644                     username = info[0]
 645                     password = info[2]
 646                 else:
 647                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 648             except (IOError, netrc.NetrcParseError) as err:
 649                 self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err))
 650
 651         return (username, password)
 652
 653     def _get_tfa_info(self, note='two-factor verification code'):
 654         """
 655         Get the two-factor authentication info
 656         TODO - asking the user will be required for sms/phone verify
 657         currently just uses the command line option
 658         If there's no info available, return None
 659         """
 660         if self._downloader is None:
 661             return None
 662         downloader_params = self._downloader.params
 663
 664         if downloader_params.get('twofactor', None) is not None:
 665             return downloader_params['twofactor']
 666
 667         return compat_getpass('Type %s and press [Return]: ' % note)
 668
 669     # Helper functions for extracting OpenGraph info
 670     @staticmethod
 671     def _og_regexes(prop):
 672         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
 673         property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
 674                        % {'prop': re.escape(prop)})
 675         template = r'<meta[^>]+?%s[^>]+?%s'
 676         return [
 677             template % (property_re, content_re),
 678             template % (content_re, property_re),
 679         ]
 680
 681     @staticmethod
 682     def _meta_regex(prop):
 683         return r'''(?isx)<meta
 684                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
 685                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
 686
 687     def _og_search_property(self, prop, html, name=None, **kargs):
 688         if name is None:
 689             name = 'OpenGraph %s' % prop
 690         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 691         if escaped is None:
 692             return None
 693         return unescapeHTML(escaped)
 694
 695     def _og_search_thumbnail(self, html, **kargs):
 696         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
 697
 698     def _og_search_description(self, html, **kargs):
 699         return self._og_search_property('description', html, fatal=False, **kargs)
 700
 701     def _og_search_title(self, html, **kargs):
 702         return self._og_search_property('title', html, **kargs)
 703
 704     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 705         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 706         if secure:
 707             regexes = self._og_regexes('video:secure_url') + regexes
 708         return self._html_search_regex(regexes, html, name, **kargs)
 709
 710     def _og_search_url(self, html, **kargs):
 711         return self._og_search_property('url', html, **kargs)
 712
 713     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 714         if display_name is None:
 715             display_name = name
 716         return self._html_search_regex(
 717             self._meta_regex(name),
 718             html, display_name, fatal=fatal, group='content', **kwargs)
 719
 720     def _dc_search_uploader(self, html):
 721         return self._html_search_meta('dc.creator', html, 'uploader')
 722
 723     def _rta_search(self, html):
 724         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 725         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 726                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 727                      html):
 728             return 18
 729         return 0
 730
 731     def _media_rating_search(self, html):
 732         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 733         rating = self._html_search_meta('rating', html)
 734
 735         if not rating:
 736             return None
 737
 738         RATING_TABLE = {
 739             'safe for kids': 0,
 740             'general': 8,
 741             '14 years': 14,
 742             'mature': 17,
 743             'restricted': 19,
 744         }
 745         return RATING_TABLE.get(rating.lower(), None)
 746
 747     def _family_friendly_search(self, html):
 748         # See http://schema.org/VideoObject
 749         family_friendly = self._html_search_meta('isFamilyFriendly', html)
 750
 751         if not family_friendly:
 752             return None
 753
 754         RATING_TABLE = {
 755             '1': 0,
 756             'true': 0,
 757             '0': 18,
 758             'false': 18,
 759         }
 760         return RATING_TABLE.get(family_friendly.lower(), None)
 761
 762     def _twitter_search_player(self, html):
 763         return self._html_search_meta('twitter:player', html,
 764                                       'twitter card player')
 765
 766     def _search_json_ld(self, html, video_id, fatal=True):
 767         json_ld = self._search_regex(
 768             r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
 769             html, 'JSON-LD', fatal=fatal, group='json_ld')
 770         if not json_ld:
 771             return {}
 772         return self._json_ld(json_ld, video_id, fatal=fatal)
 773
 774     def _json_ld(self, json_ld, video_id, fatal=True):
 775         if isinstance(json_ld, compat_str):
 776             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
 777         if not json_ld:
 778             return {}
 779         info = {}
 780         if json_ld.get('@context') == 'http://schema.org':
 781             item_type = json_ld.get('@type')
 782             if item_type == 'TVEpisode':
 783                 info.update({
 784                     'episode': unescapeHTML(json_ld.get('name')),
 785                     'episode_number': int_or_none(json_ld.get('episodeNumber')),
 786                     'description': unescapeHTML(json_ld.get('description')),
 787                 })
 788                 part_of_season = json_ld.get('partOfSeason')
 789                 if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
 790                     info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
 791                 part_of_series = json_ld.get('partOfSeries')
 792                 if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
 793                     info['series'] = unescapeHTML(part_of_series.get('name'))
 794             elif item_type == 'Article':
 795                 info.update({
 796                     'timestamp': parse_iso8601(json_ld.get('datePublished')),
 797                     'title': unescapeHTML(json_ld.get('headline')),
 798                     'description': unescapeHTML(json_ld.get('articleBody')),
 799                 })
 800         return dict((k, v) for k, v in info.items() if v is not None)
 801
 802     @staticmethod
 803     def _hidden_inputs(html):
 804         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
 805         hidden_inputs = {}
 806         for input in re.findall(r'(?i)<input([^>]+)>', html):
 807             if not re.search(r'type=(["\'])(?:hidden|submit)\1', input):
 808                 continue
 809             name = re.search(r'name=(["\'])(?P<value>.+?)\1', input)
 810             if not name:
 811                 continue
 812             value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
 813             if not value:
 814                 continue
 815             hidden_inputs[name.group('value')] = value.group('value')
 816         return hidden_inputs
 817
 818     def _form_hidden_inputs(self, form_id, html):
 819         form = self._search_regex(
 820             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
 821             html, '%s form' % form_id, group='form')
 822         return self._hidden_inputs(form)
 823
 824     def _sort_formats(self, formats, field_preference=None):
 825         if not formats:
 826             raise ExtractorError('No video formats found')
 827
 828         def _formats_key(f):
 829             # TODO remove the following workaround
 830             from ..utils import determine_ext
 831             if not f.get('ext') and 'url' in f:
 832                 f['ext'] = determine_ext(f['url'])
 833
 834             if isinstance(field_preference, (list, tuple)):
 835                 return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
 836
 837             preference = f.get('preference')
 838             if preference is None:
 839                 preference = 0
 840                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 841                     preference -= 0.5
 842
 843             proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1
 844
 845             if f.get('vcodec') == 'none':  # audio only
 846                 if self._downloader.params.get('prefer_free_formats'):
 847                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
 848                 else:
 849                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
 850                 ext_preference = 0
 851                 try:
 852                     audio_ext_preference = ORDER.index(f['ext'])
 853                 except ValueError:
 854                     audio_ext_preference = -1
 855             else:
 856                 if self._downloader.params.get('prefer_free_formats'):
 857                     ORDER = ['flv', 'mp4', 'webm']
 858                 else:
 859                     ORDER = ['webm', 'flv', 'mp4']
 860                 try:
 861                     ext_preference = ORDER.index(f['ext'])
 862                 except ValueError:
 863                     ext_preference = -1
 864                 audio_ext_preference = 0
 865
 866             return (
 867                 preference,
 868                 f.get('language_preference') if f.get('language_preference') is not None else -1,
 869                 f.get('quality') if f.get('quality') is not None else -1,
 870                 f.get('tbr') if f.get('tbr') is not None else -1,
 871                 f.get('filesize') if f.get('filesize') is not None else -1,
 872                 f.get('vbr') if f.get('vbr') is not None else -1,
 873                 f.get('height') if f.get('height') is not None else -1,
 874                 f.get('width') if f.get('width') is not None else -1,
 875                 proto_preference,
 876                 ext_preference,
 877                 f.get('abr') if f.get('abr') is not None else -1,
 878                 audio_ext_preference,
 879                 f.get('fps') if f.get('fps') is not None else -1,
 880                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
 881                 f.get('source_preference') if f.get('source_preference') is not None else -1,
 882                 f.get('format_id') if f.get('format_id') is not None else '',
 883             )
 884         formats.sort(key=_formats_key)
 885
 886     def _check_formats(self, formats, video_id):
 887         if formats:
 888             formats[:] = filter(
 889                 lambda f: self._is_valid_url(
 890                     f['url'], video_id,
 891                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
 892                 formats)
 893
 894     def _is_valid_url(self, url, video_id, item='video'):
 895         url = self._proto_relative_url(url, scheme='http:')
 896         # For now assume non HTTP(S) URLs always valid
 897         if not (url.startswith('http://') or url.startswith('https://')):
 898             return True
 899         try:
 900             self._request_webpage(url, video_id, 'Checking %s URL' % item)
 901             return True
 902         except ExtractorError as e:
 903             if isinstance(e.cause, compat_urllib_error.URLError):
 904                 self.to_screen(
 905                     '%s: %s URL is invalid, skipping' % (video_id, item))
 906                 return False
 907             raise
 908
 909     def http_scheme(self):
 910         """ Either "http:" or "https:", depending on the user's preferences """
 911         return (
 912             'http:'
 913             if self._downloader.params.get('prefer_insecure', False)
 914             else 'https:')
 915
 916     def _proto_relative_url(self, url, scheme=None):
 917         if url is None:
 918             return url
 919         if url.startswith('//'):
 920             if scheme is None:
 921                 scheme = self.http_scheme()
 922             return scheme + url
 923         else:
 924             return url
 925
 926     def _sleep(self, timeout, video_id, msg_template=None):
 927         if msg_template is None:
 928             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
 929         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
 930         self.to_screen(msg)
 931         time.sleep(timeout)
 932
 933     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
 934                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
 935                              fatal=True):
 936         manifest = self._download_xml(
 937             manifest_url, video_id, 'Downloading f4m manifest',
 938             'Unable to download f4m manifest',
 939             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
 940             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
 941             transform_source=transform_source,
 942             fatal=fatal)
 943
 944         if manifest is False:
 945             return []
 946
 947         formats = []
 948         manifest_version = '1.0'
 949         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
 950         if not media_nodes:
 951             manifest_version = '2.0'
 952             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
 953         base_url = xpath_text(
 954             manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
 955             'base URL', default=None)
 956         if base_url:
 957             base_url = base_url.strip()
 958         for i, media_el in enumerate(media_nodes):
 959             if manifest_version == '2.0':
 960                 media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
 961                 if not media_url:
 962                     continue
 963                 manifest_url = (
 964                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
 965                     else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
 966                 # If media_url is itself a f4m manifest do the recursive extraction
 967                 # since bitrates in parent manifest (this one) and media_url manifest
 968                 # may differ leading to inability to resolve the format by requested
 969                 # bitrate in f4m downloader
 970                 if determine_ext(manifest_url) == 'f4m':
 971                     formats.extend(self._extract_f4m_formats(
 972                         manifest_url, video_id, preference, f4m_id, fatal=fatal))
 973                     continue
 974             tbr = int_or_none(media_el.attrib.get('bitrate'))
 975             formats.append({
 976                 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
 977                 'url': manifest_url,
 978                 'ext': 'flv',
 979                 'tbr': tbr,
 980                 'width': int_or_none(media_el.attrib.get('width')),
 981                 'height': int_or_none(media_el.attrib.get('height')),
 982                 'preference': preference,
 983             })
 984         self._sort_formats(formats)
 985
 986         return formats
 987
 988     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
 989                               entry_protocol='m3u8', preference=None,
 990                               m3u8_id=None, note=None, errnote=None,
 991                               fatal=True):
 992
 993         formats = [{
 994             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
 995             'url': m3u8_url,
 996             'ext': ext,
 997             'protocol': 'm3u8',
 998             'preference': preference - 1 if preference else -1,
 999             'resolution': 'multiple',
1000             'format_note': 'Quality selection URL',
1001         }]
1002
1003         format_url = lambda u: (
1004             u
1005             if re.match(r'^https?://', u)
1006             else compat_urlparse.urljoin(m3u8_url, u))
1007
1008         res = self._download_webpage_handle(
1009             m3u8_url, video_id,
1010             note=note or 'Downloading m3u8 information',
1011             errnote=errnote or 'Failed to download m3u8 information',
1012             fatal=fatal)
1013         if res is False:
1014             return []
1015         m3u8_doc, urlh = res
1016         m3u8_url = urlh.geturl()
1017         last_info = None
1018         last_media = None
1019         kv_rex = re.compile(
1020             r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
1021         for line in m3u8_doc.splitlines():
1022             if line.startswith('#EXT-X-STREAM-INF:'):
1023                 last_info = {}
1024                 for m in kv_rex.finditer(line):
1025                     v = m.group('val')
1026                     if v.startswith('"'):
1027                         v = v[1:-1]
1028                     last_info[m.group('key')] = v
1029             elif line.startswith('#EXT-X-MEDIA:'):
1030                 last_media = {}
1031                 for m in kv_rex.finditer(line):
1032                     v = m.group('val')
1033                     if v.startswith('"'):
1034                         v = v[1:-1]
1035                     last_media[m.group('key')] = v
1036             elif line.startswith('#') or not line.strip():
1037                 continue
1038             else:
1039                 if last_info is None:
1040                     formats.append({'url': format_url(line)})
1041                     continue
1042                 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
1043                 format_id = []
1044                 if m3u8_id:
1045                     format_id.append(m3u8_id)
1046                 last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
1047                 format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
1048                 f = {
1049                     'format_id': '-'.join(format_id),
1050                     'url': format_url(line.strip()),
1051                     'tbr': tbr,
1052                     'ext': ext,
1053                     'protocol': entry_protocol,
1054                     'preference': preference,
1055                 }
1056                 codecs = last_info.get('CODECS')
1057                 if codecs:
1058                     # TODO: looks like video codec is not always necessarily goes first
1059                     va_codecs = codecs.split(',')
1060                     if va_codecs[0]:
1061                         f['vcodec'] = va_codecs[0].partition('.')[0]
1062                     if len(va_codecs) > 1 and va_codecs[1]:
1063                         f['acodec'] = va_codecs[1].partition('.')[0]
1064                 resolution = last_info.get('RESOLUTION')
1065                 if resolution:
1066                     width_str, height_str = resolution.split('x')
1067                     f['width'] = int(width_str)
1068                     f['height'] = int(height_str)
1069                 if last_media is not None:
1070                     f['m3u8_media'] = last_media
1071                     last_media = None
1072                 formats.append(f)
1073                 last_info = {}
1074         self._sort_formats(formats)
1075         return formats
1076
1077     @staticmethod
1078     def _xpath_ns(path, namespace=None):
1079         if not namespace:
1080             return path
1081         out = []
1082         for c in path.split('/'):
1083             if not c or c == '.':
1084                 out.append(c)
1085             else:
1086                 out.append('{%s}%s' % (namespace, c))
1087         return '/'.join(out)
1088
1089     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None):
1090         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1091
1092         if smil is False:
1093             assert not fatal
1094             return []
1095
1096         namespace = self._parse_smil_namespace(smil)
1097
1098         return self._parse_smil_formats(
1099             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1100
1101     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1102         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1103         if smil is False:
1104             return {}
1105         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1106
1107     def _download_smil(self, smil_url, video_id, fatal=True):
1108         return self._download_xml(
1109             smil_url, video_id, 'Downloading SMIL file',
1110             'Unable to download SMIL file', fatal=fatal)
1111
1112     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1113         namespace = self._parse_smil_namespace(smil)
1114
1115         formats = self._parse_smil_formats(
1116             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1117         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1118
1119         video_id = os.path.splitext(url_basename(smil_url))[0]
1120         title = None
1121         description = None
1122         upload_date = None
1123         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1124             name = meta.attrib.get('name')
1125             content = meta.attrib.get('content')
1126             if not name or not content:
1127                 continue
1128             if not title and name == 'title':
1129                 title = content
1130             elif not description and name in ('description', 'abstract'):
1131                 description = content
1132             elif not upload_date and name == 'date':
1133                 upload_date = unified_strdate(content)
1134
1135         thumbnails = [{
1136             'id': image.get('type'),
1137             'url': image.get('src'),
1138             'width': int_or_none(image.get('width')),
1139             'height': int_or_none(image.get('height')),
1140         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1141
1142         return {
1143             'id': video_id,
1144             'title': title or video_id,
1145             'description': description,
1146             'upload_date': upload_date,
1147             'thumbnails': thumbnails,
1148             'formats': formats,
1149             'subtitles': subtitles,
1150         }
1151
1152     def _parse_smil_namespace(self, smil):
1153         return self._search_regex(
1154             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1155
1156     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1157         base = smil_url
1158         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1159             b = meta.get('base') or meta.get('httpBase')
1160             if b:
1161                 base = b
1162                 break
1163
1164         formats = []
1165         rtmp_count = 0
1166         http_count = 0
1167
1168         videos = smil.findall(self._xpath_ns('.//video', namespace))
1169         for video in videos:
1170             src = video.get('src')
1171             if not src:
1172                 continue
1173
1174             bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
1175             filesize = int_or_none(video.get('size') or video.get('fileSize'))
1176             width = int_or_none(video.get('width'))
1177             height = int_or_none(video.get('height'))
1178             proto = video.get('proto')
1179             ext = video.get('ext')
1180             src_ext = determine_ext(src)
1181             streamer = video.get('streamer') or base
1182
1183             if proto == 'rtmp' or streamer.startswith('rtmp'):
1184                 rtmp_count += 1
1185                 formats.append({
1186                     'url': streamer,
1187                     'play_path': src,
1188                     'ext': 'flv',
1189                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1190                     'tbr': bitrate,
1191                     'filesize': filesize,
1192                     'width': width,
1193                     'height': height,
1194                 })
1195                 if transform_rtmp_url:
1196                     streamer, src = transform_rtmp_url(streamer, src)
1197                     formats[-1].update({
1198                         'url': streamer,
1199                         'play_path': src,
1200                     })
1201                 continue
1202
1203             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1204
1205             if proto == 'm3u8' or src_ext == 'm3u8':
1206                 formats.extend(self._extract_m3u8_formats(
1207                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False))
1208                 continue
1209
1210             if src_ext == 'f4m':
1211                 f4m_url = src_url
1212                 if not f4m_params:
1213                     f4m_params = {
1214                         'hdcore': '3.2.0',
1215                         'plugin': 'flowplayer-3.2.0.1',
1216                     }
1217                 f4m_url += '&' if '?' in f4m_url else '?'
1218                 f4m_url += compat_urllib_parse.urlencode(f4m_params)
1219                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1220                 continue
1221
1222             if src_url.startswith('http') and self._is_valid_url(src, video_id):
1223                 http_count += 1
1224                 formats.append({
1225                     'url': src_url,
1226                     'ext': ext or src_ext or 'flv',
1227                     'format_id': 'http-%d' % (bitrate or http_count),
1228                     'tbr': bitrate,
1229                     'filesize': filesize,
1230                     'width': width,
1231                     'height': height,
1232                 })
1233                 continue
1234
1235         self._sort_formats(formats)
1236
1237         return formats
1238
1239     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1240         subtitles = {}
1241         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1242             src = textstream.get('src')
1243             if not src:
1244                 continue
1245             ext = textstream.get('ext') or determine_ext(src)
1246             if not ext:
1247                 type_ = textstream.get('type')
1248                 SUBTITLES_TYPES = {
1249                     'text/vtt': 'vtt',
1250                     'text/srt': 'srt',
1251                     'application/smptett+xml': 'tt',
1252                 }
1253                 if type_ in SUBTITLES_TYPES:
1254                     ext = SUBTITLES_TYPES[type_]
1255             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1256             subtitles.setdefault(lang, []).append({
1257                 'url': src,
1258                 'ext': ext,
1259             })
1260         return subtitles
1261
1262     def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1263         xspf = self._download_xml(
1264             playlist_url, playlist_id, 'Downloading xpsf playlist',
1265             'Unable to download xspf manifest', fatal=fatal)
1266         if xspf is False:
1267             return []
1268         return self._parse_xspf(xspf, playlist_id)
1269
1270     def _parse_xspf(self, playlist, playlist_id):
1271         NS_MAP = {
1272             'xspf': 'http://xspf.org/ns/0/',
1273             's1': 'http://static.streamone.nl/player/ns/0',
1274         }
1275
1276         entries = []
1277         for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1278             title = xpath_text(
1279                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1280             description = xpath_text(
1281                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1282             thumbnail = xpath_text(
1283                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1284             duration = float_or_none(
1285                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1286
1287             formats = [{
1288                 'url': location.text,
1289                 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1290                 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1291                 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1292             } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1293             self._sort_formats(formats)
1294
1295             entries.append({
1296                 'id': playlist_id,
1297                 'title': title,
1298                 'description': description,
1299                 'thumbnail': thumbnail,
1300                 'duration': duration,
1301                 'formats': formats,
1302             })
1303         return entries
1304
1305     def _live_title(self, name):
1306         """ Generate the title for a live video """
1307         now = datetime.datetime.now()
1308         now_str = now.strftime("%Y-%m-%d %H:%M")
1309         return name + ' ' + now_str
1310
1311     def _int(self, v, name, fatal=False, **kwargs):
1312         res = int_or_none(v, **kwargs)
1313         if 'get_attr' in kwargs:
1314             print(getattr(v, kwargs['get_attr']))
1315         if res is None:
1316             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1317             if fatal:
1318                 raise ExtractorError(msg)
1319             else:
1320                 self._downloader.report_warning(msg)
1321         return res
1322
1323     def _float(self, v, name, fatal=False, **kwargs):
1324         res = float_or_none(v, **kwargs)
1325         if res is None:
1326             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1327             if fatal:
1328                 raise ExtractorError(msg)
1329             else:
1330                 self._downloader.report_warning(msg)
1331         return res
1332
1333     def _set_cookie(self, domain, name, value, expire_time=None):
1334         cookie = compat_cookiejar.Cookie(
1335             0, name, value, None, None, domain, None,
1336             None, '/', True, False, expire_time, '', None, None, None)
1337         self._downloader.cookiejar.set_cookie(cookie)
1338
1339     def _get_cookies(self, url):
1340         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
1341         req = sanitized_Request(url)
1342         self._downloader.cookiejar.add_cookie_header(req)
1343         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
1344
1345     def get_testcases(self, include_onlymatching=False):
1346         t = getattr(self, '_TEST', None)
1347         if t:
1348             assert not hasattr(self, '_TESTS'), \
1349                 '%s has _TEST and _TESTS' % type(self).__name__
1350             tests = [t]
1351         else:
1352             tests = getattr(self, '_TESTS', [])
1353         for t in tests:
1354             if not include_onlymatching and t.get('only_matching', False):
1355                 continue
1356             t['name'] = type(self).__name__[:-len('IE')]
1357             yield t
1358
1359     def is_suitable(self, age_limit):
1360         """ Test whether the extractor is generally suitable for the given
1361         age limit (i.e. pornographic sites are not, all others usually are) """
1362
1363         any_restricted = False
1364         for tc in self.get_testcases(include_onlymatching=False):
1365             if 'playlist' in tc:
1366                 tc = tc['playlist'][0]
1367             is_restricted = age_restricted(
1368                 tc.get('info_dict', {}).get('age_limit'), age_limit)
1369             if not is_restricted:
1370                 return True
1371             any_restricted = any_restricted or is_restricted
1372         return not any_restricted
1373
1374     def extract_subtitles(self, *args, **kwargs):
1375         if (self._downloader.params.get('writesubtitles', False) or
1376                 self._downloader.params.get('listsubtitles')):
1377             return self._get_subtitles(*args, **kwargs)
1378         return {}
1379
1380     def _get_subtitles(self, *args, **kwargs):
1381         raise NotImplementedError("This method must be implemented by subclasses")
1382
1383     @staticmethod
1384     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
1385         """ Merge subtitle items for one language. Items with duplicated URLs
1386         will be dropped. """
1387         list1_urls = set([item['url'] for item in subtitle_list1])
1388         ret = list(subtitle_list1)
1389         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
1390         return ret
1391
1392     @classmethod
1393     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
1394         """ Merge two subtitle dictionaries, language by language. """
1395         ret = dict(subtitle_dict1)
1396         for lang in subtitle_dict2:
1397             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
1398         return ret
1399
1400     def extract_automatic_captions(self, *args, **kwargs):
1401         if (self._downloader.params.get('writeautomaticsub', False) or
1402                 self._downloader.params.get('listsubtitles')):
1403             return self._get_automatic_captions(*args, **kwargs)
1404         return {}
1405
1406     def _get_automatic_captions(self, *args, **kwargs):
1407         raise NotImplementedError("This method must be implemented by subclasses")
1408
1409
1410 class SearchInfoExtractor(InfoExtractor):
1411     """
1412     Base class for paged search queries extractors.
1413     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
1414     Instances should define _SEARCH_KEY and _MAX_RESULTS.
1415     """
1416
1417     @classmethod
1418     def _make_valid_url(cls):
1419         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
1420
1421     @classmethod
1422     def suitable(cls, url):
1423         return re.match(cls._make_valid_url(), url) is not None
1424
1425     def _real_extract(self, query):
1426         mobj = re.match(self._make_valid_url(), query)
1427         if mobj is None:
1428             raise ExtractorError('Invalid search query "%s"' % query)
1429
1430         prefix = mobj.group('prefix')
1431         query = mobj.group('query')
1432         if prefix == '':
1433             return self._get_n_results(query, 1)
1434         elif prefix == 'all':
1435             return self._get_n_results(query, self._MAX_RESULTS)
1436         else:
1437             n = int(prefix)
1438             if n <= 0:
1439                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
1440             elif n > self._MAX_RESULTS:
1441                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
1442                 n = self._MAX_RESULTS
1443             return self._get_n_results(query, n)
1444
1445     def _get_n_results(self, query, n):
1446         """Get a specified number of results for a query"""
1447         raise NotImplementedError("This method must be implemented by subclasses")
1448
1449     @property
1450     def SEARCH_KEY(self):
1451         return self._SEARCH_KEY