youtube_dl/extractor/common.py

   1 from __future__ import unicode_literals
   2
   3 import base64
   4 import datetime
   5 import hashlib
   6 import json
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import sys
  12 import time
  13 import xml.etree.ElementTree
  14
  15 from ..utils import (
  16     compat_http_client,
  17     compat_urllib_error,
  18     compat_urllib_parse_urlparse,
  19     compat_urlparse,
  20     compat_str,
  21
  22     clean_html,
  23     compiled_regex_type,
  24     ExtractorError,
  25     float_or_none,
  26     int_or_none,
  27     RegexNotFoundError,
  28     sanitize_filename,
  29     unescapeHTML,
  30 )
  31 _NO_DEFAULT = object()
  32
  33
  34 class InfoExtractor(object):
  35     """Information Extractor class.
  36
  37     Information extractors are the classes that, given a URL, extract
  38     information about the video (or videos) the URL refers to. This
  39     information includes the real video URL, the video title, author and
  40     others. The information is stored in a dictionary which is then
  41     passed to the FileDownloader. The FileDownloader processes this
  42     information possibly downloading the video to the file system, among
  43     other possible outcomes.
  44
  45     The dictionaries must include the following fields:
  46
  47     id:             Video identifier.
  48     title:          Video title, unescaped.
  49
  50     Additionally, it must contain either a formats entry or a url one:
  51
  52     formats:        A list of dictionaries for each format available, ordered
  53                     from worst to best quality.
  54
  55                     Potential fields:
  56                     * url        Mandatory. The URL of the video file
  57                     * ext        Will be calculated from url if missing
  58                     * format     A human-readable description of the format
  59                                  ("mp4 container with h264/opus").
  60                                  Calculated from the format_id, width, height.
  61                                  and format_note fields if missing.
  62                     * format_id  A short description of the format
  63                                  ("mp4_h264_opus" or "19").
  64                                 Technically optional, but strongly recommended.
  65                     * format_note Additional info about the format
  66                                  ("3D" or "DASH video")
  67                     * width      Width of the video, if known
  68                     * height     Height of the video, if known
  69                     * resolution Textual description of width and height
  70                     * tbr        Average bitrate of audio and video in KBit/s
  71                     * abr        Average audio bitrate in KBit/s
  72                     * acodec     Name of the audio codec in use
  73                     * asr        Audio sampling rate in Hertz
  74                     * vbr        Average video bitrate in KBit/s
  75                     * vcodec     Name of the video codec in use
  76                     * container  Name of the container format
  77                     * filesize   The number of bytes, if known in advance
  78                     * filesize_approx  An estimate for the number of bytes
  79                     * player_url SWF Player URL (used for rtmpdump).
  80                     * protocol   The protocol that will be used for the actual
  81                                  download, lower-case.
  82                                  "http", "https", "rtsp", "rtmp", "m3u8" or so.
  83                     * preference Order number of this format. If this field is
  84                                  present and not None, the formats get sorted
  85                                  by this field, regardless of all other values.
  86                                  -1 for default (order by other properties),
  87                                  -2 or smaller for less than default.
  88                     * quality    Order number of the video quality of this
  89                                  format, irrespective of the file format.
  90                                  -1 for default (order by other properties),
  91                                  -2 or smaller for less than default.
  92                     * source_preference  Order number for this video source
  93                                   (quality takes higher priority)
  94                                  -1 for default (order by other properties),
  95                                  -2 or smaller for less than default.
  96                     * http_referer  HTTP Referer header value to set.
  97                     * http_method  HTTP method to use for the download.
  98                     * http_headers  A dictionary of additional HTTP headers
  99                                  to add to the request.
 100                     * http_post_data  Additional data to send with a POST
 101                                  request.
 102     url:            Final video URL.
 103     ext:            Video filename extension.
 104     format:         The video format, defaults to ext (used for --get-format)
 105     player_url:     SWF Player URL (used for rtmpdump).
 106
 107     The following fields are optional:
 108
 109     display_id      An alternative identifier for the video, not necessarily
 110                     unique, but available before title. Typically, id is
 111                     something like "4234987", title "Dancing naked mole rats",
 112                     and display_id "dancing-naked-mole-rats"
 113     thumbnails:     A list of dictionaries, with the following entries:
 114                         * "url"
 115                         * "width" (optional, int)
 116                         * "height" (optional, int)
 117                         * "resolution" (optional, string "{width}x{height"},
 118                                         deprecated)
 119     thumbnail:      Full URL to a video thumbnail image.
 120     description:    One-line video description.
 121     uploader:       Full name of the video uploader.
 122     timestamp:      UNIX timestamp of the moment the video became available.
 123     upload_date:    Video upload date (YYYYMMDD).
 124                     If not explicitly set, calculated from timestamp.
 125     uploader_id:    Nickname or id of the video uploader.
 126     location:       Physical location where the video was filmed.
 127     subtitles:      The subtitle file contents as a dictionary in the format
 128                     {language: subtitles}.
 129     duration:       Length of the video in seconds, as an integer.
 130     view_count:     How many users have watched the video on the platform.
 131     like_count:     Number of positive ratings of the video
 132     dislike_count:  Number of negative ratings of the video
 133     comment_count:  Number of comments on the video
 134     age_limit:      Age restriction for the video, as an integer (years)
 135     webpage_url:    The url to the video webpage, if given to youtube-dl it
 136                     should allow to get the same result again. (It will be set
 137                     by YoutubeDL if it's missing)
 138     categories:     A list of categories that the video falls in, for example
 139                     ["Sports", "Berlin"]
 140     is_live:        True, False, or None (=unknown). Whether this video is a
 141                     live stream that goes on instead of a fixed-length video.
 142
 143     Unless mentioned otherwise, the fields should be Unicode strings.
 144
 145     Unless mentioned otherwise, None is equivalent to absence of information.
 146
 147     Subclasses of this one should re-define the _real_initialize() and
 148     _real_extract() methods and define a _VALID_URL regexp.
 149     Probably, they should also be added to the list of extractors.
 150
 151     Finally, the _WORKING attribute should be set to False for broken IEs
 152     in order to warn the users and skip the tests.
 153     """
 154
 155     _ready = False
 156     _downloader = None
 157     _WORKING = True
 158
 159     def __init__(self, downloader=None):
 160         """Constructor. Receives an optional downloader."""
 161         self._ready = False
 162         self.set_downloader(downloader)
 163
 164     @classmethod
 165     def suitable(cls, url):
 166         """Receives a URL and returns True if suitable for this IE."""
 167
 168         # This does not use has/getattr intentionally - we want to know whether
 169         # we have cached the regexp for *this* class, whereas getattr would also
 170         # match the superclass
 171         if '_VALID_URL_RE' not in cls.__dict__:
 172             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 173         return cls._VALID_URL_RE.match(url) is not None
 174
 175     @classmethod
 176     def _match_id(cls, url):
 177         if '_VALID_URL_RE' not in cls.__dict__:
 178             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 179         m = cls._VALID_URL_RE.match(url)
 180         assert m
 181         return m.group('id')
 182
 183     @classmethod
 184     def working(cls):
 185         """Getter method for _WORKING."""
 186         return cls._WORKING
 187
 188     def initialize(self):
 189         """Initializes an instance (authentication, etc)."""
 190         if not self._ready:
 191             self._real_initialize()
 192             self._ready = True
 193
 194     def extract(self, url):
 195         """Extracts URL information and returns it in list of dicts."""
 196         self.initialize()
 197         return self._real_extract(url)
 198
 199     def set_downloader(self, downloader):
 200         """Sets the downloader for this IE."""
 201         self._downloader = downloader
 202
 203     def _real_initialize(self):
 204         """Real initialization process. Redefine in subclasses."""
 205         pass
 206
 207     def _real_extract(self, url):
 208         """Real extraction process. Redefine in subclasses."""
 209         pass
 210
 211     @classmethod
 212     def ie_key(cls):
 213         """A string for getting the InfoExtractor with get_info_extractor"""
 214         return cls.__name__[:-2]
 215
 216     @property
 217     def IE_NAME(self):
 218         return type(self).__name__[:-2]
 219
 220     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 221         """ Returns the response handle """
 222         if note is None:
 223             self.report_download_webpage(video_id)
 224         elif note is not False:
 225             if video_id is None:
 226                 self.to_screen('%s' % (note,))
 227             else:
 228                 self.to_screen('%s: %s' % (video_id, note))
 229         try:
 230             return self._downloader.urlopen(url_or_request)
 231         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 232             if errnote is False:
 233                 return False
 234             if errnote is None:
 235                 errnote = 'Unable to download webpage'
 236             errmsg = '%s: %s' % (errnote, compat_str(err))
 237             if fatal:
 238                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 239             else:
 240                 self._downloader.report_warning(errmsg)
 241                 return False
 242
 243     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 244         """ Returns a tuple (page content as string, URL handle) """
 245
 246         # Strip hashes from the URL (#1038)
 247         if isinstance(url_or_request, (compat_str, str)):
 248             url_or_request = url_or_request.partition('#')[0]
 249
 250         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 251         if urlh is False:
 252             assert not fatal
 253             return False
 254         content_type = urlh.headers.get('Content-Type', '')
 255         webpage_bytes = urlh.read()
 256         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 257         if m:
 258             encoding = m.group(1)
 259         else:
 260             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 261                           webpage_bytes[:1024])
 262             if m:
 263                 encoding = m.group(1).decode('ascii')
 264             elif webpage_bytes.startswith(b'\xff\xfe'):
 265                 encoding = 'utf-16'
 266             else:
 267                 encoding = 'utf-8'
 268         if self._downloader.params.get('dump_intermediate_pages', False):
 269             try:
 270                 url = url_or_request.get_full_url()
 271             except AttributeError:
 272                 url = url_or_request
 273             self.to_screen('Dumping request to ' + url)
 274             dump = base64.b64encode(webpage_bytes).decode('ascii')
 275             self._downloader.to_screen(dump)
 276         if self._downloader.params.get('write_pages', False):
 277             try:
 278                 url = url_or_request.get_full_url()
 279             except AttributeError:
 280                 url = url_or_request
 281             basen = '%s_%s' % (video_id, url)
 282             if len(basen) > 240:
 283                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 284                 basen = basen[:240 - len(h)] + h
 285             raw_filename = basen + '.dump'
 286             filename = sanitize_filename(raw_filename, restricted=True)
 287             self.to_screen('Saving request to ' + filename)
 288             # Working around MAX_PATH limitation on Windows (see
 289             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 290             if os.name == 'nt':
 291                 absfilepath = os.path.abspath(filename)
 292                 if len(absfilepath) > 259:
 293                     filename = '\\\\?\\' + absfilepath
 294             with open(filename, 'wb') as outf:
 295                 outf.write(webpage_bytes)
 296
 297         try:
 298             content = webpage_bytes.decode(encoding, 'replace')
 299         except LookupError:
 300             content = webpage_bytes.decode('utf-8', 'replace')
 301
 302         if ('<title>Access to this site is blocked</title>' in content and
 303                 'Websense' in content[:512]):
 304             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 305             blocked_iframe = self._html_search_regex(
 306                 r'<iframe src="([^"]+)"', content,
 307                 'Websense information URL', default=None)
 308             if blocked_iframe:
 309                 msg += ' Visit %s for more details' % blocked_iframe
 310             raise ExtractorError(msg, expected=True)
 311
 312         return (content, urlh)
 313
 314     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 315         """ Returns the data of the page as a string """
 316         res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
 317         if res is False:
 318             return res
 319         else:
 320             content, _ = res
 321             return content
 322
 323     def _download_xml(self, url_or_request, video_id,
 324                       note='Downloading XML', errnote='Unable to download XML',
 325                       transform_source=None, fatal=True):
 326         """Return the xml as an xml.etree.ElementTree.Element"""
 327         xml_string = self._download_webpage(
 328             url_or_request, video_id, note, errnote, fatal=fatal)
 329         if xml_string is False:
 330             return xml_string
 331         if transform_source:
 332             xml_string = transform_source(xml_string)
 333         return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
 334
 335     def _download_json(self, url_or_request, video_id,
 336                        note='Downloading JSON metadata',
 337                        errnote='Unable to download JSON metadata',
 338                        transform_source=None,
 339                        fatal=True):
 340         json_string = self._download_webpage(
 341             url_or_request, video_id, note, errnote, fatal=fatal)
 342         if (not fatal) and json_string is False:
 343             return None
 344         if transform_source:
 345             json_string = transform_source(json_string)
 346         try:
 347             return json.loads(json_string)
 348         except ValueError as ve:
 349             errmsg = '%s: Failed to parse JSON ' % video_id
 350             if fatal:
 351                 raise ExtractorError(errmsg, cause=ve)
 352             else:
 353                 self.report_warning(errmsg + str(ve))
 354
 355     def report_warning(self, msg, video_id=None):
 356         idstr = '' if video_id is None else '%s: ' % video_id
 357         self._downloader.report_warning(
 358             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 359
 360     def to_screen(self, msg):
 361         """Print msg to screen, prefixing it with '[ie_name]'"""
 362         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 363
 364     def report_extraction(self, id_or_name):
 365         """Report information extraction."""
 366         self.to_screen('%s: Extracting information' % id_or_name)
 367
 368     def report_download_webpage(self, video_id):
 369         """Report webpage download."""
 370         self.to_screen('%s: Downloading webpage' % video_id)
 371
 372     def report_age_confirmation(self):
 373         """Report attempt to confirm age."""
 374         self.to_screen('Confirming age')
 375
 376     def report_login(self):
 377         """Report attempt to log in."""
 378         self.to_screen('Logging in')
 379
 380     #Methods for following #608
 381     @staticmethod
 382     def url_result(url, ie=None, video_id=None):
 383         """Returns a url that points to a page that should be processed"""
 384         #TODO: ie should be the class used for getting the info
 385         video_info = {'_type': 'url',
 386                       'url': url,
 387                       'ie_key': ie}
 388         if video_id is not None:
 389             video_info['id'] = video_id
 390         return video_info
 391     @staticmethod
 392     def playlist_result(entries, playlist_id=None, playlist_title=None):
 393         """Returns a playlist"""
 394         video_info = {'_type': 'playlist',
 395                       'entries': entries}
 396         if playlist_id:
 397             video_info['id'] = playlist_id
 398         if playlist_title:
 399             video_info['title'] = playlist_title
 400         return video_info
 401
 402     def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
 403         """
 404         Perform a regex search on the given string, using a single or a list of
 405         patterns returning the first matching group.
 406         In case of failure return a default value or raise a WARNING or a
 407         RegexNotFoundError, depending on fatal, specifying the field name.
 408         """
 409         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 410             mobj = re.search(pattern, string, flags)
 411         else:
 412             for p in pattern:
 413                 mobj = re.search(p, string, flags)
 414                 if mobj:
 415                     break
 416
 417         if os.name != 'nt' and sys.stderr.isatty():
 418             _name = '\033[0;34m%s\033[0m' % name
 419         else:
 420             _name = name
 421
 422         if mobj:
 423             # return the first matching group
 424             return next(g for g in mobj.groups() if g is not None)
 425         elif default is not _NO_DEFAULT:
 426             return default
 427         elif fatal:
 428             raise RegexNotFoundError('Unable to extract %s' % _name)
 429         else:
 430             self._downloader.report_warning('unable to extract %s; '
 431                 'please report this issue on http://yt-dl.org/bug' % _name)
 432             return None
 433
 434     def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
 435         """
 436         Like _search_regex, but strips HTML tags and unescapes entities.
 437         """
 438         res = self._search_regex(pattern, string, name, default, fatal, flags)
 439         if res:
 440             return clean_html(res).strip()
 441         else:
 442             return res
 443
 444     def _get_login_info(self):
 445         """
 446         Get the the login info as (username, password)
 447         It will look in the netrc file using the _NETRC_MACHINE value
 448         If there's no info available, return (None, None)
 449         """
 450         if self._downloader is None:
 451             return (None, None)
 452
 453         username = None
 454         password = None
 455         downloader_params = self._downloader.params
 456
 457         # Attempt to use provided username and password or .netrc data
 458         if downloader_params.get('username', None) is not None:
 459             username = downloader_params['username']
 460             password = downloader_params['password']
 461         elif downloader_params.get('usenetrc', False):
 462             try:
 463                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 464                 if info is not None:
 465                     username = info[0]
 466                     password = info[2]
 467                 else:
 468                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 469             except (IOError, netrc.NetrcParseError) as err:
 470                 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
 471
 472         return (username, password)
 473
 474     def _get_tfa_info(self):
 475         """
 476         Get the two-factor authentication info
 477         TODO - asking the user will be required for sms/phone verify
 478         currently just uses the command line option
 479         If there's no info available, return None
 480         """
 481         if self._downloader is None:
 482             return None
 483         downloader_params = self._downloader.params
 484
 485         if downloader_params.get('twofactor', None) is not None:
 486             return downloader_params['twofactor']
 487
 488         return None
 489
 490     # Helper functions for extracting OpenGraph info
 491     @staticmethod
 492     def _og_regexes(prop):
 493         content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
 494         property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
 495         template = r'<meta[^>]+?%s[^>]+?%s'
 496         return [
 497             template % (property_re, content_re),
 498             template % (content_re, property_re),
 499         ]
 500
 501     def _og_search_property(self, prop, html, name=None, **kargs):
 502         if name is None:
 503             name = 'OpenGraph %s' % prop
 504         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 505         if escaped is None:
 506             return None
 507         return unescapeHTML(escaped)
 508
 509     def _og_search_thumbnail(self, html, **kargs):
 510         return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs)
 511
 512     def _og_search_description(self, html, **kargs):
 513         return self._og_search_property('description', html, fatal=False, **kargs)
 514
 515     def _og_search_title(self, html, **kargs):
 516         return self._og_search_property('title', html, **kargs)
 517
 518     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 519         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 520         if secure:
 521             regexes = self._og_regexes('video:secure_url') + regexes
 522         return self._html_search_regex(regexes, html, name, **kargs)
 523
 524     def _og_search_url(self, html, **kargs):
 525         return self._og_search_property('url', html, **kargs)
 526
 527     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 528         if display_name is None:
 529             display_name = name
 530         return self._html_search_regex(
 531             r'''(?ix)<meta
 532                     (?=[^>]+(?:itemprop|name|property)=["\']?%s["\']?)
 533                     [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
 534             html, display_name, fatal=fatal, **kwargs)
 535
 536     def _dc_search_uploader(self, html):
 537         return self._html_search_meta('dc.creator', html, 'uploader')
 538
 539     def _rta_search(self, html):
 540         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 541         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 542                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 543                      html):
 544             return 18
 545         return 0
 546
 547     def _media_rating_search(self, html):
 548         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 549         rating = self._html_search_meta('rating', html)
 550
 551         if not rating:
 552             return None
 553
 554         RATING_TABLE = {
 555             'safe for kids': 0,
 556             'general': 8,
 557             '14 years': 14,
 558             'mature': 17,
 559             'restricted': 19,
 560         }
 561         return RATING_TABLE.get(rating.lower(), None)
 562
 563     def _twitter_search_player(self, html):
 564         return self._html_search_meta('twitter:player', html,
 565             'twitter card player')
 566
 567     def _sort_formats(self, formats):
 568         if not formats:
 569             raise ExtractorError('No video formats found')
 570
 571         def _formats_key(f):
 572             # TODO remove the following workaround
 573             from ..utils import determine_ext
 574             if not f.get('ext') and 'url' in f:
 575                 f['ext'] = determine_ext(f['url'])
 576
 577             preference = f.get('preference')
 578             if preference is None:
 579                 proto = f.get('protocol')
 580                 if proto is None:
 581                     proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
 582
 583                 preference = 0 if proto in ['http', 'https'] else -0.1
 584                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 585                     preference -= 0.5
 586
 587             if f.get('vcodec') == 'none':  # audio only
 588                 if self._downloader.params.get('prefer_free_formats'):
 589                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
 590                 else:
 591                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
 592                 ext_preference = 0
 593                 try:
 594                     audio_ext_preference = ORDER.index(f['ext'])
 595                 except ValueError:
 596                     audio_ext_preference = -1
 597             else:
 598                 if self._downloader.params.get('prefer_free_formats'):
 599                     ORDER = ['flv', 'mp4', 'webm']
 600                 else:
 601                     ORDER = ['webm', 'flv', 'mp4']
 602                 try:
 603                     ext_preference = ORDER.index(f['ext'])
 604                 except ValueError:
 605                     ext_preference = -1
 606                 audio_ext_preference = 0
 607
 608             return (
 609                 preference,
 610                 f.get('quality') if f.get('quality') is not None else -1,
 611                 f.get('height') if f.get('height') is not None else -1,
 612                 f.get('width') if f.get('width') is not None else -1,
 613                 ext_preference,
 614                 f.get('tbr') if f.get('tbr') is not None else -1,
 615                 f.get('vbr') if f.get('vbr') is not None else -1,
 616                 f.get('abr') if f.get('abr') is not None else -1,
 617                 audio_ext_preference,
 618                 f.get('filesize') if f.get('filesize') is not None else -1,
 619                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
 620                 f.get('source_preference') if f.get('source_preference') is not None else -1,
 621                 f.get('format_id'),
 622             )
 623         formats.sort(key=_formats_key)
 624
 625     def http_scheme(self):
 626         """ Either "http:" or "https:", depending on the user's preferences """
 627         return (
 628             'http:'
 629             if self._downloader.params.get('prefer_insecure', False)
 630             else 'https:')
 631
 632     def _proto_relative_url(self, url, scheme=None):
 633         if url is None:
 634             return url
 635         if url.startswith('//'):
 636             if scheme is None:
 637                 scheme = self.http_scheme()
 638             return scheme + url
 639         else:
 640             return url
 641
 642     def _sleep(self, timeout, video_id, msg_template=None):
 643         if msg_template is None:
 644             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
 645         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
 646         self.to_screen(msg)
 647         time.sleep(timeout)
 648
 649     def _extract_f4m_formats(self, manifest_url, video_id):
 650         manifest = self._download_xml(
 651             manifest_url, video_id, 'Downloading f4m manifest',
 652             'Unable to download f4m manifest')
 653
 654         formats = []
 655         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
 656         for i, media_el in enumerate(media_nodes):
 657             tbr = int_or_none(media_el.attrib.get('bitrate'))
 658             format_id = 'f4m-%d' % (i if tbr is None else tbr)
 659             formats.append({
 660                 'format_id': format_id,
 661                 'url': manifest_url,
 662                 'ext': 'flv',
 663                 'tbr': tbr,
 664                 'width': int_or_none(media_el.attrib.get('width')),
 665                 'height': int_or_none(media_el.attrib.get('height')),
 666             })
 667         self._sort_formats(formats)
 668
 669         return formats
 670
 671     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
 672                               entry_protocol='m3u8', preference=None):
 673
 674         formats = [{
 675             'format_id': 'm3u8-meta',
 676             'url': m3u8_url,
 677             'ext': ext,
 678             'protocol': 'm3u8',
 679             'preference': -1,
 680             'resolution': 'multiple',
 681             'format_note': 'Quality selection URL',
 682         }]
 683
 684         format_url = lambda u: (
 685             u
 686             if re.match(r'^https?://', u)
 687             else compat_urlparse.urljoin(m3u8_url, u))
 688
 689         m3u8_doc = self._download_webpage(m3u8_url, video_id)
 690         last_info = None
 691         kv_rex = re.compile(
 692             r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
 693         for line in m3u8_doc.splitlines():
 694             if line.startswith('#EXT-X-STREAM-INF:'):
 695                 last_info = {}
 696                 for m in kv_rex.finditer(line):
 697                     v = m.group('val')
 698                     if v.startswith('"'):
 699                         v = v[1:-1]
 700                     last_info[m.group('key')] = v
 701             elif line.startswith('#') or not line.strip():
 702                 continue
 703             else:
 704                 if last_info is None:
 705                     formats.append({'url': format_url(line)})
 706                     continue
 707                 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
 708
 709                 f = {
 710                     'format_id': 'm3u8-%d' % (tbr if tbr else len(formats)),
 711                     'url': format_url(line.strip()),
 712                     'tbr': tbr,
 713                     'ext': ext,
 714                     'protocol': entry_protocol,
 715                     'preference': preference,
 716                 }
 717                 codecs = last_info.get('CODECS')
 718                 if codecs:
 719                     # TODO: looks like video codec is not always necessarily goes first
 720                     va_codecs = codecs.split(',')
 721                     if va_codecs[0]:
 722                         f['vcodec'] = va_codecs[0].partition('.')[0]
 723                     if len(va_codecs) > 1 and va_codecs[1]:
 724                         f['acodec'] = va_codecs[1].partition('.')[0]
 725                 resolution = last_info.get('RESOLUTION')
 726                 if resolution:
 727                     width_str, height_str = resolution.split('x')
 728                     f['width'] = int(width_str)
 729                     f['height'] = int(height_str)
 730                 formats.append(f)
 731                 last_info = {}
 732         self._sort_formats(formats)
 733         return formats
 734
 735     def _live_title(self, name):
 736         """ Generate the title for a live video """
 737         now = datetime.datetime.now()
 738         now_str = now.strftime("%Y-%m-%d %H:%M")
 739         return name + ' ' + now_str
 740
 741     def _int(self, v, name, fatal=False, **kwargs):
 742         res = int_or_none(v, **kwargs)
 743         if 'get_attr' in kwargs:
 744             print(getattr(v, kwargs['get_attr']))
 745         if res is None:
 746             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
 747             if fatal:
 748                 raise ExtractorError(msg)
 749             else:
 750                 self._downloader.report_warning(msg)
 751         return res
 752
 753     def _float(self, v, name, fatal=False, **kwargs):
 754         res = float_or_none(v, **kwargs)
 755         if res is None:
 756             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
 757             if fatal:
 758                 raise ExtractorError(msg)
 759             else:
 760                 self._downloader.report_warning(msg)
 761         return res
 762
 763
 764 class SearchInfoExtractor(InfoExtractor):
 765     """
 766     Base class for paged search queries extractors.
 767     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 768     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 769     """
 770
 771     @classmethod
 772     def _make_valid_url(cls):
 773         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 774
 775     @classmethod
 776     def suitable(cls, url):
 777         return re.match(cls._make_valid_url(), url) is not None
 778
 779     def _real_extract(self, query):
 780         mobj = re.match(self._make_valid_url(), query)
 781         if mobj is None:
 782             raise ExtractorError('Invalid search query "%s"' % query)
 783
 784         prefix = mobj.group('prefix')
 785         query = mobj.group('query')
 786         if prefix == '':
 787             return self._get_n_results(query, 1)
 788         elif prefix == 'all':
 789             return self._get_n_results(query, self._MAX_RESULTS)
 790         else:
 791             n = int(prefix)
 792             if n <= 0:
 793                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
 794             elif n > self._MAX_RESULTS:
 795                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 796                 n = self._MAX_RESULTS
 797             return self._get_n_results(query, n)
 798
 799     def _get_n_results(self, query, n):
 800         """Get a specified number of results for a query"""
 801         raise NotImplementedError("This method must be implemented by subclasses")
 802
 803     @property
 804     def SEARCH_KEY(self):
 805         return self._SEARCH_KEY