youtube_dl/extractor/common.py

   1 from __future__ import unicode_literals
   2
   3 import base64
   4 import datetime
   5 import hashlib
   6 import json
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import sys
  12 import time
  13 import xml.etree.ElementTree
  14
  15 from ..utils import (
  16     compat_http_client,
  17     compat_urllib_error,
  18     compat_urllib_parse_urlparse,
  19     compat_urlparse,
  20     compat_str,
  21
  22     clean_html,
  23     compiled_regex_type,
  24     ExtractorError,
  25     float_or_none,
  26     int_or_none,
  27     RegexNotFoundError,
  28     sanitize_filename,
  29     unescapeHTML,
  30 )
  31 _NO_DEFAULT = object()
  32
  33
  34 class InfoExtractor(object):
  35     """Information Extractor class.
  36
  37     Information extractors are the classes that, given a URL, extract
  38     information about the video (or videos) the URL refers to. This
  39     information includes the real video URL, the video title, author and
  40     others. The information is stored in a dictionary which is then
  41     passed to the FileDownloader. The FileDownloader processes this
  42     information possibly downloading the video to the file system, among
  43     other possible outcomes.
  44
  45     The dictionaries must include the following fields:
  46
  47     id:             Video identifier.
  48     title:          Video title, unescaped.
  49
  50     Additionally, it must contain either a formats entry or a url one:
  51
  52     formats:        A list of dictionaries for each format available, ordered
  53                     from worst to best quality.
  54
  55                     Potential fields:
  56                     * url        Mandatory. The URL of the video file
  57                     * ext        Will be calculated from url if missing
  58                     * format     A human-readable description of the format
  59                                  ("mp4 container with h264/opus").
  60                                  Calculated from the format_id, width, height.
  61                                  and format_note fields if missing.
  62                     * format_id  A short description of the format
  63                                  ("mp4_h264_opus" or "19").
  64                                 Technically optional, but strongly recommended.
  65                     * format_note Additional info about the format
  66                                  ("3D" or "DASH video")
  67                     * width      Width of the video, if known
  68                     * height     Height of the video, if known
  69                     * resolution Textual description of width and height
  70                     * tbr        Average bitrate of audio and video in KBit/s
  71                     * abr        Average audio bitrate in KBit/s
  72                     * acodec     Name of the audio codec in use
  73                     * asr        Audio sampling rate in Hertz
  74                     * vbr        Average video bitrate in KBit/s
  75                     * vcodec     Name of the video codec in use
  76                     * container  Name of the container format
  77                     * filesize   The number of bytes, if known in advance
  78                     * filesize_approx  An estimate for the number of bytes
  79                     * player_url SWF Player URL (used for rtmpdump).
  80                     * protocol   The protocol that will be used for the actual
  81                                  download, lower-case.
  82                                  "http", "https", "rtsp", "rtmp", "m3u8" or so.
  83                     * preference Order number of this format. If this field is
  84                                  present and not None, the formats get sorted
  85                                  by this field, regardless of all other values.
  86                                  -1 for default (order by other properties),
  87                                  -2 or smaller for less than default.
  88                     * quality    Order number of the video quality of this
  89                                  format, irrespective of the file format.
  90                                  -1 for default (order by other properties),
  91                                  -2 or smaller for less than default.
  92                     * http_referer  HTTP Referer header value to set.
  93                     * http_method  HTTP method to use for the download.
  94                     * http_headers  A dictionary of additional HTTP headers
  95                                  to add to the request.
  96                     * http_post_data  Additional data to send with a POST
  97                                  request.
  98     url:            Final video URL.
  99     ext:            Video filename extension.
 100     format:         The video format, defaults to ext (used for --get-format)
 101     player_url:     SWF Player URL (used for rtmpdump).
 102
 103     The following fields are optional:
 104
 105     display_id      An alternative identifier for the video, not necessarily
 106                     unique, but available before title. Typically, id is
 107                     something like "4234987", title "Dancing naked mole rats",
 108                     and display_id "dancing-naked-mole-rats"
 109     thumbnails:     A list of dictionaries, with the following entries:
 110                         * "url"
 111                         * "width" (optional, int)
 112                         * "height" (optional, int)
 113                         * "resolution" (optional, string "{width}x{height"},
 114                                         deprecated)
 115     thumbnail:      Full URL to a video thumbnail image.
 116     description:    One-line video description.
 117     uploader:       Full name of the video uploader.
 118     timestamp:      UNIX timestamp of the moment the video became available.
 119     upload_date:    Video upload date (YYYYMMDD).
 120                     If not explicitly set, calculated from timestamp.
 121     uploader_id:    Nickname or id of the video uploader.
 122     location:       Physical location where the video was filmed.
 123     subtitles:      The subtitle file contents as a dictionary in the format
 124                     {language: subtitles}.
 125     duration:       Length of the video in seconds, as an integer.
 126     view_count:     How many users have watched the video on the platform.
 127     like_count:     Number of positive ratings of the video
 128     dislike_count:  Number of negative ratings of the video
 129     comment_count:  Number of comments on the video
 130     age_limit:      Age restriction for the video, as an integer (years)
 131     webpage_url:    The url to the video webpage, if given to youtube-dl it
 132                     should allow to get the same result again. (It will be set
 133                     by YoutubeDL if it's missing)
 134     categories:     A list of categories that the video falls in, for example
 135                     ["Sports", "Berlin"]
 136     is_live:        True, False, or None (=unknown). Whether this video is a
 137                     live stream that goes on instead of a fixed-length video.
 138
 139     Unless mentioned otherwise, the fields should be Unicode strings.
 140
 141     Unless mentioned otherwise, None is equivalent to absence of information.
 142
 143     Subclasses of this one should re-define the _real_initialize() and
 144     _real_extract() methods and define a _VALID_URL regexp.
 145     Probably, they should also be added to the list of extractors.
 146
 147     Finally, the _WORKING attribute should be set to False for broken IEs
 148     in order to warn the users and skip the tests.
 149     """
 150
 151     _ready = False
 152     _downloader = None
 153     _WORKING = True
 154
 155     def __init__(self, downloader=None):
 156         """Constructor. Receives an optional downloader."""
 157         self._ready = False
 158         self.set_downloader(downloader)
 159
 160     @classmethod
 161     def suitable(cls, url):
 162         """Receives a URL and returns True if suitable for this IE."""
 163
 164         # This does not use has/getattr intentionally - we want to know whether
 165         # we have cached the regexp for *this* class, whereas getattr would also
 166         # match the superclass
 167         if '_VALID_URL_RE' not in cls.__dict__:
 168             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 169         return cls._VALID_URL_RE.match(url) is not None
 170
 171     @classmethod
 172     def _match_id(cls, url):
 173         if '_VALID_URL_RE' not in cls.__dict__:
 174             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 175         m = cls._VALID_URL_RE.match(url)
 176         assert m
 177         return m.group('id')
 178
 179     @classmethod
 180     def working(cls):
 181         """Getter method for _WORKING."""
 182         return cls._WORKING
 183
 184     def initialize(self):
 185         """Initializes an instance (authentication, etc)."""
 186         if not self._ready:
 187             self._real_initialize()
 188             self._ready = True
 189
 190     def extract(self, url):
 191         """Extracts URL information and returns it in list of dicts."""
 192         self.initialize()
 193         return self._real_extract(url)
 194
 195     def set_downloader(self, downloader):
 196         """Sets the downloader for this IE."""
 197         self._downloader = downloader
 198
 199     def _real_initialize(self):
 200         """Real initialization process. Redefine in subclasses."""
 201         pass
 202
 203     def _real_extract(self, url):
 204         """Real extraction process. Redefine in subclasses."""
 205         pass
 206
 207     @classmethod
 208     def ie_key(cls):
 209         """A string for getting the InfoExtractor with get_info_extractor"""
 210         return cls.__name__[:-2]
 211
 212     @property
 213     def IE_NAME(self):
 214         return type(self).__name__[:-2]
 215
 216     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 217         """ Returns the response handle """
 218         if note is None:
 219             self.report_download_webpage(video_id)
 220         elif note is not False:
 221             if video_id is None:
 222                 self.to_screen('%s' % (note,))
 223             else:
 224                 self.to_screen('%s: %s' % (video_id, note))
 225         try:
 226             return self._downloader.urlopen(url_or_request)
 227         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 228             if errnote is False:
 229                 return False
 230             if errnote is None:
 231                 errnote = 'Unable to download webpage'
 232             errmsg = '%s: %s' % (errnote, compat_str(err))
 233             if fatal:
 234                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 235             else:
 236                 self._downloader.report_warning(errmsg)
 237                 return False
 238
 239     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 240         """ Returns a tuple (page content as string, URL handle) """
 241
 242         # Strip hashes from the URL (#1038)
 243         if isinstance(url_or_request, (compat_str, str)):
 244             url_or_request = url_or_request.partition('#')[0]
 245
 246         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 247         if urlh is False:
 248             assert not fatal
 249             return False
 250         content_type = urlh.headers.get('Content-Type', '')
 251         webpage_bytes = urlh.read()
 252         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 253         if m:
 254             encoding = m.group(1)
 255         else:
 256             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 257                           webpage_bytes[:1024])
 258             if m:
 259                 encoding = m.group(1).decode('ascii')
 260             elif webpage_bytes.startswith(b'\xff\xfe'):
 261                 encoding = 'utf-16'
 262             else:
 263                 encoding = 'utf-8'
 264         if self._downloader.params.get('dump_intermediate_pages', False):
 265             try:
 266                 url = url_or_request.get_full_url()
 267             except AttributeError:
 268                 url = url_or_request
 269             self.to_screen('Dumping request to ' + url)
 270             dump = base64.b64encode(webpage_bytes).decode('ascii')
 271             self._downloader.to_screen(dump)
 272         if self._downloader.params.get('write_pages', False):
 273             try:
 274                 url = url_or_request.get_full_url()
 275             except AttributeError:
 276                 url = url_or_request
 277             basen = '%s_%s' % (video_id, url)
 278             if len(basen) > 240:
 279                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 280                 basen = basen[:240 - len(h)] + h
 281             raw_filename = basen + '.dump'
 282             filename = sanitize_filename(raw_filename, restricted=True)
 283             self.to_screen('Saving request to ' + filename)
 284             with open(filename, 'wb') as outf:
 285                 outf.write(webpage_bytes)
 286
 287         try:
 288             content = webpage_bytes.decode(encoding, 'replace')
 289         except LookupError:
 290             content = webpage_bytes.decode('utf-8', 'replace')
 291
 292         if ('<title>Access to this site is blocked</title>' in content and
 293                 'Websense' in content[:512]):
 294             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 295             blocked_iframe = self._html_search_regex(
 296                 r'<iframe src="([^"]+)"', content,
 297                 'Websense information URL', default=None)
 298             if blocked_iframe:
 299                 msg += ' Visit %s for more details' % blocked_iframe
 300             raise ExtractorError(msg, expected=True)
 301
 302         return (content, urlh)
 303
 304     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 305         """ Returns the data of the page as a string """
 306         res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
 307         if res is False:
 308             return res
 309         else:
 310             content, _ = res
 311             return content
 312
 313     def _download_xml(self, url_or_request, video_id,
 314                       note='Downloading XML', errnote='Unable to download XML',
 315                       transform_source=None, fatal=True):
 316         """Return the xml as an xml.etree.ElementTree.Element"""
 317         xml_string = self._download_webpage(
 318             url_or_request, video_id, note, errnote, fatal=fatal)
 319         if xml_string is False:
 320             return xml_string
 321         if transform_source:
 322             xml_string = transform_source(xml_string)
 323         return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
 324
 325     def _download_json(self, url_or_request, video_id,
 326                        note='Downloading JSON metadata',
 327                        errnote='Unable to download JSON metadata',
 328                        transform_source=None,
 329                        fatal=True):
 330         json_string = self._download_webpage(
 331             url_or_request, video_id, note, errnote, fatal=fatal)
 332         if (not fatal) and json_string is False:
 333             return None
 334         if transform_source:
 335             json_string = transform_source(json_string)
 336         try:
 337             return json.loads(json_string)
 338         except ValueError as ve:
 339             errmsg = '%s: Failed to parse JSON ' % video_id
 340             if fatal:
 341                 raise ExtractorError(errmsg, cause=ve)
 342             else:
 343                 self.report_warning(errmsg + str(ve))
 344
 345     def report_warning(self, msg, video_id=None):
 346         idstr = '' if video_id is None else '%s: ' % video_id
 347         self._downloader.report_warning(
 348             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 349
 350     def to_screen(self, msg):
 351         """Print msg to screen, prefixing it with '[ie_name]'"""
 352         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 353
 354     def report_extraction(self, id_or_name):
 355         """Report information extraction."""
 356         self.to_screen('%s: Extracting information' % id_or_name)
 357
 358     def report_download_webpage(self, video_id):
 359         """Report webpage download."""
 360         self.to_screen('%s: Downloading webpage' % video_id)
 361
 362     def report_age_confirmation(self):
 363         """Report attempt to confirm age."""
 364         self.to_screen('Confirming age')
 365
 366     def report_login(self):
 367         """Report attempt to log in."""
 368         self.to_screen('Logging in')
 369
 370     #Methods for following #608
 371     @staticmethod
 372     def url_result(url, ie=None, video_id=None):
 373         """Returns a url that points to a page that should be processed"""
 374         #TODO: ie should be the class used for getting the info
 375         video_info = {'_type': 'url',
 376                       'url': url,
 377                       'ie_key': ie}
 378         if video_id is not None:
 379             video_info['id'] = video_id
 380         return video_info
 381     @staticmethod
 382     def playlist_result(entries, playlist_id=None, playlist_title=None):
 383         """Returns a playlist"""
 384         video_info = {'_type': 'playlist',
 385                       'entries': entries}
 386         if playlist_id:
 387             video_info['id'] = playlist_id
 388         if playlist_title:
 389             video_info['title'] = playlist_title
 390         return video_info
 391
 392     def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
 393         """
 394         Perform a regex search on the given string, using a single or a list of
 395         patterns returning the first matching group.
 396         In case of failure return a default value or raise a WARNING or a
 397         RegexNotFoundError, depending on fatal, specifying the field name.
 398         """
 399         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 400             mobj = re.search(pattern, string, flags)
 401         else:
 402             for p in pattern:
 403                 mobj = re.search(p, string, flags)
 404                 if mobj:
 405                     break
 406
 407         if os.name != 'nt' and sys.stderr.isatty():
 408             _name = '\033[0;34m%s\033[0m' % name
 409         else:
 410             _name = name
 411
 412         if mobj:
 413             # return the first matching group
 414             return next(g for g in mobj.groups() if g is not None)
 415         elif default is not _NO_DEFAULT:
 416             return default
 417         elif fatal:
 418             raise RegexNotFoundError('Unable to extract %s' % _name)
 419         else:
 420             self._downloader.report_warning('unable to extract %s; '
 421                 'please report this issue on http://yt-dl.org/bug' % _name)
 422             return None
 423
 424     def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
 425         """
 426         Like _search_regex, but strips HTML tags and unescapes entities.
 427         """
 428         res = self._search_regex(pattern, string, name, default, fatal, flags)
 429         if res:
 430             return clean_html(res).strip()
 431         else:
 432             return res
 433
 434     def _get_login_info(self):
 435         """
 436         Get the the login info as (username, password)
 437         It will look in the netrc file using the _NETRC_MACHINE value
 438         If there's no info available, return (None, None)
 439         """
 440         if self._downloader is None:
 441             return (None, None)
 442
 443         username = None
 444         password = None
 445         downloader_params = self._downloader.params
 446
 447         # Attempt to use provided username and password or .netrc data
 448         if downloader_params.get('username', None) is not None:
 449             username = downloader_params['username']
 450             password = downloader_params['password']
 451         elif downloader_params.get('usenetrc', False):
 452             try:
 453                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 454                 if info is not None:
 455                     username = info[0]
 456                     password = info[2]
 457                 else:
 458                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 459             except (IOError, netrc.NetrcParseError) as err:
 460                 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
 461
 462         return (username, password)
 463
 464     def _get_tfa_info(self):
 465         """
 466         Get the two-factor authentication info
 467         TODO - asking the user will be required for sms/phone verify
 468         currently just uses the command line option
 469         If there's no info available, return None
 470         """
 471         if self._downloader is None:
 472             return None
 473         downloader_params = self._downloader.params
 474
 475         if downloader_params.get('twofactor', None) is not None:
 476             return downloader_params['twofactor']
 477
 478         return None
 479
 480     # Helper functions for extracting OpenGraph info
 481     @staticmethod
 482     def _og_regexes(prop):
 483         content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
 484         property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
 485         template = r'<meta[^>]+?%s[^>]+?%s'
 486         return [
 487             template % (property_re, content_re),
 488             template % (content_re, property_re),
 489         ]
 490
 491     def _og_search_property(self, prop, html, name=None, **kargs):
 492         if name is None:
 493             name = 'OpenGraph %s' % prop
 494         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 495         if escaped is None:
 496             return None
 497         return unescapeHTML(escaped)
 498
 499     def _og_search_thumbnail(self, html, **kargs):
 500         return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs)
 501
 502     def _og_search_description(self, html, **kargs):
 503         return self._og_search_property('description', html, fatal=False, **kargs)
 504
 505     def _og_search_title(self, html, **kargs):
 506         return self._og_search_property('title', html, **kargs)
 507
 508     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 509         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 510         if secure:
 511             regexes = self._og_regexes('video:secure_url') + regexes
 512         return self._html_search_regex(regexes, html, name, **kargs)
 513
 514     def _og_search_url(self, html, **kargs):
 515         return self._og_search_property('url', html, **kargs)
 516
 517     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 518         if display_name is None:
 519             display_name = name
 520         return self._html_search_regex(
 521             r'''(?ix)<meta
 522                     (?=[^>]+(?:itemprop|name|property)=["\']?%s["\']?)
 523                     [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
 524             html, display_name, fatal=fatal, **kwargs)
 525
 526     def _dc_search_uploader(self, html):
 527         return self._html_search_meta('dc.creator', html, 'uploader')
 528
 529     def _rta_search(self, html):
 530         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 531         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 532                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 533                      html):
 534             return 18
 535         return 0
 536
 537     def _media_rating_search(self, html):
 538         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 539         rating = self._html_search_meta('rating', html)
 540
 541         if not rating:
 542             return None
 543
 544         RATING_TABLE = {
 545             'safe for kids': 0,
 546             'general': 8,
 547             '14 years': 14,
 548             'mature': 17,
 549             'restricted': 19,
 550         }
 551         return RATING_TABLE.get(rating.lower(), None)
 552
 553     def _twitter_search_player(self, html):
 554         return self._html_search_meta('twitter:player', html,
 555             'twitter card player')
 556
 557     def _sort_formats(self, formats):
 558         if not formats:
 559             raise ExtractorError('No video formats found')
 560
 561         def _formats_key(f):
 562             # TODO remove the following workaround
 563             from ..utils import determine_ext
 564             if not f.get('ext') and 'url' in f:
 565                 f['ext'] = determine_ext(f['url'])
 566
 567             preference = f.get('preference')
 568             if preference is None:
 569                 proto = f.get('protocol')
 570                 if proto is None:
 571                     proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
 572
 573                 preference = 0 if proto in ['http', 'https'] else -0.1
 574                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 575                     preference -= 0.5
 576
 577             if f.get('vcodec') == 'none':  # audio only
 578                 if self._downloader.params.get('prefer_free_formats'):
 579                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
 580                 else:
 581                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
 582                 ext_preference = 0
 583                 try:
 584                     audio_ext_preference = ORDER.index(f['ext'])
 585                 except ValueError:
 586                     audio_ext_preference = -1
 587             else:
 588                 if self._downloader.params.get('prefer_free_formats'):
 589                     ORDER = ['flv', 'mp4', 'webm']
 590                 else:
 591                     ORDER = ['webm', 'flv', 'mp4']
 592                 try:
 593                     ext_preference = ORDER.index(f['ext'])
 594                 except ValueError:
 595                     ext_preference = -1
 596                 audio_ext_preference = 0
 597
 598             return (
 599                 preference,
 600                 f.get('quality') if f.get('quality') is not None else -1,
 601                 f.get('height') if f.get('height') is not None else -1,
 602                 f.get('width') if f.get('width') is not None else -1,
 603                 ext_preference,
 604                 f.get('tbr') if f.get('tbr') is not None else -1,
 605                 f.get('vbr') if f.get('vbr') is not None else -1,
 606                 f.get('abr') if f.get('abr') is not None else -1,
 607                 audio_ext_preference,
 608                 f.get('filesize') if f.get('filesize') is not None else -1,
 609                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
 610                 f.get('format_id'),
 611             )
 612         formats.sort(key=_formats_key)
 613
 614     def http_scheme(self):
 615         """ Either "https:" or "https:", depending on the user's preferences """
 616         return (
 617             'http:'
 618             if self._downloader.params.get('prefer_insecure', False)
 619             else 'https:')
 620
 621     def _proto_relative_url(self, url, scheme=None):
 622         if url is None:
 623             return url
 624         if url.startswith('//'):
 625             if scheme is None:
 626                 scheme = self.http_scheme()
 627             return scheme + url
 628         else:
 629             return url
 630
 631     def _sleep(self, timeout, video_id, msg_template=None):
 632         if msg_template is None:
 633             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
 634         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
 635         self.to_screen(msg)
 636         time.sleep(timeout)
 637
 638     def _extract_f4m_formats(self, manifest_url, video_id):
 639         manifest = self._download_xml(
 640             manifest_url, video_id, 'Downloading f4m manifest',
 641             'Unable to download f4m manifest')
 642
 643         formats = []
 644         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
 645         for i, media_el in enumerate(media_nodes):
 646             tbr = int_or_none(media_el.attrib.get('bitrate'))
 647             format_id = 'f4m-%d' % (i if tbr is None else tbr)
 648             formats.append({
 649                 'format_id': format_id,
 650                 'url': manifest_url,
 651                 'ext': 'flv',
 652                 'tbr': tbr,
 653                 'width': int_or_none(media_el.attrib.get('width')),
 654                 'height': int_or_none(media_el.attrib.get('height')),
 655             })
 656         self._sort_formats(formats)
 657
 658         return formats
 659
 660     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
 661                               entry_protocol='m3u8', preference=None):
 662
 663         formats = [{
 664             'format_id': 'm3u8-meta',
 665             'url': m3u8_url,
 666             'ext': ext,
 667             'protocol': 'm3u8',
 668             'preference': -1,
 669             'resolution': 'multiple',
 670             'format_note': 'Quality selection URL',
 671         }]
 672
 673         format_url = lambda u: (
 674             u
 675             if re.match(r'^https?://', u)
 676             else compat_urlparse.urljoin(m3u8_url, u))
 677
 678         m3u8_doc = self._download_webpage(m3u8_url, video_id)
 679         last_info = None
 680         kv_rex = re.compile(
 681             r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
 682         for line in m3u8_doc.splitlines():
 683             if line.startswith('#EXT-X-STREAM-INF:'):
 684                 last_info = {}
 685                 for m in kv_rex.finditer(line):
 686                     v = m.group('val')
 687                     if v.startswith('"'):
 688                         v = v[1:-1]
 689                     last_info[m.group('key')] = v
 690             elif line.startswith('#') or not line.strip():
 691                 continue
 692             else:
 693                 if last_info is None:
 694                     formats.append({'url': format_url(line)})
 695                     continue
 696                 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
 697
 698                 f = {
 699                     'format_id': 'm3u8-%d' % (tbr if tbr else len(formats)),
 700                     'url': format_url(line.strip()),
 701                     'tbr': tbr,
 702                     'ext': ext,
 703                     'protocol': entry_protocol,
 704                     'preference': preference,
 705                 }
 706                 codecs = last_info.get('CODECS')
 707                 if codecs:
 708                     # TODO: looks like video codec is not always necessarily goes first
 709                     va_codecs = codecs.split(',')
 710                     if va_codecs[0]:
 711                         f['vcodec'] = va_codecs[0].partition('.')[0]
 712                     if len(va_codecs) > 1 and va_codecs[1]:
 713                         f['acodec'] = va_codecs[1].partition('.')[0]
 714                 resolution = last_info.get('RESOLUTION')
 715                 if resolution:
 716                     width_str, height_str = resolution.split('x')
 717                     f['width'] = int(width_str)
 718                     f['height'] = int(height_str)
 719                 formats.append(f)
 720                 last_info = {}
 721         self._sort_formats(formats)
 722         return formats
 723
 724     def _live_title(self, name):
 725         """ Generate the title for a live video """
 726         now = datetime.datetime.now()
 727         now_str = now.strftime("%Y-%m-%d %H:%M")
 728         return name + ' ' + now_str
 729
 730     def _int(self, v, name, fatal=False, **kwargs):
 731         res = int_or_none(v, **kwargs)
 732         if 'get_attr' in kwargs:
 733             print(getattr(v, kwargs['get_attr']))
 734         if res is None:
 735             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
 736             if fatal:
 737                 raise ExtractorError(msg)
 738             else:
 739                 self._downloader.report_warning(msg)
 740         return res
 741
 742     def _float(self, v, name, fatal=False, **kwargs):
 743         res = float_or_none(v, **kwargs)
 744         if res is None:
 745             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
 746             if fatal:
 747                 raise ExtractorError(msg)
 748             else:
 749                 self._downloader.report_warning(msg)
 750         return res
 751
 752
 753 class SearchInfoExtractor(InfoExtractor):
 754     """
 755     Base class for paged search queries extractors.
 756     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 757     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 758     """
 759
 760     @classmethod
 761     def _make_valid_url(cls):
 762         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 763
 764     @classmethod
 765     def suitable(cls, url):
 766         return re.match(cls._make_valid_url(), url) is not None
 767
 768     def _real_extract(self, query):
 769         mobj = re.match(self._make_valid_url(), query)
 770         if mobj is None:
 771             raise ExtractorError('Invalid search query "%s"' % query)
 772
 773         prefix = mobj.group('prefix')
 774         query = mobj.group('query')
 775         if prefix == '':
 776             return self._get_n_results(query, 1)
 777         elif prefix == 'all':
 778             return self._get_n_results(query, self._MAX_RESULTS)
 779         else:
 780             n = int(prefix)
 781             if n <= 0:
 782                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
 783             elif n > self._MAX_RESULTS:
 784                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 785                 n = self._MAX_RESULTS
 786             return self._get_n_results(query, n)
 787
 788     def _get_n_results(self, query, n):
 789         """Get a specified number of results for a query"""
 790         raise NotImplementedError("This method must be implemented by subclasses")
 791
 792     @property
 793     def SEARCH_KEY(self):
 794         return self._SEARCH_KEY