]> gitweb @ CieloNegro.org - youtube-dl.git/blob - youtube_dl/extractor/common.py
f740ddad1176599f56e0204a3bec67c1151183b0
[youtube-dl.git] / youtube_dl / extractor / common.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import base64
5 import datetime
6 import hashlib
7 import json
8 import netrc
9 import os
10 import random
11 import re
12 import socket
13 import ssl
14 import sys
15 import time
16 import math
17
18 from ..compat import (
19     compat_cookiejar_Cookie,
20     compat_cookies,
21     compat_etree_Element,
22     compat_etree_fromstring,
23     compat_getpass,
24     compat_integer_types,
25     compat_http_client,
26     compat_os_name,
27     compat_str,
28     compat_urllib_error,
29     compat_urllib_parse_unquote,
30     compat_urllib_parse_urlencode,
31     compat_urllib_request,
32     compat_urlparse,
33     compat_xml_parse_error,
34 )
35 from ..downloader.f4m import (
36     get_base_url,
37     remove_encrypted_media,
38 )
39 from ..utils import (
40     NO_DEFAULT,
41     age_restricted,
42     base_url,
43     bug_reports_message,
44     clean_html,
45     compiled_regex_type,
46     determine_ext,
47     determine_protocol,
48     dict_get,
49     error_to_compat_str,
50     ExtractorError,
51     extract_attributes,
52     fix_xml_ampersands,
53     float_or_none,
54     GeoRestrictedError,
55     GeoUtils,
56     int_or_none,
57     js_to_json,
58     JSON_LD_RE,
59     mimetype2ext,
60     orderedSet,
61     parse_bitrate,
62     parse_codecs,
63     parse_duration,
64     parse_iso8601,
65     parse_m3u8_attributes,
66     parse_resolution,
67     RegexNotFoundError,
68     sanitized_Request,
69     sanitize_filename,
70     str_or_none,
71     strip_or_none,
72     unescapeHTML,
73     unified_strdate,
74     unified_timestamp,
75     update_Request,
76     update_url_query,
77     urljoin,
78     url_basename,
79     url_or_none,
80     xpath_element,
81     xpath_text,
82     xpath_with_ns,
83 )
84
85
86 class InfoExtractor(object):
87     """Information Extractor class.
88
89     Information extractors are the classes that, given a URL, extract
90     information about the video (or videos) the URL refers to. This
91     information includes the real video URL, the video title, author and
92     others. The information is stored in a dictionary which is then
93     passed to the YoutubeDL. The YoutubeDL processes this
94     information possibly downloading the video to the file system, among
95     other possible outcomes.
96
97     The type field determines the type of the result.
98     By far the most common value (and the default if _type is missing) is
99     "video", which indicates a single video.
100
101     For a video, the dictionaries must include the following fields:
102
103     id:             Video identifier.
104     title:          Video title, unescaped.
105
106     Additionally, it must contain either a formats entry or a url one:
107
108     formats:        A list of dictionaries for each format available, ordered
109                     from worst to best quality.
110
111                     Potential fields:
112                     * url        The mandatory URL representing the media:
113                                    for plain file media - HTTP URL of this file,
114                                    for RTMP - RTMP URL,
115                                    for HLS - URL of the M3U8 media playlist,
116                                    for HDS - URL of the F4M manifest,
117                                    for DASH
118                                      - HTTP URL to plain file media (in case of
119                                        unfragmented media)
120                                      - URL of the MPD manifest or base URL
121                                        representing the media if MPD manifest
122                                        is parsed from a string (in case of
123                                        fragmented media)
124                                    for MSS - URL of the ISM manifest.
125                     * manifest_url
126                                  The URL of the manifest file in case of
127                                  fragmented media:
128                                    for HLS - URL of the M3U8 master playlist,
129                                    for HDS - URL of the F4M manifest,
130                                    for DASH - URL of the MPD manifest,
131                                    for MSS - URL of the ISM manifest.
132                     * ext        Will be calculated from URL if missing
133                     * format     A human-readable description of the format
134                                  ("mp4 container with h264/opus").
135                                  Calculated from the format_id, width, height.
136                                  and format_note fields if missing.
137                     * format_id  A short description of the format
138                                  ("mp4_h264_opus" or "19").
139                                 Technically optional, but strongly recommended.
140                     * format_note Additional info about the format
141                                  ("3D" or "DASH video")
142                     * width      Width of the video, if known
143                     * height     Height of the video, if known
144                     * resolution Textual description of width and height
145                     * tbr        Average bitrate of audio and video in KBit/s
146                     * abr        Average audio bitrate in KBit/s
147                     * acodec     Name of the audio codec in use
148                     * asr        Audio sampling rate in Hertz
149                     * vbr        Average video bitrate in KBit/s
150                     * fps        Frame rate
151                     * vcodec     Name of the video codec in use
152                     * container  Name of the container format
153                     * filesize   The number of bytes, if known in advance
154                     * filesize_approx  An estimate for the number of bytes
155                     * player_url SWF Player URL (used for rtmpdump).
156                     * protocol   The protocol that will be used for the actual
157                                  download, lower-case.
158                                  "http", "https", "rtsp", "rtmp", "rtmpe",
159                                  "m3u8", "m3u8_native" or "http_dash_segments".
160                     * fragment_base_url
161                                  Base URL for fragments. Each fragment's path
162                                  value (if present) will be relative to
163                                  this URL.
164                     * fragments  A list of fragments of a fragmented media.
165                                  Each fragment entry must contain either an url
166                                  or a path. If an url is present it should be
167                                  considered by a client. Otherwise both path and
168                                  fragment_base_url must be present. Here is
169                                  the list of all potential fields:
170                                  * "url" - fragment's URL
171                                  * "path" - fragment's path relative to
172                                             fragment_base_url
173                                  * "duration" (optional, int or float)
174                                  * "filesize" (optional, int)
175                     * preference Order number of this format. If this field is
176                                  present and not None, the formats get sorted
177                                  by this field, regardless of all other values.
178                                  -1 for default (order by other properties),
179                                  -2 or smaller for less than default.
180                                  < -1000 to hide the format (if there is
181                                     another one which is strictly better)
182                     * language   Language code, e.g. "de" or "en-US".
183                     * language_preference  Is this in the language mentioned in
184                                  the URL?
185                                  10 if it's what the URL is about,
186                                  -1 for default (don't know),
187                                  -10 otherwise, other values reserved for now.
188                     * quality    Order number of the video quality of this
189                                  format, irrespective of the file format.
190                                  -1 for default (order by other properties),
191                                  -2 or smaller for less than default.
192                     * source_preference  Order number for this video source
193                                   (quality takes higher priority)
194                                  -1 for default (order by other properties),
195                                  -2 or smaller for less than default.
196                     * http_headers  A dictionary of additional HTTP headers
197                                  to add to the request.
198                     * stretched_ratio  If given and not 1, indicates that the
199                                  video's pixels are not square.
200                                  width : height ratio as float.
201                     * no_resume  The server does not support resuming the
202                                  (HTTP or RTMP) download. Boolean.
203                     * downloader_options  A dictionary of downloader options as
204                                  described in FileDownloader
205
206     url:            Final video URL.
207     ext:            Video filename extension.
208     format:         The video format, defaults to ext (used for --get-format)
209     player_url:     SWF Player URL (used for rtmpdump).
210
211     The following fields are optional:
212
213     alt_title:      A secondary title of the video.
214     display_id      An alternative identifier for the video, not necessarily
215                     unique, but available before title. Typically, id is
216                     something like "4234987", title "Dancing naked mole rats",
217                     and display_id "dancing-naked-mole-rats"
218     thumbnails:     A list of dictionaries, with the following entries:
219                         * "id" (optional, string) - Thumbnail format ID
220                         * "url"
221                         * "preference" (optional, int) - quality of the image
222                         * "width" (optional, int)
223                         * "height" (optional, int)
224                         * "resolution" (optional, string "{width}x{height}",
225                                         deprecated)
226                         * "filesize" (optional, int)
227     thumbnail:      Full URL to a video thumbnail image.
228     description:    Full video description.
229     uploader:       Full name of the video uploader.
230     license:        License name the video is licensed under.
231     creator:        The creator of the video.
232     release_date:   The date (YYYYMMDD) when the video was released.
233     timestamp:      UNIX timestamp of the moment the video became available.
234     upload_date:    Video upload date (YYYYMMDD).
235                     If not explicitly set, calculated from timestamp.
236     uploader_id:    Nickname or id of the video uploader.
237     uploader_url:   Full URL to a personal webpage of the video uploader.
238     channel:        Full name of the channel the video is uploaded on.
239                     Note that channel fields may or may not repeat uploader
240                     fields. This depends on a particular extractor.
241     channel_id:     Id of the channel.
242     channel_url:    Full URL to a channel webpage.
243     location:       Physical location where the video was filmed.
244     subtitles:      The available subtitles as a dictionary in the format
245                     {tag: subformats}. "tag" is usually a language code, and
246                     "subformats" is a list sorted from lower to higher
247                     preference, each element is a dictionary with the "ext"
248                     entry and one of:
249                         * "data": The subtitles file contents
250                         * "url": A URL pointing to the subtitles file
251                     "ext" will be calculated from URL if missing
252     automatic_captions: Like 'subtitles', used by the YoutubeIE for
253                     automatically generated captions
254     duration:       Length of the video in seconds, as an integer or float.
255     view_count:     How many users have watched the video on the platform.
256     like_count:     Number of positive ratings of the video
257     dislike_count:  Number of negative ratings of the video
258     repost_count:   Number of reposts of the video
259     average_rating: Average rating give by users, the scale used depends on the webpage
260     comment_count:  Number of comments on the video
261     comments:       A list of comments, each with one or more of the following
262                     properties (all but one of text or html optional):
263                         * "author" - human-readable name of the comment author
264                         * "author_id" - user ID of the comment author
265                         * "id" - Comment ID
266                         * "html" - Comment as HTML
267                         * "text" - Plain text of the comment
268                         * "timestamp" - UNIX timestamp of comment
269                         * "parent" - ID of the comment this one is replying to.
270                                      Set to "root" to indicate that this is a
271                                      comment to the original video.
272     age_limit:      Age restriction for the video, as an integer (years)
273     webpage_url:    The URL to the video webpage, if given to youtube-dl it
274                     should allow to get the same result again. (It will be set
275                     by YoutubeDL if it's missing)
276     categories:     A list of categories that the video falls in, for example
277                     ["Sports", "Berlin"]
278     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
279     is_live:        True, False, or None (=unknown). Whether this video is a
280                     live stream that goes on instead of a fixed-length video.
281     start_time:     Time in seconds where the reproduction should start, as
282                     specified in the URL.
283     end_time:       Time in seconds where the reproduction should end, as
284                     specified in the URL.
285     chapters:       A list of dictionaries, with the following entries:
286                         * "start_time" - The start time of the chapter in seconds
287                         * "end_time" - The end time of the chapter in seconds
288                         * "title" (optional, string)
289
290     The following fields should only be used when the video belongs to some logical
291     chapter or section:
292
293     chapter:        Name or title of the chapter the video belongs to.
294     chapter_number: Number of the chapter the video belongs to, as an integer.
295     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
296
297     The following fields should only be used when the video is an episode of some
298     series, programme or podcast:
299
300     series:         Title of the series or programme the video episode belongs to.
301     season:         Title of the season the video episode belongs to.
302     season_number:  Number of the season the video episode belongs to, as an integer.
303     season_id:      Id of the season the video episode belongs to, as a unicode string.
304     episode:        Title of the video episode. Unlike mandatory video title field,
305                     this field should denote the exact title of the video episode
306                     without any kind of decoration.
307     episode_number: Number of the video episode within a season, as an integer.
308     episode_id:     Id of the video episode, as a unicode string.
309
310     The following fields should only be used when the media is a track or a part of
311     a music album:
312
313     track:          Title of the track.
314     track_number:   Number of the track within an album or a disc, as an integer.
315     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
316                     as a unicode string.
317     artist:         Artist(s) of the track.
318     genre:          Genre(s) of the track.
319     album:          Title of the album the track belongs to.
320     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
321     album_artist:   List of all artists appeared on the album (e.g.
322                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
323                     and compilations).
324     disc_number:    Number of the disc or other physical medium the track belongs to,
325                     as an integer.
326     release_year:   Year (YYYY) when the album was released.
327
328     Unless mentioned otherwise, the fields should be Unicode strings.
329
330     Unless mentioned otherwise, None is equivalent to absence of information.
331
332
333     _type "playlist" indicates multiple videos.
334     There must be a key "entries", which is a list, an iterable, or a PagedList
335     object, each element of which is a valid dictionary by this specification.
336
337     Additionally, playlists can have "id", "title", "description", "uploader",
338     "uploader_id", "uploader_url" attributes with the same semantics as videos
339     (see above).
340
341
342     _type "multi_video" indicates that there are multiple videos that
343     form a single show, for examples multiple acts of an opera or TV episode.
344     It must have an entries key like a playlist and contain all the keys
345     required for a video at the same time.
346
347
348     _type "url" indicates that the video must be extracted from another
349     location, possibly by a different extractor. Its only required key is:
350     "url" - the next URL to extract.
351     The key "ie_key" can be set to the class name (minus the trailing "IE",
352     e.g. "Youtube") if the extractor class is known in advance.
353     Additionally, the dictionary may have any properties of the resolved entity
354     known in advance, for example "title" if the title of the referred video is
355     known ahead of time.
356
357
358     _type "url_transparent" entities have the same specification as "url", but
359     indicate that the given additional information is more precise than the one
360     associated with the resolved URL.
361     This is useful when a site employs a video service that hosts the video and
362     its technical metadata, but that video service does not embed a useful
363     title, description etc.
364
365
366     Subclasses of this one should re-define the _real_initialize() and
367     _real_extract() methods and define a _VALID_URL regexp.
368     Probably, they should also be added to the list of extractors.
369
370     _GEO_BYPASS attribute may be set to False in order to disable
371     geo restriction bypass mechanisms for a particular extractor.
372     Though it won't disable explicit geo restriction bypass based on
373     country code provided with geo_bypass_country.
374
375     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
376     countries for this extractor. One of these countries will be used by
377     geo restriction bypass mechanism right away in order to bypass
378     geo restriction, of course, if the mechanism is not disabled.
379
380     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
381     IP blocks in CIDR notation for this extractor. One of these IP blocks
382     will be used by geo restriction bypass mechanism similarly
383     to _GEO_COUNTRIES.
384
385     Finally, the _WORKING attribute should be set to False for broken IEs
386     in order to warn the users and skip the tests.
387     """
388
389     _ready = False
390     _downloader = None
391     _x_forwarded_for_ip = None
392     _GEO_BYPASS = True
393     _GEO_COUNTRIES = None
394     _GEO_IP_BLOCKS = None
395     _WORKING = True
396
397     def __init__(self, downloader=None):
398         """Constructor. Receives an optional downloader."""
399         self._ready = False
400         self._x_forwarded_for_ip = None
401         self.set_downloader(downloader)
402
403     @classmethod
404     def suitable(cls, url):
405         """Receives a URL and returns True if suitable for this IE."""
406
407         # This does not use has/getattr intentionally - we want to know whether
408         # we have cached the regexp for *this* class, whereas getattr would also
409         # match the superclass
410         if '_VALID_URL_RE' not in cls.__dict__:
411             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
412         return cls._VALID_URL_RE.match(url) is not None
413
414     @classmethod
415     def _match_id(cls, url):
416         if '_VALID_URL_RE' not in cls.__dict__:
417             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
418         m = cls._VALID_URL_RE.match(url)
419         assert m
420         return compat_str(m.group('id'))
421
422     @classmethod
423     def working(cls):
424         """Getter method for _WORKING."""
425         return cls._WORKING
426
427     def initialize(self):
428         """Initializes an instance (authentication, etc)."""
429         self._initialize_geo_bypass({
430             'countries': self._GEO_COUNTRIES,
431             'ip_blocks': self._GEO_IP_BLOCKS,
432         })
433         if not self._ready:
434             self._real_initialize()
435             self._ready = True
436
437     def _initialize_geo_bypass(self, geo_bypass_context):
438         """
439         Initialize geo restriction bypass mechanism.
440
441         This method is used to initialize geo bypass mechanism based on faking
442         X-Forwarded-For HTTP header. A random country from provided country list
443         is selected and a random IP belonging to this country is generated. This
444         IP will be passed as X-Forwarded-For HTTP header in all subsequent
445         HTTP requests.
446
447         This method will be used for initial geo bypass mechanism initialization
448         during the instance initialization with _GEO_COUNTRIES and
449         _GEO_IP_BLOCKS.
450
451         You may also manually call it from extractor's code if geo bypass
452         information is not available beforehand (e.g. obtained during
453         extraction) or due to some other reason. In this case you should pass
454         this information in geo bypass context passed as first argument. It may
455         contain following fields:
456
457         countries:  List of geo unrestricted countries (similar
458                     to _GEO_COUNTRIES)
459         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
460                     (similar to _GEO_IP_BLOCKS)
461
462         """
463         if not self._x_forwarded_for_ip:
464
465             # Geo bypass mechanism is explicitly disabled by user
466             if not self._downloader.params.get('geo_bypass', True):
467                 return
468
469             if not geo_bypass_context:
470                 geo_bypass_context = {}
471
472             # Backward compatibility: previously _initialize_geo_bypass
473             # expected a list of countries, some 3rd party code may still use
474             # it this way
475             if isinstance(geo_bypass_context, (list, tuple)):
476                 geo_bypass_context = {
477                     'countries': geo_bypass_context,
478                 }
479
480             # The whole point of geo bypass mechanism is to fake IP
481             # as X-Forwarded-For HTTP header based on some IP block or
482             # country code.
483
484             # Path 1: bypassing based on IP block in CIDR notation
485
486             # Explicit IP block specified by user, use it right away
487             # regardless of whether extractor is geo bypassable or not
488             ip_block = self._downloader.params.get('geo_bypass_ip_block', None)
489
490             # Otherwise use random IP block from geo bypass context but only
491             # if extractor is known as geo bypassable
492             if not ip_block:
493                 ip_blocks = geo_bypass_context.get('ip_blocks')
494                 if self._GEO_BYPASS and ip_blocks:
495                     ip_block = random.choice(ip_blocks)
496
497             if ip_block:
498                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
499                 if self._downloader.params.get('verbose', False):
500                     self._downloader.to_screen(
501                         '[debug] Using fake IP %s as X-Forwarded-For.'
502                         % self._x_forwarded_for_ip)
503                 return
504
505             # Path 2: bypassing based on country code
506
507             # Explicit country code specified by user, use it right away
508             # regardless of whether extractor is geo bypassable or not
509             country = self._downloader.params.get('geo_bypass_country', None)
510
511             # Otherwise use random country code from geo bypass context but
512             # only if extractor is known as geo bypassable
513             if not country:
514                 countries = geo_bypass_context.get('countries')
515                 if self._GEO_BYPASS and countries:
516                     country = random.choice(countries)
517
518             if country:
519                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
520                 if self._downloader.params.get('verbose', False):
521                     self._downloader.to_screen(
522                         '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
523                         % (self._x_forwarded_for_ip, country.upper()))
524
525     def extract(self, url):
526         """Extracts URL information and returns it in list of dicts."""
527         try:
528             for _ in range(2):
529                 try:
530                     self.initialize()
531                     ie_result = self._real_extract(url)
532                     if self._x_forwarded_for_ip:
533                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
534                     return ie_result
535                 except GeoRestrictedError as e:
536                     if self.__maybe_fake_ip_and_retry(e.countries):
537                         continue
538                     raise
539         except ExtractorError:
540             raise
541         except compat_http_client.IncompleteRead as e:
542             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
543         except (KeyError, StopIteration) as e:
544             raise ExtractorError('An extractor error has occurred.', cause=e)
545
546     def __maybe_fake_ip_and_retry(self, countries):
547         if (not self._downloader.params.get('geo_bypass_country', None)
548                 and self._GEO_BYPASS
549                 and self._downloader.params.get('geo_bypass', True)
550                 and not self._x_forwarded_for_ip
551                 and countries):
552             country_code = random.choice(countries)
553             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
554             if self._x_forwarded_for_ip:
555                 self.report_warning(
556                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
557                     % (self._x_forwarded_for_ip, country_code.upper()))
558                 return True
559         return False
560
561     def set_downloader(self, downloader):
562         """Sets the downloader for this IE."""
563         self._downloader = downloader
564
565     def _real_initialize(self):
566         """Real initialization process. Redefine in subclasses."""
567         pass
568
569     def _real_extract(self, url):
570         """Real extraction process. Redefine in subclasses."""
571         pass
572
573     @classmethod
574     def ie_key(cls):
575         """A string for getting the InfoExtractor with get_info_extractor"""
576         return compat_str(cls.__name__[:-2])
577
578     @property
579     def IE_NAME(self):
580         return compat_str(type(self).__name__[:-2])
581
582     @staticmethod
583     def __can_accept_status_code(err, expected_status):
584         assert isinstance(err, compat_urllib_error.HTTPError)
585         if expected_status is None:
586             return False
587         if isinstance(expected_status, compat_integer_types):
588             return err.code == expected_status
589         elif isinstance(expected_status, (list, tuple)):
590             return err.code in expected_status
591         elif callable(expected_status):
592             return expected_status(err.code) is True
593         else:
594             assert False
595
596     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
597         """
598         Return the response handle.
599
600         See _download_webpage docstring for arguments specification.
601         """
602         if note is None:
603             self.report_download_webpage(video_id)
604         elif note is not False:
605             if video_id is None:
606                 self.to_screen('%s' % (note,))
607             else:
608                 self.to_screen('%s: %s' % (video_id, note))
609
610         # Some sites check X-Forwarded-For HTTP header in order to figure out
611         # the origin of the client behind proxy. This allows bypassing geo
612         # restriction by faking this header's value to IP that belongs to some
613         # geo unrestricted country. We will do so once we encounter any
614         # geo restriction error.
615         if self._x_forwarded_for_ip:
616             if 'X-Forwarded-For' not in headers:
617                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
618
619         if isinstance(url_or_request, compat_urllib_request.Request):
620             url_or_request = update_Request(
621                 url_or_request, data=data, headers=headers, query=query)
622         else:
623             if query:
624                 url_or_request = update_url_query(url_or_request, query)
625             if data is not None or headers:
626                 url_or_request = sanitized_Request(url_or_request, data, headers)
627         exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
628         if hasattr(ssl, 'CertificateError'):
629             exceptions.append(ssl.CertificateError)
630         try:
631             return self._downloader.urlopen(url_or_request)
632         except tuple(exceptions) as err:
633             if isinstance(err, compat_urllib_error.HTTPError):
634                 if self.__can_accept_status_code(err, expected_status):
635                     # Retain reference to error to prevent file object from
636                     # being closed before it can be read. Works around the
637                     # effects of <https://bugs.python.org/issue15002>
638                     # introduced in Python 3.4.1.
639                     err.fp._error = err
640                     return err.fp
641
642             if errnote is False:
643                 return False
644             if errnote is None:
645                 errnote = 'Unable to download webpage'
646
647             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
648             if fatal:
649                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
650             else:
651                 self._downloader.report_warning(errmsg)
652                 return False
653
654     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
655         """
656         Return a tuple (page content as string, URL handle).
657
658         See _download_webpage docstring for arguments specification.
659         """
660         # Strip hashes from the URL (#1038)
661         if isinstance(url_or_request, (compat_str, str)):
662             url_or_request = url_or_request.partition('#')[0]
663
664         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
665         if urlh is False:
666             assert not fatal
667             return False
668         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
669         return (content, urlh)
670
671     @staticmethod
672     def _guess_encoding_from_content(content_type, webpage_bytes):
673         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
674         if m:
675             encoding = m.group(1)
676         else:
677             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
678                           webpage_bytes[:1024])
679             if m:
680                 encoding = m.group(1).decode('ascii')
681             elif webpage_bytes.startswith(b'\xff\xfe'):
682                 encoding = 'utf-16'
683             else:
684                 encoding = 'utf-8'
685
686         return encoding
687
688     def __check_blocked(self, content):
689         first_block = content[:512]
690         if ('<title>Access to this site is blocked</title>' in content
691                 and 'Websense' in first_block):
692             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
693             blocked_iframe = self._html_search_regex(
694                 r'<iframe src="([^"]+)"', content,
695                 'Websense information URL', default=None)
696             if blocked_iframe:
697                 msg += ' Visit %s for more details' % blocked_iframe
698             raise ExtractorError(msg, expected=True)
699         if '<title>The URL you requested has been blocked</title>' in first_block:
700             msg = (
701                 'Access to this webpage has been blocked by Indian censorship. '
702                 'Use a VPN or proxy server (with --proxy) to route around it.')
703             block_msg = self._html_search_regex(
704                 r'</h1><p>(.*?)</p>',
705                 content, 'block message', default=None)
706             if block_msg:
707                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
708             raise ExtractorError(msg, expected=True)
709         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
710                 and 'blocklist.rkn.gov.ru' in content):
711             raise ExtractorError(
712                 'Access to this webpage has been blocked by decision of the Russian government. '
713                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
714                 expected=True)
715
716     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
717         content_type = urlh.headers.get('Content-Type', '')
718         webpage_bytes = urlh.read()
719         if prefix is not None:
720             webpage_bytes = prefix + webpage_bytes
721         if not encoding:
722             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
723         if self._downloader.params.get('dump_intermediate_pages', False):
724             self.to_screen('Dumping request to ' + urlh.geturl())
725             dump = base64.b64encode(webpage_bytes).decode('ascii')
726             self._downloader.to_screen(dump)
727         if self._downloader.params.get('write_pages', False):
728             basen = '%s_%s' % (video_id, urlh.geturl())
729             if len(basen) > 240:
730                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
731                 basen = basen[:240 - len(h)] + h
732             raw_filename = basen + '.dump'
733             filename = sanitize_filename(raw_filename, restricted=True)
734             self.to_screen('Saving request to ' + filename)
735             # Working around MAX_PATH limitation on Windows (see
736             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
737             if compat_os_name == 'nt':
738                 absfilepath = os.path.abspath(filename)
739                 if len(absfilepath) > 259:
740                     filename = '\\\\?\\' + absfilepath
741             with open(filename, 'wb') as outf:
742                 outf.write(webpage_bytes)
743
744         try:
745             content = webpage_bytes.decode(encoding, 'replace')
746         except LookupError:
747             content = webpage_bytes.decode('utf-8', 'replace')
748
749         self.__check_blocked(content)
750
751         return content
752
753     def _download_webpage(
754             self, url_or_request, video_id, note=None, errnote=None,
755             fatal=True, tries=1, timeout=5, encoding=None, data=None,
756             headers={}, query={}, expected_status=None):
757         """
758         Return the data of the page as a string.
759
760         Arguments:
761         url_or_request -- plain text URL as a string or
762             a compat_urllib_request.Requestobject
763         video_id -- Video/playlist/item identifier (string)
764
765         Keyword arguments:
766         note -- note printed before downloading (string)
767         errnote -- note printed in case of an error (string)
768         fatal -- flag denoting whether error should be considered fatal,
769             i.e. whether it should cause ExtractionError to be raised,
770             otherwise a warning will be reported and extraction continued
771         tries -- number of tries
772         timeout -- sleep interval between tries
773         encoding -- encoding for a page content decoding, guessed automatically
774             when not explicitly specified
775         data -- POST data (bytes)
776         headers -- HTTP headers (dict)
777         query -- URL query (dict)
778         expected_status -- allows to accept failed HTTP requests (non 2xx
779             status code) by explicitly specifying a set of accepted status
780             codes. Can be any of the following entities:
781                 - an integer type specifying an exact failed status code to
782                   accept
783                 - a list or a tuple of integer types specifying a list of
784                   failed status codes to accept
785                 - a callable accepting an actual failed status code and
786                   returning True if it should be accepted
787             Note that this argument does not affect success status codes (2xx)
788             which are always accepted.
789         """
790
791         success = False
792         try_count = 0
793         while success is False:
794             try:
795                 res = self._download_webpage_handle(
796                     url_or_request, video_id, note, errnote, fatal,
797                     encoding=encoding, data=data, headers=headers, query=query,
798                     expected_status=expected_status)
799                 success = True
800             except compat_http_client.IncompleteRead as e:
801                 try_count += 1
802                 if try_count >= tries:
803                     raise e
804                 self._sleep(timeout, video_id)
805         if res is False:
806             return res
807         else:
808             content, _ = res
809             return content
810
811     def _download_xml_handle(
812             self, url_or_request, video_id, note='Downloading XML',
813             errnote='Unable to download XML', transform_source=None,
814             fatal=True, encoding=None, data=None, headers={}, query={},
815             expected_status=None):
816         """
817         Return a tuple (xml as an compat_etree_Element, URL handle).
818
819         See _download_webpage docstring for arguments specification.
820         """
821         res = self._download_webpage_handle(
822             url_or_request, video_id, note, errnote, fatal=fatal,
823             encoding=encoding, data=data, headers=headers, query=query,
824             expected_status=expected_status)
825         if res is False:
826             return res
827         xml_string, urlh = res
828         return self._parse_xml(
829             xml_string, video_id, transform_source=transform_source,
830             fatal=fatal), urlh
831
832     def _download_xml(
833             self, url_or_request, video_id,
834             note='Downloading XML', errnote='Unable to download XML',
835             transform_source=None, fatal=True, encoding=None,
836             data=None, headers={}, query={}, expected_status=None):
837         """
838         Return the xml as an compat_etree_Element.
839
840         See _download_webpage docstring for arguments specification.
841         """
842         res = self._download_xml_handle(
843             url_or_request, video_id, note=note, errnote=errnote,
844             transform_source=transform_source, fatal=fatal, encoding=encoding,
845             data=data, headers=headers, query=query,
846             expected_status=expected_status)
847         return res if res is False else res[0]
848
849     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
850         if transform_source:
851             xml_string = transform_source(xml_string)
852         try:
853             return compat_etree_fromstring(xml_string.encode('utf-8'))
854         except compat_xml_parse_error as ve:
855             errmsg = '%s: Failed to parse XML ' % video_id
856             if fatal:
857                 raise ExtractorError(errmsg, cause=ve)
858             else:
859                 self.report_warning(errmsg + str(ve))
860
861     def _download_json_handle(
862             self, url_or_request, video_id, note='Downloading JSON metadata',
863             errnote='Unable to download JSON metadata', transform_source=None,
864             fatal=True, encoding=None, data=None, headers={}, query={},
865             expected_status=None):
866         """
867         Return a tuple (JSON object, URL handle).
868
869         See _download_webpage docstring for arguments specification.
870         """
871         res = self._download_webpage_handle(
872             url_or_request, video_id, note, errnote, fatal=fatal,
873             encoding=encoding, data=data, headers=headers, query=query,
874             expected_status=expected_status)
875         if res is False:
876             return res
877         json_string, urlh = res
878         return self._parse_json(
879             json_string, video_id, transform_source=transform_source,
880             fatal=fatal), urlh
881
882     def _download_json(
883             self, url_or_request, video_id, note='Downloading JSON metadata',
884             errnote='Unable to download JSON metadata', transform_source=None,
885             fatal=True, encoding=None, data=None, headers={}, query={},
886             expected_status=None):
887         """
888         Return the JSON object as a dict.
889
890         See _download_webpage docstring for arguments specification.
891         """
892         res = self._download_json_handle(
893             url_or_request, video_id, note=note, errnote=errnote,
894             transform_source=transform_source, fatal=fatal, encoding=encoding,
895             data=data, headers=headers, query=query,
896             expected_status=expected_status)
897         return res if res is False else res[0]
898
899     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
900         if transform_source:
901             json_string = transform_source(json_string)
902         try:
903             return json.loads(json_string)
904         except ValueError as ve:
905             errmsg = '%s: Failed to parse JSON ' % video_id
906             if fatal:
907                 raise ExtractorError(errmsg, cause=ve)
908             else:
909                 self.report_warning(errmsg + str(ve))
910
911     def report_warning(self, msg, video_id=None):
912         idstr = '' if video_id is None else '%s: ' % video_id
913         self._downloader.report_warning(
914             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
915
916     def to_screen(self, msg):
917         """Print msg to screen, prefixing it with '[ie_name]'"""
918         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
919
920     def report_extraction(self, id_or_name):
921         """Report information extraction."""
922         self.to_screen('%s: Extracting information' % id_or_name)
923
924     def report_download_webpage(self, video_id):
925         """Report webpage download."""
926         self.to_screen('%s: Downloading webpage' % video_id)
927
928     def report_age_confirmation(self):
929         """Report attempt to confirm age."""
930         self.to_screen('Confirming age')
931
932     def report_login(self):
933         """Report attempt to log in."""
934         self.to_screen('Logging in')
935
936     @staticmethod
937     def raise_login_required(msg='This video is only available for registered users'):
938         raise ExtractorError(
939             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
940             expected=True)
941
942     @staticmethod
943     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
944         raise GeoRestrictedError(msg, countries=countries)
945
946     # Methods for following #608
947     @staticmethod
948     def url_result(url, ie=None, video_id=None, video_title=None):
949         """Returns a URL that points to a page that should be processed"""
950         # TODO: ie should be the class used for getting the info
951         video_info = {'_type': 'url',
952                       'url': url,
953                       'ie_key': ie}
954         if video_id is not None:
955             video_info['id'] = video_id
956         if video_title is not None:
957             video_info['title'] = video_title
958         return video_info
959
960     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
961         urls = orderedSet(
962             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
963             for m in matches)
964         return self.playlist_result(
965             urls, playlist_id=playlist_id, playlist_title=playlist_title)
966
967     @staticmethod
968     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
969         """Returns a playlist"""
970         video_info = {'_type': 'playlist',
971                       'entries': entries}
972         if playlist_id:
973             video_info['id'] = playlist_id
974         if playlist_title:
975             video_info['title'] = playlist_title
976         if playlist_description:
977             video_info['description'] = playlist_description
978         return video_info
979
980     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
981         """
982         Perform a regex search on the given string, using a single or a list of
983         patterns returning the first matching group.
984         In case of failure return a default value or raise a WARNING or a
985         RegexNotFoundError, depending on fatal, specifying the field name.
986         """
987         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
988             mobj = re.search(pattern, string, flags)
989         else:
990             for p in pattern:
991                 mobj = re.search(p, string, flags)
992                 if mobj:
993                     break
994
995         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
996             _name = '\033[0;34m%s\033[0m' % name
997         else:
998             _name = name
999
1000         if mobj:
1001             if group is None:
1002                 # return the first matching group
1003                 return next(g for g in mobj.groups() if g is not None)
1004             else:
1005                 return mobj.group(group)
1006         elif default is not NO_DEFAULT:
1007             return default
1008         elif fatal:
1009             raise RegexNotFoundError('Unable to extract %s' % _name)
1010         else:
1011             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
1012             return None
1013
1014     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1015         """
1016         Like _search_regex, but strips HTML tags and unescapes entities.
1017         """
1018         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1019         if res:
1020             return clean_html(res).strip()
1021         else:
1022             return res
1023
1024     def _get_netrc_login_info(self, netrc_machine=None):
1025         username = None
1026         password = None
1027         netrc_machine = netrc_machine or self._NETRC_MACHINE
1028
1029         if self._downloader.params.get('usenetrc', False):
1030             try:
1031                 info = netrc.netrc().authenticators(netrc_machine)
1032                 if info is not None:
1033                     username = info[0]
1034                     password = info[2]
1035                 else:
1036                     raise netrc.NetrcParseError(
1037                         'No authenticators for %s' % netrc_machine)
1038             except (IOError, netrc.NetrcParseError) as err:
1039                 self._downloader.report_warning(
1040                     'parsing .netrc: %s' % error_to_compat_str(err))
1041
1042         return username, password
1043
1044     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1045         """
1046         Get the login info as (username, password)
1047         First look for the manually specified credentials using username_option
1048         and password_option as keys in params dictionary. If no such credentials
1049         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1050         value.
1051         If there's no info available, return (None, None)
1052         """
1053         if self._downloader is None:
1054             return (None, None)
1055
1056         downloader_params = self._downloader.params
1057
1058         # Attempt to use provided username and password or .netrc data
1059         if downloader_params.get(username_option) is not None:
1060             username = downloader_params[username_option]
1061             password = downloader_params[password_option]
1062         else:
1063             username, password = self._get_netrc_login_info(netrc_machine)
1064
1065         return username, password
1066
1067     def _get_tfa_info(self, note='two-factor verification code'):
1068         """
1069         Get the two-factor authentication info
1070         TODO - asking the user will be required for sms/phone verify
1071         currently just uses the command line option
1072         If there's no info available, return None
1073         """
1074         if self._downloader is None:
1075             return None
1076         downloader_params = self._downloader.params
1077
1078         if downloader_params.get('twofactor') is not None:
1079             return downloader_params['twofactor']
1080
1081         return compat_getpass('Type %s and press [Return]: ' % note)
1082
1083     # Helper functions for extracting OpenGraph info
1084     @staticmethod
1085     def _og_regexes(prop):
1086         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1087         property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
1088                        % {'prop': re.escape(prop)})
1089         template = r'<meta[^>]+?%s[^>]+?%s'
1090         return [
1091             template % (property_re, content_re),
1092             template % (content_re, property_re),
1093         ]
1094
1095     @staticmethod
1096     def _meta_regex(prop):
1097         return r'''(?isx)<meta
1098                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1099                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1100
1101     def _og_search_property(self, prop, html, name=None, **kargs):
1102         if not isinstance(prop, (list, tuple)):
1103             prop = [prop]
1104         if name is None:
1105             name = 'OpenGraph %s' % prop[0]
1106         og_regexes = []
1107         for p in prop:
1108             og_regexes.extend(self._og_regexes(p))
1109         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1110         if escaped is None:
1111             return None
1112         return unescapeHTML(escaped)
1113
1114     def _og_search_thumbnail(self, html, **kargs):
1115         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1116
1117     def _og_search_description(self, html, **kargs):
1118         return self._og_search_property('description', html, fatal=False, **kargs)
1119
1120     def _og_search_title(self, html, **kargs):
1121         return self._og_search_property('title', html, **kargs)
1122
1123     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1124         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1125         if secure:
1126             regexes = self._og_regexes('video:secure_url') + regexes
1127         return self._html_search_regex(regexes, html, name, **kargs)
1128
1129     def _og_search_url(self, html, **kargs):
1130         return self._og_search_property('url', html, **kargs)
1131
1132     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1133         if not isinstance(name, (list, tuple)):
1134             name = [name]
1135         if display_name is None:
1136             display_name = name[0]
1137         return self._html_search_regex(
1138             [self._meta_regex(n) for n in name],
1139             html, display_name, fatal=fatal, group='content', **kwargs)
1140
1141     def _dc_search_uploader(self, html):
1142         return self._html_search_meta('dc.creator', html, 'uploader')
1143
1144     def _rta_search(self, html):
1145         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1146         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1147                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1148                      html):
1149             return 18
1150         return 0
1151
1152     def _media_rating_search(self, html):
1153         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1154         rating = self._html_search_meta('rating', html)
1155
1156         if not rating:
1157             return None
1158
1159         RATING_TABLE = {
1160             'safe for kids': 0,
1161             'general': 8,
1162             '14 years': 14,
1163             'mature': 17,
1164             'restricted': 19,
1165         }
1166         return RATING_TABLE.get(rating.lower())
1167
1168     def _family_friendly_search(self, html):
1169         # See http://schema.org/VideoObject
1170         family_friendly = self._html_search_meta(
1171             'isFamilyFriendly', html, default=None)
1172
1173         if not family_friendly:
1174             return None
1175
1176         RATING_TABLE = {
1177             '1': 0,
1178             'true': 0,
1179             '0': 18,
1180             'false': 18,
1181         }
1182         return RATING_TABLE.get(family_friendly.lower())
1183
1184     def _twitter_search_player(self, html):
1185         return self._html_search_meta('twitter:player', html,
1186                                       'twitter card player')
1187
1188     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1189         json_ld_list = list(re.finditer(JSON_LD_RE, html))
1190         default = kwargs.get('default', NO_DEFAULT)
1191         # JSON-LD may be malformed and thus `fatal` should be respected.
1192         # At the same time `default` may be passed that assumes `fatal=False`
1193         # for _search_regex. Let's simulate the same behavior here as well.
1194         fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
1195         json_ld = []
1196         for mobj in json_ld_list:
1197             json_ld_item = self._parse_json(
1198                 mobj.group('json_ld'), video_id, fatal=fatal)
1199             if not json_ld_item:
1200                 continue
1201             if isinstance(json_ld_item, dict):
1202                 json_ld.append(json_ld_item)
1203             elif isinstance(json_ld_item, (list, tuple)):
1204                 json_ld.extend(json_ld_item)
1205         if json_ld:
1206             json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1207         if json_ld:
1208             return json_ld
1209         if default is not NO_DEFAULT:
1210             return default
1211         elif fatal:
1212             raise RegexNotFoundError('Unable to extract JSON-LD')
1213         else:
1214             self._downloader.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1215             return {}
1216
1217     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1218         if isinstance(json_ld, compat_str):
1219             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1220         if not json_ld:
1221             return {}
1222         info = {}
1223         if not isinstance(json_ld, (list, tuple, dict)):
1224             return info
1225         if isinstance(json_ld, dict):
1226             json_ld = [json_ld]
1227
1228         INTERACTION_TYPE_MAP = {
1229             'CommentAction': 'comment',
1230             'AgreeAction': 'like',
1231             'DisagreeAction': 'dislike',
1232             'LikeAction': 'like',
1233             'DislikeAction': 'dislike',
1234             'ListenAction': 'view',
1235             'WatchAction': 'view',
1236             'ViewAction': 'view',
1237         }
1238
1239         def extract_interaction_statistic(e):
1240             interaction_statistic = e.get('interactionStatistic')
1241             if not isinstance(interaction_statistic, list):
1242                 return
1243             for is_e in interaction_statistic:
1244                 if not isinstance(is_e, dict):
1245                     continue
1246                 if is_e.get('@type') != 'InteractionCounter':
1247                     continue
1248                 interaction_type = is_e.get('interactionType')
1249                 if not isinstance(interaction_type, compat_str):
1250                     continue
1251                 interaction_count = int_or_none(is_e.get('userInteractionCount'))
1252                 if interaction_count is None:
1253                     continue
1254                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1255                 if not count_kind:
1256                     continue
1257                 count_key = '%s_count' % count_kind
1258                 if info.get(count_key) is not None:
1259                     continue
1260                 info[count_key] = interaction_count
1261
1262         def extract_video_object(e):
1263             assert e['@type'] == 'VideoObject'
1264             info.update({
1265                 'url': url_or_none(e.get('contentUrl')),
1266                 'title': unescapeHTML(e.get('name')),
1267                 'description': unescapeHTML(e.get('description')),
1268                 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
1269                 'duration': parse_duration(e.get('duration')),
1270                 'timestamp': unified_timestamp(e.get('uploadDate')),
1271                 'filesize': float_or_none(e.get('contentSize')),
1272                 'tbr': int_or_none(e.get('bitrate')),
1273                 'width': int_or_none(e.get('width')),
1274                 'height': int_or_none(e.get('height')),
1275                 'view_count': int_or_none(e.get('interactionCount')),
1276             })
1277             extract_interaction_statistic(e)
1278
1279         for e in json_ld:
1280             if '@context' in e:
1281                 item_type = e.get('@type')
1282                 if expected_type is not None and expected_type != item_type:
1283                     continue
1284                 if item_type in ('TVEpisode', 'Episode'):
1285                     episode_name = unescapeHTML(e.get('name'))
1286                     info.update({
1287                         'episode': episode_name,
1288                         'episode_number': int_or_none(e.get('episodeNumber')),
1289                         'description': unescapeHTML(e.get('description')),
1290                     })
1291                     if not info.get('title') and episode_name:
1292                         info['title'] = episode_name
1293                     part_of_season = e.get('partOfSeason')
1294                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1295                         info.update({
1296                             'season': unescapeHTML(part_of_season.get('name')),
1297                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1298                         })
1299                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1300                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1301                         info['series'] = unescapeHTML(part_of_series.get('name'))
1302                 elif item_type == 'Movie':
1303                     info.update({
1304                         'title': unescapeHTML(e.get('name')),
1305                         'description': unescapeHTML(e.get('description')),
1306                         'duration': parse_duration(e.get('duration')),
1307                         'timestamp': unified_timestamp(e.get('dateCreated')),
1308                     })
1309                 elif item_type in ('Article', 'NewsArticle'):
1310                     info.update({
1311                         'timestamp': parse_iso8601(e.get('datePublished')),
1312                         'title': unescapeHTML(e.get('headline')),
1313                         'description': unescapeHTML(e.get('articleBody')),
1314                     })
1315                 elif item_type == 'VideoObject':
1316                     extract_video_object(e)
1317                     if expected_type is None:
1318                         continue
1319                     else:
1320                         break
1321                 video = e.get('video')
1322                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1323                     extract_video_object(video)
1324                 if expected_type is None:
1325                     continue
1326                 else:
1327                     break
1328         return dict((k, v) for k, v in info.items() if v is not None)
1329
1330     @staticmethod
1331     def _hidden_inputs(html):
1332         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1333         hidden_inputs = {}
1334         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1335             attrs = extract_attributes(input)
1336             if not input:
1337                 continue
1338             if attrs.get('type') not in ('hidden', 'submit'):
1339                 continue
1340             name = attrs.get('name') or attrs.get('id')
1341             value = attrs.get('value')
1342             if name and value is not None:
1343                 hidden_inputs[name] = value
1344         return hidden_inputs
1345
1346     def _form_hidden_inputs(self, form_id, html):
1347         form = self._search_regex(
1348             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1349             html, '%s form' % form_id, group='form')
1350         return self._hidden_inputs(form)
1351
1352     def _sort_formats(self, formats, field_preference=None):
1353         if not formats:
1354             raise ExtractorError('No video formats found')
1355
1356         for f in formats:
1357             # Automatically determine tbr when missing based on abr and vbr (improves
1358             # formats sorting in some cases)
1359             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
1360                 f['tbr'] = f['abr'] + f['vbr']
1361
1362         def _formats_key(f):
1363             # TODO remove the following workaround
1364             from ..utils import determine_ext
1365             if not f.get('ext') and 'url' in f:
1366                 f['ext'] = determine_ext(f['url'])
1367
1368             if isinstance(field_preference, (list, tuple)):
1369                 return tuple(
1370                     f.get(field)
1371                     if f.get(field) is not None
1372                     else ('' if field == 'format_id' else -1)
1373                     for field in field_preference)
1374
1375             preference = f.get('preference')
1376             if preference is None:
1377                 preference = 0
1378                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
1379                     preference -= 0.5
1380
1381             protocol = f.get('protocol') or determine_protocol(f)
1382             proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
1383
1384             if f.get('vcodec') == 'none':  # audio only
1385                 preference -= 50
1386                 if self._downloader.params.get('prefer_free_formats'):
1387                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
1388                 else:
1389                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
1390                 ext_preference = 0
1391                 try:
1392                     audio_ext_preference = ORDER.index(f['ext'])
1393                 except ValueError:
1394                     audio_ext_preference = -1
1395             else:
1396                 if f.get('acodec') == 'none':  # video only
1397                     preference -= 40
1398                 if self._downloader.params.get('prefer_free_formats'):
1399                     ORDER = ['flv', 'mp4', 'webm']
1400                 else:
1401                     ORDER = ['webm', 'flv', 'mp4']
1402                 try:
1403                     ext_preference = ORDER.index(f['ext'])
1404                 except ValueError:
1405                     ext_preference = -1
1406                 audio_ext_preference = 0
1407
1408             return (
1409                 preference,
1410                 f.get('language_preference') if f.get('language_preference') is not None else -1,
1411                 f.get('quality') if f.get('quality') is not None else -1,
1412                 f.get('tbr') if f.get('tbr') is not None else -1,
1413                 f.get('filesize') if f.get('filesize') is not None else -1,
1414                 f.get('vbr') if f.get('vbr') is not None else -1,
1415                 f.get('height') if f.get('height') is not None else -1,
1416                 f.get('width') if f.get('width') is not None else -1,
1417                 proto_preference,
1418                 ext_preference,
1419                 f.get('abr') if f.get('abr') is not None else -1,
1420                 audio_ext_preference,
1421                 f.get('fps') if f.get('fps') is not None else -1,
1422                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
1423                 f.get('source_preference') if f.get('source_preference') is not None else -1,
1424                 f.get('format_id') if f.get('format_id') is not None else '',
1425             )
1426         formats.sort(key=_formats_key)
1427
1428     def _check_formats(self, formats, video_id):
1429         if formats:
1430             formats[:] = filter(
1431                 lambda f: self._is_valid_url(
1432                     f['url'], video_id,
1433                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1434                 formats)
1435
1436     @staticmethod
1437     def _remove_duplicate_formats(formats):
1438         format_urls = set()
1439         unique_formats = []
1440         for f in formats:
1441             if f['url'] not in format_urls:
1442                 format_urls.add(f['url'])
1443                 unique_formats.append(f)
1444         formats[:] = unique_formats
1445
1446     def _is_valid_url(self, url, video_id, item='video', headers={}):
1447         url = self._proto_relative_url(url, scheme='http:')
1448         # For now assume non HTTP(S) URLs always valid
1449         if not (url.startswith('http://') or url.startswith('https://')):
1450             return True
1451         try:
1452             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1453             return True
1454         except ExtractorError:
1455             self.to_screen(
1456                 '%s: %s URL is invalid, skipping' % (video_id, item))
1457             return False
1458
1459     def http_scheme(self):
1460         """ Either "http:" or "https:", depending on the user's preferences """
1461         return (
1462             'http:'
1463             if self._downloader.params.get('prefer_insecure', False)
1464             else 'https:')
1465
1466     def _proto_relative_url(self, url, scheme=None):
1467         if url is None:
1468             return url
1469         if url.startswith('//'):
1470             if scheme is None:
1471                 scheme = self.http_scheme()
1472             return scheme + url
1473         else:
1474             return url
1475
1476     def _sleep(self, timeout, video_id, msg_template=None):
1477         if msg_template is None:
1478             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1479         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1480         self.to_screen(msg)
1481         time.sleep(timeout)
1482
1483     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
1484                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1485                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1486         manifest = self._download_xml(
1487             manifest_url, video_id, 'Downloading f4m manifest',
1488             'Unable to download f4m manifest',
1489             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1490             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1491             transform_source=transform_source,
1492             fatal=fatal, data=data, headers=headers, query=query)
1493
1494         if manifest is False:
1495             return []
1496
1497         return self._parse_f4m_formats(
1498             manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1499             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1500
1501     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1502                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1503                            fatal=True, m3u8_id=None):
1504         if not isinstance(manifest, compat_etree_Element) and not fatal:
1505             return []
1506
1507         # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1508         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1509         if akamai_pv is not None and ';' in akamai_pv.text:
1510             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1511             if playerVerificationChallenge.strip() != '':
1512                 return []
1513
1514         formats = []
1515         manifest_version = '1.0'
1516         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1517         if not media_nodes:
1518             manifest_version = '2.0'
1519             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1520         # Remove unsupported DRM protected media from final formats
1521         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1522         media_nodes = remove_encrypted_media(media_nodes)
1523         if not media_nodes:
1524             return formats
1525
1526         manifest_base_url = get_base_url(manifest)
1527
1528         bootstrap_info = xpath_element(
1529             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1530             'bootstrap info', default=None)
1531
1532         vcodec = None
1533         mime_type = xpath_text(
1534             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1535             'base URL', default=None)
1536         if mime_type and mime_type.startswith('audio/'):
1537             vcodec = 'none'
1538
1539         for i, media_el in enumerate(media_nodes):
1540             tbr = int_or_none(media_el.attrib.get('bitrate'))
1541             width = int_or_none(media_el.attrib.get('width'))
1542             height = int_or_none(media_el.attrib.get('height'))
1543             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1544             # If <bootstrapInfo> is present, the specified f4m is a
1545             # stream-level manifest, and only set-level manifests may refer to
1546             # external resources.  See section 11.4 and section 4 of F4M spec
1547             if bootstrap_info is None:
1548                 media_url = None
1549                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1550                 if manifest_version == '2.0':
1551                     media_url = media_el.attrib.get('href')
1552                 if media_url is None:
1553                     media_url = media_el.attrib.get('url')
1554                 if not media_url:
1555                     continue
1556                 manifest_url = (
1557                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1558                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1559                 # If media_url is itself a f4m manifest do the recursive extraction
1560                 # since bitrates in parent manifest (this one) and media_url manifest
1561                 # may differ leading to inability to resolve the format by requested
1562                 # bitrate in f4m downloader
1563                 ext = determine_ext(manifest_url)
1564                 if ext == 'f4m':
1565                     f4m_formats = self._extract_f4m_formats(
1566                         manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1567                         transform_source=transform_source, fatal=fatal)
1568                     # Sometimes stream-level manifest contains single media entry that
1569                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1570                     # At the same time parent's media entry in set-level manifest may
1571                     # contain it. We will copy it from parent in such cases.
1572                     if len(f4m_formats) == 1:
1573                         f = f4m_formats[0]
1574                         f.update({
1575                             'tbr': f.get('tbr') or tbr,
1576                             'width': f.get('width') or width,
1577                             'height': f.get('height') or height,
1578                             'format_id': f.get('format_id') if not tbr else format_id,
1579                             'vcodec': vcodec,
1580                         })
1581                     formats.extend(f4m_formats)
1582                     continue
1583                 elif ext == 'm3u8':
1584                     formats.extend(self._extract_m3u8_formats(
1585                         manifest_url, video_id, 'mp4', preference=preference,
1586                         m3u8_id=m3u8_id, fatal=fatal))
1587                     continue
1588             formats.append({
1589                 'format_id': format_id,
1590                 'url': manifest_url,
1591                 'manifest_url': manifest_url,
1592                 'ext': 'flv' if bootstrap_info is not None else None,
1593                 'protocol': 'f4m',
1594                 'tbr': tbr,
1595                 'width': width,
1596                 'height': height,
1597                 'vcodec': vcodec,
1598                 'preference': preference,
1599             })
1600         return formats
1601
1602     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1603         return {
1604             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1605             'url': m3u8_url,
1606             'ext': ext,
1607             'protocol': 'm3u8',
1608             'preference': preference - 100 if preference else -100,
1609             'resolution': 'multiple',
1610             'format_note': 'Quality selection URL',
1611         }
1612
1613     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1614                               entry_protocol='m3u8', preference=None,
1615                               m3u8_id=None, note=None, errnote=None,
1616                               fatal=True, live=False, data=None, headers={},
1617                               query={}):
1618         res = self._download_webpage_handle(
1619             m3u8_url, video_id,
1620             note=note or 'Downloading m3u8 information',
1621             errnote=errnote or 'Failed to download m3u8 information',
1622             fatal=fatal, data=data, headers=headers, query=query)
1623
1624         if res is False:
1625             return []
1626
1627         m3u8_doc, urlh = res
1628         m3u8_url = urlh.geturl()
1629
1630         return self._parse_m3u8_formats(
1631             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1632             preference=preference, m3u8_id=m3u8_id, live=live)
1633
1634     def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
1635                             entry_protocol='m3u8', preference=None,
1636                             m3u8_id=None, live=False):
1637         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
1638             return []
1639
1640         if re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc):  # Apple FairPlay
1641             return []
1642
1643         formats = []
1644
1645         format_url = lambda u: (
1646             u
1647             if re.match(r'^https?://', u)
1648             else compat_urlparse.urljoin(m3u8_url, u))
1649
1650         # References:
1651         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
1652         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
1653         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
1654
1655         # We should try extracting formats only from master playlists [1, 4.3.4],
1656         # i.e. playlists that describe available qualities. On the other hand
1657         # media playlists [1, 4.3.3] should be returned as is since they contain
1658         # just the media without qualities renditions.
1659         # Fortunately, master playlist can be easily distinguished from media
1660         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
1661         # master playlist tags MUST NOT appear in a media playist and vice versa.
1662         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
1663         # media playlist and MUST NOT appear in master playlist thus we can
1664         # clearly detect media playlist with this criterion.
1665
1666         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1667             return [{
1668                 'url': m3u8_url,
1669                 'format_id': m3u8_id,
1670                 'ext': ext,
1671                 'protocol': entry_protocol,
1672                 'preference': preference,
1673             }]
1674
1675         groups = {}
1676         last_stream_inf = {}
1677
1678         def extract_media(x_media_line):
1679             media = parse_m3u8_attributes(x_media_line)
1680             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
1681             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
1682             if not (media_type and group_id and name):
1683                 return
1684             groups.setdefault(group_id, []).append(media)
1685             if media_type not in ('VIDEO', 'AUDIO'):
1686                 return
1687             media_url = media.get('URI')
1688             if media_url:
1689                 format_id = []
1690                 for v in (m3u8_id, group_id, name):
1691                     if v:
1692                         format_id.append(v)
1693                 f = {
1694                     'format_id': '-'.join(format_id),
1695                     'url': format_url(media_url),
1696                     'manifest_url': m3u8_url,
1697                     'language': media.get('LANGUAGE'),
1698                     'ext': ext,
1699                     'protocol': entry_protocol,
1700                     'preference': preference,
1701                 }
1702                 if media_type == 'AUDIO':
1703                     f['vcodec'] = 'none'
1704                 formats.append(f)
1705
1706         def build_stream_name():
1707             # Despite specification does not mention NAME attribute for
1708             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
1709             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
1710             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
1711             stream_name = last_stream_inf.get('NAME')
1712             if stream_name:
1713                 return stream_name
1714             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
1715             # from corresponding rendition group
1716             stream_group_id = last_stream_inf.get('VIDEO')
1717             if not stream_group_id:
1718                 return
1719             stream_group = groups.get(stream_group_id)
1720             if not stream_group:
1721                 return stream_group_id
1722             rendition = stream_group[0]
1723             return rendition.get('NAME') or stream_group_id
1724
1725         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
1726         # chance to detect video only formats when EXT-X-STREAM-INF tags
1727         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
1728         for line in m3u8_doc.splitlines():
1729             if line.startswith('#EXT-X-MEDIA:'):
1730                 extract_media(line)
1731
1732         for line in m3u8_doc.splitlines():
1733             if line.startswith('#EXT-X-STREAM-INF:'):
1734                 last_stream_inf = parse_m3u8_attributes(line)
1735             elif line.startswith('#') or not line.strip():
1736                 continue
1737             else:
1738                 tbr = float_or_none(
1739                     last_stream_inf.get('AVERAGE-BANDWIDTH')
1740                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
1741                 format_id = []
1742                 if m3u8_id:
1743                     format_id.append(m3u8_id)
1744                 stream_name = build_stream_name()
1745                 # Bandwidth of live streams may differ over time thus making
1746                 # format_id unpredictable. So it's better to keep provided
1747                 # format_id intact.
1748                 if not live:
1749                     format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
1750                 manifest_url = format_url(line.strip())
1751                 f = {
1752                     'format_id': '-'.join(format_id),
1753                     'url': manifest_url,
1754                     'manifest_url': m3u8_url,
1755                     'tbr': tbr,
1756                     'ext': ext,
1757                     'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
1758                     'protocol': entry_protocol,
1759                     'preference': preference,
1760                 }
1761                 resolution = last_stream_inf.get('RESOLUTION')
1762                 if resolution:
1763                     mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
1764                     if mobj:
1765                         f['width'] = int(mobj.group('width'))
1766                         f['height'] = int(mobj.group('height'))
1767                 # Unified Streaming Platform
1768                 mobj = re.search(
1769                     r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
1770                 if mobj:
1771                     abr, vbr = mobj.groups()
1772                     abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
1773                     f.update({
1774                         'vbr': vbr,
1775                         'abr': abr,
1776                     })
1777                 codecs = parse_codecs(last_stream_inf.get('CODECS'))
1778                 f.update(codecs)
1779                 audio_group_id = last_stream_inf.get('AUDIO')
1780                 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
1781                 # references a rendition group MUST have a CODECS attribute.
1782                 # However, this is not always respected, for example, [2]
1783                 # contains EXT-X-STREAM-INF tag which references AUDIO
1784                 # rendition group but does not have CODECS and despite
1785                 # referencing an audio group it represents a complete
1786                 # (with audio and video) format. So, for such cases we will
1787                 # ignore references to rendition groups and treat them
1788                 # as complete formats.
1789                 if audio_group_id and codecs and f.get('vcodec') != 'none':
1790                     audio_group = groups.get(audio_group_id)
1791                     if audio_group and audio_group[0].get('URI'):
1792                         # TODO: update acodec for audio only formats with
1793                         # the same GROUP-ID
1794                         f['acodec'] = 'none'
1795                 formats.append(f)
1796
1797                 # for DailyMotion
1798                 progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
1799                 if progressive_uri:
1800                     http_f = f.copy()
1801                     del http_f['manifest_url']
1802                     http_f.update({
1803                         'format_id': f['format_id'].replace('hls-', 'http-'),
1804                         'protocol': 'http',
1805                         'url': progressive_uri,
1806                     })
1807                     formats.append(http_f)
1808
1809                 last_stream_inf = {}
1810         return formats
1811
1812     @staticmethod
1813     def _xpath_ns(path, namespace=None):
1814         if not namespace:
1815             return path
1816         out = []
1817         for c in path.split('/'):
1818             if not c or c == '.':
1819                 out.append(c)
1820             else:
1821                 out.append('{%s}%s' % (namespace, c))
1822         return '/'.join(out)
1823
1824     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1825         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1826
1827         if smil is False:
1828             assert not fatal
1829             return []
1830
1831         namespace = self._parse_smil_namespace(smil)
1832
1833         return self._parse_smil_formats(
1834             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1835
1836     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1837         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1838         if smil is False:
1839             return {}
1840         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1841
1842     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1843         return self._download_xml(
1844             smil_url, video_id, 'Downloading SMIL file',
1845             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1846
1847     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1848         namespace = self._parse_smil_namespace(smil)
1849
1850         formats = self._parse_smil_formats(
1851             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1852         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1853
1854         video_id = os.path.splitext(url_basename(smil_url))[0]
1855         title = None
1856         description = None
1857         upload_date = None
1858         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1859             name = meta.attrib.get('name')
1860             content = meta.attrib.get('content')
1861             if not name or not content:
1862                 continue
1863             if not title and name == 'title':
1864                 title = content
1865             elif not description and name in ('description', 'abstract'):
1866                 description = content
1867             elif not upload_date and name == 'date':
1868                 upload_date = unified_strdate(content)
1869
1870         thumbnails = [{
1871             'id': image.get('type'),
1872             'url': image.get('src'),
1873             'width': int_or_none(image.get('width')),
1874             'height': int_or_none(image.get('height')),
1875         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1876
1877         return {
1878             'id': video_id,
1879             'title': title or video_id,
1880             'description': description,
1881             'upload_date': upload_date,
1882             'thumbnails': thumbnails,
1883             'formats': formats,
1884             'subtitles': subtitles,
1885         }
1886
1887     def _parse_smil_namespace(self, smil):
1888         return self._search_regex(
1889             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1890
1891     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1892         base = smil_url
1893         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1894             b = meta.get('base') or meta.get('httpBase')
1895             if b:
1896                 base = b
1897                 break
1898
1899         formats = []
1900         rtmp_count = 0
1901         http_count = 0
1902         m3u8_count = 0
1903
1904         srcs = []
1905         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1906         for medium in media:
1907             src = medium.get('src')
1908             if not src or src in srcs:
1909                 continue
1910             srcs.append(src)
1911
1912             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1913             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1914             width = int_or_none(medium.get('width'))
1915             height = int_or_none(medium.get('height'))
1916             proto = medium.get('proto')
1917             ext = medium.get('ext')
1918             src_ext = determine_ext(src)
1919             streamer = medium.get('streamer') or base
1920
1921             if proto == 'rtmp' or streamer.startswith('rtmp'):
1922                 rtmp_count += 1
1923                 formats.append({
1924                     'url': streamer,
1925                     'play_path': src,
1926                     'ext': 'flv',
1927                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1928                     'tbr': bitrate,
1929                     'filesize': filesize,
1930                     'width': width,
1931                     'height': height,
1932                 })
1933                 if transform_rtmp_url:
1934                     streamer, src = transform_rtmp_url(streamer, src)
1935                     formats[-1].update({
1936                         'url': streamer,
1937                         'play_path': src,
1938                     })
1939                 continue
1940
1941             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1942             src_url = src_url.strip()
1943
1944             if proto == 'm3u8' or src_ext == 'm3u8':
1945                 m3u8_formats = self._extract_m3u8_formats(
1946                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1947                 if len(m3u8_formats) == 1:
1948                     m3u8_count += 1
1949                     m3u8_formats[0].update({
1950                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1951                         'tbr': bitrate,
1952                         'width': width,
1953                         'height': height,
1954                     })
1955                 formats.extend(m3u8_formats)
1956             elif src_ext == 'f4m':
1957                 f4m_url = src_url
1958                 if not f4m_params:
1959                     f4m_params = {
1960                         'hdcore': '3.2.0',
1961                         'plugin': 'flowplayer-3.2.0.1',
1962                     }
1963                 f4m_url += '&' if '?' in f4m_url else '?'
1964                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1965                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1966             elif src_ext == 'mpd':
1967                 formats.extend(self._extract_mpd_formats(
1968                     src_url, video_id, mpd_id='dash', fatal=False))
1969             elif re.search(r'\.ism/[Mm]anifest', src_url):
1970                 formats.extend(self._extract_ism_formats(
1971                     src_url, video_id, ism_id='mss', fatal=False))
1972             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
1973                 http_count += 1
1974                 formats.append({
1975                     'url': src_url,
1976                     'ext': ext or src_ext or 'flv',
1977                     'format_id': 'http-%d' % (bitrate or http_count),
1978                     'tbr': bitrate,
1979                     'filesize': filesize,
1980                     'width': width,
1981                     'height': height,
1982                 })
1983
1984         return formats
1985
1986     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1987         urls = []
1988         subtitles = {}
1989         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1990             src = textstream.get('src')
1991             if not src or src in urls:
1992                 continue
1993             urls.append(src)
1994             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
1995             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1996             subtitles.setdefault(lang, []).append({
1997                 'url': src,
1998                 'ext': ext,
1999             })
2000         return subtitles
2001
2002     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2003         xspf = self._download_xml(
2004             xspf_url, playlist_id, 'Downloading xpsf playlist',
2005             'Unable to download xspf manifest', fatal=fatal)
2006         if xspf is False:
2007             return []
2008         return self._parse_xspf(
2009             xspf, playlist_id, xspf_url=xspf_url,
2010             xspf_base_url=base_url(xspf_url))
2011
2012     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2013         NS_MAP = {
2014             'xspf': 'http://xspf.org/ns/0/',
2015             's1': 'http://static.streamone.nl/player/ns/0',
2016         }
2017
2018         entries = []
2019         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2020             title = xpath_text(
2021                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2022             description = xpath_text(
2023                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2024             thumbnail = xpath_text(
2025                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2026             duration = float_or_none(
2027                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2028
2029             formats = []
2030             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2031                 format_url = urljoin(xspf_base_url, location.text)
2032                 if not format_url:
2033                     continue
2034                 formats.append({
2035                     'url': format_url,
2036                     'manifest_url': xspf_url,
2037                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2038                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2039                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2040                 })
2041             self._sort_formats(formats)
2042
2043             entries.append({
2044                 'id': playlist_id,
2045                 'title': title,
2046                 'description': description,
2047                 'thumbnail': thumbnail,
2048                 'duration': duration,
2049                 'formats': formats,
2050             })
2051         return entries
2052
2053     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}, data=None, headers={}, query={}):
2054         res = self._download_xml_handle(
2055             mpd_url, video_id,
2056             note=note or 'Downloading MPD manifest',
2057             errnote=errnote or 'Failed to download MPD manifest',
2058             fatal=fatal, data=data, headers=headers, query=query)
2059         if res is False:
2060             return []
2061         mpd_doc, urlh = res
2062         if mpd_doc is None:
2063             return []
2064         mpd_base_url = base_url(urlh.geturl())
2065
2066         return self._parse_mpd_formats(
2067             mpd_doc, mpd_id=mpd_id, mpd_base_url=mpd_base_url,
2068             formats_dict=formats_dict, mpd_url=mpd_url)
2069
2070     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
2071         """
2072         Parse formats from MPD manifest.
2073         References:
2074          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2075             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2076          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2077         """
2078         if mpd_doc.get('type') == 'dynamic':
2079             return []
2080
2081         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2082
2083         def _add_ns(path):
2084             return self._xpath_ns(path, namespace)
2085
2086         def is_drm_protected(element):
2087             return element.find(_add_ns('ContentProtection')) is not None
2088
2089         def extract_multisegment_info(element, ms_parent_info):
2090             ms_info = ms_parent_info.copy()
2091
2092             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2093             # common attributes and elements.  We will only extract relevant
2094             # for us.
2095             def extract_common(source):
2096                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2097                 if segment_timeline is not None:
2098                     s_e = segment_timeline.findall(_add_ns('S'))
2099                     if s_e:
2100                         ms_info['total_number'] = 0
2101                         ms_info['s'] = []
2102                         for s in s_e:
2103                             r = int(s.get('r', 0))
2104                             ms_info['total_number'] += 1 + r
2105                             ms_info['s'].append({
2106                                 't': int(s.get('t', 0)),
2107                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2108                                 'd': int(s.attrib['d']),
2109                                 'r': r,
2110                             })
2111                 start_number = source.get('startNumber')
2112                 if start_number:
2113                     ms_info['start_number'] = int(start_number)
2114                 timescale = source.get('timescale')
2115                 if timescale:
2116                     ms_info['timescale'] = int(timescale)
2117                 segment_duration = source.get('duration')
2118                 if segment_duration:
2119                     ms_info['segment_duration'] = float(segment_duration)
2120
2121             def extract_Initialization(source):
2122                 initialization = source.find(_add_ns('Initialization'))
2123                 if initialization is not None:
2124                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2125
2126             segment_list = element.find(_add_ns('SegmentList'))
2127             if segment_list is not None:
2128                 extract_common(segment_list)
2129                 extract_Initialization(segment_list)
2130                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2131                 if segment_urls_e:
2132                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2133             else:
2134                 segment_template = element.find(_add_ns('SegmentTemplate'))
2135                 if segment_template is not None:
2136                     extract_common(segment_template)
2137                     media = segment_template.get('media')
2138                     if media:
2139                         ms_info['media'] = media
2140                     initialization = segment_template.get('initialization')
2141                     if initialization:
2142                         ms_info['initialization'] = initialization
2143                     else:
2144                         extract_Initialization(segment_template)
2145             return ms_info
2146
2147         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2148         formats = []
2149         for period in mpd_doc.findall(_add_ns('Period')):
2150             period_duration = parse_duration(period.get('duration')) or mpd_duration
2151             period_ms_info = extract_multisegment_info(period, {
2152                 'start_number': 1,
2153                 'timescale': 1,
2154             })
2155             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2156                 if is_drm_protected(adaptation_set):
2157                     continue
2158                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2159                 for representation in adaptation_set.findall(_add_ns('Representation')):
2160                     if is_drm_protected(representation):
2161                         continue
2162                     representation_attrib = adaptation_set.attrib.copy()
2163                     representation_attrib.update(representation.attrib)
2164                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2165                     mime_type = representation_attrib['mimeType']
2166                     content_type = mime_type.split('/')[0]
2167                     if content_type == 'text':
2168                         # TODO implement WebVTT downloading
2169                         pass
2170                     elif content_type in ('video', 'audio'):
2171                         base_url = ''
2172                         for element in (representation, adaptation_set, period, mpd_doc):
2173                             base_url_e = element.find(_add_ns('BaseURL'))
2174                             if base_url_e is not None:
2175                                 base_url = base_url_e.text + base_url
2176                                 if re.match(r'^https?://', base_url):
2177                                     break
2178                         if mpd_base_url and not re.match(r'^https?://', base_url):
2179                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
2180                                 mpd_base_url += '/'
2181                             base_url = mpd_base_url + base_url
2182                         representation_id = representation_attrib.get('id')
2183                         lang = representation_attrib.get('lang')
2184                         url_el = representation.find(_add_ns('BaseURL'))
2185                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2186                         bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2187                         f = {
2188                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
2189                             'manifest_url': mpd_url,
2190                             'ext': mimetype2ext(mime_type),
2191                             'width': int_or_none(representation_attrib.get('width')),
2192                             'height': int_or_none(representation_attrib.get('height')),
2193                             'tbr': float_or_none(bandwidth, 1000),
2194                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2195                             'fps': int_or_none(representation_attrib.get('frameRate')),
2196                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2197                             'format_note': 'DASH %s' % content_type,
2198                             'filesize': filesize,
2199                             'container': mimetype2ext(mime_type) + '_dash',
2200                         }
2201                         f.update(parse_codecs(representation_attrib.get('codecs')))
2202                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2203
2204                         def prepare_template(template_name, identifiers):
2205                             tmpl = representation_ms_info[template_name]
2206                             # First of, % characters outside $...$ templates
2207                             # must be escaped by doubling for proper processing
2208                             # by % operator string formatting used further (see
2209                             # https://github.com/ytdl-org/youtube-dl/issues/16867).
2210                             t = ''
2211                             in_template = False
2212                             for c in tmpl:
2213                                 t += c
2214                                 if c == '$':
2215                                     in_template = not in_template
2216                                 elif c == '%' and not in_template:
2217                                     t += c
2218                             # Next, $...$ templates are translated to their
2219                             # %(...) counterparts to be used with % operator
2220                             t = t.replace('$RepresentationID$', representation_id)
2221                             t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2222                             t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2223                             t.replace('$$', '$')
2224                             return t
2225
2226                         # @initialization is a regular template like @media one
2227                         # so it should be handled just the same way (see
2228                         # https://github.com/ytdl-org/youtube-dl/issues/11605)
2229                         if 'initialization' in representation_ms_info:
2230                             initialization_template = prepare_template(
2231                                 'initialization',
2232                                 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2233                                 # $Time$ shall not be included for @initialization thus
2234                                 # only $Bandwidth$ remains
2235                                 ('Bandwidth', ))
2236                             representation_ms_info['initialization_url'] = initialization_template % {
2237                                 'Bandwidth': bandwidth,
2238                             }
2239
2240                         def location_key(location):
2241                             return 'url' if re.match(r'^https?://', location) else 'path'
2242
2243                         if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2244
2245                             media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2246                             media_location_key = location_key(media_template)
2247
2248                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2249                             # can't be used at the same time
2250                             if '%(Number' in media_template and 's' not in representation_ms_info:
2251                                 segment_duration = None
2252                                 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2253                                     segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2254                                     representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2255                                 representation_ms_info['fragments'] = [{
2256                                     media_location_key: media_template % {
2257                                         'Number': segment_number,
2258                                         'Bandwidth': bandwidth,
2259                                     },
2260                                     'duration': segment_duration,
2261                                 } for segment_number in range(
2262                                     representation_ms_info['start_number'],
2263                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2264                             else:
2265                                 # $Number*$ or $Time$ in media template with S list available
2266                                 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2267                                 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2268                                 representation_ms_info['fragments'] = []
2269                                 segment_time = 0
2270                                 segment_d = None
2271                                 segment_number = representation_ms_info['start_number']
2272
2273                                 def add_segment_url():
2274                                     segment_url = media_template % {
2275                                         'Time': segment_time,
2276                                         'Bandwidth': bandwidth,
2277                                         'Number': segment_number,
2278                                     }
2279                                     representation_ms_info['fragments'].append({
2280                                         media_location_key: segment_url,
2281                                         'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2282                                     })
2283
2284                                 for num, s in enumerate(representation_ms_info['s']):
2285                                     segment_time = s.get('t') or segment_time
2286                                     segment_d = s['d']
2287                                     add_segment_url()
2288                                     segment_number += 1
2289                                     for r in range(s.get('r', 0)):
2290                                         segment_time += segment_d
2291                                         add_segment_url()
2292                                         segment_number += 1
2293                                     segment_time += segment_d
2294                         elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2295                             # No media template
2296                             # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2297                             # or any YouTube dashsegments video
2298                             fragments = []
2299                             segment_index = 0
2300                             timescale = representation_ms_info['timescale']
2301                             for s in representation_ms_info['s']:
2302                                 duration = float_or_none(s['d'], timescale)
2303                                 for r in range(s.get('r', 0) + 1):
2304                                     segment_uri = representation_ms_info['segment_urls'][segment_index]
2305                                     fragments.append({
2306                                         location_key(segment_uri): segment_uri,
2307                                         'duration': duration,
2308                                     })
2309                                     segment_index += 1
2310                             representation_ms_info['fragments'] = fragments
2311                         elif 'segment_urls' in representation_ms_info:
2312                             # Segment URLs with no SegmentTimeline
2313                             # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2314                             # https://github.com/ytdl-org/youtube-dl/pull/14844
2315                             fragments = []
2316                             segment_duration = float_or_none(
2317                                 representation_ms_info['segment_duration'],
2318                                 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2319                             for segment_url in representation_ms_info['segment_urls']:
2320                                 fragment = {
2321                                     location_key(segment_url): segment_url,
2322                                 }
2323                                 if segment_duration:
2324                                     fragment['duration'] = segment_duration
2325                                 fragments.append(fragment)
2326                             representation_ms_info['fragments'] = fragments
2327                         # If there is a fragments key available then we correctly recognized fragmented media.
2328                         # Otherwise we will assume unfragmented media with direct access. Technically, such
2329                         # assumption is not necessarily correct since we may simply have no support for
2330                         # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2331                         if 'fragments' in representation_ms_info:
2332                             f.update({
2333                                 # NB: mpd_url may be empty when MPD manifest is parsed from a string
2334                                 'url': mpd_url or base_url,
2335                                 'fragment_base_url': base_url,
2336                                 'fragments': [],
2337                                 'protocol': 'http_dash_segments',
2338                             })
2339                             if 'initialization_url' in representation_ms_info:
2340                                 initialization_url = representation_ms_info['initialization_url']
2341                                 if not f.get('url'):
2342                                     f['url'] = initialization_url
2343                                 f['fragments'].append({location_key(initialization_url): initialization_url})
2344                             f['fragments'].extend(representation_ms_info['fragments'])
2345                         else:
2346                             # Assuming direct URL to unfragmented media.
2347                             f['url'] = base_url
2348
2349                         # According to [1, 5.3.5.2, Table 7, page 35] @id of Representation
2350                         # is not necessarily unique within a Period thus formats with
2351                         # the same `format_id` are quite possible. There are numerous examples
2352                         # of such manifests (see https://github.com/ytdl-org/youtube-dl/issues/15111,
2353                         # https://github.com/ytdl-org/youtube-dl/issues/13919)
2354                         full_info = formats_dict.get(representation_id, {}).copy()
2355                         full_info.update(f)
2356                         formats.append(full_info)
2357                     else:
2358                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2359         return formats
2360
2361     def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2362         res = self._download_xml_handle(
2363             ism_url, video_id,
2364             note=note or 'Downloading ISM manifest',
2365             errnote=errnote or 'Failed to download ISM manifest',
2366             fatal=fatal, data=data, headers=headers, query=query)
2367         if res is False:
2368             return []
2369         ism_doc, urlh = res
2370         if ism_doc is None:
2371             return []
2372
2373         return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id)
2374
2375     def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
2376         """
2377         Parse formats from ISM manifest.
2378         References:
2379          1. [MS-SSTR]: Smooth Streaming Protocol,
2380             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2381         """
2382         if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
2383             return []
2384
2385         duration = int(ism_doc.attrib['Duration'])
2386         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2387
2388         formats = []
2389         for stream in ism_doc.findall('StreamIndex'):
2390             stream_type = stream.get('Type')
2391             if stream_type not in ('video', 'audio'):
2392                 continue
2393             url_pattern = stream.attrib['Url']
2394             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2395             stream_name = stream.get('Name')
2396             for track in stream.findall('QualityLevel'):
2397                 fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None)
2398                 # TODO: add support for WVC1 and WMAP
2399                 if fourcc not in ('H264', 'AVC1', 'AACL'):
2400                     self.report_warning('%s is not a supported codec' % fourcc)
2401                     continue
2402                 tbr = int(track.attrib['Bitrate']) // 1000
2403                 # [1] does not mention Width and Height attributes. However,
2404                 # they're often present while MaxWidth and MaxHeight are
2405                 # missing, so should be used as fallbacks
2406                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2407                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2408                 sampling_rate = int_or_none(track.get('SamplingRate'))
2409
2410                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2411                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2412
2413                 fragments = []
2414                 fragment_ctx = {
2415                     'time': 0,
2416                 }
2417                 stream_fragments = stream.findall('c')
2418                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2419                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2420                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2421                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2422                     if not fragment_ctx['duration']:
2423                         try:
2424                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2425                         except IndexError:
2426                             next_fragment_time = duration
2427                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2428                     for _ in range(fragment_repeat):
2429                         fragments.append({
2430                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2431                             'duration': fragment_ctx['duration'] / stream_timescale,
2432                         })
2433                         fragment_ctx['time'] += fragment_ctx['duration']
2434
2435                 format_id = []
2436                 if ism_id:
2437                     format_id.append(ism_id)
2438                 if stream_name:
2439                     format_id.append(stream_name)
2440                 format_id.append(compat_str(tbr))
2441
2442                 formats.append({
2443                     'format_id': '-'.join(format_id),
2444                     'url': ism_url,
2445                     'manifest_url': ism_url,
2446                     'ext': 'ismv' if stream_type == 'video' else 'isma',
2447                     'width': width,
2448                     'height': height,
2449                     'tbr': tbr,
2450                     'asr': sampling_rate,
2451                     'vcodec': 'none' if stream_type == 'audio' else fourcc,
2452                     'acodec': 'none' if stream_type == 'video' else fourcc,
2453                     'protocol': 'ism',
2454                     'fragments': fragments,
2455                     '_download_params': {
2456                         'duration': duration,
2457                         'timescale': stream_timescale,
2458                         'width': width or 0,
2459                         'height': height or 0,
2460                         'fourcc': fourcc,
2461                         'codec_private_data': track.get('CodecPrivateData'),
2462                         'sampling_rate': sampling_rate,
2463                         'channels': int_or_none(track.get('Channels', 2)),
2464                         'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2465                         'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2466                     },
2467                 })
2468         return formats
2469
2470     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None):
2471         def absolute_url(item_url):
2472             return urljoin(base_url, item_url)
2473
2474         def parse_content_type(content_type):
2475             if not content_type:
2476                 return {}
2477             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2478             if ctr:
2479                 mimetype, codecs = ctr.groups()
2480                 f = parse_codecs(codecs)
2481                 f['ext'] = mimetype2ext(mimetype)
2482                 return f
2483             return {}
2484
2485         def _media_formats(src, cur_media_type, type_info={}):
2486             full_url = absolute_url(src)
2487             ext = type_info.get('ext') or determine_ext(full_url)
2488             if ext == 'm3u8':
2489                 is_plain_url = False
2490                 formats = self._extract_m3u8_formats(
2491                     full_url, video_id, ext='mp4',
2492                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2493                     preference=preference, fatal=False)
2494             elif ext == 'mpd':
2495                 is_plain_url = False
2496                 formats = self._extract_mpd_formats(
2497                     full_url, video_id, mpd_id=mpd_id, fatal=False)
2498             else:
2499                 is_plain_url = True
2500                 formats = [{
2501                     'url': full_url,
2502                     'vcodec': 'none' if cur_media_type == 'audio' else None,
2503                 }]
2504             return is_plain_url, formats
2505
2506         entries = []
2507         # amp-video and amp-audio are very similar to their HTML5 counterparts
2508         # so we wll include them right here (see
2509         # https://www.ampproject.org/docs/reference/components/amp-video)
2510         media_tags = [(media_tag, media_type, '')
2511                       for media_tag, media_type
2512                       in re.findall(r'(?s)(<(?:amp-)?(video|audio)[^>]*/>)', webpage)]
2513         media_tags.extend(re.findall(
2514             # We only allow video|audio followed by a whitespace or '>'.
2515             # Allowing more characters may end up in significant slow down (see
2516             # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
2517             # http://www.porntrex.com/maps/videositemap.xml).
2518             r'(?s)(<(?P<tag>(?:amp-)?(?:video|audio))(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
2519         for media_tag, media_type, media_content in media_tags:
2520             media_info = {
2521                 'formats': [],
2522                 'subtitles': {},
2523             }
2524             media_attributes = extract_attributes(media_tag)
2525             src = strip_or_none(media_attributes.get('src'))
2526             if src:
2527                 _, formats = _media_formats(src, media_type)
2528                 media_info['formats'].extend(formats)
2529             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
2530             if media_content:
2531                 for source_tag in re.findall(r'<source[^>]+>', media_content):
2532                     s_attr = extract_attributes(source_tag)
2533                     # data-video-src and data-src are non standard but seen
2534                     # several times in the wild
2535                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
2536                     if not src:
2537                         continue
2538                     f = parse_content_type(s_attr.get('type'))
2539                     is_plain_url, formats = _media_formats(src, media_type, f)
2540                     if is_plain_url:
2541                         # width, height, res, label and title attributes are
2542                         # all not standard but seen several times in the wild
2543                         labels = [
2544                             s_attr.get(lbl)
2545                             for lbl in ('label', 'title')
2546                             if str_or_none(s_attr.get(lbl))
2547                         ]
2548                         width = int_or_none(s_attr.get('width'))
2549                         height = (int_or_none(s_attr.get('height'))
2550                                   or int_or_none(s_attr.get('res')))
2551                         if not width or not height:
2552                             for lbl in labels:
2553                                 resolution = parse_resolution(lbl)
2554                                 if not resolution:
2555                                     continue
2556                                 width = width or resolution.get('width')
2557                                 height = height or resolution.get('height')
2558                         for lbl in labels:
2559                             tbr = parse_bitrate(lbl)
2560                             if tbr:
2561                                 break
2562                         else:
2563                             tbr = None
2564                         f.update({
2565                             'width': width,
2566                             'height': height,
2567                             'tbr': tbr,
2568                             'format_id': s_attr.get('label') or s_attr.get('title'),
2569                         })
2570                         f.update(formats[0])
2571                         media_info['formats'].append(f)
2572                     else:
2573                         media_info['formats'].extend(formats)
2574                 for track_tag in re.findall(r'<track[^>]+>', media_content):
2575                     track_attributes = extract_attributes(track_tag)
2576                     kind = track_attributes.get('kind')
2577                     if not kind or kind in ('subtitles', 'captions'):
2578                         src = strip_or_none(track_attributes.get('src'))
2579                         if not src:
2580                             continue
2581                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2582                         media_info['subtitles'].setdefault(lang, []).append({
2583                             'url': absolute_url(src),
2584                         })
2585             for f in media_info['formats']:
2586                 f.setdefault('http_headers', {})['Referer'] = base_url
2587             if media_info['formats'] or media_info['subtitles']:
2588                 entries.append(media_info)
2589         return entries
2590
2591     def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
2592         formats = []
2593         hdcore_sign = 'hdcore=3.7.0'
2594         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
2595         hds_host = hosts.get('hds')
2596         if hds_host:
2597             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
2598         if 'hdcore=' not in f4m_url:
2599             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2600         f4m_formats = self._extract_f4m_formats(
2601             f4m_url, video_id, f4m_id='hds', fatal=False)
2602         for entry in f4m_formats:
2603             entry.update({'extra_param_to_segment_url': hdcore_sign})
2604         formats.extend(f4m_formats)
2605         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2606         hls_host = hosts.get('hls')
2607         if hls_host:
2608             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
2609         formats.extend(self._extract_m3u8_formats(
2610             m3u8_url, video_id, 'mp4', 'm3u8_native',
2611             m3u8_id='hls', fatal=False))
2612         return formats
2613
2614     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
2615         query = compat_urlparse.urlparse(url).query
2616         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
2617         mobj = re.search(
2618             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
2619         url_base = mobj.group('url')
2620         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
2621         formats = []
2622
2623         def manifest_url(manifest):
2624             m_url = '%s/%s' % (http_base_url, manifest)
2625             if query:
2626                 m_url += '?%s' % query
2627             return m_url
2628
2629         if 'm3u8' not in skip_protocols:
2630             formats.extend(self._extract_m3u8_formats(
2631                 manifest_url('playlist.m3u8'), video_id, 'mp4',
2632                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2633         if 'f4m' not in skip_protocols:
2634             formats.extend(self._extract_f4m_formats(
2635                 manifest_url('manifest.f4m'),
2636                 video_id, f4m_id='hds', fatal=False))
2637         if 'dash' not in skip_protocols:
2638             formats.extend(self._extract_mpd_formats(
2639                 manifest_url('manifest.mpd'),
2640                 video_id, mpd_id='dash', fatal=False))
2641         if re.search(r'(?:/smil:|\.smil)', url_base):
2642             if 'smil' not in skip_protocols:
2643                 rtmp_formats = self._extract_smil_formats(
2644                     manifest_url('jwplayer.smil'),
2645                     video_id, fatal=False)
2646                 for rtmp_format in rtmp_formats:
2647                     rtsp_format = rtmp_format.copy()
2648                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2649                     del rtsp_format['play_path']
2650                     del rtsp_format['ext']
2651                     rtsp_format.update({
2652                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2653                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
2654                         'protocol': 'rtsp',
2655                     })
2656                     formats.extend([rtmp_format, rtsp_format])
2657         else:
2658             for protocol in ('rtmp', 'rtsp'):
2659                 if protocol not in skip_protocols:
2660                     formats.append({
2661                         'url': '%s:%s' % (protocol, url_base),
2662                         'format_id': protocol,
2663                         'protocol': protocol,
2664                     })
2665         return formats
2666
2667     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
2668         mobj = re.search(
2669             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
2670             webpage)
2671         if mobj:
2672             try:
2673                 jwplayer_data = self._parse_json(mobj.group('options'),
2674                                                  video_id=video_id,
2675                                                  transform_source=transform_source)
2676             except ExtractorError:
2677                 pass
2678             else:
2679                 if isinstance(jwplayer_data, dict):
2680                     return jwplayer_data
2681
2682     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
2683         jwplayer_data = self._find_jwplayer_data(
2684             webpage, video_id, transform_source=js_to_json)
2685         return self._parse_jwplayer_data(
2686             jwplayer_data, video_id, *args, **kwargs)
2687
2688     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
2689                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2690         # JWPlayer backward compatibility: flattened playlists
2691         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
2692         if 'playlist' not in jwplayer_data:
2693             jwplayer_data = {'playlist': [jwplayer_data]}
2694
2695         entries = []
2696
2697         # JWPlayer backward compatibility: single playlist item
2698         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
2699         if not isinstance(jwplayer_data['playlist'], list):
2700             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
2701
2702         for video_data in jwplayer_data['playlist']:
2703             # JWPlayer backward compatibility: flattened sources
2704             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
2705             if 'sources' not in video_data:
2706                 video_data['sources'] = [video_data]
2707
2708             this_video_id = video_id or video_data['mediaid']
2709
2710             formats = self._parse_jwplayer_formats(
2711                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
2712                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
2713
2714             subtitles = {}
2715             tracks = video_data.get('tracks')
2716             if tracks and isinstance(tracks, list):
2717                 for track in tracks:
2718                     if not isinstance(track, dict):
2719                         continue
2720                     track_kind = track.get('kind')
2721                     if not track_kind or not isinstance(track_kind, compat_str):
2722                         continue
2723                     if track_kind.lower() not in ('captions', 'subtitles'):
2724                         continue
2725                     track_url = urljoin(base_url, track.get('file'))
2726                     if not track_url:
2727                         continue
2728                     subtitles.setdefault(track.get('label') or 'en', []).append({
2729                         'url': self._proto_relative_url(track_url)
2730                     })
2731
2732             entry = {
2733                 'id': this_video_id,
2734                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
2735                 'description': clean_html(video_data.get('description')),
2736                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
2737                 'timestamp': int_or_none(video_data.get('pubdate')),
2738                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
2739                 'subtitles': subtitles,
2740             }
2741             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
2742             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
2743                 entry.update({
2744                     '_type': 'url_transparent',
2745                     'url': formats[0]['url'],
2746                 })
2747             else:
2748                 self._sort_formats(formats)
2749                 entry['formats'] = formats
2750             entries.append(entry)
2751         if len(entries) == 1:
2752             return entries[0]
2753         else:
2754             return self.playlist_result(entries)
2755
2756     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
2757                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2758         urls = []
2759         formats = []
2760         for source in jwplayer_sources_data:
2761             if not isinstance(source, dict):
2762                 continue
2763             source_url = urljoin(
2764                 base_url, self._proto_relative_url(source.get('file')))
2765             if not source_url or source_url in urls:
2766                 continue
2767             urls.append(source_url)
2768             source_type = source.get('type') or ''
2769             ext = mimetype2ext(source_type) or determine_ext(source_url)
2770             if source_type == 'hls' or ext == 'm3u8':
2771                 formats.extend(self._extract_m3u8_formats(
2772                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
2773                     m3u8_id=m3u8_id, fatal=False))
2774             elif source_type == 'dash' or ext == 'mpd':
2775                 formats.extend(self._extract_mpd_formats(
2776                     source_url, video_id, mpd_id=mpd_id, fatal=False))
2777             elif ext == 'smil':
2778                 formats.extend(self._extract_smil_formats(
2779                     source_url, video_id, fatal=False))
2780             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
2781             elif source_type.startswith('audio') or ext in (
2782                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
2783                 formats.append({
2784                     'url': source_url,
2785                     'vcodec': 'none',
2786                     'ext': ext,
2787                 })
2788             else:
2789                 height = int_or_none(source.get('height'))
2790                 if height is None:
2791                     # Often no height is provided but there is a label in
2792                     # format like "1080p", "720p SD", or 1080.
2793                     height = int_or_none(self._search_regex(
2794                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
2795                         'height', default=None))
2796                 a_format = {
2797                     'url': source_url,
2798                     'width': int_or_none(source.get('width')),
2799                     'height': height,
2800                     'tbr': int_or_none(source.get('bitrate')),
2801                     'ext': ext,
2802                 }
2803                 if source_url.startswith('rtmp'):
2804                     a_format['ext'] = 'flv'
2805                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
2806                     # of jwplayer.flash.swf
2807                     rtmp_url_parts = re.split(
2808                         r'((?:mp4|mp3|flv):)', source_url, 1)
2809                     if len(rtmp_url_parts) == 3:
2810                         rtmp_url, prefix, play_path = rtmp_url_parts
2811                         a_format.update({
2812                             'url': rtmp_url,
2813                             'play_path': prefix + play_path,
2814                         })
2815                     if rtmp_params:
2816                         a_format.update(rtmp_params)
2817                 formats.append(a_format)
2818         return formats
2819
2820     def _live_title(self, name):
2821         """ Generate the title for a live video """
2822         now = datetime.datetime.now()
2823         now_str = now.strftime('%Y-%m-%d %H:%M')
2824         return name + ' ' + now_str
2825
2826     def _int(self, v, name, fatal=False, **kwargs):
2827         res = int_or_none(v, **kwargs)
2828         if 'get_attr' in kwargs:
2829             print(getattr(v, kwargs['get_attr']))
2830         if res is None:
2831             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2832             if fatal:
2833                 raise ExtractorError(msg)
2834             else:
2835                 self._downloader.report_warning(msg)
2836         return res
2837
2838     def _float(self, v, name, fatal=False, **kwargs):
2839         res = float_or_none(v, **kwargs)
2840         if res is None:
2841             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2842             if fatal:
2843                 raise ExtractorError(msg)
2844             else:
2845                 self._downloader.report_warning(msg)
2846         return res
2847
2848     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
2849                     path='/', secure=False, discard=False, rest={}, **kwargs):
2850         cookie = compat_cookiejar_Cookie(
2851             0, name, value, port, port is not None, domain, True,
2852             domain.startswith('.'), path, True, secure, expire_time,
2853             discard, None, None, rest)
2854         self._downloader.cookiejar.set_cookie(cookie)
2855
2856     def _get_cookies(self, url):
2857         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
2858         req = sanitized_Request(url)
2859         self._downloader.cookiejar.add_cookie_header(req)
2860         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
2861
2862     def _apply_first_set_cookie_header(self, url_handle, cookie):
2863         """
2864         Apply first Set-Cookie header instead of the last. Experimental.
2865
2866         Some sites (e.g. [1-3]) may serve two cookies under the same name
2867         in Set-Cookie header and expect the first (old) one to be set rather
2868         than second (new). However, as of RFC6265 the newer one cookie
2869         should be set into cookie store what actually happens.
2870         We will workaround this issue by resetting the cookie to
2871         the first one manually.
2872         1. https://new.vk.com/
2873         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
2874         3. https://learning.oreilly.com/
2875         """
2876         for header, cookies in url_handle.headers.items():
2877             if header.lower() != 'set-cookie':
2878                 continue
2879             if sys.version_info[0] >= 3:
2880                 cookies = cookies.encode('iso-8859-1')
2881             cookies = cookies.decode('utf-8')
2882             cookie_value = re.search(
2883                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
2884             if cookie_value:
2885                 value, domain = cookie_value.groups()
2886                 self._set_cookie(domain, cookie, value)
2887                 break
2888
2889     def get_testcases(self, include_onlymatching=False):
2890         t = getattr(self, '_TEST', None)
2891         if t:
2892             assert not hasattr(self, '_TESTS'), \
2893                 '%s has _TEST and _TESTS' % type(self).__name__
2894             tests = [t]
2895         else:
2896             tests = getattr(self, '_TESTS', [])
2897         for t in tests:
2898             if not include_onlymatching and t.get('only_matching', False):
2899                 continue
2900             t['name'] = type(self).__name__[:-len('IE')]
2901             yield t
2902
2903     def is_suitable(self, age_limit):
2904         """ Test whether the extractor is generally suitable for the given
2905         age limit (i.e. pornographic sites are not, all others usually are) """
2906
2907         any_restricted = False
2908         for tc in self.get_testcases(include_onlymatching=False):
2909             if tc.get('playlist', []):
2910                 tc = tc['playlist'][0]
2911             is_restricted = age_restricted(
2912                 tc.get('info_dict', {}).get('age_limit'), age_limit)
2913             if not is_restricted:
2914                 return True
2915             any_restricted = any_restricted or is_restricted
2916         return not any_restricted
2917
2918     def extract_subtitles(self, *args, **kwargs):
2919         if (self._downloader.params.get('writesubtitles', False)
2920                 or self._downloader.params.get('listsubtitles')):
2921             return self._get_subtitles(*args, **kwargs)
2922         return {}
2923
2924     def _get_subtitles(self, *args, **kwargs):
2925         raise NotImplementedError('This method must be implemented by subclasses')
2926
2927     @staticmethod
2928     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
2929         """ Merge subtitle items for one language. Items with duplicated URLs
2930         will be dropped. """
2931         list1_urls = set([item['url'] for item in subtitle_list1])
2932         ret = list(subtitle_list1)
2933         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
2934         return ret
2935
2936     @classmethod
2937     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
2938         """ Merge two subtitle dictionaries, language by language. """
2939         ret = dict(subtitle_dict1)
2940         for lang in subtitle_dict2:
2941             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
2942         return ret
2943
2944     def extract_automatic_captions(self, *args, **kwargs):
2945         if (self._downloader.params.get('writeautomaticsub', False)
2946                 or self._downloader.params.get('listsubtitles')):
2947             return self._get_automatic_captions(*args, **kwargs)
2948         return {}
2949
2950     def _get_automatic_captions(self, *args, **kwargs):
2951         raise NotImplementedError('This method must be implemented by subclasses')
2952
2953     def mark_watched(self, *args, **kwargs):
2954         if (self._downloader.params.get('mark_watched', False)
2955                 and (self._get_login_info()[0] is not None
2956                      or self._downloader.params.get('cookiefile') is not None)):
2957             self._mark_watched(*args, **kwargs)
2958
2959     def _mark_watched(self, *args, **kwargs):
2960         raise NotImplementedError('This method must be implemented by subclasses')
2961
2962     def geo_verification_headers(self):
2963         headers = {}
2964         geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
2965         if geo_verification_proxy:
2966             headers['Ytdl-request-proxy'] = geo_verification_proxy
2967         return headers
2968
2969     def _generic_id(self, url):
2970         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
2971
2972     def _generic_title(self, url):
2973         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
2974
2975
2976 class SearchInfoExtractor(InfoExtractor):
2977     """
2978     Base class for paged search queries extractors.
2979     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
2980     Instances should define _SEARCH_KEY and _MAX_RESULTS.
2981     """
2982
2983     @classmethod
2984     def _make_valid_url(cls):
2985         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
2986
2987     @classmethod
2988     def suitable(cls, url):
2989         return re.match(cls._make_valid_url(), url) is not None
2990
2991     def _real_extract(self, query):
2992         mobj = re.match(self._make_valid_url(), query)
2993         if mobj is None:
2994             raise ExtractorError('Invalid search query "%s"' % query)
2995
2996         prefix = mobj.group('prefix')
2997         query = mobj.group('query')
2998         if prefix == '':
2999             return self._get_n_results(query, 1)
3000         elif prefix == 'all':
3001             return self._get_n_results(query, self._MAX_RESULTS)
3002         else:
3003             n = int(prefix)
3004             if n <= 0:
3005                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
3006             elif n > self._MAX_RESULTS:
3007                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3008                 n = self._MAX_RESULTS
3009             return self._get_n_results(query, n)
3010
3011     def _get_n_results(self, query, n):
3012         """Get a specified number of results for a query"""
3013         raise NotImplementedError('This method must be implemented by subclasses')
3014
3015     @property
3016     def SEARCH_KEY(self):
3017         return self._SEARCH_KEY