youtube_dl/extractor/youtube.py

   1 # coding: utf-8
   2
   3 from __future__ import unicode_literals
   4
   5
   6 import itertools
   7 import json
   8 import os.path
   9 import re
  10 import time
  11 import traceback
  12
  13 from .common import InfoExtractor, SearchInfoExtractor
  14 from ..jsinterp import JSInterpreter
  15 from ..swfinterp import SWFInterpreter
  16 from ..compat import (
  17     compat_chr,
  18     compat_parse_qs,
  19     compat_urllib_parse,
  20     compat_urllib_parse_unquote,
  21     compat_urllib_parse_unquote_plus,
  22     compat_urllib_parse_urlparse,
  23     compat_urllib_request,
  24     compat_urlparse,
  25     compat_str,
  26 )
  27 from ..utils import (
  28     clean_html,
  29     ExtractorError,
  30     float_or_none,
  31     get_element_by_attribute,
  32     get_element_by_id,
  33     int_or_none,
  34     orderedSet,
  35     parse_duration,
  36     smuggle_url,
  37     str_to_int,
  38     unescapeHTML,
  39     unified_strdate,
  40     unsmuggle_url,
  41     uppercase_escape,
  42     ISO3166Utils,
  43 )
  44
  45
  46 class YoutubeBaseInfoExtractor(InfoExtractor):
  47     """Provide base functions for Youtube extractors"""
  48     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
  49     _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
  50     _NETRC_MACHINE = 'youtube'
  51     # If True it will raise an error if no login info is provided
  52     _LOGIN_REQUIRED = False
  53
  54     def _set_language(self):
  55         self._set_cookie(
  56             '.youtube.com', 'PREF', 'f1=50000000&hl=en',
  57             # YouTube sets the expire time to about two months
  58             expire_time=time.time() + 2 * 30 * 24 * 3600)
  59
  60     def _ids_to_results(self, ids):
  61         return [
  62             self.url_result(vid_id, 'Youtube', video_id=vid_id)
  63             for vid_id in ids]
  64
  65     def _login(self):
  66         """
  67         Attempt to log in to YouTube.
  68         True is returned if successful or skipped.
  69         False is returned if login failed.
  70
  71         If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
  72         """
  73         (username, password) = self._get_login_info()
  74         # No authentication to be performed
  75         if username is None:
  76             if self._LOGIN_REQUIRED:
  77                 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
  78             return True
  79
  80         login_page = self._download_webpage(
  81             self._LOGIN_URL, None,
  82             note='Downloading login page',
  83             errnote='unable to fetch login page', fatal=False)
  84         if login_page is False:
  85             return
  86
  87         galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
  88                                   login_page, 'Login GALX parameter')
  89
  90         # Log in
  91         login_form_strs = {
  92             'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
  93             'Email': username,
  94             'GALX': galx,
  95             'Passwd': password,
  96
  97             'PersistentCookie': 'yes',
  98             '_utf8': '霱',
  99             'bgresponse': 'js_disabled',
 100             'checkConnection': '',
 101             'checkedDomains': 'youtube',
 102             'dnConn': '',
 103             'pstMsg': '0',
 104             'rmShown': '1',
 105             'secTok': '',
 106             'signIn': 'Sign in',
 107             'timeStmp': '',
 108             'service': 'youtube',
 109             'uilel': '3',
 110             'hl': 'en_US',
 111         }
 112
 113         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 114         # chokes on unicode
 115         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items())
 116         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 117
 118         req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 119         login_results = self._download_webpage(
 120             req, None,
 121             note='Logging in', errnote='unable to log in', fatal=False)
 122         if login_results is False:
 123             return False
 124
 125         if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
 126             raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
 127
 128         # Two-Factor
 129         # TODO add SMS and phone call support - these require making a request and then prompting the user
 130
 131         if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
 132             tfa_code = self._get_tfa_info()
 133
 134             if tfa_code is None:
 135                 self._downloader.report_warning('Two-factor authentication required. Provide it with --twofactor <code>')
 136                 self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)')
 137                 return False
 138
 139             # Unlike the first login form, secTok and timeStmp are both required for the TFA form
 140
 141             match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
 142             if match is None:
 143                 self._downloader.report_warning('Failed to get secTok - did the page structure change?')
 144             secTok = match.group(1)
 145             match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
 146             if match is None:
 147                 self._downloader.report_warning('Failed to get timeStmp - did the page structure change?')
 148             timeStmp = match.group(1)
 149
 150             tfa_form_strs = {
 151                 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 152                 'smsToken': '',
 153                 'smsUserPin': tfa_code,
 154                 'smsVerifyPin': 'Verify',
 155
 156                 'PersistentCookie': 'yes',
 157                 'checkConnection': '',
 158                 'checkedDomains': 'youtube',
 159                 'pstMsg': '1',
 160                 'secTok': secTok,
 161                 'timeStmp': timeStmp,
 162                 'service': 'youtube',
 163                 'hl': 'en_US',
 164             }
 165             tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in tfa_form_strs.items())
 166             tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
 167
 168             tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
 169             tfa_results = self._download_webpage(
 170                 tfa_req, None,
 171                 note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
 172
 173             if tfa_results is False:
 174                 return False
 175
 176             if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
 177                 self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.')
 178                 return False
 179             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
 180                 self._downloader.report_warning('unable to log in - did the page structure change?')
 181                 return False
 182             if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
 183                 self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
 184                 return False
 185
 186         if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 187             self._downloader.report_warning('unable to log in: bad username or password')
 188             return False
 189         return True
 190
 191     def _real_initialize(self):
 192         if self._downloader is None:
 193             return
 194         self._set_language()
 195         if not self._login():
 196             return
 197
 198
 199 class YoutubeIE(YoutubeBaseInfoExtractor):
 200     IE_DESC = 'YouTube.com'
 201     _VALID_URL = r"""(?x)^
 202                      (
 203                          (?:https?://|//)                                    # http(s):// or protocol-independent URL
 204                          (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
 205                             (?:www\.)?deturl\.com/www\.youtube\.com/|
 206                             (?:www\.)?pwnyoutube\.com/|
 207                             (?:www\.)?yourepeat\.com/|
 208                             tube\.majestyc\.net/|
 209                             youtube\.googleapis\.com/)                        # the various hostnames, with wildcard subdomains
 210                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 211                          (?:                                                  # the various things that can precede the ID:
 212                              (?:(?:v|embed|e)/(?!videoseries))                # v/ or embed/ or e/
 213                              |(?:                                             # or the v= param in all its forms
 214                                  (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)?  # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 215                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 216                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 217                                  v=
 218                              )
 219                          ))
 220                          |youtu\.be/                                          # just youtu.be/xxxx
 221                          |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
 222                          )
 223                      )?                                                       # all until now is optional -> you can pass the naked ID
 224                      ([0-9A-Za-z_-]{11})                                      # here is it! the YouTube video ID
 225                      (?!.*?&list=)                                            # combined list/video URLs are handled by the playlist IE
 226                      (?(1).+)?                                                # if we found the ID, everything can follow
 227                      $"""
 228     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 229     _formats = {
 230         '5': {'ext': 'flv', 'width': 400, 'height': 240},
 231         '6': {'ext': 'flv', 'width': 450, 'height': 270},
 232         '13': {'ext': '3gp'},
 233         '17': {'ext': '3gp', 'width': 176, 'height': 144},
 234         '18': {'ext': 'mp4', 'width': 640, 'height': 360},
 235         '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
 236         '34': {'ext': 'flv', 'width': 640, 'height': 360},
 237         '35': {'ext': 'flv', 'width': 854, 'height': 480},
 238         '36': {'ext': '3gp', 'width': 320, 'height': 240},
 239         '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
 240         '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
 241         '43': {'ext': 'webm', 'width': 640, 'height': 360},
 242         '44': {'ext': 'webm', 'width': 854, 'height': 480},
 243         '45': {'ext': 'webm', 'width': 1280, 'height': 720},
 244         '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
 245         '59': {'ext': 'mp4', 'width': 854, 'height': 480},
 246         '78': {'ext': 'mp4', 'width': 854, 'height': 480},
 247
 248
 249         # 3d videos
 250         '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
 251         '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
 252         '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
 253         '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
 254         '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
 255         '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
 256         '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
 257
 258         # Apple HTTP Live Streaming
 259         '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
 260         '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
 261         '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
 262         '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
 263         '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
 264         '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
 265         '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
 266
 267         # DASH mp4 video
 268         '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 269         '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 270         '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 271         '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 272         '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 273         '138': {'ext': 'mp4', 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},  # Height can vary (https://github.com/rg3/youtube-dl/issues/4559)
 274         '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 275         '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 276         '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
 277         '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
 278         '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'},
 279
 280         # Dash mp4 audio
 281         '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 48, 'preference': -50, 'container': 'm4a_dash'},
 282         '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 128, 'preference': -50, 'container': 'm4a_dash'},
 283         '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'},
 284
 285         # Dash webm
 286         '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
 287         '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
 288         '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
 289         '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
 290         '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
 291         '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
 292         '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'vp9'},
 293         '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 294         '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 295         '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 296         '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 297         '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 298         '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 299         '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 300         '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 301         '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 302         '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'},
 303         '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'},
 304         '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'},
 305         '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'vp9'},
 306         '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'},
 307
 308         # Dash webm audio
 309         '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
 310         '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
 311
 312         # Dash webm audio with opus inside
 313         '249': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},
 314         '250': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},
 315         '251': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},
 316
 317         # RTMP (unnamed)
 318         '_rtmp': {'protocol': 'rtmp'},
 319     }
 320
 321     IE_NAME = 'youtube'
 322     _TESTS = [
 323         {
 324             'url': 'http://www.youtube.com/watch?v=BaW_jenozKcj&t=1s&end=9',
 325             'info_dict': {
 326                 'id': 'BaW_jenozKc',
 327                 'ext': 'mp4',
 328                 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
 329                 'uploader': 'Philipp Hagemeister',
 330                 'uploader_id': 'phihag',
 331                 'upload_date': '20121002',
 332                 'description': 'test chars:  "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
 333                 'categories': ['Science & Technology'],
 334                 'tags': ['youtube-dl'],
 335                 'like_count': int,
 336                 'dislike_count': int,
 337                 'start_time': 1,
 338                 'end_time': 9,
 339             }
 340         },
 341         {
 342             'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
 343             'note': 'Test generic use_cipher_signature video (#897)',
 344             'info_dict': {
 345                 'id': 'UxxajLWwzqY',
 346                 'ext': 'mp4',
 347                 'upload_date': '20120506',
 348                 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
 349                 'description': 'md5:782e8651347686cba06e58f71ab51773',
 350                 'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli',
 351                          'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop',
 352                          'iconic ep', 'iconic', 'love', 'it'],
 353                 'uploader': 'Icona Pop',
 354                 'uploader_id': 'IconaPop',
 355             }
 356         },
 357         {
 358             'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
 359             'note': 'Test VEVO video with age protection (#956)',
 360             'info_dict': {
 361                 'id': '07FYdnEawAQ',
 362                 'ext': 'mp4',
 363                 'upload_date': '20130703',
 364                 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
 365                 'description': 'md5:64249768eec3bc4276236606ea996373',
 366                 'uploader': 'justintimberlakeVEVO',
 367                 'uploader_id': 'justintimberlakeVEVO',
 368             }
 369         },
 370         {
 371             'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
 372             'note': 'Embed-only video (#1746)',
 373             'info_dict': {
 374                 'id': 'yZIXLfi8CZQ',
 375                 'ext': 'mp4',
 376                 'upload_date': '20120608',
 377                 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
 378                 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
 379                 'uploader': 'SET India',
 380                 'uploader_id': 'setindia'
 381             }
 382         },
 383         {
 384             'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
 385             'note': '256k DASH audio (format 141) via DASH manifest',
 386             'info_dict': {
 387                 'id': 'a9LDPn-MO4I',
 388                 'ext': 'm4a',
 389                 'upload_date': '20121002',
 390                 'uploader_id': '8KVIDEO',
 391                 'description': '',
 392                 'uploader': '8KVIDEO',
 393                 'title': 'UHDTV TEST 8K VIDEO.mp4'
 394             },
 395             'params': {
 396                 'youtube_include_dash_manifest': True,
 397                 'format': '141',
 398             },
 399         },
 400         # DASH manifest with encrypted signature
 401         {
 402             'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
 403             'info_dict': {
 404                 'id': 'IB3lcPjvWLA',
 405                 'ext': 'm4a',
 406                 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',
 407                 'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d',
 408                 'uploader': 'AfrojackVEVO',
 409                 'uploader_id': 'AfrojackVEVO',
 410                 'upload_date': '20131011',
 411             },
 412             'params': {
 413                 'youtube_include_dash_manifest': True,
 414                 'format': '141',
 415             },
 416         },
 417         # JS player signature function name containing $
 418         {
 419             'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM',
 420             'info_dict': {
 421                 'id': 'nfWlot6h_JM',
 422                 'ext': 'm4a',
 423                 'title': 'Taylor Swift - Shake It Off',
 424                 'description': 'md5:2acfda1b285bdd478ccec22f9918199d',
 425                 'uploader': 'TaylorSwiftVEVO',
 426                 'uploader_id': 'TaylorSwiftVEVO',
 427                 'upload_date': '20140818',
 428             },
 429             'params': {
 430                 'youtube_include_dash_manifest': True,
 431                 'format': '141',
 432             },
 433         },
 434         # Controversy video
 435         {
 436             'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
 437             'info_dict': {
 438                 'id': 'T4XJQO3qol8',
 439                 'ext': 'mp4',
 440                 'upload_date': '20100909',
 441                 'uploader': 'The Amazing Atheist',
 442                 'uploader_id': 'TheAmazingAtheist',
 443                 'title': 'Burning Everyone\'s Koran',
 444                 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
 445             }
 446         },
 447         # Normal age-gate video (No vevo, embed allowed)
 448         {
 449             'url': 'http://youtube.com/watch?v=HtVdAasjOgU',
 450             'info_dict': {
 451                 'id': 'HtVdAasjOgU',
 452                 'ext': 'mp4',
 453                 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
 454                 'description': 're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
 455                 'uploader': 'The Witcher',
 456                 'uploader_id': 'WitcherGame',
 457                 'upload_date': '20140605',
 458             },
 459         },
 460         # Age-gate video with encrypted signature
 461         {
 462             'url': 'http://www.youtube.com/watch?v=6kLq3WMV1nU',
 463             'info_dict': {
 464                 'id': '6kLq3WMV1nU',
 465                 'ext': 'mp4',
 466                 'title': 'Dedication To My Ex (Miss That) (Lyric Video)',
 467                 'description': 'md5:33765bb339e1b47e7e72b5490139bb41',
 468                 'uploader': 'LloydVEVO',
 469                 'uploader_id': 'LloydVEVO',
 470                 'upload_date': '20110629',
 471             },
 472         },
 473         # video_info is None (https://github.com/rg3/youtube-dl/issues/4421)
 474         {
 475             'url': '__2ABJjxzNo',
 476             'info_dict': {
 477                 'id': '__2ABJjxzNo',
 478                 'ext': 'mp4',
 479                 'upload_date': '20100430',
 480                 'uploader_id': 'deadmau5',
 481                 'description': 'md5:12c56784b8032162bb936a5f76d55360',
 482                 'uploader': 'deadmau5',
 483                 'title': 'Deadmau5 - Some Chords (HD)',
 484             },
 485             'expected_warnings': [
 486                 'DASH manifest missing',
 487             ]
 488         },
 489         # Olympics (https://github.com/rg3/youtube-dl/issues/4431)
 490         {
 491             'url': 'lqQg6PlCWgI',
 492             'info_dict': {
 493                 'id': 'lqQg6PlCWgI',
 494                 'ext': 'mp4',
 495                 'upload_date': '20120731',
 496                 'uploader_id': 'olympic',
 497                 'description': 'HO09  - Women -  GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
 498                 'uploader': 'Olympics',
 499                 'title': 'Hockey - Women -  GER-AUS - London 2012 Olympic Games',
 500             },
 501             'params': {
 502                 'skip_download': 'requires avconv',
 503             }
 504         },
 505         # Non-square pixels
 506         {
 507             'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
 508             'info_dict': {
 509                 'id': '_b-2C3KPAM0',
 510                 'ext': 'mp4',
 511                 'stretched_ratio': 16 / 9.,
 512                 'upload_date': '20110310',
 513                 'uploader_id': 'AllenMeow',
 514                 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
 515                 'uploader': '孫艾倫',
 516                 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
 517             },
 518         },
 519         # url_encoded_fmt_stream_map is empty string
 520         {
 521             'url': 'qEJwOuvDf7I',
 522             'info_dict': {
 523                 'id': 'qEJwOuvDf7I',
 524                 'ext': 'mp4',
 525                 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
 526                 'description': '',
 527                 'upload_date': '20150404',
 528                 'uploader_id': 'spbelect',
 529                 'uploader': 'Наблюдатели Петербурга',
 530             },
 531             'params': {
 532                 'skip_download': 'requires avconv',
 533             }
 534         },
 535         # Extraction from multiple DASH manifests (https://github.com/rg3/youtube-dl/pull/6097)
 536         {
 537             'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
 538             'info_dict': {
 539                 'id': 'FIl7x6_3R5Y',
 540                 'ext': 'mp4',
 541                 'title': 'md5:7b81415841e02ecd4313668cde88737a',
 542                 'description': 'md5:116377fd2963b81ec4ce64b542173306',
 543                 'upload_date': '20150625',
 544                 'uploader_id': 'dorappi2000',
 545                 'uploader': 'dorappi2000',
 546                 'formats': 'mincount:33',
 547             },
 548         },
 549         # DASH manifest with segment_list
 550         {
 551             'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
 552             'md5': '8ce563a1d667b599d21064e982ab9e31',
 553             'info_dict': {
 554                 'id': 'CsmdDsKjzN8',
 555                 'ext': 'mp4',
 556                 'upload_date': '20150501',  # According to '<meta itemprop="datePublished"', but in other places it's 20150510
 557                 'uploader': 'Airtek',
 558                 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
 559                 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
 560                 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
 561             },
 562             'params': {
 563                 'youtube_include_dash_manifest': True,
 564                 'format': '135',  # bestvideo
 565             }
 566         },
 567         {
 568             # Multifeed videos (multiple cameras), URL is for Main Camera
 569             'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',
 570             'info_dict': {
 571                 'id': 'jqWvoWXjCVs',
 572                 'title': 'teamPGP: Rocket League Noob Stream',
 573                 'description': 'md5:dc7872fb300e143831327f1bae3af010',
 574             },
 575             'playlist': [{
 576                 'info_dict': {
 577                     'id': 'jqWvoWXjCVs',
 578                     'ext': 'mp4',
 579                     'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',
 580                     'description': 'md5:dc7872fb300e143831327f1bae3af010',
 581                     'upload_date': '20150721',
 582                     'uploader': 'Beer Games Beer',
 583                     'uploader_id': 'beergamesbeer',
 584                 },
 585             }, {
 586                 'info_dict': {
 587                     'id': '6h8e8xoXJzg',
 588                     'ext': 'mp4',
 589                     'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',
 590                     'description': 'md5:dc7872fb300e143831327f1bae3af010',
 591                     'upload_date': '20150721',
 592                     'uploader': 'Beer Games Beer',
 593                     'uploader_id': 'beergamesbeer',
 594                 },
 595             }, {
 596                 'info_dict': {
 597                     'id': 'PUOgX5z9xZw',
 598                     'ext': 'mp4',
 599                     'title': 'teamPGP: Rocket League Noob Stream (grizzle)',
 600                     'description': 'md5:dc7872fb300e143831327f1bae3af010',
 601                     'upload_date': '20150721',
 602                     'uploader': 'Beer Games Beer',
 603                     'uploader_id': 'beergamesbeer',
 604                 },
 605             }, {
 606                 'info_dict': {
 607                     'id': 'teuwxikvS5k',
 608                     'ext': 'mp4',
 609                     'title': 'teamPGP: Rocket League Noob Stream (zim)',
 610                     'description': 'md5:dc7872fb300e143831327f1bae3af010',
 611                     'upload_date': '20150721',
 612                     'uploader': 'Beer Games Beer',
 613                     'uploader_id': 'beergamesbeer',
 614                 },
 615             }],
 616             'params': {
 617                 'skip_download': True,
 618             },
 619         }
 620     ]
 621
 622     def __init__(self, *args, **kwargs):
 623         super(YoutubeIE, self).__init__(*args, **kwargs)
 624         self._player_cache = {}
 625
 626     def report_video_info_webpage_download(self, video_id):
 627         """Report attempt to download video info webpage."""
 628         self.to_screen('%s: Downloading video info webpage' % video_id)
 629
 630     def report_information_extraction(self, video_id):
 631         """Report attempt to extract video information."""
 632         self.to_screen('%s: Extracting video information' % video_id)
 633
 634     def report_unavailable_format(self, video_id, format):
 635         """Report extracted video URL."""
 636         self.to_screen('%s: Format %s not available' % (video_id, format))
 637
 638     def report_rtmp_download(self):
 639         """Indicate the download will use the RTMP protocol."""
 640         self.to_screen('RTMP download detected')
 641
 642     def _signature_cache_id(self, example_sig):
 643         """ Return a string representation of a signature """
 644         return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
 645
 646     def _extract_signature_function(self, video_id, player_url, example_sig):
 647         id_m = re.match(
 648             r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
 649             player_url)
 650         if not id_m:
 651             raise ExtractorError('Cannot identify player %r' % player_url)
 652         player_type = id_m.group('ext')
 653         player_id = id_m.group('id')
 654
 655         # Read from filesystem cache
 656         func_id = '%s_%s_%s' % (
 657             player_type, player_id, self._signature_cache_id(example_sig))
 658         assert os.path.basename(func_id) == func_id
 659
 660         cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
 661         if cache_spec is not None:
 662             return lambda s: ''.join(s[i] for i in cache_spec)
 663
 664         download_note = (
 665             'Downloading player %s' % player_url
 666             if self._downloader.params.get('verbose') else
 667             'Downloading %s player %s' % (player_type, player_id)
 668         )
 669         if player_type == 'js':
 670             code = self._download_webpage(
 671                 player_url, video_id,
 672                 note=download_note,
 673                 errnote='Download of %s failed' % player_url)
 674             res = self._parse_sig_js(code)
 675         elif player_type == 'swf':
 676             urlh = self._request_webpage(
 677                 player_url, video_id,
 678                 note=download_note,
 679                 errnote='Download of %s failed' % player_url)
 680             code = urlh.read()
 681             res = self._parse_sig_swf(code)
 682         else:
 683             assert False, 'Invalid player type %r' % player_type
 684
 685         test_string = ''.join(map(compat_chr, range(len(example_sig))))
 686         cache_res = res(test_string)
 687         cache_spec = [ord(c) for c in cache_res]
 688
 689         self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
 690         return res
 691
 692     def _print_sig_code(self, func, example_sig):
 693         def gen_sig_code(idxs):
 694             def _genslice(start, end, step):
 695                 starts = '' if start == 0 else str(start)
 696                 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
 697                 steps = '' if step == 1 else (':%d' % step)
 698                 return 's[%s%s%s]' % (starts, ends, steps)
 699
 700             step = None
 701             # Quelch pyflakes warnings - start will be set when step is set
 702             start = '(Never used)'
 703             for i, prev in zip(idxs[1:], idxs[:-1]):
 704                 if step is not None:
 705                     if i - prev == step:
 706                         continue
 707                     yield _genslice(start, prev, step)
 708                     step = None
 709                     continue
 710                 if i - prev in [-1, 1]:
 711                     step = i - prev
 712                     start = prev
 713                     continue
 714                 else:
 715                     yield 's[%d]' % prev
 716             if step is None:
 717                 yield 's[%d]' % i
 718             else:
 719                 yield _genslice(start, i, step)
 720
 721         test_string = ''.join(map(compat_chr, range(len(example_sig))))
 722         cache_res = func(test_string)
 723         cache_spec = [ord(c) for c in cache_res]
 724         expr_code = ' + '.join(gen_sig_code(cache_spec))
 725         signature_id_tuple = '(%s)' % (
 726             ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
 727         code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
 728                 '    return %s\n') % (signature_id_tuple, expr_code)
 729         self.to_screen('Extracted signature function:\n' + code)
 730
 731     def _parse_sig_js(self, jscode):
 732         funcname = self._search_regex(
 733             r'\.sig\|\|([a-zA-Z0-9$]+)\(', jscode,
 734             'Initial JS player signature function name')
 735
 736         jsi = JSInterpreter(jscode)
 737         initial_function = jsi.extract_function(funcname)
 738         return lambda s: initial_function([s])
 739
 740     def _parse_sig_swf(self, file_contents):
 741         swfi = SWFInterpreter(file_contents)
 742         TARGET_CLASSNAME = 'SignatureDecipher'
 743         searched_class = swfi.extract_class(TARGET_CLASSNAME)
 744         initial_function = swfi.extract_function(searched_class, 'decipher')
 745         return lambda s: initial_function([s])
 746
 747     def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
 748         """Turn the encrypted s field into a working signature"""
 749
 750         if player_url is None:
 751             raise ExtractorError('Cannot decrypt signature without player_url')
 752
 753         if player_url.startswith('//'):
 754             player_url = 'https:' + player_url
 755         try:
 756             player_id = (player_url, self._signature_cache_id(s))
 757             if player_id not in self._player_cache:
 758                 func = self._extract_signature_function(
 759                     video_id, player_url, s
 760                 )
 761                 self._player_cache[player_id] = func
 762             func = self._player_cache[player_id]
 763             if self._downloader.params.get('youtube_print_sig_code'):
 764                 self._print_sig_code(func, s)
 765             return func(s)
 766         except Exception as e:
 767             tb = traceback.format_exc()
 768             raise ExtractorError(
 769                 'Signature extraction failed: ' + tb, cause=e)
 770
 771     def _get_subtitles(self, video_id, webpage):
 772         try:
 773             subs_doc = self._download_xml(
 774                 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
 775                 video_id, note=False)
 776         except ExtractorError as err:
 777             self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err))
 778             return {}
 779
 780         sub_lang_list = {}
 781         for track in subs_doc.findall('track'):
 782             lang = track.attrib['lang_code']
 783             if lang in sub_lang_list:
 784                 continue
 785             sub_formats = []
 786             for ext in ['sbv', 'vtt', 'srt']:
 787                 params = compat_urllib_parse.urlencode({
 788                     'lang': lang,
 789                     'v': video_id,
 790                     'fmt': ext,
 791                     'name': track.attrib['name'].encode('utf-8'),
 792                 })
 793                 sub_formats.append({
 794                     'url': 'https://www.youtube.com/api/timedtext?' + params,
 795                     'ext': ext,
 796                 })
 797             sub_lang_list[lang] = sub_formats
 798         if not sub_lang_list:
 799             self._downloader.report_warning('video doesn\'t have subtitles')
 800             return {}
 801         return sub_lang_list
 802
 803     def _get_automatic_captions(self, video_id, webpage):
 804         """We need the webpage for getting the captions url, pass it as an
 805            argument to speed up the process."""
 806         self.to_screen('%s: Looking for automatic captions' % video_id)
 807         mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
 808         err_msg = 'Couldn\'t find automatic captions for %s' % video_id
 809         if mobj is None:
 810             self._downloader.report_warning(err_msg)
 811             return {}
 812         player_config = json.loads(mobj.group(1))
 813         try:
 814             args = player_config['args']
 815             caption_url = args['ttsurl']
 816             timestamp = args['timestamp']
 817             # We get the available subtitles
 818             list_params = compat_urllib_parse.urlencode({
 819                 'type': 'list',
 820                 'tlangs': 1,
 821                 'asrs': 1,
 822             })
 823             list_url = caption_url + '&' + list_params
 824             caption_list = self._download_xml(list_url, video_id)
 825             original_lang_node = caption_list.find('track')
 826             if original_lang_node is None:
 827                 self._downloader.report_warning('Video doesn\'t have automatic captions')
 828                 return {}
 829             original_lang = original_lang_node.attrib['lang_code']
 830             caption_kind = original_lang_node.attrib.get('kind', '')
 831
 832             sub_lang_list = {}
 833             for lang_node in caption_list.findall('target'):
 834                 sub_lang = lang_node.attrib['lang_code']
 835                 sub_formats = []
 836                 for ext in ['sbv', 'vtt', 'srt']:
 837                     params = compat_urllib_parse.urlencode({
 838                         'lang': original_lang,
 839                         'tlang': sub_lang,
 840                         'fmt': ext,
 841                         'ts': timestamp,
 842                         'kind': caption_kind,
 843                     })
 844                     sub_formats.append({
 845                         'url': caption_url + '&' + params,
 846                         'ext': ext,
 847                     })
 848                 sub_lang_list[sub_lang] = sub_formats
 849             return sub_lang_list
 850         # An extractor error can be raise by the download process if there are
 851         # no automatic captions but there are subtitles
 852         except (KeyError, ExtractorError):
 853             self._downloader.report_warning(err_msg)
 854             return {}
 855
 856     @classmethod
 857     def extract_id(cls, url):
 858         mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
 859         if mobj is None:
 860             raise ExtractorError('Invalid URL: %s' % url)
 861         video_id = mobj.group(2)
 862         return video_id
 863
 864     def _extract_from_m3u8(self, manifest_url, video_id):
 865         url_map = {}
 866
 867         def _get_urls(_manifest):
 868             lines = _manifest.split('\n')
 869             urls = filter(lambda l: l and not l.startswith('#'),
 870                           lines)
 871             return urls
 872         manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
 873         formats_urls = _get_urls(manifest)
 874         for format_url in formats_urls:
 875             itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
 876             url_map[itag] = format_url
 877         return url_map
 878
 879     def _extract_annotations(self, video_id):
 880         url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
 881         return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
 882
 883     def _parse_dash_manifest(
 884             self, video_id, dash_manifest_url, player_url, age_gate, fatal=True):
 885         def decrypt_sig(mobj):
 886             s = mobj.group(1)
 887             dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
 888             return '/signature/%s' % dec_s
 889         dash_manifest_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, dash_manifest_url)
 890         dash_doc = self._download_xml(
 891             dash_manifest_url, video_id,
 892             note='Downloading DASH manifest',
 893             errnote='Could not download DASH manifest',
 894             fatal=fatal)
 895
 896         if dash_doc is False:
 897             return []
 898
 899         formats = []
 900         for a in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}AdaptationSet'):
 901             mime_type = a.attrib.get('mimeType')
 902             for r in a.findall('{urn:mpeg:DASH:schema:MPD:2011}Representation'):
 903                 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
 904                 if url_el is None:
 905                     continue
 906                 if mime_type == 'text/vtt':
 907                     # TODO implement WebVTT downloading
 908                     pass
 909                 elif mime_type.startswith('audio/') or mime_type.startswith('video/'):
 910                     segment_list = r.find('{urn:mpeg:DASH:schema:MPD:2011}SegmentList')
 911                     format_id = r.attrib['id']
 912                     video_url = url_el.text
 913                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
 914                     f = {
 915                         'format_id': format_id,
 916                         'url': video_url,
 917                         'width': int_or_none(r.attrib.get('width')),
 918                         'height': int_or_none(r.attrib.get('height')),
 919                         'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
 920                         'asr': int_or_none(r.attrib.get('audioSamplingRate')),
 921                         'filesize': filesize,
 922                         'fps': int_or_none(r.attrib.get('frameRate')),
 923                     }
 924                     if segment_list is not None:
 925                         f.update({
 926                             'initialization_url': segment_list.find('{urn:mpeg:DASH:schema:MPD:2011}Initialization').attrib['sourceURL'],
 927                             'segment_urls': [segment.attrib.get('media') for segment in segment_list.findall('{urn:mpeg:DASH:schema:MPD:2011}SegmentURL')],
 928                             'protocol': 'http_dash_segments',
 929                         })
 930                     try:
 931                         existing_format = next(
 932                             fo for fo in formats
 933                             if fo['format_id'] == format_id)
 934                     except StopIteration:
 935                         full_info = self._formats.get(format_id, {}).copy()
 936                         full_info.update(f)
 937                         codecs = r.attrib.get('codecs')
 938                         if codecs:
 939                             if full_info.get('acodec') == 'none' and 'vcodec' not in full_info:
 940                                 full_info['vcodec'] = codecs
 941                             elif full_info.get('vcodec') == 'none' and 'acodec' not in full_info:
 942                                 full_info['acodec'] = codecs
 943                         formats.append(full_info)
 944                     else:
 945                         existing_format.update(f)
 946                 else:
 947                     self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
 948         return formats
 949
 950     def _real_extract(self, url):
 951         url, smuggled_data = unsmuggle_url(url, {})
 952
 953         proto = (
 954             'http' if self._downloader.params.get('prefer_insecure', False)
 955             else 'https')
 956
 957         start_time = None
 958         end_time = None
 959         parsed_url = compat_urllib_parse_urlparse(url)
 960         for component in [parsed_url.fragment, parsed_url.query]:
 961             query = compat_parse_qs(component)
 962             if start_time is None and 't' in query:
 963                 start_time = parse_duration(query['t'][0])
 964             if start_time is None and 'start' in query:
 965                 start_time = parse_duration(query['start'][0])
 966             if end_time is None and 'end' in query:
 967                 end_time = parse_duration(query['end'][0])
 968
 969         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 970         mobj = re.search(self._NEXT_URL_RE, url)
 971         if mobj:
 972             url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')
 973         video_id = self.extract_id(url)
 974
 975         # Get video webpage
 976         url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
 977         video_webpage = self._download_webpage(url, video_id)
 978
 979         # Attempt to extract SWF player URL
 980         mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 981         if mobj is not None:
 982             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 983         else:
 984             player_url = None
 985
 986         dash_mpds = []
 987
 988         def add_dash_mpd(video_info):
 989             dash_mpd = video_info.get('dashmpd')
 990             if dash_mpd and dash_mpd[0] not in dash_mpds:
 991                 dash_mpds.append(dash_mpd[0])
 992
 993         # Get video info
 994         embed_webpage = None
 995         is_live = None
 996         if re.search(r'player-age-gate-content">', video_webpage) is not None:
 997             age_gate = True
 998             # We simulate the access to the video from www.youtube.com/v/{video_id}
 999             # this can be viewed without login into Youtube
1000             url = proto + '://www.youtube.com/embed/%s' % video_id
1001             embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
1002             data = compat_urllib_parse.urlencode({
1003                 'video_id': video_id,
1004                 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1005                 'sts': self._search_regex(
1006                     r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
1007             })
1008             video_info_url = proto + '://www.youtube.com/get_video_info?' + data
1009             video_info_webpage = self._download_webpage(
1010                 video_info_url, video_id,
1011                 note='Refetching age-gated info webpage',
1012                 errnote='unable to download video info webpage')
1013             video_info = compat_parse_qs(video_info_webpage)
1014             add_dash_mpd(video_info)
1015         else:
1016             age_gate = False
1017             video_info = None
1018             # Try looking directly into the video webpage
1019             mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
1020             if mobj:
1021                 json_code = uppercase_escape(mobj.group(1))
1022                 ytplayer_config = json.loads(json_code)
1023                 args = ytplayer_config['args']
1024                 if args.get('url_encoded_fmt_stream_map'):
1025                     # Convert to the same format returned by compat_parse_qs
1026                     video_info = dict((k, [v]) for k, v in args.items())
1027                     add_dash_mpd(video_info)
1028                 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1029                     is_live = True
1030             if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
1031                 # We also try looking in get_video_info since it may contain different dashmpd
1032                 # URL that points to a DASH manifest with possibly different itag set (some itags
1033                 # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH
1034                 # manifest pointed by get_video_info's dashmpd).
1035                 # The general idea is to take a union of itags of both DASH manifests (for example
1036                 # video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093)
1037                 self.report_video_info_webpage_download(video_id)
1038                 for el_type in ['&el=info', '&el=embedded', '&el=detailpage', '&el=vevo', '']:
1039                     video_info_url = (
1040                         '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1041                         % (proto, video_id, el_type))
1042                     video_info_webpage = self._download_webpage(
1043                         video_info_url,
1044                         video_id, note=False,
1045                         errnote='unable to download video info webpage')
1046                     get_video_info = compat_parse_qs(video_info_webpage)
1047                     if get_video_info.get('use_cipher_signature') != ['True']:
1048                         add_dash_mpd(get_video_info)
1049                     if not video_info:
1050                         video_info = get_video_info
1051                     if 'token' in get_video_info:
1052                         break
1053         if 'token' not in video_info:
1054             if 'reason' in video_info:
1055                 if 'The uploader has not made this video available in your country.' in video_info['reason']:
1056                     regions_allowed = self._html_search_meta('regionsAllowed', video_webpage, default=None)
1057                     if regions_allowed:
1058                         raise ExtractorError('YouTube said: This video is available in %s only' % (
1059                             ', '.join(map(ISO3166Utils.short2full, regions_allowed.split(',')))),
1060                             expected=True)
1061                 raise ExtractorError(
1062                     'YouTube said: %s' % video_info['reason'][0],
1063                     expected=True, video_id=video_id)
1064             else:
1065                 raise ExtractorError(
1066                     '"token" parameter not in video info for unknown reason',
1067                     video_id=video_id)
1068
1069         # title
1070         if 'title' in video_info:
1071             video_title = video_info['title'][0]
1072         else:
1073             self._downloader.report_warning('Unable to extract video title')
1074             video_title = '_'
1075
1076         # description
1077         video_description = get_element_by_id("eow-description", video_webpage)
1078         if video_description:
1079             video_description = re.sub(r'''(?x)
1080                 <a\s+
1081                     (?:[a-zA-Z-]+="[^"]+"\s+)*?
1082                     title="([^"]+)"\s+
1083                     (?:[a-zA-Z-]+="[^"]+"\s+)*?
1084                     class="yt-uix-redirect-link"\s*>
1085                 [^<]+
1086                 </a>
1087             ''', r'\1', video_description)
1088             video_description = clean_html(video_description)
1089         else:
1090             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1091             if fd_mobj:
1092                 video_description = unescapeHTML(fd_mobj.group(1))
1093             else:
1094                 video_description = ''
1095
1096         if 'multifeed_metadata_list' in video_info and not smuggled_data.get('force_singlefeed', False):
1097             if not self._downloader.params.get('noplaylist'):
1098                 entries = []
1099                 feed_ids = []
1100                 multifeed_metadata_list = compat_urllib_parse_unquote_plus(video_info['multifeed_metadata_list'][0])
1101                 for feed in multifeed_metadata_list.split(','):
1102                     feed_data = compat_parse_qs(feed)
1103                     entries.append({
1104                         '_type': 'url_transparent',
1105                         'ie_key': 'Youtube',
1106                         'url': smuggle_url(
1107                             '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),
1108                             {'force_singlefeed': True}),
1109                         'title': '%s (%s)' % (video_title, feed_data['title'][0]),
1110                     })
1111                     feed_ids.append(feed_data['id'][0])
1112                 self.to_screen(
1113                     'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
1114                     % (', '.join(feed_ids), video_id))
1115                 return self.playlist_result(entries, video_id, video_title, video_description)
1116             self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1117
1118         if 'view_count' in video_info:
1119             view_count = int(video_info['view_count'][0])
1120         else:
1121             view_count = None
1122
1123         # Check for "rental" videos
1124         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1125             raise ExtractorError('"rental" videos not supported')
1126
1127         # Start extracting information
1128         self.report_information_extraction(video_id)
1129
1130         # uploader
1131         if 'author' not in video_info:
1132             raise ExtractorError('Unable to extract uploader name')
1133         video_uploader = compat_urllib_parse_unquote_plus(video_info['author'][0])
1134
1135         # uploader_id
1136         video_uploader_id = None
1137         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1138         if mobj is not None:
1139             video_uploader_id = mobj.group(1)
1140         else:
1141             self._downloader.report_warning('unable to extract uploader nickname')
1142
1143         # thumbnail image
1144         # We try first to get a high quality image:
1145         m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1146                             video_webpage, re.DOTALL)
1147         if m_thumb is not None:
1148             video_thumbnail = m_thumb.group(1)
1149         elif 'thumbnail_url' not in video_info:
1150             self._downloader.report_warning('unable to extract video thumbnail')
1151             video_thumbnail = None
1152         else:   # don't panic if we can't find it
1153             video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0])
1154
1155         # upload date
1156         upload_date = self._html_search_meta(
1157             'datePublished', video_webpage, 'upload date', default=None)
1158         if not upload_date:
1159             upload_date = self._search_regex(
1160                 [r'(?s)id="eow-date.*?>(.*?)</span>',
1161                  r'id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live|Started) on (.+?)</strong>'],
1162                 video_webpage, 'upload date', default=None)
1163             if upload_date:
1164                 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1165         upload_date = unified_strdate(upload_date)
1166
1167         m_cat_container = self._search_regex(
1168             r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
1169             video_webpage, 'categories', default=None)
1170         if m_cat_container:
1171             category = self._html_search_regex(
1172                 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
1173                 default=None)
1174             video_categories = None if category is None else [category]
1175         else:
1176             video_categories = None
1177
1178         video_tags = [
1179             unescapeHTML(m.group('content'))
1180             for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
1181
1182         def _extract_count(count_name):
1183             return str_to_int(self._search_regex(
1184                 r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>'
1185                 % re.escape(count_name),
1186                 video_webpage, count_name, default=None))
1187
1188         like_count = _extract_count('like')
1189         dislike_count = _extract_count('dislike')
1190
1191         # subtitles
1192         video_subtitles = self.extract_subtitles(video_id, video_webpage)
1193         automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
1194
1195         if 'length_seconds' not in video_info:
1196             self._downloader.report_warning('unable to extract video duration')
1197             video_duration = None
1198         else:
1199             video_duration = int(compat_urllib_parse_unquote_plus(video_info['length_seconds'][0]))
1200
1201         # annotations
1202         video_annotations = None
1203         if self._downloader.params.get('writeannotations', False):
1204             video_annotations = self._extract_annotations(video_id)
1205
1206         def _map_to_format_list(urlmap):
1207             formats = []
1208             for itag, video_real_url in urlmap.items():
1209                 dct = {
1210                     'format_id': itag,
1211                     'url': video_real_url,
1212                     'player_url': player_url,
1213                 }
1214                 if itag in self._formats:
1215                     dct.update(self._formats[itag])
1216                 formats.append(dct)
1217             return formats
1218
1219         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1220             self.report_rtmp_download()
1221             formats = [{
1222                 'format_id': '_rtmp',
1223                 'protocol': 'rtmp',
1224                 'url': video_info['conn'][0],
1225                 'player_url': player_url,
1226             }]
1227         elif len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1:
1228             encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
1229             if 'rtmpe%3Dyes' in encoded_url_map:
1230                 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1231             url_map = {}
1232             for url_data_str in encoded_url_map.split(','):
1233                 url_data = compat_parse_qs(url_data_str)
1234                 if 'itag' not in url_data or 'url' not in url_data:
1235                     continue
1236                 format_id = url_data['itag'][0]
1237                 url = url_data['url'][0]
1238
1239                 if 'sig' in url_data:
1240                     url += '&signature=' + url_data['sig'][0]
1241                 elif 's' in url_data:
1242                     encrypted_sig = url_data['s'][0]
1243                     ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'
1244
1245                     jsplayer_url_json = self._search_regex(
1246                         ASSETS_RE,
1247                         embed_webpage if age_gate else video_webpage,
1248                         'JS player URL (1)', default=None)
1249                     if not jsplayer_url_json and not age_gate:
1250                         # We need the embed website after all
1251                         if embed_webpage is None:
1252                             embed_url = proto + '://www.youtube.com/embed/%s' % video_id
1253                             embed_webpage = self._download_webpage(
1254                                 embed_url, video_id, 'Downloading embed webpage')
1255                         jsplayer_url_json = self._search_regex(
1256                             ASSETS_RE, embed_webpage, 'JS player URL')
1257
1258                     player_url = json.loads(jsplayer_url_json)
1259                     if player_url is None:
1260                         player_url_json = self._search_regex(
1261                             r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
1262                             video_webpage, 'age gate player URL')
1263                         player_url = json.loads(player_url_json)
1264
1265                     if self._downloader.params.get('verbose'):
1266                         if player_url is None:
1267                             player_version = 'unknown'
1268                             player_desc = 'unknown'
1269                         else:
1270                             if player_url.endswith('swf'):
1271                                 player_version = self._search_regex(
1272                                     r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
1273                                     'flash player', fatal=False)
1274                                 player_desc = 'flash player %s' % player_version
1275                             else:
1276                                 player_version = self._search_regex(
1277                                     r'html5player-([^/]+?)(?:/html5player)?\.js',
1278                                     player_url,
1279                                     'html5 player', fatal=False)
1280                                 player_desc = 'html5 player %s' % player_version
1281
1282                         parts_sizes = self._signature_cache_id(encrypted_sig)
1283                         self.to_screen('{%s} signature length %s, %s' %
1284                                        (format_id, parts_sizes, player_desc))
1285
1286                     signature = self._decrypt_signature(
1287                         encrypted_sig, video_id, player_url, age_gate)
1288                     url += '&signature=' + signature
1289                 if 'ratebypass' not in url:
1290                     url += '&ratebypass=yes'
1291                 url_map[format_id] = url
1292             formats = _map_to_format_list(url_map)
1293         elif video_info.get('hlsvp'):
1294             manifest_url = video_info['hlsvp'][0]
1295             url_map = self._extract_from_m3u8(manifest_url, video_id)
1296             formats = _map_to_format_list(url_map)
1297         else:
1298             raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
1299
1300         # Look for the DASH manifest
1301         if self._downloader.params.get('youtube_include_dash_manifest', True):
1302             dash_mpd_fatal = True
1303             for dash_manifest_url in dash_mpds:
1304                 dash_formats = {}
1305                 try:
1306                     for df in self._parse_dash_manifest(
1307                             video_id, dash_manifest_url, player_url, age_gate, dash_mpd_fatal):
1308                         # Do not overwrite DASH format found in some previous DASH manifest
1309                         if df['format_id'] not in dash_formats:
1310                             dash_formats[df['format_id']] = df
1311                         # Additional DASH manifests may end up in HTTP Error 403 therefore
1312                         # allow them to fail without bug report message if we already have
1313                         # some DASH manifest succeeded. This is temporary workaround to reduce
1314                         # burst of bug reports until we figure out the reason and whether it
1315                         # can be fixed at all.
1316                         dash_mpd_fatal = False
1317                 except (ExtractorError, KeyError) as e:
1318                     self.report_warning(
1319                         'Skipping DASH manifest: %r' % e, video_id)
1320                 if dash_formats:
1321                     # Remove the formats we found through non-DASH, they
1322                     # contain less info and it can be wrong, because we use
1323                     # fixed values (for example the resolution). See
1324                     # https://github.com/rg3/youtube-dl/issues/5774 for an
1325                     # example.
1326                     formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
1327                     formats.extend(dash_formats.values())
1328
1329         # Check for malformed aspect ratio
1330         stretched_m = re.search(
1331             r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
1332             video_webpage)
1333         if stretched_m:
1334             ratio = float(stretched_m.group('w')) / float(stretched_m.group('h'))
1335             for f in formats:
1336                 if f.get('vcodec') != 'none':
1337                     f['stretched_ratio'] = ratio
1338
1339         self._sort_formats(formats)
1340
1341         return {
1342             'id': video_id,
1343             'uploader': video_uploader,
1344             'uploader_id': video_uploader_id,
1345             'upload_date': upload_date,
1346             'title': video_title,
1347             'thumbnail': video_thumbnail,
1348             'description': video_description,
1349             'categories': video_categories,
1350             'tags': video_tags,
1351             'subtitles': video_subtitles,
1352             'automatic_captions': automatic_captions,
1353             'duration': video_duration,
1354             'age_limit': 18 if age_gate else 0,
1355             'annotations': video_annotations,
1356             'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
1357             'view_count': view_count,
1358             'like_count': like_count,
1359             'dislike_count': dislike_count,
1360             'average_rating': float_or_none(video_info.get('avg_rating', [None])[0]),
1361             'formats': formats,
1362             'is_live': is_live,
1363             'start_time': start_time,
1364             'end_time': end_time,
1365         }
1366
1367
1368 class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
1369     IE_DESC = 'YouTube.com playlists'
1370     _VALID_URL = r"""(?x)(?:
1371                         (?:https?://)?
1372                         (?:\w+\.)?
1373                         youtube\.com/
1374                         (?:
1375                            (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries)
1376                            \? (?:.*?&)*? (?:p|a|list)=
1377                         |  p/
1378                         )
1379                         (
1380                             (?:PL|LL|EC|UU|FL|RD|UL)?[0-9A-Za-z-_]{10,}
1381                             # Top tracks, they can also include dots
1382                             |(?:MC)[\w\.]*
1383                         )
1384                         .*
1385                      |
1386                         ((?:PL|LL|EC|UU|FL|RD|UL)[0-9A-Za-z-_]{10,})
1387                      )"""
1388     _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
1389     _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
1390     IE_NAME = 'youtube:playlist'
1391     _TESTS = [{
1392         'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1393         'info_dict': {
1394             'title': 'ytdl test PL',
1395             'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1396         },
1397         'playlist_count': 3,
1398     }, {
1399         'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1400         'info_dict': {
1401             'id': 'PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1402             'title': 'YDL_Empty_List',
1403         },
1404         'playlist_count': 0,
1405     }, {
1406         'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
1407         'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1408         'info_dict': {
1409             'title': '29C3: Not my department',
1410             'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1411         },
1412         'playlist_count': 95,
1413     }, {
1414         'note': 'issue #673',
1415         'url': 'PLBB231211A4F62143',
1416         'info_dict': {
1417             'title': '[OLD]Team Fortress 2 (Class-based LP)',
1418             'id': 'PLBB231211A4F62143',
1419         },
1420         'playlist_mincount': 26,
1421     }, {
1422         'note': 'Large playlist',
1423         'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
1424         'info_dict': {
1425             'title': 'Uploads from Cauchemar',
1426             'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
1427         },
1428         'playlist_mincount': 799,
1429     }, {
1430         'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1431         'info_dict': {
1432             'title': 'YDL_safe_search',
1433             'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1434         },
1435         'playlist_count': 2,
1436     }, {
1437         'note': 'embedded',
1438         'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1439         'playlist_count': 4,
1440         'info_dict': {
1441             'title': 'JODA15',
1442             'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1443         }
1444     }, {
1445         'note': 'Embedded SWF player',
1446         'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
1447         'playlist_count': 4,
1448         'info_dict': {
1449             'title': 'JODA7',
1450             'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',
1451         }
1452     }, {
1453         'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
1454         'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
1455         'info_dict': {
1456             'title': 'Uploads from Interstellar Movie',
1457             'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
1458         },
1459         'playlist_mincout': 21,
1460     }]
1461
1462     def _real_initialize(self):
1463         self._login()
1464
1465     def _extract_mix(self, playlist_id):
1466         # The mixes are generated from a single video
1467         # the id of the playlist is just 'RD' + video_id
1468         url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
1469         webpage = self._download_webpage(
1470             url, playlist_id, 'Downloading Youtube mix')
1471         search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
1472         title_span = (
1473             search_title('playlist-title') or
1474             search_title('title long-title') or
1475             search_title('title'))
1476         title = clean_html(title_span)
1477         ids = orderedSet(re.findall(
1478             r'''(?xs)data-video-username=".*?".*?
1479                        href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
1480             webpage))
1481         url_results = self._ids_to_results(ids)
1482
1483         return self.playlist_result(url_results, playlist_id, title)
1484
1485     def _extract_playlist(self, playlist_id):
1486         url = self._TEMPLATE_URL % playlist_id
1487         page = self._download_webpage(url, playlist_id)
1488
1489         for match in re.findall(r'<div class="yt-alert-message">([^<]+)</div>', page):
1490             match = match.strip()
1491             # Check if the playlist exists or is private
1492             if re.match(r'[^<]*(The|This) playlist (does not exist|is private)[^<]*', match):
1493                 raise ExtractorError(
1494                     'The playlist doesn\'t exist or is private, use --username or '
1495                     '--netrc to access it.',
1496                     expected=True)
1497             elif re.match(r'[^<]*Invalid parameters[^<]*', match):
1498                 raise ExtractorError(
1499                     'Invalid parameters. Maybe URL is incorrect.',
1500                     expected=True)
1501             elif re.match(r'[^<]*Choose your language[^<]*', match):
1502                 continue
1503             else:
1504                 self.report_warning('Youtube gives an alert message: ' + match)
1505
1506         # Extract the video ids from the playlist pages
1507         def _entries():
1508             more_widget_html = content_html = page
1509             for page_num in itertools.count(1):
1510                 matches = re.finditer(self._VIDEO_RE, content_html)
1511                 # We remove the duplicates and the link with index 0
1512                 # (it's not the first video of the playlist)
1513                 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1514                 for vid_id in new_ids:
1515                     yield self.url_result(vid_id, 'Youtube', video_id=vid_id)
1516
1517                 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1518                 if not mobj:
1519                     break
1520
1521                 more = self._download_json(
1522                     'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1523                     'Downloading page #%s' % page_num,
1524                     transform_source=uppercase_escape)
1525                 content_html = more['content_html']
1526                 if not content_html.strip():
1527                     # Some webpages show a "Load more" button but they don't
1528                     # have more videos
1529                     break
1530                 more_widget_html = more['load_more_widget_html']
1531
1532         playlist_title = self._html_search_regex(
1533             r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
1534             page, 'title')
1535
1536         return self.playlist_result(_entries(), playlist_id, playlist_title)
1537
1538     def _real_extract(self, url):
1539         # Extract playlist id
1540         mobj = re.match(self._VALID_URL, url)
1541         if mobj is None:
1542             raise ExtractorError('Invalid URL: %s' % url)
1543         playlist_id = mobj.group(1) or mobj.group(2)
1544
1545         # Check if it's a video-specific URL
1546         query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1547         if 'v' in query_dict:
1548             video_id = query_dict['v'][0]
1549             if self._downloader.params.get('noplaylist'):
1550                 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1551                 return self.url_result(video_id, 'Youtube', video_id=video_id)
1552             else:
1553                 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1554
1555         if playlist_id.startswith('RD') or playlist_id.startswith('UL'):
1556             # Mixes require a custom extraction process
1557             return self._extract_mix(playlist_id)
1558
1559         return self._extract_playlist(playlist_id)
1560
1561
1562 class YoutubeChannelIE(InfoExtractor):
1563     IE_DESC = 'YouTube.com channels'
1564     _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'
1565     _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'
1566     IE_NAME = 'youtube:channel'
1567     _TESTS = [{
1568         'note': 'paginated channel',
1569         'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
1570         'playlist_mincount': 91,
1571         'info_dict': {
1572             'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
1573         }
1574     }]
1575
1576     @staticmethod
1577     def extract_videos_from_page(page):
1578         ids_in_page = []
1579         titles_in_page = []
1580         for mobj in re.finditer(r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?', page):
1581             video_id = mobj.group('id')
1582             video_title = unescapeHTML(mobj.group('title'))
1583             try:
1584                 idx = ids_in_page.index(video_id)
1585                 if video_title and not titles_in_page[idx]:
1586                     titles_in_page[idx] = video_title
1587             except ValueError:
1588                 ids_in_page.append(video_id)
1589                 titles_in_page.append(video_title)
1590         return zip(ids_in_page, titles_in_page)
1591
1592     def _real_extract(self, url):
1593         channel_id = self._match_id(url)
1594
1595         url = self._TEMPLATE_URL % channel_id
1596
1597         # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)
1598         # Workaround by extracting as a playlist if managed to obtain channel playlist URL
1599         # otherwise fallback on channel by page extraction
1600         channel_page = self._download_webpage(
1601             url + '?view=57', channel_id,
1602             'Downloading channel page', fatal=False)
1603         channel_playlist_id = self._html_search_meta(
1604             'channelId', channel_page, 'channel id', default=None)
1605         if not channel_playlist_id:
1606             channel_playlist_id = self._search_regex(
1607                 r'data-channel-external-id="([^"]+)"',
1608                 channel_page, 'channel id', default=None)
1609         if channel_playlist_id and channel_playlist_id.startswith('UC'):
1610             playlist_id = 'UU' + channel_playlist_id[2:]
1611             return self.url_result(
1612                 compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')
1613
1614         channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')
1615         autogenerated = re.search(r'''(?x)
1616                 class="[^"]*?(?:
1617                     channel-header-autogenerated-label|
1618                     yt-channel-title-autogenerated
1619                 )[^"]*"''', channel_page) is not None
1620
1621         if autogenerated:
1622             # The videos are contained in a single page
1623             # the ajax pages can't be used, they are empty
1624             entries = [
1625                 self.url_result(
1626                     video_id, 'Youtube', video_id=video_id,
1627                     video_title=video_title)
1628                 for video_id, video_title in self.extract_videos_from_page(channel_page)]
1629             return self.playlist_result(entries, channel_id)
1630
1631         def _entries():
1632             more_widget_html = content_html = channel_page
1633             for pagenum in itertools.count(1):
1634
1635                 for video_id, video_title in self.extract_videos_from_page(content_html):
1636                     yield self.url_result(
1637                         video_id, 'Youtube', video_id=video_id,
1638                         video_title=video_title)
1639
1640                 mobj = re.search(
1641                     r'data-uix-load-more-href="/?(?P<more>[^"]+)"',
1642                     more_widget_html)
1643                 if not mobj:
1644                     break
1645
1646                 more = self._download_json(
1647                     'https://youtube.com/%s' % mobj.group('more'), channel_id,
1648                     'Downloading page #%s' % (pagenum + 1),
1649                     transform_source=uppercase_escape)
1650                 content_html = more['content_html']
1651                 more_widget_html = more['load_more_widget_html']
1652
1653         return self.playlist_result(_entries(), channel_id)
1654
1655
1656 class YoutubeUserIE(YoutubeChannelIE):
1657     IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
1658     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
1659     _TEMPLATE_URL = 'https://www.youtube.com/user/%s/videos'
1660     IE_NAME = 'youtube:user'
1661
1662     _TESTS = [{
1663         'url': 'https://www.youtube.com/user/TheLinuxFoundation',
1664         'playlist_mincount': 320,
1665         'info_dict': {
1666             'title': 'TheLinuxFoundation',
1667         }
1668     }, {
1669         'url': 'ytuser:phihag',
1670         'only_matching': True,
1671     }]
1672
1673     @classmethod
1674     def suitable(cls, url):
1675         # Don't return True if the url can be extracted with other youtube
1676         # extractor, the regex would is too permissive and it would match.
1677         other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1678         if any(ie.suitable(url) for ie in other_ies):
1679             return False
1680         else:
1681             return super(YoutubeUserIE, cls).suitable(url)
1682
1683
1684 class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE):
1685     IE_DESC = 'YouTube.com searches'
1686     # there doesn't appear to be a real limit, for example if you search for
1687     # 'python' you get more than 8.000.000 results
1688     _MAX_RESULTS = float('inf')
1689     IE_NAME = 'youtube:search'
1690     _SEARCH_KEY = 'ytsearch'
1691     _EXTRA_QUERY_ARGS = {}
1692     _TESTS = []
1693
1694     def _get_n_results(self, query, n):
1695         """Get a specified number of results for a query"""
1696
1697         videos = []
1698         limit = n
1699
1700         for pagenum in itertools.count(1):
1701             url_query = {
1702                 'search_query': query.encode('utf-8'),
1703                 'page': pagenum,
1704                 'spf': 'navigate',
1705             }
1706             url_query.update(self._EXTRA_QUERY_ARGS)
1707             result_url = 'https://www.youtube.com/results?' + compat_urllib_parse.urlencode(url_query)
1708             data = self._download_json(
1709                 result_url, video_id='query "%s"' % query,
1710                 note='Downloading page %s' % pagenum,
1711                 errnote='Unable to download API page')
1712             html_content = data[1]['body']['content']
1713
1714             if 'class="search-message' in html_content:
1715                 raise ExtractorError(
1716                     '[youtube] No video results', expected=True)
1717
1718             new_videos = self._ids_to_results(orderedSet(re.findall(
1719                 r'href="/watch\?v=(.{11})', html_content)))
1720             videos += new_videos
1721             if not new_videos or len(videos) > limit:
1722                 break
1723
1724         if len(videos) > n:
1725             videos = videos[:n]
1726         return self.playlist_result(videos, query)
1727
1728
1729 class YoutubeSearchDateIE(YoutubeSearchIE):
1730     IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
1731     _SEARCH_KEY = 'ytsearchdate'
1732     IE_DESC = 'YouTube.com searches, newest videos first'
1733     _EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'}
1734
1735
1736 class YoutubeSearchURLIE(InfoExtractor):
1737     IE_DESC = 'YouTube.com search URLs'
1738     IE_NAME = 'youtube:search_url'
1739     _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
1740     _TESTS = [{
1741         'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
1742         'playlist_mincount': 5,
1743         'info_dict': {
1744             'title': 'youtube-dl test video',
1745         }
1746     }]
1747
1748     def _real_extract(self, url):
1749         mobj = re.match(self._VALID_URL, url)
1750         query = compat_urllib_parse_unquote_plus(mobj.group('query'))
1751
1752         webpage = self._download_webpage(url, query)
1753         result_code = self._search_regex(
1754             r'(?s)<ol[^>]+class="item-section"(.*?)</ol>', webpage, 'result HTML')
1755
1756         part_codes = re.findall(
1757             r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1758         entries = []
1759         for part_code in part_codes:
1760             part_title = self._html_search_regex(
1761                 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
1762             part_url_snippet = self._html_search_regex(
1763                 r'(?s)href="([^"]+)"', part_code, 'item URL')
1764             part_url = compat_urlparse.urljoin(
1765                 'https://www.youtube.com/', part_url_snippet)
1766             entries.append({
1767                 '_type': 'url',
1768                 'url': part_url,
1769                 'title': part_title,
1770             })
1771
1772         return {
1773             '_type': 'playlist',
1774             'entries': entries,
1775             'title': query,
1776         }
1777
1778
1779 class YoutubeShowIE(InfoExtractor):
1780     IE_DESC = 'YouTube.com (multi-season) shows'
1781     _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
1782     IE_NAME = 'youtube:show'
1783     _TESTS = [{
1784         'url': 'http://www.youtube.com/show/airdisasters',
1785         'playlist_mincount': 3,
1786         'info_dict': {
1787             'id': 'airdisasters',
1788             'title': 'Air Disasters',
1789         }
1790     }]
1791
1792     def _real_extract(self, url):
1793         mobj = re.match(self._VALID_URL, url)
1794         playlist_id = mobj.group('id')
1795         webpage = self._download_webpage(
1796             url, playlist_id, 'Downloading show webpage')
1797         # There's one playlist for each season of the show
1798         m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1799         self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons)))
1800         entries = [
1801             self.url_result(
1802                 'https://www.youtube.com' + season.group(1), 'YoutubePlaylist')
1803             for season in m_seasons
1804         ]
1805         title = self._og_search_title(webpage, fatal=False)
1806
1807         return {
1808             '_type': 'playlist',
1809             'id': playlist_id,
1810             'title': title,
1811             'entries': entries,
1812         }
1813
1814
1815 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1816     """
1817     Base class for feed extractors
1818     Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1819     """
1820     _LOGIN_REQUIRED = True
1821
1822     @property
1823     def IE_NAME(self):
1824         return 'youtube:%s' % self._FEED_NAME
1825
1826     def _real_initialize(self):
1827         self._login()
1828
1829     def _real_extract(self, url):
1830         page = self._download_webpage(
1831             'https://www.youtube.com/feed/%s' % self._FEED_NAME, self._PLAYLIST_TITLE)
1832
1833         # The extraction process is the same as for playlists, but the regex
1834         # for the video ids doesn't contain an index
1835         ids = []
1836         more_widget_html = content_html = page
1837         for page_num in itertools.count(1):
1838             matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
1839
1840             # 'recommended' feed has infinite 'load more' and each new portion spins
1841             # the same videos in (sometimes) slightly different order, so we'll check
1842             # for unicity and break when portion has no new videos
1843             new_ids = filter(lambda video_id: video_id not in ids, orderedSet(matches))
1844             if not new_ids:
1845                 break
1846
1847             ids.extend(new_ids)
1848
1849             mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1850             if not mobj:
1851                 break
1852
1853             more = self._download_json(
1854                 'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,
1855                 'Downloading page #%s' % page_num,
1856                 transform_source=uppercase_escape)
1857             content_html = more['content_html']
1858             more_widget_html = more['load_more_widget_html']
1859
1860         return self.playlist_result(
1861             self._ids_to_results(ids), playlist_title=self._PLAYLIST_TITLE)
1862
1863
1864 class YoutubeWatchLaterIE(YoutubePlaylistIE):
1865     IE_NAME = 'youtube:watchlater'
1866     IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
1867     _VALID_URL = r'https?://www\.youtube\.com/(?:feed/watch_later|playlist\?list=WL)|:ytwatchlater'
1868
1869     _TESTS = []  # override PlaylistIE tests
1870
1871     def _real_extract(self, url):
1872         return self._extract_playlist('WL')
1873
1874
1875 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1876     IE_NAME = 'youtube:favorites'
1877     IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
1878     _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1879     _LOGIN_REQUIRED = True
1880
1881     def _real_extract(self, url):
1882         webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1883         playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
1884         return self.url_result(playlist_id, 'YoutubePlaylist')
1885
1886
1887 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1888     IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
1889     _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1890     _FEED_NAME = 'recommended'
1891     _PLAYLIST_TITLE = 'Youtube Recommended videos'
1892
1893
1894 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1895     IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1896     _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1897     _FEED_NAME = 'subscriptions'
1898     _PLAYLIST_TITLE = 'Youtube Subscriptions'
1899
1900
1901 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1902     IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
1903     _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
1904     _FEED_NAME = 'history'
1905     _PLAYLIST_TITLE = 'Youtube History'
1906
1907
1908 class YoutubeTruncatedURLIE(InfoExtractor):
1909     IE_NAME = 'youtube:truncated_url'
1910     IE_DESC = False  # Do not list
1911     _VALID_URL = r'''(?x)
1912         (?:https?://)?
1913         (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
1914         (?:watch\?(?:
1915             feature=[a-z_]+|
1916             annotation_id=annotation_[^&]+|
1917             x-yt-cl=[0-9]+|
1918             hl=[^&]*|
1919         )?
1920         |
1921             attribution_link\?a=[^&]+
1922         )
1923         $
1924     '''
1925
1926     _TESTS = [{
1927         'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1928         'only_matching': True,
1929     }, {
1930         'url': 'http://www.youtube.com/watch?',
1931         'only_matching': True,
1932     }, {
1933         'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
1934         'only_matching': True,
1935     }, {
1936         'url': 'https://www.youtube.com/watch?feature=foo',
1937         'only_matching': True,
1938     }, {
1939         'url': 'https://www.youtube.com/watch?hl=en-GB',
1940         'only_matching': True,
1941     }]
1942
1943     def _real_extract(self, url):
1944         raise ExtractorError(
1945             'Did you forget to quote the URL? Remember that & is a meta '
1946             'character in most shells, so you want to put the URL in quotes, '
1947             'like  youtube-dl '
1948             '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1949             ' or simply  youtube-dl BaW_jenozKc  .',
1950             expected=True)
1951
1952
1953 class YoutubeTruncatedIDIE(InfoExtractor):
1954     IE_NAME = 'youtube:truncated_id'
1955     IE_DESC = False  # Do not list
1956     _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
1957
1958     _TESTS = [{
1959         'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
1960         'only_matching': True,
1961     }]
1962
1963     def _real_extract(self, url):
1964         video_id = self._match_id(url)
1965         raise ExtractorError(
1966             'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
1967             expected=True)