youtube_dl/extractor/youtube.py

   1 # coding: utf-8
   2
   3 from __future__ import unicode_literals
   4
   5
   6 import itertools
   7 import json
   8 import os.path
   9 import re
  10 import traceback
  11
  12 from .common import InfoExtractor, SearchInfoExtractor
  13 from .subtitles import SubtitlesInfoExtractor
  14 from ..jsinterp import JSInterpreter
  15 from ..swfinterp import SWFInterpreter
  16 from ..utils import (
  17     compat_chr,
  18     compat_parse_qs,
  19     compat_urllib_parse,
  20     compat_urllib_request,
  21     compat_urlparse,
  22     compat_str,
  23
  24     clean_html,
  25     get_element_by_id,
  26     get_element_by_attribute,
  27     ExtractorError,
  28     int_or_none,
  29     PagedList,
  30     unescapeHTML,
  31     unified_strdate,
  32     orderedSet,
  33     uppercase_escape,
  34 )
  35
  36 class YoutubeBaseInfoExtractor(InfoExtractor):
  37     """Provide base functions for Youtube extractors"""
  38     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
  39     _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
  40     _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
  41     _AGE_URL = 'https://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
  42     _NETRC_MACHINE = 'youtube'
  43     # If True it will raise an error if no login info is provided
  44     _LOGIN_REQUIRED = False
  45
  46     def _set_language(self):
  47         return bool(self._download_webpage(
  48             self._LANG_URL, None,
  49             note=u'Setting language', errnote='unable to set language',
  50             fatal=False))
  51
  52     def _login(self):
  53         """
  54         Attempt to log in to YouTube.
  55         True is returned if successful or skipped.
  56         False is returned if login failed.
  57
  58         If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
  59         """
  60         (username, password) = self._get_login_info()
  61         # No authentication to be performed
  62         if username is None:
  63             if self._LOGIN_REQUIRED:
  64                 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
  65             return True
  66
  67         login_page = self._download_webpage(
  68             self._LOGIN_URL, None,
  69             note=u'Downloading login page',
  70             errnote=u'unable to fetch login page', fatal=False)
  71         if login_page is False:
  72             return
  73
  74         galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
  75                                   login_page, 'Login GALX parameter')
  76
  77         # Log in
  78         login_form_strs = {
  79                 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
  80                 'Email': username,
  81                 'GALX': galx,
  82                 'Passwd': password,
  83
  84                 'PersistentCookie': 'yes',
  85                 '_utf8': '霱',
  86                 'bgresponse': 'js_disabled',
  87                 'checkConnection': '',
  88                 'checkedDomains': 'youtube',
  89                 'dnConn': '',
  90                 'pstMsg': '0',
  91                 'rmShown': '1',
  92                 'secTok': '',
  93                 'signIn': 'Sign in',
  94                 'timeStmp': '',
  95                 'service': 'youtube',
  96                 'uilel': '3',
  97                 'hl': 'en_US',
  98         }
  99
 100         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 101         # chokes on unicode
 102         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 103         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 104
 105         req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 106         login_results = self._download_webpage(
 107             req, None,
 108             note=u'Logging in', errnote=u'unable to log in', fatal=False)
 109         if login_results is False:
 110             return False
 111
 112         if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
 113             raise ExtractorError(u'Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
 114
 115         # Two-Factor
 116         # TODO add SMS and phone call support - these require making a request and then prompting the user
 117
 118         if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
 119             tfa_code = self._get_tfa_info()
 120
 121             if tfa_code is None:
 122                 self._downloader.report_warning(u'Two-factor authentication required. Provide it with --twofactor <code>')
 123                 self._downloader.report_warning(u'(Note that only TOTP (Google Authenticator App) codes work at this time.)')
 124                 return False
 125
 126             # Unlike the first login form, secTok and timeStmp are both required for the TFA form
 127
 128             match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
 129             if match is None:
 130                 self._downloader.report_warning(u'Failed to get secTok - did the page structure change?')
 131             secTok = match.group(1)
 132             match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
 133             if match is None:
 134                 self._downloader.report_warning(u'Failed to get timeStmp - did the page structure change?')
 135             timeStmp = match.group(1)
 136
 137             tfa_form_strs = {
 138                 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 139                 'smsToken': '',
 140                 'smsUserPin': tfa_code,
 141                 'smsVerifyPin': 'Verify',
 142
 143                 'PersistentCookie': 'yes',
 144                 'checkConnection': '',
 145                 'checkedDomains': 'youtube',
 146                 'pstMsg': '1',
 147                 'secTok': secTok,
 148                 'timeStmp': timeStmp,
 149                 'service': 'youtube',
 150                 'hl': 'en_US',
 151             }
 152             tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in tfa_form_strs.items())
 153             tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
 154
 155             tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
 156             tfa_results = self._download_webpage(
 157                 tfa_req, None,
 158                 note=u'Submitting TFA code', errnote=u'unable to submit tfa', fatal=False)
 159
 160             if tfa_results is False:
 161                 return False
 162
 163             if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
 164                 self._downloader.report_warning(u'Two-factor code expired. Please try again, or use a one-use backup code instead.')
 165                 return False
 166             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
 167                 self._downloader.report_warning(u'unable to log in - did the page structure change?')
 168                 return False
 169             if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
 170                 self._downloader.report_warning(u'Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
 171                 return False
 172
 173         if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 174             self._downloader.report_warning(u'unable to log in: bad username or password')
 175             return False
 176         return True
 177
 178     def _confirm_age(self):
 179         age_form = {
 180             'next_url': '/',
 181             'action_confirm': 'Confirm',
 182         }
 183         req = compat_urllib_request.Request(self._AGE_URL,
 184             compat_urllib_parse.urlencode(age_form).encode('ascii'))
 185
 186         self._download_webpage(
 187             req, None,
 188             note=u'Confirming age', errnote=u'Unable to confirm age')
 189         return True
 190
 191     def _real_initialize(self):
 192         if self._downloader is None:
 193             return
 194         if not self._set_language():
 195             return
 196         if not self._login():
 197             return
 198         self._confirm_age()
 199
 200
 201 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
 202     IE_DESC = 'YouTube.com'
 203     _VALID_URL = r"""(?x)^
 204                      (
 205                          (?:https?://|//)                                    # http(s):// or protocol-independent URL
 206                          (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
 207                             (?:www\.)?deturl\.com/www\.youtube\.com/|
 208                             (?:www\.)?pwnyoutube\.com/|
 209                             (?:www\.)?yourepeat\.com/|
 210                             tube\.majestyc\.net/|
 211                             youtube\.googleapis\.com/)                        # the various hostnames, with wildcard subdomains
 212                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 213                          (?:                                                  # the various things that can precede the ID:
 214                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 215                              |(?:                                             # or the v= param in all its forms
 216                                  (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)?  # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 217                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 218                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 219                                  v=
 220                              )
 221                          ))
 222                          |youtu\.be/                                          # just youtu.be/xxxx
 223                          |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
 224                          )
 225                      )?                                                       # all until now is optional -> you can pass the naked ID
 226                      ([0-9A-Za-z_-]{11})                                      # here is it! the YouTube video ID
 227                      (?!.*?&list=)                                            # combined list/video URLs are handled by the playlist IE
 228                      (?(1).+)?                                                # if we found the ID, everything can follow
 229                      $"""
 230     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 231     _formats = {
 232         '5': {'ext': 'flv', 'width': 400, 'height': 240},
 233         '6': {'ext': 'flv', 'width': 450, 'height': 270},
 234         '13': {'ext': '3gp'},
 235         '17': {'ext': '3gp', 'width': 176, 'height': 144},
 236         '18': {'ext': 'mp4', 'width': 640, 'height': 360},
 237         '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
 238         '34': {'ext': 'flv', 'width': 640, 'height': 360},
 239         '35': {'ext': 'flv', 'width': 854, 'height': 480},
 240         '36': {'ext': '3gp', 'width': 320, 'height': 240},
 241         '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
 242         '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
 243         '43': {'ext': 'webm', 'width': 640, 'height': 360},
 244         '44': {'ext': 'webm', 'width': 854, 'height': 480},
 245         '45': {'ext': 'webm', 'width': 1280, 'height': 720},
 246         '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
 247
 248
 249         # 3d videos
 250         '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
 251         '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
 252         '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
 253         '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
 254         '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
 255         '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
 256         '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
 257
 258         # Apple HTTP Live Streaming
 259         '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
 260         '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
 261         '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
 262         '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
 263         '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
 264         '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
 265         '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
 266
 267         # DASH mp4 video
 268         '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 269         '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 270         '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 271         '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 272         '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 273         '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 274         '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 275         '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 276
 277         # Dash mp4 audio
 278         '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
 279         '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
 280         '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
 281
 282         # Dash webm
 283         '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 284         '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 285         '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 286         '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 287         '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 288         '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 289         '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 290         '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 291         '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 292         '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 293         '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 294         '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 295         '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 296         '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 297         '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 298
 299         # Dash webm audio
 300         '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
 301         '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
 302
 303         # RTMP (unnamed)
 304         '_rtmp': {'protocol': 'rtmp'},
 305     }
 306
 307     IE_NAME = 'youtube'
 308     _TESTS = [
 309         {
 310             u"url":  u"http://www.youtube.com/watch?v=BaW_jenozKc",
 311             u"file":  u"BaW_jenozKc.mp4",
 312             u"info_dict": {
 313                 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
 314                 u"uploader": u"Philipp Hagemeister",
 315                 u"uploader_id": u"phihag",
 316                 u"upload_date": u"20121002",
 317                 u"description": u"test chars:  \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .",
 318                 u"categories": [u'Science & Technology'],
 319                 'like_count': int,
 320                 'dislike_count': int,
 321             }
 322         },
 323         {
 324             u"url":  u"http://www.youtube.com/watch?v=UxxajLWwzqY",
 325             u"file":  u"UxxajLWwzqY.mp4",
 326             u"note": u"Test generic use_cipher_signature video (#897)",
 327             u"info_dict": {
 328                 u"upload_date": u"20120506",
 329                 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
 330                 u"description": u"md5:fea86fda2d5a5784273df5c7cc994d9f",
 331                 u"uploader": u"Icona Pop",
 332                 u"uploader_id": u"IconaPop"
 333             }
 334         },
 335         {
 336             u"url":  u"https://www.youtube.com/watch?v=07FYdnEawAQ",
 337             u"file":  u"07FYdnEawAQ.mp4",
 338             u"note": u"Test VEVO video with age protection (#956)",
 339             u"info_dict": {
 340                 u"upload_date": u"20130703",
 341                 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
 342                 u"description": u"md5:64249768eec3bc4276236606ea996373",
 343                 u"uploader": u"justintimberlakeVEVO",
 344                 u"uploader_id": u"justintimberlakeVEVO"
 345             }
 346         },
 347         {
 348             u"url":  u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
 349             u"file":  u"yZIXLfi8CZQ.mp4",
 350             u"note": u"Embed-only video (#1746)",
 351             u"info_dict": {
 352                 u"upload_date": u"20120608",
 353                 u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
 354                 u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
 355                 u"uploader": u"SET India",
 356                 u"uploader_id": u"setindia"
 357             }
 358         },
 359         {
 360             u"url": u"http://www.youtube.com/watch?v=a9LDPn-MO4I",
 361             u"file": u"a9LDPn-MO4I.m4a",
 362             u"note": u"256k DASH audio (format 141) via DASH manifest",
 363             u"info_dict": {
 364                 u"upload_date": "20121002",
 365                 u"uploader_id": "8KVIDEO",
 366                 u"description": '',
 367                 u"uploader": "8KVIDEO",
 368                 u"title": "UHDTV TEST 8K VIDEO.mp4"
 369             },
 370             u"params": {
 371                 u"youtube_include_dash_manifest": True,
 372                 u"format": "141",
 373             },
 374         },
 375         # DASH manifest with encrypted signature
 376         {
 377             'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
 378             'info_dict': {
 379                 'id': 'IB3lcPjvWLA',
 380                 'ext': 'm4a',
 381                 'title': 'Afrojack - The Spark ft. Spree Wilson',
 382                 'description': 'md5:9717375db5a9a3992be4668bbf3bc0a8',
 383                 'uploader': 'AfrojackVEVO',
 384                 'uploader_id': 'AfrojackVEVO',
 385                 'upload_date': '20131011',
 386             },
 387             u"params": {
 388                 'youtube_include_dash_manifest': True,
 389                 'format': '141',
 390             },
 391         },
 392     ]
 393
 394     def __init__(self, *args, **kwargs):
 395         super(YoutubeIE, self).__init__(*args, **kwargs)
 396         self._player_cache = {}
 397
 398     def report_video_info_webpage_download(self, video_id):
 399         """Report attempt to download video info webpage."""
 400         self.to_screen(u'%s: Downloading video info webpage' % video_id)
 401
 402     def report_information_extraction(self, video_id):
 403         """Report attempt to extract video information."""
 404         self.to_screen(u'%s: Extracting video information' % video_id)
 405
 406     def report_unavailable_format(self, video_id, format):
 407         """Report extracted video URL."""
 408         self.to_screen(u'%s: Format %s not available' % (video_id, format))
 409
 410     def report_rtmp_download(self):
 411         """Indicate the download will use the RTMP protocol."""
 412         self.to_screen(u'RTMP download detected')
 413
 414     def _signature_cache_id(self, example_sig):
 415         """ Return a string representation of a signature """
 416         return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
 417
 418     def _extract_signature_function(self, video_id, player_url, example_sig):
 419         id_m = re.match(
 420             r'.*-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
 421             player_url)
 422         if not id_m:
 423             raise ExtractorError('Cannot identify player %r' % player_url)
 424         player_type = id_m.group('ext')
 425         player_id = id_m.group('id')
 426
 427         # Read from filesystem cache
 428         func_id = '%s_%s_%s' % (
 429             player_type, player_id, self._signature_cache_id(example_sig))
 430         assert os.path.basename(func_id) == func_id
 431
 432         cache_spec = self._downloader.cache.load(u'youtube-sigfuncs', func_id)
 433         if cache_spec is not None:
 434             return lambda s: ''.join(s[i] for i in cache_spec)
 435
 436         if player_type == 'js':
 437             code = self._download_webpage(
 438                 player_url, video_id,
 439                 note=u'Downloading %s player %s' % (player_type, player_id),
 440                 errnote=u'Download of %s failed' % player_url)
 441             res = self._parse_sig_js(code)
 442         elif player_type == 'swf':
 443             urlh = self._request_webpage(
 444                 player_url, video_id,
 445                 note=u'Downloading %s player %s' % (player_type, player_id),
 446                 errnote=u'Download of %s failed' % player_url)
 447             code = urlh.read()
 448             res = self._parse_sig_swf(code)
 449         else:
 450             assert False, 'Invalid player type %r' % player_type
 451
 452         if cache_spec is None:
 453             test_string = ''.join(map(compat_chr, range(len(example_sig))))
 454             cache_res = res(test_string)
 455             cache_spec = [ord(c) for c in cache_res]
 456
 457         self._downloader.cache.store(u'youtube-sigfuncs', func_id, cache_spec)
 458         return res
 459
 460     def _print_sig_code(self, func, example_sig):
 461         def gen_sig_code(idxs):
 462             def _genslice(start, end, step):
 463                 starts = '' if start == 0 else str(start)
 464                 ends = (u':%d' % (end+step)) if end + step >= 0 else ':'
 465                 steps = '' if step == 1 else (u':%d' % step)
 466                 return 's[%s%s%s]' % (starts, ends, steps)
 467
 468             step = None
 469             start = '(Never used)'  # Quelch pyflakes warnings - start will be
 470                                     # set as soon as step is set
 471             for i, prev in zip(idxs[1:], idxs[:-1]):
 472                 if step is not None:
 473                     if i - prev == step:
 474                         continue
 475                     yield _genslice(start, prev, step)
 476                     step = None
 477                     continue
 478                 if i - prev in [-1, 1]:
 479                     step = i - prev
 480                     start = prev
 481                     continue
 482                 else:
 483                     yield 's[%d]' % prev
 484             if step is None:
 485                 yield 's[%d]' % i
 486             else:
 487                 yield _genslice(start, i, step)
 488
 489         test_string = ''.join(map(compat_chr, range(len(example_sig))))
 490         cache_res = func(test_string)
 491         cache_spec = [ord(c) for c in cache_res]
 492         expr_code = ' + '.join(gen_sig_code(cache_spec))
 493         signature_id_tuple = '(%s)' % (
 494             ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
 495         code = (u'if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
 496                 '    return %s\n') % (signature_id_tuple, expr_code)
 497         self.to_screen(u'Extracted signature function:\n' + code)
 498
 499     def _parse_sig_js(self, jscode):
 500         funcname = self._search_regex(
 501             r'signature=([$a-zA-Z]+)', jscode,
 502              'Initial JS player signature function name')
 503
 504         jsi = JSInterpreter(jscode)
 505         initial_function = jsi.extract_function(funcname)
 506         return lambda s: initial_function([s])
 507
 508     def _parse_sig_swf(self, file_contents):
 509         swfi = SWFInterpreter(file_contents)
 510         TARGET_CLASSNAME = 'SignatureDecipher'
 511         searched_class = swfi.extract_class(TARGET_CLASSNAME)
 512         initial_function = swfi.extract_function(searched_class, 'decipher')
 513         return lambda s: initial_function([s])
 514
 515     def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
 516         """Turn the encrypted s field into a working signature"""
 517
 518         if player_url is None:
 519             raise ExtractorError(u'Cannot decrypt signature without player_url')
 520
 521         if player_url.startswith(u'//'):
 522             player_url = 'https:' + player_url
 523         try:
 524             player_id = (player_url, self._signature_cache_id(s))
 525             if player_id not in self._player_cache:
 526                 func = self._extract_signature_function(
 527                     video_id, player_url, s
 528                 )
 529                 self._player_cache[player_id] = func
 530             func = self._player_cache[player_id]
 531             if self._downloader.params.get('youtube_print_sig_code'):
 532                 self._print_sig_code(func, s)
 533             return func(s)
 534         except Exception as e:
 535             tb = traceback.format_exc()
 536             raise ExtractorError(
 537                 'Signature extraction failed: ' + tb, cause=e)
 538
 539     def _get_available_subtitles(self, video_id, webpage):
 540         try:
 541             sub_list = self._download_webpage(
 542                 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
 543                 video_id, note=False)
 544         except ExtractorError as err:
 545             self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
 546             return {}
 547         lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 548
 549         sub_lang_list = {}
 550         for l in lang_list:
 551             lang = l[1]
 552             if lang in sub_lang_list:
 553                 continue
 554             params = compat_urllib_parse.urlencode({
 555                 'lang': lang,
 556                 'v': video_id,
 557                 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
 558                 'name': unescapeHTML(l[0]).encode('utf-8'),
 559             })
 560             url = 'https://www.youtube.com/api/timedtext?' + params
 561             sub_lang_list[lang] = url
 562         if not sub_lang_list:
 563             self._downloader.report_warning(u'video doesn\'t have subtitles')
 564             return {}
 565         return sub_lang_list
 566
 567     def _get_available_automatic_caption(self, video_id, webpage):
 568         """We need the webpage for getting the captions url, pass it as an
 569            argument to speed up the process."""
 570         sub_format = self._downloader.params.get('subtitlesformat', 'srt')
 571         self.to_screen(u'%s: Looking for automatic captions' % video_id)
 572         mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
 573         err_msg = 'Couldn\'t find automatic captions for %s' % video_id
 574         if mobj is None:
 575             self._downloader.report_warning(err_msg)
 576             return {}
 577         player_config = json.loads(mobj.group(1))
 578         try:
 579             args = player_config[u'args']
 580             caption_url = args[u'ttsurl']
 581             timestamp = args[u'timestamp']
 582             # We get the available subtitles
 583             list_params = compat_urllib_parse.urlencode({
 584                 'type': 'list',
 585                 'tlangs': 1,
 586                 'asrs': 1,
 587             })
 588             list_url = caption_url + '&' + list_params
 589             caption_list = self._download_xml(list_url, video_id)
 590             original_lang_node = caption_list.find('track')
 591             if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
 592                 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
 593                 return {}
 594             original_lang = original_lang_node.attrib['lang_code']
 595
 596             sub_lang_list = {}
 597             for lang_node in caption_list.findall('target'):
 598                 sub_lang = lang_node.attrib['lang_code']
 599                 params = compat_urllib_parse.urlencode({
 600                     'lang': original_lang,
 601                     'tlang': sub_lang,
 602                     'fmt': sub_format,
 603                     'ts': timestamp,
 604                     'kind': 'asr',
 605                 })
 606                 sub_lang_list[sub_lang] = caption_url + '&' + params
 607             return sub_lang_list
 608         # An extractor error can be raise by the download process if there are
 609         # no automatic captions but there are subtitles
 610         except (KeyError, ExtractorError):
 611             self._downloader.report_warning(err_msg)
 612             return {}
 613
 614     @classmethod
 615     def extract_id(cls, url):
 616         mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
 617         if mobj is None:
 618             raise ExtractorError(u'Invalid URL: %s' % url)
 619         video_id = mobj.group(2)
 620         return video_id
 621
 622     def _extract_from_m3u8(self, manifest_url, video_id):
 623         url_map = {}
 624         def _get_urls(_manifest):
 625             lines = _manifest.split('\n')
 626             urls = filter(lambda l: l and not l.startswith('#'),
 627                             lines)
 628             return urls
 629         manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
 630         formats_urls = _get_urls(manifest)
 631         for format_url in formats_urls:
 632             itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
 633             url_map[itag] = format_url
 634         return url_map
 635
 636     def _extract_annotations(self, video_id):
 637         url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
 638         return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
 639
 640     def _real_extract(self, url):
 641         proto = (
 642             'http' if self._downloader.params.get('prefer_insecure', False)
 643             else 'https')
 644
 645         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 646         mobj = re.search(self._NEXT_URL_RE, url)
 647         if mobj:
 648             url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 649         video_id = self.extract_id(url)
 650
 651         # Get video webpage
 652         url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 653         video_webpage = self._download_webpage(url, video_id)
 654
 655         # Attempt to extract SWF player URL
 656         mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 657         if mobj is not None:
 658             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 659         else:
 660             player_url = None
 661
 662         # Get video info
 663         self.report_video_info_webpage_download(video_id)
 664         if re.search(r'player-age-gate-content">', video_webpage) is not None:
 665             self.report_age_confirmation()
 666             age_gate = True
 667             # We simulate the access to the video from www.youtube.com/v/{video_id}
 668             # this can be viewed without login into Youtube
 669             data = compat_urllib_parse.urlencode({
 670                 'video_id': video_id,
 671                 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
 672                 'sts': self._search_regex(
 673                     r'"sts"\s*:\s*(\d+)', video_webpage, 'sts'),
 674             })
 675             video_info_url = proto + '://www.youtube.com/get_video_info?' + data
 676             video_info_webpage = self._download_webpage(video_info_url, video_id,
 677                                     note=False,
 678                                     errnote='unable to download video info webpage')
 679             video_info = compat_parse_qs(video_info_webpage)
 680         else:
 681             age_gate = False
 682             for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 683                 video_info_url = (proto + '://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 684                         % (video_id, el_type))
 685                 video_info_webpage = self._download_webpage(video_info_url, video_id,
 686                                         note=False,
 687                                         errnote='unable to download video info webpage')
 688                 video_info = compat_parse_qs(video_info_webpage)
 689                 if 'token' in video_info:
 690                     break
 691         if 'token' not in video_info:
 692             if 'reason' in video_info:
 693                 raise ExtractorError(
 694                     'YouTube said: %s' % video_info['reason'][0],
 695                     expected=True, video_id=video_id)
 696             else:
 697                 raise ExtractorError(
 698                     '"token" parameter not in video info for unknown reason',
 699                     video_id=video_id)
 700
 701         if 'view_count' in video_info:
 702             view_count = int(video_info['view_count'][0])
 703         else:
 704             view_count = None
 705
 706         # Check for "rental" videos
 707         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 708             raise ExtractorError(u'"rental" videos not supported')
 709
 710         # Start extracting information
 711         self.report_information_extraction(video_id)
 712
 713         # uploader
 714         if 'author' not in video_info:
 715             raise ExtractorError(u'Unable to extract uploader name')
 716         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 717
 718         # uploader_id
 719         video_uploader_id = None
 720         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 721         if mobj is not None:
 722             video_uploader_id = mobj.group(1)
 723         else:
 724             self._downloader.report_warning(u'unable to extract uploader nickname')
 725
 726         # title
 727         if 'title' in video_info:
 728             video_title = video_info['title'][0]
 729         else:
 730             self._downloader.report_warning(u'Unable to extract video title')
 731             video_title = '_'
 732
 733         # thumbnail image
 734         # We try first to get a high quality image:
 735         m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
 736                             video_webpage, re.DOTALL)
 737         if m_thumb is not None:
 738             video_thumbnail = m_thumb.group(1)
 739         elif 'thumbnail_url' not in video_info:
 740             self._downloader.report_warning(u'unable to extract video thumbnail')
 741             video_thumbnail = None
 742         else:   # don't panic if we can't find it
 743             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 744
 745         # upload date
 746         upload_date = None
 747         mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
 748         if mobj is None:
 749             mobj = re.search(
 750                 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
 751                 video_webpage)
 752         if mobj is not None:
 753             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 754             upload_date = unified_strdate(upload_date)
 755
 756         m_cat_container = self._search_regex(
 757             r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
 758             video_webpage, 'categories', fatal=False)
 759         if m_cat_container:
 760             category = self._html_search_regex(
 761                 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
 762                 default=None)
 763             video_categories = None if category is None else [category]
 764         else:
 765             video_categories = None
 766
 767         # description
 768         video_description = get_element_by_id("eow-description", video_webpage)
 769         if video_description:
 770             video_description = re.sub(r'''(?x)
 771                 <a\s+
 772                     (?:[a-zA-Z-]+="[^"]+"\s+)*?
 773                     title="([^"]+)"\s+
 774                     (?:[a-zA-Z-]+="[^"]+"\s+)*?
 775                     class="yt-uix-redirect-link"\s*>
 776                 [^<]+
 777                 </a>
 778             ''', r'\1', video_description)
 779             video_description = clean_html(video_description)
 780         else:
 781             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
 782             if fd_mobj:
 783                 video_description = unescapeHTML(fd_mobj.group(1))
 784             else:
 785                 video_description = ''
 786
 787         def _extract_count(count_name):
 788             count = self._search_regex(
 789                 r'id="watch-%s"[^>]*>.*?([\d,]+)\s*</span>' % re.escape(count_name),
 790                 video_webpage, count_name, default=None)
 791             if count is not None:
 792                 return int(count.replace(',', ''))
 793             return None
 794         like_count = _extract_count(u'like')
 795         dislike_count = _extract_count(u'dislike')
 796
 797         # subtitles
 798         video_subtitles = self.extract_subtitles(video_id, video_webpage)
 799
 800         if self._downloader.params.get('listsubtitles', False):
 801             self._list_available_subtitles(video_id, video_webpage)
 802             return
 803
 804         if 'length_seconds' not in video_info:
 805             self._downloader.report_warning(u'unable to extract video duration')
 806             video_duration = None
 807         else:
 808             video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
 809
 810         # annotations
 811         video_annotations = None
 812         if self._downloader.params.get('writeannotations', False):
 813                 video_annotations = self._extract_annotations(video_id)
 814
 815         # Decide which formats to download
 816         try:
 817             mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
 818             if not mobj:
 819                 raise ValueError('Could not find vevo ID')
 820             json_code = uppercase_escape(mobj.group(1))
 821             ytplayer_config = json.loads(json_code)
 822             args = ytplayer_config['args']
 823             # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
 824             # this signatures are encrypted
 825             if 'url_encoded_fmt_stream_map' not in args:
 826                 raise ValueError(u'No stream_map present')  # caught below
 827             re_signature = re.compile(r'[&,]s=')
 828             m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
 829             if m_s is not None:
 830                 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
 831                 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
 832             m_s = re_signature.search(args.get('adaptive_fmts', ''))
 833             if m_s is not None:
 834                 if 'adaptive_fmts' in video_info:
 835                     video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
 836                 else:
 837                     video_info['adaptive_fmts'] = [args['adaptive_fmts']]
 838         except ValueError:
 839             pass
 840
 841         def _map_to_format_list(urlmap):
 842             formats = []
 843             for itag, video_real_url in urlmap.items():
 844                 dct = {
 845                     'format_id': itag,
 846                     'url': video_real_url,
 847                     'player_url': player_url,
 848                 }
 849                 if itag in self._formats:
 850                     dct.update(self._formats[itag])
 851                 formats.append(dct)
 852             return formats
 853
 854         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 855             self.report_rtmp_download()
 856             formats = [{
 857                 'format_id': '_rtmp',
 858                 'protocol': 'rtmp',
 859                 'url': video_info['conn'][0],
 860                 'player_url': player_url,
 861             }]
 862         elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
 863             encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
 864             if 'rtmpe%3Dyes' in encoded_url_map:
 865                 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
 866             url_map = {}
 867             for url_data_str in encoded_url_map.split(','):
 868                 url_data = compat_parse_qs(url_data_str)
 869                 if 'itag' not in url_data or 'url' not in url_data:
 870                     continue
 871                 format_id = url_data['itag'][0]
 872                 url = url_data['url'][0]
 873
 874                 if 'sig' in url_data:
 875                     url += '&signature=' + url_data['sig'][0]
 876                 elif 's' in url_data:
 877                     encrypted_sig = url_data['s'][0]
 878
 879                     if not age_gate:
 880                         jsplayer_url_json = self._search_regex(
 881                             r'"assets":.+?"js":\s*("[^"]+")',
 882                             video_webpage, 'JS player URL')
 883                         player_url = json.loads(jsplayer_url_json)
 884                     if player_url is None:
 885                         player_url_json = self._search_regex(
 886                             r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
 887                             video_webpage, 'age gate player URL')
 888                         player_url = json.loads(player_url_json)
 889
 890                     if self._downloader.params.get('verbose'):
 891                         if player_url is None:
 892                             player_version = 'unknown'
 893                             player_desc = 'unknown'
 894                         else:
 895                             if player_url.endswith('swf'):
 896                                 player_version = self._search_regex(
 897                                     r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
 898                                     'flash player', fatal=False)
 899                                 player_desc = 'flash player %s' % player_version
 900                             else:
 901                                 player_version = self._search_regex(
 902                                     r'html5player-([^/]+?)(?:/html5player)?\.js',
 903                                     player_url,
 904                                     'html5 player', fatal=False)
 905                                 player_desc = 'html5 player %s' % player_version
 906
 907                         parts_sizes = self._signature_cache_id(encrypted_sig)
 908                         self.to_screen(u'{%s} signature length %s, %s' %
 909                             (format_id, parts_sizes, player_desc))
 910
 911                     signature = self._decrypt_signature(
 912                         encrypted_sig, video_id, player_url, age_gate)
 913                     url += '&signature=' + signature
 914                 if 'ratebypass' not in url:
 915                     url += '&ratebypass=yes'
 916                 url_map[format_id] = url
 917             formats = _map_to_format_list(url_map)
 918         elif video_info.get('hlsvp'):
 919             manifest_url = video_info['hlsvp'][0]
 920             url_map = self._extract_from_m3u8(manifest_url, video_id)
 921             formats = _map_to_format_list(url_map)
 922         else:
 923             raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
 924
 925         # Look for the DASH manifest
 926         if (self._downloader.params.get('youtube_include_dash_manifest', False)):
 927             try:
 928                 # The DASH manifest used needs to be the one from the original video_webpage.
 929                 # The one found in get_video_info seems to be using different signatures.
 930                 # However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage.
 931                 # Luckily, it seems, this case uses some kind of default signature (len == 86), so the
 932                 # combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here.
 933                 if age_gate:
 934                     dash_manifest_url = video_info.get('dashmpd')[0]
 935                 else:
 936                     dash_manifest_url = ytplayer_config['args']['dashmpd']
 937                 def decrypt_sig(mobj):
 938                     s = mobj.group(1)
 939                     dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
 940                     return '/signature/%s' % dec_s
 941                 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
 942                 dash_doc = self._download_xml(
 943                     dash_manifest_url, video_id,
 944                     note=u'Downloading DASH manifest',
 945                     errnote=u'Could not download DASH manifest')
 946                 for r in dash_doc.findall(u'.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
 947                     url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
 948                     if url_el is None:
 949                         continue
 950                     format_id = r.attrib['id']
 951                     video_url = url_el.text
 952                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
 953                     f = {
 954                         'format_id': format_id,
 955                         'url': video_url,
 956                         'width': int_or_none(r.attrib.get('width')),
 957                         'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
 958                         'asr': int_or_none(r.attrib.get('audioSamplingRate')),
 959                         'filesize': filesize,
 960                     }
 961                     try:
 962                         existing_format = next(
 963                             fo for fo in formats
 964                             if fo['format_id'] == format_id)
 965                     except StopIteration:
 966                         f.update(self._formats.get(format_id, {}))
 967                         formats.append(f)
 968                     else:
 969                         existing_format.update(f)
 970
 971             except (ExtractorError, KeyError) as e:
 972                 self.report_warning(u'Skipping DASH manifest: %s' % e, video_id)
 973
 974         self._sort_formats(formats)
 975
 976         return {
 977             'id':           video_id,
 978             'uploader':     video_uploader,
 979             'uploader_id':  video_uploader_id,
 980             'upload_date':  upload_date,
 981             'title':        video_title,
 982             'thumbnail':    video_thumbnail,
 983             'description':  video_description,
 984             'categories':   video_categories,
 985             'subtitles':    video_subtitles,
 986             'duration':     video_duration,
 987             'age_limit':    18 if age_gate else 0,
 988             'annotations':  video_annotations,
 989             'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
 990             'view_count':   view_count,
 991             'like_count': like_count,
 992             'dislike_count': dislike_count,
 993             'formats':      formats,
 994         }
 995
 996 class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
 997     IE_DESC = 'YouTube.com playlists'
 998     _VALID_URL = r"""(?x)(?:
 999                         (?:https?://)?
1000                         (?:\w+\.)?
1001                         youtube\.com/
1002                         (?:
1003                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1004                            \? (?:.*?&)*? (?:p|a|list)=
1005                         |  p/
1006                         )
1007                         (
1008                             (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
1009                             # Top tracks, they can also include dots
1010                             |(?:MC)[\w\.]*
1011                         )
1012                         .*
1013                      |
1014                         ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
1015                      )"""
1016     _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
1017     _MORE_PAGES_INDICATOR = r'data-link-type="next"'
1018     _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
1019     IE_NAME = 'youtube:playlist'
1020     _TESTS = [{
1021         'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1022         'info_dict': {
1023             'title': 'ytdl test PL',
1024         },
1025         'playlist_count': 3,
1026     }, {
1027         'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1028         'info_dict': {
1029             'title': 'YDL_Empty_List',
1030         },
1031         'playlist_count': 0,
1032     }, {
1033         'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
1034         'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1035         'info_dict': {
1036             'title': '29C3: Not my department',
1037         },
1038         'playlist_count': 95,
1039     }, {
1040         'note': 'issue #673',
1041         'url': 'PLBB231211A4F62143',
1042         'info_dict': {
1043             'title': 'Team Fortress 2 (Class-based LP)',
1044         },
1045         'playlist_mincount': 26,
1046     }, {
1047         'note': 'Large playlist',
1048         'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
1049         'info_dict': {
1050             'title': 'Uploads from Cauchemar',
1051         },
1052         'playlist_mincount': 799,
1053     }, {
1054         'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1055         'info_dict': {
1056             'title': 'YDL_safe_search',
1057         },
1058         'playlist_count': 2,
1059     }]
1060
1061     def _real_initialize(self):
1062         self._login()
1063
1064     def _ids_to_results(self, ids):
1065         return [
1066             self.url_result(vid_id, 'Youtube', video_id=vid_id)
1067             for vid_id in ids]
1068
1069     def _extract_mix(self, playlist_id):
1070         # The mixes are generated from a a single video
1071         # the id of the playlist is just 'RD' + video_id
1072         url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
1073         webpage = self._download_webpage(
1074             url, playlist_id, 'Downloading Youtube mix')
1075         search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
1076         title_span = (
1077             search_title('playlist-title') or
1078             search_title('title long-title') or
1079             search_title('title'))
1080         title = clean_html(title_span)
1081         ids = orderedSet(re.findall(
1082             r'''(?xs)data-video-username=".*?".*?
1083                        href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
1084             webpage))
1085         url_results = self._ids_to_results(ids)
1086
1087         return self.playlist_result(url_results, playlist_id, title)
1088
1089     def _real_extract(self, url):
1090         # Extract playlist id
1091         mobj = re.match(self._VALID_URL, url)
1092         if mobj is None:
1093             raise ExtractorError(u'Invalid URL: %s' % url)
1094         playlist_id = mobj.group(1) or mobj.group(2)
1095
1096         # Check if it's a video-specific URL
1097         query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1098         if 'v' in query_dict:
1099             video_id = query_dict['v'][0]
1100             if self._downloader.params.get('noplaylist'):
1101                 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
1102                 return self.url_result(video_id, 'Youtube', video_id=video_id)
1103             else:
1104                 self.to_screen(u'Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1105
1106         if playlist_id.startswith('RD'):
1107             # Mixes require a custom extraction process
1108             return self._extract_mix(playlist_id)
1109         if playlist_id.startswith('TL'):
1110             raise ExtractorError(u'For downloading YouTube.com top lists, use '
1111                 'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
1112
1113         url = self._TEMPLATE_URL % playlist_id
1114         page = self._download_webpage(url, playlist_id)
1115         more_widget_html = content_html = page
1116
1117         # Check if the playlist exists or is private
1118         if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
1119             raise ExtractorError(
1120                 'The playlist doesn\'t exist or is private, use --username or '
1121                 '--netrc to access it.',
1122                 expected=True)
1123
1124         # Extract the video ids from the playlist pages
1125         ids = []
1126
1127         for page_num in itertools.count(1):
1128             matches = re.finditer(self._VIDEO_RE, content_html)
1129             # We remove the duplicates and the link with index 0
1130             # (it's not the first video of the playlist)
1131             new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1132             ids.extend(new_ids)
1133
1134             mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1135             if not mobj:
1136                 break
1137
1138             more = self._download_json(
1139                 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1140                 'Downloading page #%s' % page_num,
1141                 transform_source=uppercase_escape)
1142             content_html = more['content_html']
1143             more_widget_html = more['load_more_widget_html']
1144
1145         playlist_title = self._html_search_regex(
1146             r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
1147             page, 'title')
1148
1149         url_results = self._ids_to_results(ids)
1150         return self.playlist_result(url_results, playlist_id, playlist_title)
1151
1152
1153 class YoutubeTopListIE(YoutubePlaylistIE):
1154     IE_NAME = 'youtube:toplist'
1155     IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1156         ' (Example: "yttoplist:music:Top Tracks")')
1157     _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1158     _TESTS = []
1159
1160     def _real_extract(self, url):
1161         mobj = re.match(self._VALID_URL, url)
1162         channel = mobj.group('chann')
1163         title = mobj.group('title')
1164         query = compat_urllib_parse.urlencode({'title': title})
1165         playlist_re = 'href="([^"]+?%s.*?)"' % re.escape(query)
1166         channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)
1167         link = self._html_search_regex(playlist_re, channel_page, 'list')
1168         url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1169
1170         video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1171         ids = []
1172         # sometimes the webpage doesn't contain the videos
1173         # retry until we get them
1174         for i in itertools.count(0):
1175             msg = 'Downloading Youtube mix'
1176             if i > 0:
1177                 msg += ', retry #%d' % i
1178
1179             webpage = self._download_webpage(url, title, msg)
1180             ids = orderedSet(re.findall(video_re, webpage))
1181             if ids:
1182                 break
1183         url_results = self._ids_to_results(ids)
1184         return self.playlist_result(url_results, playlist_title=title)
1185
1186
1187 class YoutubeChannelIE(InfoExtractor):
1188     IE_DESC = 'YouTube.com channels'
1189     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1190     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1191     _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1192     IE_NAME = 'youtube:channel'
1193
1194     def extract_videos_from_page(self, page):
1195         ids_in_page = []
1196         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1197             if mobj.group(1) not in ids_in_page:
1198                 ids_in_page.append(mobj.group(1))
1199         return ids_in_page
1200
1201     def _real_extract(self, url):
1202         # Extract channel id
1203         mobj = re.match(self._VALID_URL, url)
1204         if mobj is None:
1205             raise ExtractorError(u'Invalid URL: %s' % url)
1206
1207         # Download channel page
1208         channel_id = mobj.group(1)
1209         video_ids = []
1210         url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1211         channel_page = self._download_webpage(url, channel_id)
1212         autogenerated = re.search(r'''(?x)
1213                 class="[^"]*?(?:
1214                     channel-header-autogenerated-label|
1215                     yt-channel-title-autogenerated
1216                 )[^"]*"''', channel_page) is not None
1217
1218         if autogenerated:
1219             # The videos are contained in a single page
1220             # the ajax pages can't be used, they are empty
1221             video_ids = self.extract_videos_from_page(channel_page)
1222         else:
1223             # Download all channel pages using the json-based channel_ajax query
1224             for pagenum in itertools.count(1):
1225                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1226                 page = self._download_json(
1227                     url, channel_id, note=u'Downloading page #%s' % pagenum,
1228                     transform_source=uppercase_escape)
1229
1230                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1231                 video_ids.extend(ids_in_page)
1232
1233                 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1234                     break
1235
1236         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1237
1238         url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1239                        for video_id in video_ids]
1240         return self.playlist_result(url_entries, channel_id)
1241
1242
1243 class YoutubeUserIE(InfoExtractor):
1244     IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
1245     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1246     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
1247     _GDATA_PAGE_SIZE = 50
1248     _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1249     IE_NAME = 'youtube:user'
1250
1251     @classmethod
1252     def suitable(cls, url):
1253         # Don't return True if the url can be extracted with other youtube
1254         # extractor, the regex would is too permissive and it would match.
1255         other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1256         if any(ie.suitable(url) for ie in other_ies): return False
1257         else: return super(YoutubeUserIE, cls).suitable(url)
1258
1259     def _real_extract(self, url):
1260         # Extract username
1261         mobj = re.match(self._VALID_URL, url)
1262         if mobj is None:
1263             raise ExtractorError(u'Invalid URL: %s' % url)
1264
1265         username = mobj.group(1)
1266
1267         # Download video ids using YouTube Data API. Result size per
1268         # query is limited (currently to 50 videos) so we need to query
1269         # page by page until there are no video ids - it means we got
1270         # all of them.
1271
1272         def download_page(pagenum):
1273             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1274
1275             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1276             page = self._download_webpage(
1277                 gdata_url, username,
1278                 'Downloading video ids from %d to %d' % (
1279                     start_index, start_index + self._GDATA_PAGE_SIZE))
1280
1281             try:
1282                 response = json.loads(page)
1283             except ValueError as err:
1284                 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1285             if 'entry' not in response['feed']:
1286                 return
1287
1288             # Extract video identifiers
1289             entries = response['feed']['entry']
1290             for entry in entries:
1291                 title = entry['title']['$t']
1292                 video_id = entry['id']['$t'].split('/')[-1]
1293                 yield {
1294                     '_type': 'url',
1295                     'url': video_id,
1296                     'ie_key': 'Youtube',
1297                     'id': video_id,
1298                     'title': title,
1299                 }
1300         url_results = PagedList(download_page, self._GDATA_PAGE_SIZE)
1301
1302         return self.playlist_result(url_results, playlist_title=username)
1303
1304
1305 class YoutubeSearchIE(SearchInfoExtractor):
1306     IE_DESC = 'YouTube.com searches'
1307     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1308     _MAX_RESULTS = 1000
1309     IE_NAME = 'youtube:search'
1310     _SEARCH_KEY = 'ytsearch'
1311
1312     def _get_n_results(self, query, n):
1313         """Get a specified number of results for a query"""
1314
1315         video_ids = []
1316         pagenum = 0
1317         limit = n
1318         PAGE_SIZE = 50
1319
1320         while (PAGE_SIZE * pagenum) < limit:
1321             result_url = self._API_URL % (
1322                 compat_urllib_parse.quote_plus(query.encode('utf-8')),
1323                 (PAGE_SIZE * pagenum) + 1)
1324             data_json = self._download_webpage(
1325                 result_url, video_id=u'query "%s"' % query,
1326                 note=u'Downloading page %s' % (pagenum + 1),
1327                 errnote=u'Unable to download API page')
1328             data = json.loads(data_json)
1329             api_response = data['data']
1330
1331             if 'items' not in api_response:
1332                 raise ExtractorError(
1333                     '[youtube] No video results', expected=True)
1334
1335             new_ids = list(video['id'] for video in api_response['items'])
1336             video_ids += new_ids
1337
1338             limit = min(n, api_response['totalItems'])
1339             pagenum += 1
1340
1341         if len(video_ids) > n:
1342             video_ids = video_ids[:n]
1343         videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1344                   for video_id in video_ids]
1345         return self.playlist_result(videos, query)
1346
1347
1348 class YoutubeSearchDateIE(YoutubeSearchIE):
1349     IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
1350     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1351     _SEARCH_KEY = 'ytsearchdate'
1352     IE_DESC = 'YouTube.com searches, newest videos first'
1353
1354
1355 class YoutubeSearchURLIE(InfoExtractor):
1356     IE_DESC = 'YouTube.com search URLs'
1357     IE_NAME = 'youtube:search_url'
1358     _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
1359
1360     def _real_extract(self, url):
1361         mobj = re.match(self._VALID_URL, url)
1362         query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1363
1364         webpage = self._download_webpage(url, query)
1365         result_code = self._search_regex(
1366             r'(?s)<ol class="item-section"(.*?)</ol>', webpage, 'result HTML')
1367
1368         part_codes = re.findall(
1369             r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1370         entries = []
1371         for part_code in part_codes:
1372             part_title = self._html_search_regex(
1373                 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
1374             part_url_snippet = self._html_search_regex(
1375                 r'(?s)href="([^"]+)"', part_code, 'item URL')
1376             part_url = compat_urlparse.urljoin(
1377                 'https://www.youtube.com/', part_url_snippet)
1378             entries.append({
1379                 '_type': 'url',
1380                 'url': part_url,
1381                 'title': part_title,
1382             })
1383
1384         return {
1385             '_type': 'playlist',
1386             'entries': entries,
1387             'title': query,
1388         }
1389
1390
1391 class YoutubeShowIE(InfoExtractor):
1392     IE_DESC = 'YouTube.com (multi-season) shows'
1393     _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1394     IE_NAME = 'youtube:show'
1395
1396     def _real_extract(self, url):
1397         mobj = re.match(self._VALID_URL, url)
1398         show_name = mobj.group(1)
1399         webpage = self._download_webpage(url, show_name, 'Downloading show webpage')
1400         # There's one playlist for each season of the show
1401         m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1402         self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1403         return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1404
1405
1406 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1407     """
1408     Base class for extractors that fetch info from
1409     http://www.youtube.com/feed_ajax
1410     Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1411     """
1412     _LOGIN_REQUIRED = True
1413     # use action_load_personal_feed instead of action_load_system_feed
1414     _PERSONAL_FEED = False
1415
1416     @property
1417     def _FEED_TEMPLATE(self):
1418         action = 'action_load_system_feed'
1419         if self._PERSONAL_FEED:
1420             action = 'action_load_personal_feed'
1421         return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1422
1423     @property
1424     def IE_NAME(self):
1425         return 'youtube:%s' % self._FEED_NAME
1426
1427     def _real_initialize(self):
1428         self._login()
1429
1430     def _real_extract(self, url):
1431         feed_entries = []
1432         paging = 0
1433         for i in itertools.count(1):
1434             info = self._download_json(self._FEED_TEMPLATE % paging,
1435                                           '%s feed' % self._FEED_NAME,
1436                                           'Downloading page %s' % i)
1437             feed_html = info.get('feed_html') or info.get('content_html')
1438             load_more_widget_html = info.get('load_more_widget_html') or feed_html
1439             m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1440             ids = orderedSet(m.group(1) for m in m_ids)
1441             feed_entries.extend(
1442                 self.url_result(video_id, 'Youtube', video_id=video_id)
1443                 for video_id in ids)
1444             mobj = re.search(
1445                 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
1446                 load_more_widget_html)
1447             if mobj is None:
1448                 break
1449             paging = mobj.group('paging')
1450         return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1451
1452 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1453     IE_DESC = 'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1454     _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1455     _FEED_NAME = 'recommended'
1456     _PLAYLIST_TITLE = 'Youtube Recommended videos'
1457
1458 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1459     IE_DESC = 'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1460     _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1461     _FEED_NAME = 'watch_later'
1462     _PLAYLIST_TITLE = 'Youtube Watch Later'
1463     _PERSONAL_FEED = True
1464
1465 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1466     IE_DESC = 'Youtube watch history, "ythistory" keyword (requires authentication)'
1467     _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
1468     _FEED_NAME = 'history'
1469     _PERSONAL_FEED = True
1470     _PLAYLIST_TITLE = 'Youtube Watch History'
1471
1472 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1473     IE_NAME = 'youtube:favorites'
1474     IE_DESC = 'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1475     _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1476     _LOGIN_REQUIRED = True
1477
1478     def _real_extract(self, url):
1479         webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1480         playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
1481         return self.url_result(playlist_id, 'YoutubePlaylist')
1482
1483
1484 class YoutubeSubscriptionsIE(YoutubePlaylistIE):
1485     IE_NAME = 'youtube:subscriptions'
1486     IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1487     _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1488     _TESTS = []
1489
1490     def _real_extract(self, url):
1491         title = 'Youtube Subscriptions'
1492         page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title)
1493
1494         # The extraction process is the same as for playlists, but the regex
1495         # for the video ids doesn't contain an index
1496         ids = []
1497         more_widget_html = content_html = page
1498
1499         for page_num in itertools.count(1):
1500             matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
1501             new_ids = orderedSet(matches)
1502             ids.extend(new_ids)
1503
1504             mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1505             if not mobj:
1506                 break
1507
1508             more = self._download_json(
1509                 'https://youtube.com/%s' % mobj.group('more'), title,
1510                 'Downloading page #%s' % page_num,
1511                 transform_source=uppercase_escape)
1512             content_html = more['content_html']
1513             more_widget_html = more['load_more_widget_html']
1514
1515         return {
1516             '_type': 'playlist',
1517             'title': title,
1518             'entries': self._ids_to_results(ids),
1519         }
1520
1521
1522 class YoutubeTruncatedURLIE(InfoExtractor):
1523     IE_NAME = 'youtube:truncated_url'
1524     IE_DESC = False  # Do not list
1525     _VALID_URL = r'''(?x)
1526         (?:https?://)?[^/]+/watch\?(?:
1527             feature=[a-z_]+|
1528             annotation_id=annotation_[^&]+
1529         )?$|
1530         (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1531     '''
1532
1533     _TESTS = [{
1534         'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1535         'only_matching': True,
1536     }, {
1537         'url': 'http://www.youtube.com/watch?',
1538         'only_matching': True,
1539     }]
1540
1541     def _real_extract(self, url):
1542         raise ExtractorError(
1543             'Did you forget to quote the URL? Remember that & is a meta '
1544             'character in most shells, so you want to put the URL in quotes, '
1545             'like  youtube-dl '
1546             '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1547             ' or simply  youtube-dl BaW_jenozKc  .',
1548             expected=True)