youtube_dl/extractor/youtube.py

   1 # coding: utf-8
   2
   3 import collections
   4 import errno
   5 import io
   6 import itertools
   7 import json
   8 import os.path
   9 import re
  10 import string
  11 import struct
  12 import traceback
  13 import zlib
  14
  15 from .common import InfoExtractor, SearchInfoExtractor
  16 from .subtitles import SubtitlesInfoExtractor
  17 from ..utils import (
  18     compat_chr,
  19     compat_parse_qs,
  20     compat_urllib_parse,
  21     compat_urllib_request,
  22     compat_urlparse,
  23     compat_str,
  24
  25     clean_html,
  26     get_cachedir,
  27     get_element_by_id,
  28     get_element_by_attribute,
  29     ExtractorError,
  30     unescapeHTML,
  31     unified_strdate,
  32     orderedSet,
  33     write_json_file,
  34 )
  35
  36 class YoutubeBaseInfoExtractor(InfoExtractor):
  37     """Provide base functions for Youtube extractors"""
  38     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
  39     _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
  40     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
  41     _NETRC_MACHINE = 'youtube'
  42     # If True it will raise an error if no login info is provided
  43     _LOGIN_REQUIRED = False
  44
  45     def _set_language(self):
  46         return bool(self._download_webpage(
  47             self._LANG_URL, None,
  48             note=u'Setting language', errnote='unable to set language',
  49             fatal=False))
  50
  51     def _login(self):
  52         (username, password) = self._get_login_info()
  53         # No authentication to be performed
  54         if username is None:
  55             if self._LOGIN_REQUIRED:
  56                 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
  57             return False
  58
  59         login_page = self._download_webpage(
  60             self._LOGIN_URL, None,
  61             note=u'Downloading login page',
  62             errnote=u'unable to fetch login page', fatal=False)
  63         if login_page is False:
  64             return
  65
  66         galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
  67                                   login_page, u'Login GALX parameter')
  68
  69         # Log in
  70         login_form_strs = {
  71                 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
  72                 u'Email': username,
  73                 u'GALX': galx,
  74                 u'Passwd': password,
  75                 u'PersistentCookie': u'yes',
  76                 u'_utf8': u'霱',
  77                 u'bgresponse': u'js_disabled',
  78                 u'checkConnection': u'',
  79                 u'checkedDomains': u'youtube',
  80                 u'dnConn': u'',
  81                 u'pstMsg': u'0',
  82                 u'rmShown': u'1',
  83                 u'secTok': u'',
  84                 u'signIn': u'Sign in',
  85                 u'timeStmp': u'',
  86                 u'service': u'youtube',
  87                 u'uilel': u'3',
  88                 u'hl': u'en_US',
  89         }
  90         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
  91         # chokes on unicode
  92         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
  93         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
  94
  95         req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
  96         login_results = self._download_webpage(
  97             req, None,
  98             note=u'Logging in', errnote=u'unable to log in', fatal=False)
  99         if login_results is False:
 100             return False
 101         if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 102             self._downloader.report_warning(u'unable to log in: bad username or password')
 103             return False
 104         return True
 105
 106     def _confirm_age(self):
 107         age_form = {
 108             'next_url': '/',
 109             'action_confirm': 'Confirm',
 110         }
 111         req = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 112
 113         self._download_webpage(
 114             req, None,
 115             note=u'Confirming age', errnote=u'Unable to confirm age')
 116         return True
 117
 118     def _real_initialize(self):
 119         if self._downloader is None:
 120             return
 121         if not self._set_language():
 122             return
 123         if not self._login():
 124             return
 125         self._confirm_age()
 126
 127
 128 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
 129     IE_DESC = u'YouTube.com'
 130     _VALID_URL = r"""(?x)^
 131                      (
 132                          (?:https?://|//)?                                    # http(s):// or protocol-independent URL (optional)
 133                          (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
 134                             tube\.majestyc\.net/|
 135                             youtube\.googleapis\.com/)                        # the various hostnames, with wildcard subdomains
 136                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 137                          (?:                                                  # the various things that can precede the ID:
 138                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 139                              |(?:                                             # or the v= param in all its forms
 140                                  (?:(?:watch|movie)(?:_popup)?(?:\.php)?)?    # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 141                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 142                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 143                                  v=
 144                              )
 145                          ))
 146                          |youtu\.be/                                          # just youtu.be/xxxx
 147                          )
 148                      )?                                                       # all until now is optional -> you can pass the naked ID
 149                      ([0-9A-Za-z_-]{11})                                      # here is it! the YouTube video ID
 150                      (?(1).+)?                                                # if we found the ID, everything can follow
 151                      $"""
 152     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 153     # Listed in order of quality
 154     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
 155                           # Apple HTTP Live Streaming
 156                           '96', '95', '94', '93', '92', '132', '151',
 157                           # 3D
 158                           '85', '84', '102', '83', '101', '82', '100',
 159                           # Dash video
 160                           '138', '137', '248', '136', '247', '135', '246',
 161                           '245', '244', '134', '243', '133', '242', '160',
 162                           # Dash audio
 163                           '141', '172', '140', '171', '139',
 164                           ]
 165     _video_extensions = {
 166         '13': '3gp',
 167         '17': '3gp',
 168         '18': 'mp4',
 169         '22': 'mp4',
 170         '36': '3gp',
 171         '37': 'mp4',
 172         '38': 'mp4',
 173         '43': 'webm',
 174         '44': 'webm',
 175         '45': 'webm',
 176         '46': 'webm',
 177
 178         # 3d videos
 179         '82': 'mp4',
 180         '83': 'mp4',
 181         '84': 'mp4',
 182         '85': 'mp4',
 183         '100': 'webm',
 184         '101': 'webm',
 185         '102': 'webm',
 186
 187         # Apple HTTP Live Streaming
 188         '92': 'mp4',
 189         '93': 'mp4',
 190         '94': 'mp4',
 191         '95': 'mp4',
 192         '96': 'mp4',
 193         '132': 'mp4',
 194         '151': 'mp4',
 195
 196         # Dash mp4
 197         '133': 'mp4',
 198         '134': 'mp4',
 199         '135': 'mp4',
 200         '136': 'mp4',
 201         '137': 'mp4',
 202         '138': 'mp4',
 203         '160': 'mp4',
 204
 205         # Dash mp4 audio
 206         '139': 'm4a',
 207         '140': 'm4a',
 208         '141': 'm4a',
 209
 210         # Dash webm
 211         '171': 'webm',
 212         '172': 'webm',
 213         '242': 'webm',
 214         '243': 'webm',
 215         '244': 'webm',
 216         '245': 'webm',
 217         '246': 'webm',
 218         '247': 'webm',
 219         '248': 'webm',
 220     }
 221     _video_dimensions = {
 222         '5': {'width': 400, 'height': 240},
 223         '6': {},
 224         '13': {},
 225         '17': {'width': 176, 'height': 144},
 226         '18': {'width': 640, 'height': 360},
 227         '22': {'width': 1280, 'height': 720},
 228         '34': {'width': 640, 'height': 360},
 229         '35': {'width': 854, 'height': 480},
 230         '36': {'width': 320, 'height': 240},
 231         '37': {'width': 1920, 'height': 1080},
 232         '38': {'width': 4096, 'height': 3072},
 233         '43': {'width': 640, 'height': 360},
 234         '44': {'width': 854, 'height': 480},
 235         '45': {'width': 1280, 'height': 720},
 236         '46': {'width': 1920, 'height': 1080},
 237         '82': {'height': 360, 'display': '360p'},
 238         '83': {'height': 480, 'display': '480p'},
 239         '84': {'height': 720, 'display': '720p'},
 240         '85': {'height': 1080, 'display': '1080p'},
 241         '92': {'height': 240, 'display': '240p'},
 242         '93': {'height': 360, 'display': '360p'},
 243         '94': {'height': 480, 'display': '480p'},
 244         '95': {'height': 720, 'display': '720p'},
 245         '96': {'height': 1080, 'display': '1080p'},
 246         '100': {'height': 360, 'display': '360p'},
 247         '101': {'height': 480, 'display': '480p'},
 248         '102': {'height': 720, 'display': '720p'},
 249         '132': {'height': 240, 'display': '240p'},
 250         '151': {'height': 72, 'display': '72p'},
 251         '133': {'height': 240, 'display': '240p'},
 252         '134': {'height': 360, 'display': '360p'},
 253         '135': {'height': 480, 'display': '480p'},
 254         '136': {'height': 720, 'display': '720p'},
 255         '137': {'height': 1080, 'display': '1080p'},
 256         '138': {'height': 1081, 'display': '>1080p'},
 257         '139': {'display': '48k'},
 258         '140': {'display': '128k'},
 259         '141': {'display': '256k'},
 260         '160': {'height': 192, 'display': '192p'},
 261         '171': {'display': '128k'},
 262         '172': {'display': '256k'},
 263         '242': {'height': 240, 'display': '240p'},
 264         '243': {'height': 360, 'display': '360p'},
 265         '244': {'height': 480, 'display': '480p'},
 266         '245': {'height': 480, 'display': '480p'},
 267         '246': {'height': 480, 'display': '480p'},
 268         '247': {'height': 720, 'display': '720p'},
 269         '248': {'height': 1080, 'display': '1080p'},
 270     }
 271     _special_itags = {
 272         '82': '3D',
 273         '83': '3D',
 274         '84': '3D',
 275         '85': '3D',
 276         '100': '3D',
 277         '101': '3D',
 278         '102': '3D',
 279         '133': 'DASH Video',
 280         '134': 'DASH Video',
 281         '135': 'DASH Video',
 282         '136': 'DASH Video',
 283         '137': 'DASH Video',
 284         '138': 'DASH Video',
 285         '139': 'DASH Audio',
 286         '140': 'DASH Audio',
 287         '141': 'DASH Audio',
 288         '160': 'DASH Video',
 289         '171': 'DASH Audio',
 290         '172': 'DASH Audio',
 291         '242': 'DASH Video',
 292         '243': 'DASH Video',
 293         '244': 'DASH Video',
 294         '245': 'DASH Video',
 295         '246': 'DASH Video',
 296         '247': 'DASH Video',
 297         '248': 'DASH Video',
 298     }
 299
 300     IE_NAME = u'youtube'
 301     _TESTS = [
 302         {
 303             u"url":  u"http://www.youtube.com/watch?v=BaW_jenozKc",
 304             u"file":  u"BaW_jenozKc.mp4",
 305             u"info_dict": {
 306                 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
 307                 u"uploader": u"Philipp Hagemeister",
 308                 u"uploader_id": u"phihag",
 309                 u"upload_date": u"20121002",
 310                 u"description": u"test chars:  \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
 311             }
 312         },
 313         {
 314             u"url":  u"http://www.youtube.com/watch?v=UxxajLWwzqY",
 315             u"file":  u"UxxajLWwzqY.mp4",
 316             u"note": u"Test generic use_cipher_signature video (#897)",
 317             u"info_dict": {
 318                 u"upload_date": u"20120506",
 319                 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
 320                 u"description": u"md5:5b292926389560516e384ac437c0ec07",
 321                 u"uploader": u"Icona Pop",
 322                 u"uploader_id": u"IconaPop"
 323             }
 324         },
 325         {
 326             u"url":  u"https://www.youtube.com/watch?v=07FYdnEawAQ",
 327             u"file":  u"07FYdnEawAQ.mp4",
 328             u"note": u"Test VEVO video with age protection (#956)",
 329             u"info_dict": {
 330                 u"upload_date": u"20130703",
 331                 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
 332                 u"description": u"md5:64249768eec3bc4276236606ea996373",
 333                 u"uploader": u"justintimberlakeVEVO",
 334                 u"uploader_id": u"justintimberlakeVEVO"
 335             }
 336         },
 337         {
 338             u"url":  u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
 339             u"file":  u"yZIXLfi8CZQ.mp4",
 340             u"note": u"Embed-only video (#1746)",
 341             u"info_dict": {
 342                 u"upload_date": u"20120608",
 343                 u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
 344                 u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
 345                 u"uploader": u"SET India",
 346                 u"uploader_id": u"setindia"
 347             }
 348         },
 349     ]
 350
 351
 352     @classmethod
 353     def suitable(cls, url):
 354         """Receives a URL and returns True if suitable for this IE."""
 355         if YoutubePlaylistIE.suitable(url): return False
 356         return re.match(cls._VALID_URL, url) is not None
 357
 358     def __init__(self, *args, **kwargs):
 359         super(YoutubeIE, self).__init__(*args, **kwargs)
 360         self._player_cache = {}
 361
 362     def report_video_info_webpage_download(self, video_id):
 363         """Report attempt to download video info webpage."""
 364         self.to_screen(u'%s: Downloading video info webpage' % video_id)
 365
 366     def report_information_extraction(self, video_id):
 367         """Report attempt to extract video information."""
 368         self.to_screen(u'%s: Extracting video information' % video_id)
 369
 370     def report_unavailable_format(self, video_id, format):
 371         """Report extracted video URL."""
 372         self.to_screen(u'%s: Format %s not available' % (video_id, format))
 373
 374     def report_rtmp_download(self):
 375         """Indicate the download will use the RTMP protocol."""
 376         self.to_screen(u'RTMP download detected')
 377
 378     def _extract_signature_function(self, video_id, player_url, slen):
 379         id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
 380                         player_url)
 381         player_type = id_m.group('ext')
 382         player_id = id_m.group('id')
 383
 384         # Read from filesystem cache
 385         func_id = '%s_%s_%d' % (player_type, player_id, slen)
 386         assert os.path.basename(func_id) == func_id
 387         cache_dir = get_cachedir(self._downloader.params)
 388
 389         cache_enabled = cache_dir is not None
 390         if cache_enabled:
 391             cache_fn = os.path.join(os.path.expanduser(cache_dir),
 392                                     u'youtube-sigfuncs',
 393                                     func_id + '.json')
 394             try:
 395                 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
 396                     cache_spec = json.load(cachef)
 397                 return lambda s: u''.join(s[i] for i in cache_spec)
 398             except IOError:
 399                 pass  # No cache available
 400
 401         if player_type == 'js':
 402             code = self._download_webpage(
 403                 player_url, video_id,
 404                 note=u'Downloading %s player %s' % (player_type, player_id),
 405                 errnote=u'Download of %s failed' % player_url)
 406             res = self._parse_sig_js(code)
 407         elif player_type == 'swf':
 408             urlh = self._request_webpage(
 409                 player_url, video_id,
 410                 note=u'Downloading %s player %s' % (player_type, player_id),
 411                 errnote=u'Download of %s failed' % player_url)
 412             code = urlh.read()
 413             res = self._parse_sig_swf(code)
 414         else:
 415             assert False, 'Invalid player type %r' % player_type
 416
 417         if cache_enabled:
 418             try:
 419                 test_string = u''.join(map(compat_chr, range(slen)))
 420                 cache_res = res(test_string)
 421                 cache_spec = [ord(c) for c in cache_res]
 422                 try:
 423                     os.makedirs(os.path.dirname(cache_fn))
 424                 except OSError as ose:
 425                     if ose.errno != errno.EEXIST:
 426                         raise
 427                 write_json_file(cache_spec, cache_fn)
 428             except Exception:
 429                 tb = traceback.format_exc()
 430                 self._downloader.report_warning(
 431                     u'Writing cache to %r failed: %s' % (cache_fn, tb))
 432
 433         return res
 434
 435     def _print_sig_code(self, func, slen):
 436         def gen_sig_code(idxs):
 437             def _genslice(start, end, step):
 438                 starts = u'' if start == 0 else str(start)
 439                 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
 440                 steps = u'' if step == 1 else (u':%d' % step)
 441                 return u's[%s%s%s]' % (starts, ends, steps)
 442
 443             step = None
 444             start = '(Never used)'  # Quelch pyflakes warnings - start will be
 445                                     # set as soon as step is set
 446             for i, prev in zip(idxs[1:], idxs[:-1]):
 447                 if step is not None:
 448                     if i - prev == step:
 449                         continue
 450                     yield _genslice(start, prev, step)
 451                     step = None
 452                     continue
 453                 if i - prev in [-1, 1]:
 454                     step = i - prev
 455                     start = prev
 456                     continue
 457                 else:
 458                     yield u's[%d]' % prev
 459             if step is None:
 460                 yield u's[%d]' % i
 461             else:
 462                 yield _genslice(start, i, step)
 463
 464         test_string = u''.join(map(compat_chr, range(slen)))
 465         cache_res = func(test_string)
 466         cache_spec = [ord(c) for c in cache_res]
 467         expr_code = u' + '.join(gen_sig_code(cache_spec))
 468         code = u'if len(s) == %d:\n    return %s\n' % (slen, expr_code)
 469         self.to_screen(u'Extracted signature function:\n' + code)
 470
 471     def _parse_sig_js(self, jscode):
 472         funcname = self._search_regex(
 473             r'signature=([a-zA-Z]+)', jscode,
 474             u'Initial JS player signature function name')
 475
 476         functions = {}
 477
 478         def argidx(varname):
 479             return string.lowercase.index(varname)
 480
 481         def interpret_statement(stmt, local_vars, allow_recursion=20):
 482             if allow_recursion < 0:
 483                 raise ExtractorError(u'Recursion limit reached')
 484
 485             if stmt.startswith(u'var '):
 486                 stmt = stmt[len(u'var '):]
 487             ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
 488                              r'=(?P<expr>.*)$', stmt)
 489             if ass_m:
 490                 if ass_m.groupdict().get('index'):
 491                     def assign(val):
 492                         lvar = local_vars[ass_m.group('out')]
 493                         idx = interpret_expression(ass_m.group('index'),
 494                                                    local_vars, allow_recursion)
 495                         assert isinstance(idx, int)
 496                         lvar[idx] = val
 497                         return val
 498                     expr = ass_m.group('expr')
 499                 else:
 500                     def assign(val):
 501                         local_vars[ass_m.group('out')] = val
 502                         return val
 503                     expr = ass_m.group('expr')
 504             elif stmt.startswith(u'return '):
 505                 assign = lambda v: v
 506                 expr = stmt[len(u'return '):]
 507             else:
 508                 raise ExtractorError(
 509                     u'Cannot determine left side of statement in %r' % stmt)
 510
 511             v = interpret_expression(expr, local_vars, allow_recursion)
 512             return assign(v)
 513
 514         def interpret_expression(expr, local_vars, allow_recursion):
 515             if expr.isdigit():
 516                 return int(expr)
 517
 518             if expr.isalpha():
 519                 return local_vars[expr]
 520
 521             m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
 522             if m:
 523                 member = m.group('member')
 524                 val = local_vars[m.group('in')]
 525                 if member == 'split("")':
 526                     return list(val)
 527                 if member == 'join("")':
 528                     return u''.join(val)
 529                 if member == 'length':
 530                     return len(val)
 531                 if member == 'reverse()':
 532                     return val[::-1]
 533                 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
 534                 if slice_m:
 535                     idx = interpret_expression(
 536                         slice_m.group('idx'), local_vars, allow_recursion-1)
 537                     return val[idx:]
 538
 539             m = re.match(
 540                 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
 541             if m:
 542                 val = local_vars[m.group('in')]
 543                 idx = interpret_expression(m.group('idx'), local_vars,
 544                                            allow_recursion-1)
 545                 return val[idx]
 546
 547             m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
 548             if m:
 549                 a = interpret_expression(m.group('a'),
 550                                          local_vars, allow_recursion)
 551                 b = interpret_expression(m.group('b'),
 552                                          local_vars, allow_recursion)
 553                 return a % b
 554
 555             m = re.match(
 556                 r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
 557             if m:
 558                 fname = m.group('func')
 559                 if fname not in functions:
 560                     functions[fname] = extract_function(fname)
 561                 argvals = [int(v) if v.isdigit() else local_vars[v]
 562                            for v in m.group('args').split(',')]
 563                 return functions[fname](argvals)
 564             raise ExtractorError(u'Unsupported JS expression %r' % expr)
 565
 566         def extract_function(funcname):
 567             func_m = re.search(
 568                 r'function ' + re.escape(funcname) +
 569                 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
 570                 jscode)
 571             argnames = func_m.group('args').split(',')
 572
 573             def resf(args):
 574                 local_vars = dict(zip(argnames, args))
 575                 for stmt in func_m.group('code').split(';'):
 576                     res = interpret_statement(stmt, local_vars)
 577                 return res
 578             return resf
 579
 580         initial_function = extract_function(funcname)
 581         return lambda s: initial_function([s])
 582
 583     def _parse_sig_swf(self, file_contents):
 584         if file_contents[1:3] != b'WS':
 585             raise ExtractorError(
 586                 u'Not an SWF file; header is %r' % file_contents[:3])
 587         if file_contents[:1] == b'C':
 588             content = zlib.decompress(file_contents[8:])
 589         else:
 590             raise NotImplementedError(u'Unsupported compression format %r' %
 591                                       file_contents[:1])
 592
 593         def extract_tags(content):
 594             pos = 0
 595             while pos < len(content):
 596                 header16 = struct.unpack('<H', content[pos:pos+2])[0]
 597                 pos += 2
 598                 tag_code = header16 >> 6
 599                 tag_len = header16 & 0x3f
 600                 if tag_len == 0x3f:
 601                     tag_len = struct.unpack('<I', content[pos:pos+4])[0]
 602                     pos += 4
 603                 assert pos+tag_len <= len(content)
 604                 yield (tag_code, content[pos:pos+tag_len])
 605                 pos += tag_len
 606
 607         code_tag = next(tag
 608                         for tag_code, tag in extract_tags(content)
 609                         if tag_code == 82)
 610         p = code_tag.index(b'\0', 4) + 1
 611         code_reader = io.BytesIO(code_tag[p:])
 612
 613         # Parse ABC (AVM2 ByteCode)
 614         def read_int(reader=None):
 615             if reader is None:
 616                 reader = code_reader
 617             res = 0
 618             shift = 0
 619             for _ in range(5):
 620                 buf = reader.read(1)
 621                 assert len(buf) == 1
 622                 b = struct.unpack('<B', buf)[0]
 623                 res = res | ((b & 0x7f) << shift)
 624                 if b & 0x80 == 0:
 625                     break
 626                 shift += 7
 627             return res
 628
 629         def u30(reader=None):
 630             res = read_int(reader)
 631             assert res & 0xf0000000 == 0
 632             return res
 633         u32 = read_int
 634
 635         def s32(reader=None):
 636             v = read_int(reader)
 637             if v & 0x80000000 != 0:
 638                 v = - ((v ^ 0xffffffff) + 1)
 639             return v
 640
 641         def read_string(reader=None):
 642             if reader is None:
 643                 reader = code_reader
 644             slen = u30(reader)
 645             resb = reader.read(slen)
 646             assert len(resb) == slen
 647             return resb.decode('utf-8')
 648
 649         def read_bytes(count, reader=None):
 650             if reader is None:
 651                 reader = code_reader
 652             resb = reader.read(count)
 653             assert len(resb) == count
 654             return resb
 655
 656         def read_byte(reader=None):
 657             resb = read_bytes(1, reader=reader)
 658             res = struct.unpack('<B', resb)[0]
 659             return res
 660
 661         # minor_version + major_version
 662         read_bytes(2 + 2)
 663
 664         # Constant pool
 665         int_count = u30()
 666         for _c in range(1, int_count):
 667             s32()
 668         uint_count = u30()
 669         for _c in range(1, uint_count):
 670             u32()
 671         double_count = u30()
 672         read_bytes((double_count-1) * 8)
 673         string_count = u30()
 674         constant_strings = [u'']
 675         for _c in range(1, string_count):
 676             s = read_string()
 677             constant_strings.append(s)
 678         namespace_count = u30()
 679         for _c in range(1, namespace_count):
 680             read_bytes(1)  # kind
 681             u30()  # name
 682         ns_set_count = u30()
 683         for _c in range(1, ns_set_count):
 684             count = u30()
 685             for _c2 in range(count):
 686                 u30()
 687         multiname_count = u30()
 688         MULTINAME_SIZES = {
 689             0x07: 2,  # QName
 690             0x0d: 2,  # QNameA
 691             0x0f: 1,  # RTQName
 692             0x10: 1,  # RTQNameA
 693             0x11: 0,  # RTQNameL
 694             0x12: 0,  # RTQNameLA
 695             0x09: 2,  # Multiname
 696             0x0e: 2,  # MultinameA
 697             0x1b: 1,  # MultinameL
 698             0x1c: 1,  # MultinameLA
 699         }
 700         multinames = [u'']
 701         for _c in range(1, multiname_count):
 702             kind = u30()
 703             assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
 704             if kind == 0x07:
 705                 u30()  # namespace_idx
 706                 name_idx = u30()
 707                 multinames.append(constant_strings[name_idx])
 708             else:
 709                 multinames.append('[MULTINAME kind: %d]' % kind)
 710                 for _c2 in range(MULTINAME_SIZES[kind]):
 711                     u30()
 712
 713         # Methods
 714         method_count = u30()
 715         MethodInfo = collections.namedtuple(
 716             'MethodInfo',
 717             ['NEED_ARGUMENTS', 'NEED_REST'])
 718         method_infos = []
 719         for method_id in range(method_count):
 720             param_count = u30()
 721             u30()  # return type
 722             for _ in range(param_count):
 723                 u30()  # param type
 724             u30()  # name index (always 0 for youtube)
 725             flags = read_byte()
 726             if flags & 0x08 != 0:
 727                 # Options present
 728                 option_count = u30()
 729                 for c in range(option_count):
 730                     u30()  # val
 731                     read_bytes(1)  # kind
 732             if flags & 0x80 != 0:
 733                 # Param names present
 734                 for _ in range(param_count):
 735                     u30()  # param name
 736             mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
 737             method_infos.append(mi)
 738
 739         # Metadata
 740         metadata_count = u30()
 741         for _c in range(metadata_count):
 742             u30()  # name
 743             item_count = u30()
 744             for _c2 in range(item_count):
 745                 u30()  # key
 746                 u30()  # value
 747
 748         def parse_traits_info():
 749             trait_name_idx = u30()
 750             kind_full = read_byte()
 751             kind = kind_full & 0x0f
 752             attrs = kind_full >> 4
 753             methods = {}
 754             if kind in [0x00, 0x06]:  # Slot or Const
 755                 u30()  # Slot id
 756                 u30()  # type_name_idx
 757                 vindex = u30()
 758                 if vindex != 0:
 759                     read_byte()  # vkind
 760             elif kind in [0x01, 0x02, 0x03]:  # Method / Getter / Setter
 761                 u30()  # disp_id
 762                 method_idx = u30()
 763                 methods[multinames[trait_name_idx]] = method_idx
 764             elif kind == 0x04:  # Class
 765                 u30()  # slot_id
 766                 u30()  # classi
 767             elif kind == 0x05:  # Function
 768                 u30()  # slot_id
 769                 function_idx = u30()
 770                 methods[function_idx] = multinames[trait_name_idx]
 771             else:
 772                 raise ExtractorError(u'Unsupported trait kind %d' % kind)
 773
 774             if attrs & 0x4 != 0:  # Metadata present
 775                 metadata_count = u30()
 776                 for _c3 in range(metadata_count):
 777                     u30()  # metadata index
 778
 779             return methods
 780
 781         # Classes
 782         TARGET_CLASSNAME = u'SignatureDecipher'
 783         searched_idx = multinames.index(TARGET_CLASSNAME)
 784         searched_class_id = None
 785         class_count = u30()
 786         for class_id in range(class_count):
 787             name_idx = u30()
 788             if name_idx == searched_idx:
 789                 # We found the class we're looking for!
 790                 searched_class_id = class_id
 791             u30()  # super_name idx
 792             flags = read_byte()
 793             if flags & 0x08 != 0:  # Protected namespace is present
 794                 u30()  # protected_ns_idx
 795             intrf_count = u30()
 796             for _c2 in range(intrf_count):
 797                 u30()
 798             u30()  # iinit
 799             trait_count = u30()
 800             for _c2 in range(trait_count):
 801                 parse_traits_info()
 802
 803         if searched_class_id is None:
 804             raise ExtractorError(u'Target class %r not found' %
 805                                  TARGET_CLASSNAME)
 806
 807         method_names = {}
 808         method_idxs = {}
 809         for class_id in range(class_count):
 810             u30()  # cinit
 811             trait_count = u30()
 812             for _c2 in range(trait_count):
 813                 trait_methods = parse_traits_info()
 814                 if class_id == searched_class_id:
 815                     method_names.update(trait_methods.items())
 816                     method_idxs.update(dict(
 817                         (idx, name)
 818                         for name, idx in trait_methods.items()))
 819
 820         # Scripts
 821         script_count = u30()
 822         for _c in range(script_count):
 823             u30()  # init
 824             trait_count = u30()
 825             for _c2 in range(trait_count):
 826                 parse_traits_info()
 827
 828         # Method bodies
 829         method_body_count = u30()
 830         Method = collections.namedtuple('Method', ['code', 'local_count'])
 831         methods = {}
 832         for _c in range(method_body_count):
 833             method_idx = u30()
 834             u30()  # max_stack
 835             local_count = u30()
 836             u30()  # init_scope_depth
 837             u30()  # max_scope_depth
 838             code_length = u30()
 839             code = read_bytes(code_length)
 840             if method_idx in method_idxs:
 841                 m = Method(code, local_count)
 842                 methods[method_idxs[method_idx]] = m
 843             exception_count = u30()
 844             for _c2 in range(exception_count):
 845                 u30()  # from
 846                 u30()  # to
 847                 u30()  # target
 848                 u30()  # exc_type
 849                 u30()  # var_name
 850             trait_count = u30()
 851             for _c2 in range(trait_count):
 852                 parse_traits_info()
 853
 854         assert p + code_reader.tell() == len(code_tag)
 855         assert len(methods) == len(method_idxs)
 856
 857         method_pyfunctions = {}
 858
 859         def extract_function(func_name):
 860             if func_name in method_pyfunctions:
 861                 return method_pyfunctions[func_name]
 862             if func_name not in methods:
 863                 raise ExtractorError(u'Cannot find function %r' % func_name)
 864             m = methods[func_name]
 865
 866             def resfunc(args):
 867                 registers = ['(this)'] + list(args) + [None] * m.local_count
 868                 stack = []
 869                 coder = io.BytesIO(m.code)
 870                 while True:
 871                     opcode = struct.unpack('!B', coder.read(1))[0]
 872                     if opcode == 36:  # pushbyte
 873                         v = struct.unpack('!B', coder.read(1))[0]
 874                         stack.append(v)
 875                     elif opcode == 44:  # pushstring
 876                         idx = u30(coder)
 877                         stack.append(constant_strings[idx])
 878                     elif opcode == 48:  # pushscope
 879                         # We don't implement the scope register, so we'll just
 880                         # ignore the popped value
 881                         stack.pop()
 882                     elif opcode == 70:  # callproperty
 883                         index = u30(coder)
 884                         mname = multinames[index]
 885                         arg_count = u30(coder)
 886                         args = list(reversed(
 887                             [stack.pop() for _ in range(arg_count)]))
 888                         obj = stack.pop()
 889                         if mname == u'split':
 890                             assert len(args) == 1
 891                             assert isinstance(args[0], compat_str)
 892                             assert isinstance(obj, compat_str)
 893                             if args[0] == u'':
 894                                 res = list(obj)
 895                             else:
 896                                 res = obj.split(args[0])
 897                             stack.append(res)
 898                         elif mname == u'slice':
 899                             assert len(args) == 1
 900                             assert isinstance(args[0], int)
 901                             assert isinstance(obj, list)
 902                             res = obj[args[0]:]
 903                             stack.append(res)
 904                         elif mname == u'join':
 905                             assert len(args) == 1
 906                             assert isinstance(args[0], compat_str)
 907                             assert isinstance(obj, list)
 908                             res = args[0].join(obj)
 909                             stack.append(res)
 910                         elif mname in method_pyfunctions:
 911                             stack.append(method_pyfunctions[mname](args))
 912                         else:
 913                             raise NotImplementedError(
 914                                 u'Unsupported property %r on %r'
 915                                 % (mname, obj))
 916                     elif opcode == 72:  # returnvalue
 917                         res = stack.pop()
 918                         return res
 919                     elif opcode == 79:  # callpropvoid
 920                         index = u30(coder)
 921                         mname = multinames[index]
 922                         arg_count = u30(coder)
 923                         args = list(reversed(
 924                             [stack.pop() for _ in range(arg_count)]))
 925                         obj = stack.pop()
 926                         if mname == u'reverse':
 927                             assert isinstance(obj, list)
 928                             obj.reverse()
 929                         else:
 930                             raise NotImplementedError(
 931                                 u'Unsupported (void) property %r on %r'
 932                                 % (mname, obj))
 933                     elif opcode == 93:  # findpropstrict
 934                         index = u30(coder)
 935                         mname = multinames[index]
 936                         res = extract_function(mname)
 937                         stack.append(res)
 938                     elif opcode == 97:  # setproperty
 939                         index = u30(coder)
 940                         value = stack.pop()
 941                         idx = stack.pop()
 942                         obj = stack.pop()
 943                         assert isinstance(obj, list)
 944                         assert isinstance(idx, int)
 945                         obj[idx] = value
 946                     elif opcode == 98:  # getlocal
 947                         index = u30(coder)
 948                         stack.append(registers[index])
 949                     elif opcode == 99:  # setlocal
 950                         index = u30(coder)
 951                         value = stack.pop()
 952                         registers[index] = value
 953                     elif opcode == 102:  # getproperty
 954                         index = u30(coder)
 955                         pname = multinames[index]
 956                         if pname == u'length':
 957                             obj = stack.pop()
 958                             assert isinstance(obj, list)
 959                             stack.append(len(obj))
 960                         else:  # Assume attribute access
 961                             idx = stack.pop()
 962                             assert isinstance(idx, int)
 963                             obj = stack.pop()
 964                             assert isinstance(obj, list)
 965                             stack.append(obj[idx])
 966                     elif opcode == 128:  # coerce
 967                         u30(coder)
 968                     elif opcode == 133:  # coerce_s
 969                         assert isinstance(stack[-1], (type(None), compat_str))
 970                     elif opcode == 164:  # modulo
 971                         value2 = stack.pop()
 972                         value1 = stack.pop()
 973                         res = value1 % value2
 974                         stack.append(res)
 975                     elif opcode == 208:  # getlocal_0
 976                         stack.append(registers[0])
 977                     elif opcode == 209:  # getlocal_1
 978                         stack.append(registers[1])
 979                     elif opcode == 210:  # getlocal_2
 980                         stack.append(registers[2])
 981                     elif opcode == 211:  # getlocal_3
 982                         stack.append(registers[3])
 983                     elif opcode == 214:  # setlocal_2
 984                         registers[2] = stack.pop()
 985                     elif opcode == 215:  # setlocal_3
 986                         registers[3] = stack.pop()
 987                     else:
 988                         raise NotImplementedError(
 989                             u'Unsupported opcode %d' % opcode)
 990
 991             method_pyfunctions[func_name] = resfunc
 992             return resfunc
 993
 994         initial_function = extract_function(u'decipher')
 995         return lambda s: initial_function([s])
 996
 997     def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
 998         """Turn the encrypted s field into a working signature"""
 999
1000         if player_url is not None:
1001             if player_url.startswith(u'//'):
1002                 player_url = u'https:' + player_url
1003             try:
1004                 player_id = (player_url, len(s))
1005                 if player_id not in self._player_cache:
1006                     func = self._extract_signature_function(
1007                         video_id, player_url, len(s)
1008                     )
1009                     self._player_cache[player_id] = func
1010                 func = self._player_cache[player_id]
1011                 if self._downloader.params.get('youtube_print_sig_code'):
1012                     self._print_sig_code(func, len(s))
1013                 return func(s)
1014             except Exception:
1015                 tb = traceback.format_exc()
1016                 self._downloader.report_warning(
1017                     u'Automatic signature extraction failed: ' + tb)
1018
1019             self._downloader.report_warning(
1020                 u'Warning: Falling back to static signature algorithm')
1021
1022         return self._static_decrypt_signature(
1023             s, video_id, player_url, age_gate)
1024
1025     def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
1026         if age_gate:
1027             # The videos with age protection use another player, so the
1028             # algorithms can be different.
1029             if len(s) == 86:
1030                 return s[2:63] + s[82] + s[64:82] + s[63]
1031
1032         if len(s) == 93:
1033             return s[86:29:-1] + s[88] + s[28:5:-1]
1034         elif len(s) == 92:
1035             return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
1036         elif len(s) == 91:
1037             return s[84:27:-1] + s[86] + s[26:5:-1]
1038         elif len(s) == 90:
1039             return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
1040         elif len(s) == 89:
1041             return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
1042         elif len(s) == 88:
1043             return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
1044         elif len(s) == 87:
1045             return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
1046         elif len(s) == 86:
1047             return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1]
1048         elif len(s) == 85:
1049             return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
1050         elif len(s) == 84:
1051             return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1]
1052         elif len(s) == 83:
1053             return s[80:63:-1] + s[0] + s[62:0:-1] + s[63]
1054         elif len(s) == 82:
1055             return s[80:37:-1] + s[7] + s[36:7:-1] + s[0] + s[6:0:-1] + s[37]
1056         elif len(s) == 81:
1057             return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1058         elif len(s) == 80:
1059             return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
1060         elif len(s) == 79:
1061             return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1062
1063         else:
1064             raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
1065
1066     def _get_available_subtitles(self, video_id, webpage):
1067         try:
1068             sub_list = self._download_webpage(
1069                 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1070                 video_id, note=False)
1071         except ExtractorError as err:
1072             self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
1073             return {}
1074         lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
1075
1076         sub_lang_list = {}
1077         for l in lang_list:
1078             lang = l[1]
1079             params = compat_urllib_parse.urlencode({
1080                 'lang': lang,
1081                 'v': video_id,
1082                 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
1083                 'name': l[0].encode('utf-8'),
1084             })
1085             url = u'http://www.youtube.com/api/timedtext?' + params
1086             sub_lang_list[lang] = url
1087         if not sub_lang_list:
1088             self._downloader.report_warning(u'video doesn\'t have subtitles')
1089             return {}
1090         return sub_lang_list
1091
1092     def _get_available_automatic_caption(self, video_id, webpage):
1093         """We need the webpage for getting the captions url, pass it as an
1094            argument to speed up the process."""
1095         sub_format = self._downloader.params.get('subtitlesformat', 'srt')
1096         self.to_screen(u'%s: Looking for automatic captions' % video_id)
1097         mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
1098         err_msg = u'Couldn\'t find automatic captions for %s' % video_id
1099         if mobj is None:
1100             self._downloader.report_warning(err_msg)
1101             return {}
1102         player_config = json.loads(mobj.group(1))
1103         try:
1104             args = player_config[u'args']
1105             caption_url = args[u'ttsurl']
1106             timestamp = args[u'timestamp']
1107             # We get the available subtitles
1108             list_params = compat_urllib_parse.urlencode({
1109                 'type': 'list',
1110                 'tlangs': 1,
1111                 'asrs': 1,
1112             })
1113             list_url = caption_url + '&' + list_params
1114             caption_list = self._download_xml(list_url, video_id)
1115             original_lang_node = caption_list.find('track')
1116             if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
1117                 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1118                 return {}
1119             original_lang = original_lang_node.attrib['lang_code']
1120
1121             sub_lang_list = {}
1122             for lang_node in caption_list.findall('target'):
1123                 sub_lang = lang_node.attrib['lang_code']
1124                 params = compat_urllib_parse.urlencode({
1125                     'lang': original_lang,
1126                     'tlang': sub_lang,
1127                     'fmt': sub_format,
1128                     'ts': timestamp,
1129                     'kind': 'asr',
1130                 })
1131                 sub_lang_list[sub_lang] = caption_url + '&' + params
1132             return sub_lang_list
1133         # An extractor error can be raise by the download process if there are
1134         # no automatic captions but there are subtitles
1135         except (KeyError, ExtractorError):
1136             self._downloader.report_warning(err_msg)
1137             return {}
1138
1139     def _extract_id(self, url):
1140         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1141         if mobj is None:
1142             raise ExtractorError(u'Invalid URL: %s' % url)
1143         video_id = mobj.group(2)
1144         return video_id
1145
1146     def _get_video_url_list(self, url_map):
1147         """
1148         Transform a dictionary in the format {itag:url} to a list of (itag, url)
1149         with the requested formats.
1150         """
1151         existing_formats = [x for x in self._available_formats if x in url_map]
1152         if len(existing_formats) == 0:
1153             raise ExtractorError(u'no known formats available for video')
1154         video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1155         video_url_list.reverse() # order worst to best
1156         return video_url_list
1157
1158     def _extract_from_m3u8(self, manifest_url, video_id):
1159         url_map = {}
1160         def _get_urls(_manifest):
1161             lines = _manifest.split('\n')
1162             urls = filter(lambda l: l and not l.startswith('#'),
1163                             lines)
1164             return urls
1165         manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1166         formats_urls = _get_urls(manifest)
1167         for format_url in formats_urls:
1168             itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1169             url_map[itag] = format_url
1170         return url_map
1171
1172     def _extract_annotations(self, video_id):
1173         url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
1174         return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
1175
1176     def _real_extract(self, url):
1177         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1178         mobj = re.search(self._NEXT_URL_RE, url)
1179         if mobj:
1180             url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1181         video_id = self._extract_id(url)
1182
1183         # Get video webpage
1184         url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1185         video_webpage = self._download_webpage(url, video_id)
1186
1187         # Attempt to extract SWF player URL
1188         mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1189         if mobj is not None:
1190             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1191         else:
1192             player_url = None
1193
1194         # Get video info
1195         self.report_video_info_webpage_download(video_id)
1196         if re.search(r'player-age-gate-content">', video_webpage) is not None:
1197             self.report_age_confirmation()
1198             age_gate = True
1199             # We simulate the access to the video from www.youtube.com/v/{video_id}
1200             # this can be viewed without login into Youtube
1201             data = compat_urllib_parse.urlencode({'video_id': video_id,
1202                                                   'el': 'player_embedded',
1203                                                   'gl': 'US',
1204                                                   'hl': 'en',
1205                                                   'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1206                                                   'asv': 3,
1207                                                   'sts':'1588',
1208                                                   })
1209             video_info_url = 'https://www.youtube.com/get_video_info?' + data
1210             video_info_webpage = self._download_webpage(video_info_url, video_id,
1211                                     note=False,
1212                                     errnote='unable to download video info webpage')
1213             video_info = compat_parse_qs(video_info_webpage)
1214         else:
1215             age_gate = False
1216             for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1217                 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1218                         % (video_id, el_type))
1219                 video_info_webpage = self._download_webpage(video_info_url, video_id,
1220                                         note=False,
1221                                         errnote='unable to download video info webpage')
1222                 video_info = compat_parse_qs(video_info_webpage)
1223                 if 'token' in video_info:
1224                     break
1225         if 'token' not in video_info:
1226             if 'reason' in video_info:
1227                 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
1228             else:
1229                 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1230
1231         if 'view_count' in video_info:
1232             view_count = int(video_info['view_count'][0])
1233         else:
1234             view_count = None
1235
1236         # Check for "rental" videos
1237         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1238             raise ExtractorError(u'"rental" videos not supported')
1239
1240         # Start extracting information
1241         self.report_information_extraction(video_id)
1242
1243         # uploader
1244         if 'author' not in video_info:
1245             raise ExtractorError(u'Unable to extract uploader name')
1246         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1247
1248         # uploader_id
1249         video_uploader_id = None
1250         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1251         if mobj is not None:
1252             video_uploader_id = mobj.group(1)
1253         else:
1254             self._downloader.report_warning(u'unable to extract uploader nickname')
1255
1256         # title
1257         if 'title' in video_info:
1258             video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1259         else:
1260             self._downloader.report_warning(u'Unable to extract video title')
1261             video_title = u'_'
1262
1263         # thumbnail image
1264         # We try first to get a high quality image:
1265         m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1266                             video_webpage, re.DOTALL)
1267         if m_thumb is not None:
1268             video_thumbnail = m_thumb.group(1)
1269         elif 'thumbnail_url' not in video_info:
1270             self._downloader.report_warning(u'unable to extract video thumbnail')
1271             video_thumbnail = None
1272         else:   # don't panic if we can't find it
1273             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1274
1275         # upload date
1276         upload_date = None
1277         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1278         if mobj is not None:
1279             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1280             upload_date = unified_strdate(upload_date)
1281
1282         # description
1283         video_description = get_element_by_id("eow-description", video_webpage)
1284         if video_description:
1285             video_description = re.sub(r'''(?x)
1286                 <a\s+
1287                     (?:[a-zA-Z-]+="[^"]+"\s+)*?
1288                     title="([^"]+)"\s+
1289                     (?:[a-zA-Z-]+="[^"]+"\s+)*?
1290                     class="yt-uix-redirect-link"\s*>
1291                 [^<]+
1292                 </a>
1293             ''', r'\1', video_description)
1294             video_description = clean_html(video_description)
1295         else:
1296             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1297             if fd_mobj:
1298                 video_description = unescapeHTML(fd_mobj.group(1))
1299             else:
1300                 video_description = u''
1301
1302         def _extract_count(klass):
1303             count = self._search_regex(
1304                 r'class="%s">([\d,]+)</span>' % re.escape(klass),
1305                 video_webpage, klass, default=None)
1306             if count is not None:
1307                 return int(count.replace(',', ''))
1308             return None
1309         like_count = _extract_count(u'likes-count')
1310         dislike_count = _extract_count(u'dislikes-count')
1311
1312         # subtitles
1313         video_subtitles = self.extract_subtitles(video_id, video_webpage)
1314
1315         if self._downloader.params.get('listsubtitles', False):
1316             self._list_available_subtitles(video_id, video_webpage)
1317             return
1318
1319         if 'length_seconds' not in video_info:
1320             self._downloader.report_warning(u'unable to extract video duration')
1321             video_duration = None
1322         else:
1323             video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
1324
1325         # annotations
1326         video_annotations = None
1327         if self._downloader.params.get('writeannotations', False):
1328                 video_annotations = self._extract_annotations(video_id)
1329
1330         # Decide which formats to download
1331
1332         try:
1333             mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
1334             if not mobj:
1335                 raise ValueError('Could not find vevo ID')
1336             info = json.loads(mobj.group(1))
1337             args = info['args']
1338             # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1339             # this signatures are encrypted
1340             if 'url_encoded_fmt_stream_map' not in args:
1341                 raise ValueError(u'No stream_map present')  # caught below
1342             re_signature = re.compile(r'[&,]s=')
1343             m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
1344             if m_s is not None:
1345                 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
1346                 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
1347             m_s = re_signature.search(args.get('adaptive_fmts', u''))
1348             if m_s is not None:
1349                 if 'adaptive_fmts' in video_info:
1350                     video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
1351                 else:
1352                     video_info['adaptive_fmts'] = [args['adaptive_fmts']]
1353         except ValueError:
1354             pass
1355
1356         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1357             self.report_rtmp_download()
1358             video_url_list = [(None, video_info['conn'][0])]
1359         elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
1360             encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
1361             if 'rtmpe%3Dyes' in encoded_url_map:
1362                 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1363             url_map = {}
1364             for url_data_str in encoded_url_map.split(','):
1365                 url_data = compat_parse_qs(url_data_str)
1366                 if 'itag' in url_data and 'url' in url_data:
1367                     url = url_data['url'][0]
1368                     if 'sig' in url_data:
1369                         url += '&signature=' + url_data['sig'][0]
1370                     elif 's' in url_data:
1371                         encrypted_sig = url_data['s'][0]
1372                         if self._downloader.params.get('verbose'):
1373                             if age_gate:
1374                                 if player_url is None:
1375                                     player_version = 'unknown'
1376                                 else:
1377                                     player_version = self._search_regex(
1378                                         r'-(.+)\.swf$', player_url,
1379                                         u'flash player', fatal=False)
1380                                 player_desc = 'flash player %s' % player_version
1381                             else:
1382                                 player_version = self._search_regex(
1383                                     r'html5player-(.+?)\.js', video_webpage,
1384                                     'html5 player', fatal=False)
1385                                 player_desc = u'html5 player %s' % player_version
1386
1387                             parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
1388                             self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
1389                                 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1390
1391                         if not age_gate:
1392                             jsplayer_url_json = self._search_regex(
1393                                 r'"assets":.+?"js":\s*("[^"]+")',
1394                                 video_webpage, u'JS player URL')
1395                             player_url = json.loads(jsplayer_url_json)
1396
1397                         signature = self._decrypt_signature(
1398                             encrypted_sig, video_id, player_url, age_gate)
1399                         url += '&signature=' + signature
1400                     if 'ratebypass' not in url:
1401                         url += '&ratebypass=yes'
1402                     url_map[url_data['itag'][0]] = url
1403             video_url_list = self._get_video_url_list(url_map)
1404         elif video_info.get('hlsvp'):
1405             manifest_url = video_info['hlsvp'][0]
1406             url_map = self._extract_from_m3u8(manifest_url, video_id)
1407             video_url_list = self._get_video_url_list(url_map)
1408         else:
1409             raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
1410
1411         formats = []
1412         for itag, video_real_url in video_url_list:
1413             # Extension
1414             video_extension = self._video_extensions.get(itag, 'flv')
1415             resolution = self._video_dimensions.get(itag, {}).get('display')
1416             width = self._video_dimensions.get(itag, {}).get('width')
1417             height = self._video_dimensions.get(itag, {}).get('height')
1418             note = self._special_itags.get(itag)
1419
1420             video_format = '{0} - {1}{2}'.format(itag if itag else video_extension,
1421                                               '%dx%d' % (width, height) if width is not None and height is not None else (resolution if resolution is not None else '???'),
1422                                               ' ('+self._special_itags[itag]+')' if itag in self._special_itags else '')
1423
1424             formats.append({
1425                 'url':         video_real_url,
1426                 'ext':         video_extension,
1427                 'format':      video_format,
1428                 'format_id':   itag,
1429                 'player_url':  player_url,
1430                 '_resolution': resolution,
1431                 'width':       width,
1432                 'height':      height,
1433                 'format_note': note,
1434             })
1435
1436         return {
1437             'id':           video_id,
1438             'uploader':     video_uploader,
1439             'uploader_id':  video_uploader_id,
1440             'upload_date':  upload_date,
1441             'title':        video_title,
1442             'thumbnail':    video_thumbnail,
1443             'description':  video_description,
1444             'subtitles':    video_subtitles,
1445             'duration':     video_duration,
1446             'age_limit':    18 if age_gate else 0,
1447             'annotations':  video_annotations,
1448             'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id,
1449             'view_count':   view_count,
1450             'like_count': like_count,
1451             'dislike_count': dislike_count,
1452             'formats':      formats,
1453         }
1454
1455 class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
1456     IE_DESC = u'YouTube.com playlists'
1457     _VALID_URL = r"""(?:
1458                         (?:https?://)?
1459                         (?:\w+\.)?
1460                         youtube\.com/
1461                         (?:
1462                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1463                            \? (?:.*?&)*? (?:p|a|list)=
1464                         |  p/
1465                         )
1466                         ((?:PL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,})
1467                         .*
1468                      |
1469                         ((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
1470                      )"""
1471     _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&page=%s'
1472     _MORE_PAGES_INDICATOR = r'data-link-type="next"'
1473     _VIDEO_RE = r'href="/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
1474     IE_NAME = u'youtube:playlist'
1475
1476     @classmethod
1477     def suitable(cls, url):
1478         """Receives a URL and returns True if suitable for this IE."""
1479         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1480
1481     def _real_initialize(self):
1482         self._login()
1483
1484     def _ids_to_results(self, ids):
1485         return [self.url_result(vid_id, 'Youtube', video_id=vid_id)
1486                        for vid_id in ids]
1487
1488     def _extract_mix(self, playlist_id):
1489         # The mixes are generated from a a single video
1490         # the id of the playlist is just 'RD' + video_id
1491         url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
1492         webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
1493         title_span = (get_element_by_attribute('class', 'title long-title', webpage) or
1494             get_element_by_attribute('class', 'title ', webpage))
1495         title = clean_html(title_span)
1496         video_re = r'data-index="\d+".*?href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s' % re.escape(playlist_id)
1497         ids = orderedSet(re.findall(video_re, webpage))
1498         url_results = self._ids_to_results(ids)
1499
1500         return self.playlist_result(url_results, playlist_id, title)
1501
1502     def _real_extract(self, url):
1503         # Extract playlist id
1504         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1505         if mobj is None:
1506             raise ExtractorError(u'Invalid URL: %s' % url)
1507         playlist_id = mobj.group(1) or mobj.group(2)
1508
1509         # Check if it's a video-specific URL
1510         query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1511         if 'v' in query_dict:
1512             video_id = query_dict['v'][0]
1513             if self._downloader.params.get('noplaylist'):
1514                 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
1515                 return self.url_result(video_id, 'Youtube', video_id=video_id)
1516             else:
1517                 self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1518
1519         if playlist_id.startswith('RD'):
1520             # Mixes require a custom extraction process
1521             return self._extract_mix(playlist_id)
1522         if playlist_id.startswith('TL'):
1523             raise ExtractorError(u'For downloading YouTube.com top lists, use '
1524                 u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
1525
1526         # Extract the video ids from the playlist pages
1527         ids = []
1528
1529         for page_num in itertools.count(1):
1530             url = self._TEMPLATE_URL % (playlist_id, page_num)
1531             page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1532             matches = re.finditer(self._VIDEO_RE, page)
1533             # We remove the duplicates and the link with index 0
1534             # (it's not the first video of the playlist)
1535             new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1536             ids.extend(new_ids)
1537
1538             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1539                 break
1540
1541         playlist_title = self._og_search_title(page)
1542
1543         url_results = self._ids_to_results(ids)
1544         return self.playlist_result(url_results, playlist_id, playlist_title)
1545
1546
1547 class YoutubeTopListIE(YoutubePlaylistIE):
1548     IE_NAME = u'youtube:toplist'
1549     IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1550         u' (Example: "yttoplist:music:Top Tracks")')
1551     _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1552
1553     def _real_extract(self, url):
1554         mobj = re.match(self._VALID_URL, url)
1555         channel = mobj.group('chann')
1556         title = mobj.group('title')
1557         query = compat_urllib_parse.urlencode({'title': title})
1558         playlist_re = 'href="([^"]+?%s[^"]+?)"' % re.escape(query)
1559         channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)
1560         link = self._html_search_regex(playlist_re, channel_page, u'list')
1561         url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1562
1563         video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1564         ids = []
1565         # sometimes the webpage doesn't contain the videos
1566         # retry until we get them
1567         for i in itertools.count(0):
1568             msg = u'Downloading Youtube mix'
1569             if i > 0:
1570                 msg += ', retry #%d' % i
1571             webpage = self._download_webpage(url, title, msg)
1572             ids = orderedSet(re.findall(video_re, webpage))
1573             if ids:
1574                 break
1575         url_results = self._ids_to_results(ids)
1576         return self.playlist_result(url_results, playlist_title=title)
1577
1578
1579 class YoutubeChannelIE(InfoExtractor):
1580     IE_DESC = u'YouTube.com channels'
1581     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1582     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1583     _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1584     IE_NAME = u'youtube:channel'
1585
1586     def extract_videos_from_page(self, page):
1587         ids_in_page = []
1588         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1589             if mobj.group(1) not in ids_in_page:
1590                 ids_in_page.append(mobj.group(1))
1591         return ids_in_page
1592
1593     def _real_extract(self, url):
1594         # Extract channel id
1595         mobj = re.match(self._VALID_URL, url)
1596         if mobj is None:
1597             raise ExtractorError(u'Invalid URL: %s' % url)
1598
1599         # Download channel page
1600         channel_id = mobj.group(1)
1601         video_ids = []
1602         url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1603         channel_page = self._download_webpage(url, channel_id)
1604         autogenerated = re.search(r'''(?x)
1605                 class="[^"]*?(?:
1606                     channel-header-autogenerated-label|
1607                     yt-channel-title-autogenerated
1608                 )[^"]*"''', channel_page) is not None
1609
1610         if autogenerated:
1611             # The videos are contained in a single page
1612             # the ajax pages can't be used, they are empty
1613             video_ids = self.extract_videos_from_page(channel_page)
1614         else:
1615             # Download all channel pages using the json-based channel_ajax query
1616             for pagenum in itertools.count(1):
1617                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1618                 page = self._download_webpage(url, channel_id,
1619                                               u'Downloading page #%s' % pagenum)
1620
1621                 page = json.loads(page)
1622
1623                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1624                 video_ids.extend(ids_in_page)
1625
1626                 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1627                     break
1628
1629         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1630
1631         url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1632                        for video_id in video_ids]
1633         return self.playlist_result(url_entries, channel_id)
1634
1635
1636 class YoutubeUserIE(InfoExtractor):
1637     IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
1638     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1639     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1640     _GDATA_PAGE_SIZE = 50
1641     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1642     IE_NAME = u'youtube:user'
1643
1644     @classmethod
1645     def suitable(cls, url):
1646         # Don't return True if the url can be extracted with other youtube
1647         # extractor, the regex would is too permissive and it would match.
1648         other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1649         if any(ie.suitable(url) for ie in other_ies): return False
1650         else: return super(YoutubeUserIE, cls).suitable(url)
1651
1652     def _real_extract(self, url):
1653         # Extract username
1654         mobj = re.match(self._VALID_URL, url)
1655         if mobj is None:
1656             raise ExtractorError(u'Invalid URL: %s' % url)
1657
1658         username = mobj.group(1)
1659
1660         # Download video ids using YouTube Data API. Result size per
1661         # query is limited (currently to 50 videos) so we need to query
1662         # page by page until there are no video ids - it means we got
1663         # all of them.
1664
1665         video_ids = []
1666
1667         for pagenum in itertools.count(0):
1668             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1669
1670             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1671             page = self._download_webpage(gdata_url, username,
1672                                           u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1673
1674             try:
1675                 response = json.loads(page)
1676             except ValueError as err:
1677                 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1678             if 'entry' not in response['feed']:
1679                 # Number of videos is a multiple of self._MAX_RESULTS
1680                 break
1681
1682             # Extract video identifiers
1683             ids_in_page = []
1684             for entry in response['feed']['entry']:
1685                 ids_in_page.append(entry['id']['$t'].split('/')[-1])
1686             video_ids.extend(ids_in_page)
1687
1688             # A little optimization - if current page is not
1689             # "full", ie. does not contain PAGE_SIZE video ids then
1690             # we can assume that this page is the last one - there
1691             # are no more ids on further pages - no need to query
1692             # again.
1693
1694             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1695                 break
1696
1697         url_results = [
1698             self.url_result(video_id, 'Youtube', video_id=video_id)
1699             for video_id in video_ids]
1700         return self.playlist_result(url_results, playlist_title=username)
1701
1702
1703 class YoutubeSearchIE(SearchInfoExtractor):
1704     IE_DESC = u'YouTube.com searches'
1705     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1706     _MAX_RESULTS = 1000
1707     IE_NAME = u'youtube:search'
1708     _SEARCH_KEY = 'ytsearch'
1709
1710     def _get_n_results(self, query, n):
1711         """Get a specified number of results for a query"""
1712
1713         video_ids = []
1714         pagenum = 0
1715         limit = n
1716
1717         while (50 * pagenum) < limit:
1718             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1719             data_json = self._download_webpage(
1720                 result_url, video_id=u'query "%s"' % query,
1721                 note=u'Downloading page %s' % (pagenum + 1),
1722                 errnote=u'Unable to download API page')
1723             data = json.loads(data_json)
1724             api_response = data['data']
1725
1726             if 'items' not in api_response:
1727                 raise ExtractorError(u'[youtube] No video results')
1728
1729             new_ids = list(video['id'] for video in api_response['items'])
1730             video_ids += new_ids
1731
1732             limit = min(n, api_response['totalItems'])
1733             pagenum += 1
1734
1735         if len(video_ids) > n:
1736             video_ids = video_ids[:n]
1737         videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1738                   for video_id in video_ids]
1739         return self.playlist_result(videos, query)
1740
1741 class YoutubeSearchDateIE(YoutubeSearchIE):
1742     IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
1743     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1744     _SEARCH_KEY = 'ytsearchdate'
1745     IE_DESC = u'YouTube.com searches, newest videos first'
1746
1747 class YoutubeShowIE(InfoExtractor):
1748     IE_DESC = u'YouTube.com (multi-season) shows'
1749     _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1750     IE_NAME = u'youtube:show'
1751
1752     def _real_extract(self, url):
1753         mobj = re.match(self._VALID_URL, url)
1754         show_name = mobj.group(1)
1755         webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1756         # There's one playlist for each season of the show
1757         m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1758         self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1759         return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1760
1761
1762 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1763     """
1764     Base class for extractors that fetch info from
1765     http://www.youtube.com/feed_ajax
1766     Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1767     """
1768     _LOGIN_REQUIRED = True
1769     # use action_load_personal_feed instead of action_load_system_feed
1770     _PERSONAL_FEED = False
1771
1772     @property
1773     def _FEED_TEMPLATE(self):
1774         action = 'action_load_system_feed'
1775         if self._PERSONAL_FEED:
1776             action = 'action_load_personal_feed'
1777         return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1778
1779     @property
1780     def IE_NAME(self):
1781         return u'youtube:%s' % self._FEED_NAME
1782
1783     def _real_initialize(self):
1784         self._login()
1785
1786     def _real_extract(self, url):
1787         feed_entries = []
1788         paging = 0
1789         for i in itertools.count(1):
1790             info = self._download_webpage(self._FEED_TEMPLATE % paging,
1791                                           u'%s feed' % self._FEED_NAME,
1792                                           u'Downloading page %s' % i)
1793             info = json.loads(info)
1794             feed_html = info['feed_html']
1795             m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1796             ids = orderedSet(m.group(1) for m in m_ids)
1797             feed_entries.extend(
1798                 self.url_result(video_id, 'Youtube', video_id=video_id)
1799                 for video_id in ids)
1800             if info['paging'] is None:
1801                 break
1802             paging = info['paging']
1803         return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1804
1805 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1806     IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1807     _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1808     _FEED_NAME = 'subscriptions'
1809     _PLAYLIST_TITLE = u'Youtube Subscriptions'
1810
1811 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1812     IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1813     _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1814     _FEED_NAME = 'recommended'
1815     _PLAYLIST_TITLE = u'Youtube Recommended videos'
1816
1817 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1818     IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1819     _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1820     _FEED_NAME = 'watch_later'
1821     _PLAYLIST_TITLE = u'Youtube Watch Later'
1822     _PERSONAL_FEED = True
1823
1824 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1825     IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'
1826     _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'
1827     _FEED_NAME = 'history'
1828     _PERSONAL_FEED = True
1829     _PLAYLIST_TITLE = u'Youtube Watch History'
1830
1831 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1832     IE_NAME = u'youtube:favorites'
1833     IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1834     _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1835     _LOGIN_REQUIRED = True
1836
1837     def _real_extract(self, url):
1838         webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1839         playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1840         return self.url_result(playlist_id, 'YoutubePlaylist')
1841
1842
1843 class YoutubeTruncatedURLIE(InfoExtractor):
1844     IE_NAME = 'youtube:truncated_url'
1845     IE_DESC = False  # Do not list
1846     _VALID_URL = r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$'
1847
1848     def _real_extract(self, url):
1849         raise ExtractorError(
1850             u'Did you forget to quote the URL? Remember that & is a meta '
1851             u'character in most shells, so you want to put the URL in quotes, '
1852             u'like  youtube-dl '
1853             u'\'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\''
1854             u' (or simply  youtube-dl BaW_jenozKc  ).',
1855             expected=True)