16 from .common import InfoExtractor, SearchInfoExtractor
17 from .subtitles import SubtitlesInfoExtractor
24 compat_urllib_request,
31 get_element_by_attribute,
39 class YoutubeBaseInfoExtractor(InfoExtractor):
40 """Provide base functions for Youtube extractors"""
41 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
42 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
43 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
44 _NETRC_MACHINE = 'youtube'
45 # If True it will raise an error if no login info is provided
46 _LOGIN_REQUIRED = False
48 def report_lang(self):
49 """Report attempt to set language."""
50 self.to_screen(u'Setting language')
52 def _set_language(self):
53 request = compat_urllib_request.Request(self._LANG_URL)
56 compat_urllib_request.urlopen(request).read()
57 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
58 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
63 (username, password) = self._get_login_info()
64 # No authentication to be performed
66 if self._LOGIN_REQUIRED:
67 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
70 request = compat_urllib_request.Request(self._LOGIN_URL)
72 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
73 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
74 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
77 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
78 login_page, u'Login GALX parameter')
82 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
86 u'PersistentCookie': u'yes',
88 u'bgresponse': u'js_disabled',
89 u'checkConnection': u'',
90 u'checkedDomains': u'youtube',
95 u'signIn': u'Sign in',
97 u'service': u'youtube',
101 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
103 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
104 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
105 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
108 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
109 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
110 self._downloader.report_warning(u'unable to log in: bad username or password')
112 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
113 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
117 def _confirm_age(self):
120 'action_confirm': 'Confirm',
122 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
124 self.report_age_confirmation()
125 compat_urllib_request.urlopen(request).read().decode('utf-8')
126 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
127 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
130 def _real_initialize(self):
131 if self._downloader is None:
133 if not self._set_language():
135 if not self._login():
140 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
141 IE_DESC = u'YouTube.com'
142 _VALID_URL = r"""(?x)^
144 (?:https?://|//)? # http(s):// or protocol-independent URL (optional)
145 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
146 tube\.majestyc\.net/|
147 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
148 (?:.*?\#/)? # handle anchor (#/) redirect urls
149 (?: # the various things that can precede the ID:
150 (?:(?:v|embed|e)/) # v/ or embed/ or e/
151 |(?: # or the v= param in all its forms
152 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
153 (?:\?|\#!?) # the params delimiter ? or # or #!
154 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
158 |youtu\.be/ # just youtu.be/xxxx
160 )? # all until now is optional -> you can pass the naked ID
161 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
162 (?(1).+)? # if we found the ID, everything can follow
164 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
165 # Listed in order of quality
166 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
167 # Apple HTTP Live Streaming
168 '96', '95', '94', '93', '92', '132', '151',
170 '85', '84', '102', '83', '101', '82', '100',
172 '138', '137', '248', '136', '247', '135', '246',
173 '245', '244', '134', '243', '133', '242', '160',
175 '141', '172', '140', '171', '139',
177 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
178 # Apple HTTP Live Streaming
179 '96', '95', '94', '93', '92', '132', '151',
181 '85', '102', '84', '101', '83', '100', '82',
183 '138', '248', '137', '247', '136', '246', '245',
184 '244', '135', '243', '134', '242', '133', '160',
186 '172', '141', '171', '140', '139',
188 _video_formats_map = {
189 'flv': ['35', '34', '6', '5'],
190 '3gp': ['36', '17', '13'],
191 'mp4': ['38', '37', '22', '18'],
192 'webm': ['46', '45', '44', '43'],
194 _video_extensions = {
216 # Apple HTTP Live Streaming
250 _video_dimensions = {
332 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
333 u"file": u"BaW_jenozKc.mp4",
335 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
336 u"uploader": u"Philipp Hagemeister",
337 u"uploader_id": u"phihag",
338 u"upload_date": u"20121002",
339 u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
343 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
344 u"file": u"UxxajLWwzqY.mp4",
345 u"note": u"Test generic use_cipher_signature video (#897)",
347 u"upload_date": u"20120506",
348 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
349 u"description": u"md5:5b292926389560516e384ac437c0ec07",
350 u"uploader": u"Icona Pop",
351 u"uploader_id": u"IconaPop"
355 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
356 u"file": u"07FYdnEawAQ.mp4",
357 u"note": u"Test VEVO video with age protection (#956)",
359 u"upload_date": u"20130703",
360 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
361 u"description": u"md5:64249768eec3bc4276236606ea996373",
362 u"uploader": u"justintimberlakeVEVO",
363 u"uploader_id": u"justintimberlakeVEVO"
367 u"url": u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
368 u"file": u"yZIXLfi8CZQ.mp4",
369 u"note": u"Embed-only video (#1746)",
371 u"upload_date": u"20120608",
372 u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
373 u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
374 u"uploader": u"SET India",
375 u"uploader_id": u"setindia"
382 def suitable(cls, url):
383 """Receives a URL and returns True if suitable for this IE."""
384 if YoutubePlaylistIE.suitable(url): return False
385 return re.match(cls._VALID_URL, url) is not None
387 def __init__(self, *args, **kwargs):
388 super(YoutubeIE, self).__init__(*args, **kwargs)
389 self._player_cache = {}
391 def report_video_info_webpage_download(self, video_id):
392 """Report attempt to download video info webpage."""
393 self.to_screen(u'%s: Downloading video info webpage' % video_id)
395 def report_information_extraction(self, video_id):
396 """Report attempt to extract video information."""
397 self.to_screen(u'%s: Extracting video information' % video_id)
399 def report_unavailable_format(self, video_id, format):
400 """Report extracted video URL."""
401 self.to_screen(u'%s: Format %s not available' % (video_id, format))
403 def report_rtmp_download(self):
404 """Indicate the download will use the RTMP protocol."""
405 self.to_screen(u'RTMP download detected')
407 def _extract_signature_function(self, video_id, player_url, slen):
408 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
410 player_type = id_m.group('ext')
411 player_id = id_m.group('id')
413 # Read from filesystem cache
414 func_id = '%s_%s_%d' % (player_type, player_id, slen)
415 assert os.path.basename(func_id) == func_id
416 cache_dir = get_cachedir(self._downloader.params)
418 cache_enabled = cache_dir is not None
420 cache_fn = os.path.join(os.path.expanduser(cache_dir),
424 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
425 cache_spec = json.load(cachef)
426 return lambda s: u''.join(s[i] for i in cache_spec)
428 pass # No cache available
430 if player_type == 'js':
431 code = self._download_webpage(
432 player_url, video_id,
433 note=u'Downloading %s player %s' % (player_type, player_id),
434 errnote=u'Download of %s failed' % player_url)
435 res = self._parse_sig_js(code)
436 elif player_type == 'swf':
437 urlh = self._request_webpage(
438 player_url, video_id,
439 note=u'Downloading %s player %s' % (player_type, player_id),
440 errnote=u'Download of %s failed' % player_url)
442 res = self._parse_sig_swf(code)
444 assert False, 'Invalid player type %r' % player_type
448 test_string = u''.join(map(compat_chr, range(slen)))
449 cache_res = res(test_string)
450 cache_spec = [ord(c) for c in cache_res]
452 os.makedirs(os.path.dirname(cache_fn))
453 except OSError as ose:
454 if ose.errno != errno.EEXIST:
456 write_json_file(cache_spec, cache_fn)
458 tb = traceback.format_exc()
459 self._downloader.report_warning(
460 u'Writing cache to %r failed: %s' % (cache_fn, tb))
464 def _print_sig_code(self, func, slen):
465 def gen_sig_code(idxs):
466 def _genslice(start, end, step):
467 starts = u'' if start == 0 else str(start)
468 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
469 steps = u'' if step == 1 else (u':%d' % step)
470 return u's[%s%s%s]' % (starts, ends, steps)
473 start = '(Never used)' # Quelch pyflakes warnings - start will be
474 # set as soon as step is set
475 for i, prev in zip(idxs[1:], idxs[:-1]):
479 yield _genslice(start, prev, step)
482 if i - prev in [-1, 1]:
487 yield u's[%d]' % prev
491 yield _genslice(start, i, step)
493 test_string = u''.join(map(compat_chr, range(slen)))
494 cache_res = func(test_string)
495 cache_spec = [ord(c) for c in cache_res]
496 expr_code = u' + '.join(gen_sig_code(cache_spec))
497 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
498 self.to_screen(u'Extracted signature function:\n' + code)
500 def _parse_sig_js(self, jscode):
501 funcname = self._search_regex(
502 r'signature=([a-zA-Z]+)', jscode,
503 u'Initial JS player signature function name')
508 return string.lowercase.index(varname)
510 def interpret_statement(stmt, local_vars, allow_recursion=20):
511 if allow_recursion < 0:
512 raise ExtractorError(u'Recursion limit reached')
514 if stmt.startswith(u'var '):
515 stmt = stmt[len(u'var '):]
516 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
517 r'=(?P<expr>.*)$', stmt)
519 if ass_m.groupdict().get('index'):
521 lvar = local_vars[ass_m.group('out')]
522 idx = interpret_expression(ass_m.group('index'),
523 local_vars, allow_recursion)
524 assert isinstance(idx, int)
527 expr = ass_m.group('expr')
530 local_vars[ass_m.group('out')] = val
532 expr = ass_m.group('expr')
533 elif stmt.startswith(u'return '):
535 expr = stmt[len(u'return '):]
537 raise ExtractorError(
538 u'Cannot determine left side of statement in %r' % stmt)
540 v = interpret_expression(expr, local_vars, allow_recursion)
543 def interpret_expression(expr, local_vars, allow_recursion):
548 return local_vars[expr]
550 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
552 member = m.group('member')
553 val = local_vars[m.group('in')]
554 if member == 'split("")':
556 if member == 'join("")':
558 if member == 'length':
560 if member == 'reverse()':
562 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
564 idx = interpret_expression(
565 slice_m.group('idx'), local_vars, allow_recursion-1)
569 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
571 val = local_vars[m.group('in')]
572 idx = interpret_expression(m.group('idx'), local_vars,
576 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
578 a = interpret_expression(m.group('a'),
579 local_vars, allow_recursion)
580 b = interpret_expression(m.group('b'),
581 local_vars, allow_recursion)
585 r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
587 fname = m.group('func')
588 if fname not in functions:
589 functions[fname] = extract_function(fname)
590 argvals = [int(v) if v.isdigit() else local_vars[v]
591 for v in m.group('args').split(',')]
592 return functions[fname](argvals)
593 raise ExtractorError(u'Unsupported JS expression %r' % expr)
595 def extract_function(funcname):
597 r'function ' + re.escape(funcname) +
598 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
600 argnames = func_m.group('args').split(',')
603 local_vars = dict(zip(argnames, args))
604 for stmt in func_m.group('code').split(';'):
605 res = interpret_statement(stmt, local_vars)
609 initial_function = extract_function(funcname)
610 return lambda s: initial_function([s])
612 def _parse_sig_swf(self, file_contents):
613 if file_contents[1:3] != b'WS':
614 raise ExtractorError(
615 u'Not an SWF file; header is %r' % file_contents[:3])
616 if file_contents[:1] == b'C':
617 content = zlib.decompress(file_contents[8:])
619 raise NotImplementedError(u'Unsupported compression format %r' %
622 def extract_tags(content):
624 while pos < len(content):
625 header16 = struct.unpack('<H', content[pos:pos+2])[0]
627 tag_code = header16 >> 6
628 tag_len = header16 & 0x3f
630 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
632 assert pos+tag_len <= len(content)
633 yield (tag_code, content[pos:pos+tag_len])
637 for tag_code, tag in extract_tags(content)
639 p = code_tag.index(b'\0', 4) + 1
640 code_reader = io.BytesIO(code_tag[p:])
642 # Parse ABC (AVM2 ByteCode)
643 def read_int(reader=None):
651 b = struct.unpack('<B', buf)[0]
652 res = res | ((b & 0x7f) << shift)
658 def u30(reader=None):
659 res = read_int(reader)
660 assert res & 0xf0000000 == 0
664 def s32(reader=None):
666 if v & 0x80000000 != 0:
667 v = - ((v ^ 0xffffffff) + 1)
670 def read_string(reader=None):
674 resb = reader.read(slen)
675 assert len(resb) == slen
676 return resb.decode('utf-8')
678 def read_bytes(count, reader=None):
681 resb = reader.read(count)
682 assert len(resb) == count
685 def read_byte(reader=None):
686 resb = read_bytes(1, reader=reader)
687 res = struct.unpack('<B', resb)[0]
690 # minor_version + major_version
695 for _c in range(1, int_count):
698 for _c in range(1, uint_count):
701 read_bytes((double_count-1) * 8)
703 constant_strings = [u'']
704 for _c in range(1, string_count):
706 constant_strings.append(s)
707 namespace_count = u30()
708 for _c in range(1, namespace_count):
712 for _c in range(1, ns_set_count):
714 for _c2 in range(count):
716 multiname_count = u30()
725 0x0e: 2, # MultinameA
726 0x1b: 1, # MultinameL
727 0x1c: 1, # MultinameLA
730 for _c in range(1, multiname_count):
732 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
734 u30() # namespace_idx
736 multinames.append(constant_strings[name_idx])
738 multinames.append('[MULTINAME kind: %d]' % kind)
739 for _c2 in range(MULTINAME_SIZES[kind]):
744 MethodInfo = collections.namedtuple(
746 ['NEED_ARGUMENTS', 'NEED_REST'])
748 for method_id in range(method_count):
751 for _ in range(param_count):
753 u30() # name index (always 0 for youtube)
755 if flags & 0x08 != 0:
758 for c in range(option_count):
761 if flags & 0x80 != 0:
762 # Param names present
763 for _ in range(param_count):
765 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
766 method_infos.append(mi)
769 metadata_count = u30()
770 for _c in range(metadata_count):
773 for _c2 in range(item_count):
777 def parse_traits_info():
778 trait_name_idx = u30()
779 kind_full = read_byte()
780 kind = kind_full & 0x0f
781 attrs = kind_full >> 4
783 if kind in [0x00, 0x06]: # Slot or Const
785 u30() # type_name_idx
789 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
792 methods[multinames[trait_name_idx]] = method_idx
793 elif kind == 0x04: # Class
796 elif kind == 0x05: # Function
799 methods[function_idx] = multinames[trait_name_idx]
801 raise ExtractorError(u'Unsupported trait kind %d' % kind)
803 if attrs & 0x4 != 0: # Metadata present
804 metadata_count = u30()
805 for _c3 in range(metadata_count):
806 u30() # metadata index
811 TARGET_CLASSNAME = u'SignatureDecipher'
812 searched_idx = multinames.index(TARGET_CLASSNAME)
813 searched_class_id = None
815 for class_id in range(class_count):
817 if name_idx == searched_idx:
818 # We found the class we're looking for!
819 searched_class_id = class_id
820 u30() # super_name idx
822 if flags & 0x08 != 0: # Protected namespace is present
823 u30() # protected_ns_idx
825 for _c2 in range(intrf_count):
829 for _c2 in range(trait_count):
832 if searched_class_id is None:
833 raise ExtractorError(u'Target class %r not found' %
838 for class_id in range(class_count):
841 for _c2 in range(trait_count):
842 trait_methods = parse_traits_info()
843 if class_id == searched_class_id:
844 method_names.update(trait_methods.items())
845 method_idxs.update(dict(
847 for name, idx in trait_methods.items()))
851 for _c in range(script_count):
854 for _c2 in range(trait_count):
858 method_body_count = u30()
859 Method = collections.namedtuple('Method', ['code', 'local_count'])
861 for _c in range(method_body_count):
865 u30() # init_scope_depth
866 u30() # max_scope_depth
868 code = read_bytes(code_length)
869 if method_idx in method_idxs:
870 m = Method(code, local_count)
871 methods[method_idxs[method_idx]] = m
872 exception_count = u30()
873 for _c2 in range(exception_count):
880 for _c2 in range(trait_count):
883 assert p + code_reader.tell() == len(code_tag)
884 assert len(methods) == len(method_idxs)
886 method_pyfunctions = {}
888 def extract_function(func_name):
889 if func_name in method_pyfunctions:
890 return method_pyfunctions[func_name]
891 if func_name not in methods:
892 raise ExtractorError(u'Cannot find function %r' % func_name)
893 m = methods[func_name]
896 registers = ['(this)'] + list(args) + [None] * m.local_count
898 coder = io.BytesIO(m.code)
900 opcode = struct.unpack('!B', coder.read(1))[0]
901 if opcode == 36: # pushbyte
902 v = struct.unpack('!B', coder.read(1))[0]
904 elif opcode == 44: # pushstring
906 stack.append(constant_strings[idx])
907 elif opcode == 48: # pushscope
908 # We don't implement the scope register, so we'll just
909 # ignore the popped value
911 elif opcode == 70: # callproperty
913 mname = multinames[index]
914 arg_count = u30(coder)
915 args = list(reversed(
916 [stack.pop() for _ in range(arg_count)]))
918 if mname == u'split':
919 assert len(args) == 1
920 assert isinstance(args[0], compat_str)
921 assert isinstance(obj, compat_str)
925 res = obj.split(args[0])
927 elif mname == u'slice':
928 assert len(args) == 1
929 assert isinstance(args[0], int)
930 assert isinstance(obj, list)
933 elif mname == u'join':
934 assert len(args) == 1
935 assert isinstance(args[0], compat_str)
936 assert isinstance(obj, list)
937 res = args[0].join(obj)
939 elif mname in method_pyfunctions:
940 stack.append(method_pyfunctions[mname](args))
942 raise NotImplementedError(
943 u'Unsupported property %r on %r'
945 elif opcode == 72: # returnvalue
948 elif opcode == 79: # callpropvoid
950 mname = multinames[index]
951 arg_count = u30(coder)
952 args = list(reversed(
953 [stack.pop() for _ in range(arg_count)]))
955 if mname == u'reverse':
956 assert isinstance(obj, list)
959 raise NotImplementedError(
960 u'Unsupported (void) property %r on %r'
962 elif opcode == 93: # findpropstrict
964 mname = multinames[index]
965 res = extract_function(mname)
967 elif opcode == 97: # setproperty
972 assert isinstance(obj, list)
973 assert isinstance(idx, int)
975 elif opcode == 98: # getlocal
977 stack.append(registers[index])
978 elif opcode == 99: # setlocal
981 registers[index] = value
982 elif opcode == 102: # getproperty
984 pname = multinames[index]
985 if pname == u'length':
987 assert isinstance(obj, list)
988 stack.append(len(obj))
989 else: # Assume attribute access
991 assert isinstance(idx, int)
993 assert isinstance(obj, list)
994 stack.append(obj[idx])
995 elif opcode == 128: # coerce
997 elif opcode == 133: # coerce_s
998 assert isinstance(stack[-1], (type(None), compat_str))
999 elif opcode == 164: # modulo
1000 value2 = stack.pop()
1001 value1 = stack.pop()
1002 res = value1 % value2
1004 elif opcode == 208: # getlocal_0
1005 stack.append(registers[0])
1006 elif opcode == 209: # getlocal_1
1007 stack.append(registers[1])
1008 elif opcode == 210: # getlocal_2
1009 stack.append(registers[2])
1010 elif opcode == 211: # getlocal_3
1011 stack.append(registers[3])
1012 elif opcode == 214: # setlocal_2
1013 registers[2] = stack.pop()
1014 elif opcode == 215: # setlocal_3
1015 registers[3] = stack.pop()
1017 raise NotImplementedError(
1018 u'Unsupported opcode %d' % opcode)
1020 method_pyfunctions[func_name] = resfunc
1023 initial_function = extract_function(u'decipher')
1024 return lambda s: initial_function([s])
1026 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
1027 """Turn the encrypted s field into a working signature"""
1029 if player_url is not None:
1030 if player_url.startswith(u'//'):
1031 player_url = u'https:' + player_url
1033 player_id = (player_url, len(s))
1034 if player_id not in self._player_cache:
1035 func = self._extract_signature_function(
1036 video_id, player_url, len(s)
1038 self._player_cache[player_id] = func
1039 func = self._player_cache[player_id]
1040 if self._downloader.params.get('youtube_print_sig_code'):
1041 self._print_sig_code(func, len(s))
1044 tb = traceback.format_exc()
1045 self._downloader.report_warning(
1046 u'Automatic signature extraction failed: ' + tb)
1048 self._downloader.report_warning(
1049 u'Warning: Falling back to static signature algorithm')
1051 return self._static_decrypt_signature(
1052 s, video_id, player_url, age_gate)
1054 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
1056 # The videos with age protection use another player, so the
1057 # algorithms can be different.
1059 return s[2:63] + s[82] + s[64:82] + s[63]
1062 return s[86:29:-1] + s[88] + s[28:5:-1]
1064 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
1066 return s[84:27:-1] + s[86] + s[26:5:-1]
1068 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
1070 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
1072 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
1074 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
1076 return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1]
1078 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
1080 return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1]
1082 return s[80:63:-1] + s[0] + s[62:0:-1] + s[63]
1084 return s[80:37:-1] + s[7] + s[36:7:-1] + s[0] + s[6:0:-1] + s[37]
1086 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1088 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
1090 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1093 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
1095 def _get_available_subtitles(self, video_id, webpage):
1097 sub_list = self._download_webpage(
1098 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1099 video_id, note=False)
1100 except ExtractorError as err:
1101 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
1103 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
1108 params = compat_urllib_parse.urlencode({
1111 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
1112 'name': l[0].encode('utf-8'),
1114 url = u'http://www.youtube.com/api/timedtext?' + params
1115 sub_lang_list[lang] = url
1116 if not sub_lang_list:
1117 self._downloader.report_warning(u'video doesn\'t have subtitles')
1119 return sub_lang_list
1121 def _get_available_automatic_caption(self, video_id, webpage):
1122 """We need the webpage for getting the captions url, pass it as an
1123 argument to speed up the process."""
1124 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
1125 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1126 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
1127 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
1129 self._downloader.report_warning(err_msg)
1131 player_config = json.loads(mobj.group(1))
1133 args = player_config[u'args']
1134 caption_url = args[u'ttsurl']
1135 timestamp = args[u'timestamp']
1136 # We get the available subtitles
1137 list_params = compat_urllib_parse.urlencode({
1142 list_url = caption_url + '&' + list_params
1143 caption_list = self._download_xml(list_url, video_id)
1144 original_lang_node = caption_list.find('track')
1145 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
1146 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1148 original_lang = original_lang_node.attrib['lang_code']
1151 for lang_node in caption_list.findall('target'):
1152 sub_lang = lang_node.attrib['lang_code']
1153 params = compat_urllib_parse.urlencode({
1154 'lang': original_lang,
1160 sub_lang_list[sub_lang] = caption_url + '&' + params
1161 return sub_lang_list
1162 # An extractor error can be raise by the download process if there are
1163 # no automatic captions but there are subtitles
1164 except (KeyError, ExtractorError):
1165 self._downloader.report_warning(err_msg)
1168 def _print_formats(self, formats):
1169 print('Available formats:')
1171 print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'),
1172 self._video_dimensions.get(x, '???'),
1173 ' ('+self._special_itags[x]+')' if x in self._special_itags else ''))
1175 def _extract_id(self, url):
1176 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1178 raise ExtractorError(u'Invalid URL: %s' % url)
1179 video_id = mobj.group(2)
1182 def _get_video_url_list(self, url_map):
1184 Transform a dictionary in the format {itag:url} to a list of (itag, url)
1185 with the requested formats.
1187 req_format = self._downloader.params.get('format', None)
1188 format_limit = self._downloader.params.get('format_limit', None)
1189 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1190 if format_limit is not None and format_limit in available_formats:
1191 format_list = available_formats[available_formats.index(format_limit):]
1193 format_list = available_formats
1194 existing_formats = [x for x in format_list if x in url_map]
1195 if len(existing_formats) == 0:
1196 raise ExtractorError(u'no known formats available for video')
1197 if self._downloader.params.get('listformats', None):
1198 self._print_formats(existing_formats)
1200 if req_format is None or req_format == 'best':
1201 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1202 elif req_format == 'worst':
1203 video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality
1204 elif req_format in ('-1', 'all'):
1205 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1207 # Specific formats. We pick the first in a slash-delimeted sequence.
1208 # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
1209 # available in the specified format. For example,
1210 # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1211 # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
1212 # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
1213 req_formats = req_format.split('/')
1214 video_url_list = None
1215 for rf in req_formats:
1217 video_url_list = [(rf, url_map[rf])]
1219 if rf in self._video_formats_map:
1220 for srf in self._video_formats_map[rf]:
1222 video_url_list = [(srf, url_map[srf])]
1227 if video_url_list is None:
1228 raise ExtractorError(u'requested format not available')
1229 return video_url_list
1231 def _extract_from_m3u8(self, manifest_url, video_id):
1233 def _get_urls(_manifest):
1234 lines = _manifest.split('\n')
1235 urls = filter(lambda l: l and not l.startswith('#'),
1238 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1239 formats_urls = _get_urls(manifest)
1240 for format_url in formats_urls:
1241 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1242 url_map[itag] = format_url
1245 def _extract_annotations(self, video_id):
1246 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
1247 return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
1249 def _real_extract(self, url):
1250 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1251 mobj = re.search(self._NEXT_URL_RE, url)
1253 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1254 video_id = self._extract_id(url)
1257 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1258 video_webpage = self._download_webpage(url, video_id)
1260 # Attempt to extract SWF player URL
1261 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1262 if mobj is not None:
1263 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1268 self.report_video_info_webpage_download(video_id)
1269 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1270 self.report_age_confirmation()
1272 # We simulate the access to the video from www.youtube.com/v/{video_id}
1273 # this can be viewed without login into Youtube
1274 data = compat_urllib_parse.urlencode({'video_id': video_id,
1275 'el': 'player_embedded',
1278 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1282 video_info_url = 'https://www.youtube.com/get_video_info?' + data
1283 video_info_webpage = self._download_webpage(video_info_url, video_id,
1285 errnote='unable to download video info webpage')
1286 video_info = compat_parse_qs(video_info_webpage)
1289 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1290 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1291 % (video_id, el_type))
1292 video_info_webpage = self._download_webpage(video_info_url, video_id,
1294 errnote='unable to download video info webpage')
1295 video_info = compat_parse_qs(video_info_webpage)
1296 if 'token' in video_info:
1298 if 'token' not in video_info:
1299 if 'reason' in video_info:
1300 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
1302 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1304 if 'view_count' in video_info:
1305 view_count = int(video_info['view_count'][0])
1309 # Check for "rental" videos
1310 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1311 raise ExtractorError(u'"rental" videos not supported')
1313 # Start extracting information
1314 self.report_information_extraction(video_id)
1317 if 'author' not in video_info:
1318 raise ExtractorError(u'Unable to extract uploader name')
1319 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1322 video_uploader_id = None
1323 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1324 if mobj is not None:
1325 video_uploader_id = mobj.group(1)
1327 self._downloader.report_warning(u'unable to extract uploader nickname')
1330 if 'title' in video_info:
1331 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1333 self._downloader.report_warning(u'Unable to extract video title')
1337 # We try first to get a high quality image:
1338 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1339 video_webpage, re.DOTALL)
1340 if m_thumb is not None:
1341 video_thumbnail = m_thumb.group(1)
1342 elif 'thumbnail_url' not in video_info:
1343 self._downloader.report_warning(u'unable to extract video thumbnail')
1344 video_thumbnail = None
1345 else: # don't panic if we can't find it
1346 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1350 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1351 if mobj is not None:
1352 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1353 upload_date = unified_strdate(upload_date)
1356 video_description = get_element_by_id("eow-description", video_webpage)
1357 if video_description:
1358 video_description = re.sub(r'''(?x)
1360 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1362 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1363 class="yt-uix-redirect-link"\s*>
1366 ''', r'\1', video_description)
1367 video_description = clean_html(video_description)
1369 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1371 video_description = unescapeHTML(fd_mobj.group(1))
1373 video_description = u''
1375 def _extract_count(klass):
1376 count = self._search_regex(r'class="%s">([\d,]+)</span>' % re.escape(klass), video_webpage, klass, fatal=False)
1377 if count is not None:
1378 return int(count.replace(',', ''))
1380 like_count = _extract_count(u'likes-count')
1381 dislike_count = _extract_count(u'dislikes-count')
1384 video_subtitles = self.extract_subtitles(video_id, video_webpage)
1386 if self._downloader.params.get('listsubtitles', False):
1387 self._list_available_subtitles(video_id, video_webpage)
1390 if 'length_seconds' not in video_info:
1391 self._downloader.report_warning(u'unable to extract video duration')
1394 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
1397 video_annotations = None
1398 if self._downloader.params.get('writeannotations', False):
1399 video_annotations = self._extract_annotations(video_id)
1401 # Decide which formats to download
1404 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
1406 raise ValueError('Could not find vevo ID')
1407 info = json.loads(mobj.group(1))
1409 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1410 # this signatures are encrypted
1411 if 'url_encoded_fmt_stream_map' not in args:
1412 raise ValueError(u'No stream_map present') # caught below
1413 re_signature = re.compile(r'[&,]s=')
1414 m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
1416 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
1417 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
1418 m_s = re_signature.search(args.get('adaptive_fmts', u''))
1420 if 'adaptive_fmts' in video_info:
1421 video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
1423 video_info['adaptive_fmts'] = [args['adaptive_fmts']]
1427 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1428 self.report_rtmp_download()
1429 video_url_list = [(None, video_info['conn'][0])]
1430 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
1431 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
1432 if 'rtmpe%3Dyes' in encoded_url_map:
1433 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1435 for url_data_str in encoded_url_map.split(','):
1436 url_data = compat_parse_qs(url_data_str)
1437 if 'itag' in url_data and 'url' in url_data:
1438 url = url_data['url'][0]
1439 if 'sig' in url_data:
1440 url += '&signature=' + url_data['sig'][0]
1441 elif 's' in url_data:
1442 encrypted_sig = url_data['s'][0]
1443 if self._downloader.params.get('verbose'):
1445 if player_url is None:
1446 player_version = 'unknown'
1448 player_version = self._search_regex(
1449 r'-(.+)\.swf$', player_url,
1450 u'flash player', fatal=False)
1451 player_desc = 'flash player %s' % player_version
1453 player_version = self._search_regex(
1454 r'html5player-(.+?)\.js', video_webpage,
1455 'html5 player', fatal=False)
1456 player_desc = u'html5 player %s' % player_version
1458 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
1459 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
1460 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1463 jsplayer_url_json = self._search_regex(
1464 r'"assets":.+?"js":\s*("[^"]+")',
1465 video_webpage, u'JS player URL')
1466 player_url = json.loads(jsplayer_url_json)
1468 signature = self._decrypt_signature(
1469 encrypted_sig, video_id, player_url, age_gate)
1470 url += '&signature=' + signature
1471 if 'ratebypass' not in url:
1472 url += '&ratebypass=yes'
1473 url_map[url_data['itag'][0]] = url
1474 video_url_list = self._get_video_url_list(url_map)
1475 if not video_url_list:
1477 elif video_info.get('hlsvp'):
1478 manifest_url = video_info['hlsvp'][0]
1479 url_map = self._extract_from_m3u8(manifest_url, video_id)
1480 video_url_list = self._get_video_url_list(url_map)
1481 if not video_url_list:
1485 raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
1488 for itag, video_real_url in video_url_list:
1490 video_extension = self._video_extensions.get(itag, 'flv')
1492 video_format = '{0} - {1}{2}'.format(itag if itag else video_extension,
1493 self._video_dimensions.get(itag, '???'),
1494 ' ('+self._special_itags[itag]+')' if itag in self._special_itags else '')
1498 'url': video_real_url,
1499 'uploader': video_uploader,
1500 'uploader_id': video_uploader_id,
1501 'upload_date': upload_date,
1502 'title': video_title,
1503 'ext': video_extension,
1504 'format': video_format,
1506 'thumbnail': video_thumbnail,
1507 'description': video_description,
1508 'player_url': player_url,
1509 'subtitles': video_subtitles,
1510 'duration': video_duration,
1511 'age_limit': 18 if age_gate else 0,
1512 'annotations': video_annotations,
1513 'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id,
1514 'view_count': view_count,
1515 'like_count': like_count,
1516 'dislike_count': dislike_count,
1520 class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
1521 IE_DESC = u'YouTube.com playlists'
1522 _VALID_URL = r"""(?:
1527 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1528 \? (?:.*?&)*? (?:p|a|list)=
1531 ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
1534 ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
1536 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&page=%s'
1537 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
1538 _VIDEO_RE = r'href="/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&[^"]*?index=(?P<index>\d+)'
1539 IE_NAME = u'youtube:playlist'
1542 def suitable(cls, url):
1543 """Receives a URL and returns True if suitable for this IE."""
1544 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1546 def _real_initialize(self):
1549 def _ids_to_results(self, ids):
1550 return [self.url_result(vid_id, 'Youtube', video_id=vid_id)
1553 def _extract_mix(self, playlist_id):
1554 # The mixes are generated from a a single video
1555 # the id of the playlist is just 'RD' + video_id
1556 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[2:], playlist_id)
1557 webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
1558 title_span = (get_element_by_attribute('class', 'title long-title', webpage) or
1559 get_element_by_attribute('class', 'title ', webpage))
1560 title = clean_html(title_span)
1561 video_re = r'data-index="\d+".*?href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s' % re.escape(playlist_id)
1562 ids = orderedSet(re.findall(video_re, webpage))
1563 url_results = self._ids_to_results(ids)
1565 return self.playlist_result(url_results, playlist_id, title)
1567 def _real_extract(self, url):
1568 # Extract playlist id
1569 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1571 raise ExtractorError(u'Invalid URL: %s' % url)
1572 playlist_id = mobj.group(1) or mobj.group(2)
1574 # Check if it's a video-specific URL
1575 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1576 if 'v' in query_dict:
1577 video_id = query_dict['v'][0]
1578 if self._downloader.params.get('noplaylist'):
1579 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
1580 return self.url_result(video_id, 'Youtube', video_id=video_id)
1582 self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1584 if len(playlist_id) == 13: # 'RD' + 11 characters for the video id
1585 # Mixes require a custom extraction process
1586 return self._extract_mix(playlist_id)
1588 # Extract the video ids from the playlist pages
1591 for page_num in itertools.count(1):
1592 url = self._TEMPLATE_URL % (playlist_id, page_num)
1593 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1594 matches = re.finditer(self._VIDEO_RE, page)
1595 # We remove the duplicates and the link with index 0
1596 # (it's not the first video of the playlist)
1597 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1600 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1603 playlist_title = self._og_search_title(page)
1605 url_results = self._ids_to_results(ids)
1606 return self.playlist_result(url_results, playlist_id, playlist_title)
1609 class YoutubeChannelIE(InfoExtractor):
1610 IE_DESC = u'YouTube.com channels'
1611 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1612 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1613 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1614 IE_NAME = u'youtube:channel'
1616 def extract_videos_from_page(self, page):
1618 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1619 if mobj.group(1) not in ids_in_page:
1620 ids_in_page.append(mobj.group(1))
1623 def _real_extract(self, url):
1624 # Extract channel id
1625 mobj = re.match(self._VALID_URL, url)
1627 raise ExtractorError(u'Invalid URL: %s' % url)
1629 # Download channel page
1630 channel_id = mobj.group(1)
1632 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1633 channel_page = self._download_webpage(url, channel_id)
1634 if re.search(r'channel-header-autogenerated-label', channel_page) is not None:
1635 autogenerated = True
1637 autogenerated = False
1640 # The videos are contained in a single page
1641 # the ajax pages can't be used, they are empty
1642 video_ids = self.extract_videos_from_page(channel_page)
1644 # Download all channel pages using the json-based channel_ajax query
1645 for pagenum in itertools.count(1):
1646 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1647 page = self._download_webpage(url, channel_id,
1648 u'Downloading page #%s' % pagenum)
1650 page = json.loads(page)
1652 ids_in_page = self.extract_videos_from_page(page['content_html'])
1653 video_ids.extend(ids_in_page)
1655 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1658 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1660 url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1661 for video_id in video_ids]
1662 return self.playlist_result(url_entries, channel_id)
1665 class YoutubeUserIE(InfoExtractor):
1666 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
1667 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1668 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1669 _GDATA_PAGE_SIZE = 50
1670 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1671 IE_NAME = u'youtube:user'
1674 def suitable(cls, url):
1675 # Don't return True if the url can be extracted with other youtube
1676 # extractor, the regex would is too permissive and it would match.
1677 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1678 if any(ie.suitable(url) for ie in other_ies): return False
1679 else: return super(YoutubeUserIE, cls).suitable(url)
1681 def _real_extract(self, url):
1683 mobj = re.match(self._VALID_URL, url)
1685 raise ExtractorError(u'Invalid URL: %s' % url)
1687 username = mobj.group(1)
1689 # Download video ids using YouTube Data API. Result size per
1690 # query is limited (currently to 50 videos) so we need to query
1691 # page by page until there are no video ids - it means we got
1696 for pagenum in itertools.count(0):
1697 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1699 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1700 page = self._download_webpage(gdata_url, username,
1701 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1704 response = json.loads(page)
1705 except ValueError as err:
1706 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1707 if 'entry' not in response['feed']:
1708 # Number of videos is a multiple of self._MAX_RESULTS
1711 # Extract video identifiers
1713 for entry in response['feed']['entry']:
1714 ids_in_page.append(entry['id']['$t'].split('/')[-1])
1715 video_ids.extend(ids_in_page)
1717 # A little optimization - if current page is not
1718 # "full", ie. does not contain PAGE_SIZE video ids then
1719 # we can assume that this page is the last one - there
1720 # are no more ids on further pages - no need to query
1723 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1727 self.url_result(video_id, 'Youtube', video_id=video_id)
1728 for video_id in video_ids]
1729 return self.playlist_result(url_results, playlist_title=username)
1732 class YoutubeSearchIE(SearchInfoExtractor):
1733 IE_DESC = u'YouTube.com searches'
1734 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1736 IE_NAME = u'youtube:search'
1737 _SEARCH_KEY = 'ytsearch'
1739 def report_download_page(self, query, pagenum):
1740 """Report attempt to download search page with given number."""
1741 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1743 def _get_n_results(self, query, n):
1744 """Get a specified number of results for a query"""
1750 while (50 * pagenum) < limit:
1751 self.report_download_page(query, pagenum+1)
1752 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1753 request = compat_urllib_request.Request(result_url)
1755 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1756 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1757 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1758 api_response = json.loads(data)['data']
1760 if not 'items' in api_response:
1761 raise ExtractorError(u'[youtube] No video results')
1763 new_ids = list(video['id'] for video in api_response['items'])
1764 video_ids += new_ids
1766 limit = min(n, api_response['totalItems'])
1769 if len(video_ids) > n:
1770 video_ids = video_ids[:n]
1771 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1772 for video_id in video_ids]
1773 return self.playlist_result(videos, query)
1775 class YoutubeSearchDateIE(YoutubeSearchIE):
1776 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
1777 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1778 _SEARCH_KEY = 'ytsearchdate'
1779 IE_DESC = u'YouTube.com searches, newest videos first'
1781 class YoutubeShowIE(InfoExtractor):
1782 IE_DESC = u'YouTube.com (multi-season) shows'
1783 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1784 IE_NAME = u'youtube:show'
1786 def _real_extract(self, url):
1787 mobj = re.match(self._VALID_URL, url)
1788 show_name = mobj.group(1)
1789 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1790 # There's one playlist for each season of the show
1791 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1792 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1793 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1796 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1798 Base class for extractors that fetch info from
1799 http://www.youtube.com/feed_ajax
1800 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1802 _LOGIN_REQUIRED = True
1803 # use action_load_personal_feed instead of action_load_system_feed
1804 _PERSONAL_FEED = False
1807 def _FEED_TEMPLATE(self):
1808 action = 'action_load_system_feed'
1809 if self._PERSONAL_FEED:
1810 action = 'action_load_personal_feed'
1811 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1815 return u'youtube:%s' % self._FEED_NAME
1817 def _real_initialize(self):
1820 def _real_extract(self, url):
1823 for i in itertools.count(1):
1824 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1825 u'%s feed' % self._FEED_NAME,
1826 u'Downloading page %s' % i)
1827 info = json.loads(info)
1828 feed_html = info['feed_html']
1829 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1830 ids = orderedSet(m.group(1) for m in m_ids)
1831 feed_entries.extend(
1832 self.url_result(video_id, 'Youtube', video_id=video_id)
1833 for video_id in ids)
1834 if info['paging'] is None:
1836 paging = info['paging']
1837 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1839 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1840 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1841 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1842 _FEED_NAME = 'subscriptions'
1843 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1845 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1846 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1847 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1848 _FEED_NAME = 'recommended'
1849 _PLAYLIST_TITLE = u'Youtube Recommended videos'
1851 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1852 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1853 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1854 _FEED_NAME = 'watch_later'
1855 _PLAYLIST_TITLE = u'Youtube Watch Later'
1856 _PERSONAL_FEED = True
1858 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1859 IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'
1860 _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'
1861 _FEED_NAME = 'history'
1862 _PERSONAL_FEED = True
1863 _PLAYLIST_TITLE = u'Youtube Watch History'
1865 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1866 IE_NAME = u'youtube:favorites'
1867 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1868 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1869 _LOGIN_REQUIRED = True
1871 def _real_extract(self, url):
1872 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1873 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1874 return self.url_result(playlist_id, 'YoutubePlaylist')
1877 class YoutubeTruncatedURLIE(InfoExtractor):
1878 IE_NAME = 'youtube:truncated_url'
1879 IE_DESC = False # Do not list
1880 _VALID_URL = r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$'
1882 def _real_extract(self, url):
1883 raise ExtractorError(
1884 u'Did you forget to quote the URL? Remember that & is a meta '
1885 u'character in most shells, so you want to put the URL in quotes, '
1887 u'\'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\''
1888 u' (or simply youtube-dl BaW_jenozKc ).',