16 from .common import InfoExtractor, SearchInfoExtractor
17 from .subtitles import SubtitlesInfoExtractor
24 compat_urllib_request,
38 class YoutubeBaseInfoExtractor(InfoExtractor):
39 """Provide base functions for Youtube extractors"""
40 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
41 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
42 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
43 _NETRC_MACHINE = 'youtube'
44 # If True it will raise an error if no login info is provided
45 _LOGIN_REQUIRED = False
47 def report_lang(self):
48 """Report attempt to set language."""
49 self.to_screen(u'Setting language')
51 def _set_language(self):
52 request = compat_urllib_request.Request(self._LANG_URL)
55 compat_urllib_request.urlopen(request).read()
56 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
57 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
62 (username, password) = self._get_login_info()
63 # No authentication to be performed
65 if self._LOGIN_REQUIRED:
66 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
69 request = compat_urllib_request.Request(self._LOGIN_URL)
71 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
72 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
73 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
76 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
77 login_page, u'Login GALX parameter')
81 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
85 u'PersistentCookie': u'yes',
87 u'bgresponse': u'js_disabled',
88 u'checkConnection': u'',
89 u'checkedDomains': u'youtube',
94 u'signIn': u'Sign in',
96 u'service': u'youtube',
100 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
102 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
103 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
104 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
107 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
108 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
109 self._downloader.report_warning(u'unable to log in: bad username or password')
111 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
112 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
116 def _confirm_age(self):
119 'action_confirm': 'Confirm',
121 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
123 self.report_age_confirmation()
124 compat_urllib_request.urlopen(request).read().decode('utf-8')
125 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
126 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
129 def _real_initialize(self):
130 if self._downloader is None:
132 if not self._set_language():
134 if not self._login():
139 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
140 IE_DESC = u'YouTube.com'
141 _VALID_URL = r"""(?x)^
143 (?:https?://|//)? # http(s):// or protocol-independent URL (optional)
144 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
145 tube\.majestyc\.net/|
146 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
147 (?:.*?\#/)? # handle anchor (#/) redirect urls
148 (?: # the various things that can precede the ID:
149 (?:(?:v|embed|e)/) # v/ or embed/ or e/
150 |(?: # or the v= param in all its forms
151 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
152 (?:\?|\#!?) # the params delimiter ? or # or #!
153 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
157 |youtu\.be/ # just youtu.be/xxxx
159 )? # all until now is optional -> you can pass the naked ID
160 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
161 (?(1).+)? # if we found the ID, everything can follow
163 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
164 # Listed in order of quality
165 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
166 # Apple HTTP Live Streaming
167 '96', '95', '94', '93', '92', '132', '151',
169 '85', '84', '102', '83', '101', '82', '100',
171 '138', '137', '248', '136', '247', '135', '246',
172 '245', '244', '134', '243', '133', '242', '160',
174 '141', '172', '140', '171', '139',
176 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
177 # Apple HTTP Live Streaming
178 '96', '95', '94', '93', '92', '132', '151',
180 '85', '102', '84', '101', '83', '100', '82',
182 '138', '248', '137', '247', '136', '246', '245',
183 '244', '135', '243', '134', '242', '133', '160',
185 '172', '141', '171', '140', '139',
187 _video_formats_map = {
188 'flv': ['35', '34', '6', '5'],
189 '3gp': ['36', '17', '13'],
190 'mp4': ['38', '37', '22', '18'],
191 'webm': ['46', '45', '44', '43'],
193 _video_extensions = {
215 # Apple HTTP Live Streaming
249 _video_dimensions = {
331 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
332 u"file": u"BaW_jenozKc.mp4",
334 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
335 u"uploader": u"Philipp Hagemeister",
336 u"uploader_id": u"phihag",
337 u"upload_date": u"20121002",
338 u"description": u"test chars: \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
342 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
343 u"file": u"UxxajLWwzqY.mp4",
344 u"note": u"Test generic use_cipher_signature video (#897)",
346 u"upload_date": u"20120506",
347 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
348 u"description": u"md5:5b292926389560516e384ac437c0ec07",
349 u"uploader": u"Icona Pop",
350 u"uploader_id": u"IconaPop"
354 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
355 u"file": u"07FYdnEawAQ.mp4",
356 u"note": u"Test VEVO video with age protection (#956)",
358 u"upload_date": u"20130703",
359 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
360 u"description": u"md5:64249768eec3bc4276236606ea996373",
361 u"uploader": u"justintimberlakeVEVO",
362 u"uploader_id": u"justintimberlakeVEVO"
366 u"url": u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
367 u"file": u"yZIXLfi8CZQ.mp4",
368 u"note": u"Embed-only video (#1746)",
370 u"upload_date": u"20120608",
371 u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
372 u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
373 u"uploader": u"SET India",
374 u"uploader_id": u"setindia"
381 def suitable(cls, url):
382 """Receives a URL and returns True if suitable for this IE."""
383 if YoutubePlaylistIE.suitable(url): return False
384 return re.match(cls._VALID_URL, url) is not None
386 def __init__(self, *args, **kwargs):
387 super(YoutubeIE, self).__init__(*args, **kwargs)
388 self._player_cache = {}
390 def report_video_webpage_download(self, video_id):
391 """Report attempt to download video webpage."""
392 self.to_screen(u'%s: Downloading video webpage' % video_id)
394 def report_video_info_webpage_download(self, video_id):
395 """Report attempt to download video info webpage."""
396 self.to_screen(u'%s: Downloading video info webpage' % video_id)
398 def report_information_extraction(self, video_id):
399 """Report attempt to extract video information."""
400 self.to_screen(u'%s: Extracting video information' % video_id)
402 def report_unavailable_format(self, video_id, format):
403 """Report extracted video URL."""
404 self.to_screen(u'%s: Format %s not available' % (video_id, format))
406 def report_rtmp_download(self):
407 """Indicate the download will use the RTMP protocol."""
408 self.to_screen(u'RTMP download detected')
410 def _extract_signature_function(self, video_id, player_url, slen):
411 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
413 player_type = id_m.group('ext')
414 player_id = id_m.group('id')
416 # Read from filesystem cache
417 func_id = '%s_%s_%d' % (player_type, player_id, slen)
418 assert os.path.basename(func_id) == func_id
419 cache_dir = get_cachedir(self._downloader.params)
421 cache_enabled = cache_dir is not None
423 cache_fn = os.path.join(os.path.expanduser(cache_dir),
427 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
428 cache_spec = json.load(cachef)
429 return lambda s: u''.join(s[i] for i in cache_spec)
431 pass # No cache available
433 if player_type == 'js':
434 code = self._download_webpage(
435 player_url, video_id,
436 note=u'Downloading %s player %s' % (player_type, player_id),
437 errnote=u'Download of %s failed' % player_url)
438 res = self._parse_sig_js(code)
439 elif player_type == 'swf':
440 urlh = self._request_webpage(
441 player_url, video_id,
442 note=u'Downloading %s player %s' % (player_type, player_id),
443 errnote=u'Download of %s failed' % player_url)
445 res = self._parse_sig_swf(code)
447 assert False, 'Invalid player type %r' % player_type
451 test_string = u''.join(map(compat_chr, range(slen)))
452 cache_res = res(test_string)
453 cache_spec = [ord(c) for c in cache_res]
455 os.makedirs(os.path.dirname(cache_fn))
456 except OSError as ose:
457 if ose.errno != errno.EEXIST:
459 write_json_file(cache_spec, cache_fn)
461 tb = traceback.format_exc()
462 self._downloader.report_warning(
463 u'Writing cache to %r failed: %s' % (cache_fn, tb))
467 def _print_sig_code(self, func, slen):
468 def gen_sig_code(idxs):
469 def _genslice(start, end, step):
470 starts = u'' if start == 0 else str(start)
471 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
472 steps = u'' if step == 1 else (u':%d' % step)
473 return u's[%s%s%s]' % (starts, ends, steps)
476 start = '(Never used)' # Quelch pyflakes warnings - start will be
477 # set as soon as step is set
478 for i, prev in zip(idxs[1:], idxs[:-1]):
482 yield _genslice(start, prev, step)
485 if i - prev in [-1, 1]:
490 yield u's[%d]' % prev
494 yield _genslice(start, i, step)
496 test_string = u''.join(map(compat_chr, range(slen)))
497 cache_res = func(test_string)
498 cache_spec = [ord(c) for c in cache_res]
499 expr_code = u' + '.join(gen_sig_code(cache_spec))
500 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
501 self.to_screen(u'Extracted signature function:\n' + code)
503 def _parse_sig_js(self, jscode):
504 funcname = self._search_regex(
505 r'signature=([a-zA-Z]+)', jscode,
506 u'Initial JS player signature function name')
511 return string.lowercase.index(varname)
513 def interpret_statement(stmt, local_vars, allow_recursion=20):
514 if allow_recursion < 0:
515 raise ExtractorError(u'Recursion limit reached')
517 if stmt.startswith(u'var '):
518 stmt = stmt[len(u'var '):]
519 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
520 r'=(?P<expr>.*)$', stmt)
522 if ass_m.groupdict().get('index'):
524 lvar = local_vars[ass_m.group('out')]
525 idx = interpret_expression(ass_m.group('index'),
526 local_vars, allow_recursion)
527 assert isinstance(idx, int)
530 expr = ass_m.group('expr')
533 local_vars[ass_m.group('out')] = val
535 expr = ass_m.group('expr')
536 elif stmt.startswith(u'return '):
538 expr = stmt[len(u'return '):]
540 raise ExtractorError(
541 u'Cannot determine left side of statement in %r' % stmt)
543 v = interpret_expression(expr, local_vars, allow_recursion)
546 def interpret_expression(expr, local_vars, allow_recursion):
551 return local_vars[expr]
553 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
555 member = m.group('member')
556 val = local_vars[m.group('in')]
557 if member == 'split("")':
559 if member == 'join("")':
561 if member == 'length':
563 if member == 'reverse()':
565 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
567 idx = interpret_expression(
568 slice_m.group('idx'), local_vars, allow_recursion-1)
572 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
574 val = local_vars[m.group('in')]
575 idx = interpret_expression(m.group('idx'), local_vars,
579 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
581 a = interpret_expression(m.group('a'),
582 local_vars, allow_recursion)
583 b = interpret_expression(m.group('b'),
584 local_vars, allow_recursion)
588 r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
590 fname = m.group('func')
591 if fname not in functions:
592 functions[fname] = extract_function(fname)
593 argvals = [int(v) if v.isdigit() else local_vars[v]
594 for v in m.group('args').split(',')]
595 return functions[fname](argvals)
596 raise ExtractorError(u'Unsupported JS expression %r' % expr)
598 def extract_function(funcname):
600 r'function ' + re.escape(funcname) +
601 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
603 argnames = func_m.group('args').split(',')
606 local_vars = dict(zip(argnames, args))
607 for stmt in func_m.group('code').split(';'):
608 res = interpret_statement(stmt, local_vars)
612 initial_function = extract_function(funcname)
613 return lambda s: initial_function([s])
615 def _parse_sig_swf(self, file_contents):
616 if file_contents[1:3] != b'WS':
617 raise ExtractorError(
618 u'Not an SWF file; header is %r' % file_contents[:3])
619 if file_contents[:1] == b'C':
620 content = zlib.decompress(file_contents[8:])
622 raise NotImplementedError(u'Unsupported compression format %r' %
625 def extract_tags(content):
627 while pos < len(content):
628 header16 = struct.unpack('<H', content[pos:pos+2])[0]
630 tag_code = header16 >> 6
631 tag_len = header16 & 0x3f
633 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
635 assert pos+tag_len <= len(content)
636 yield (tag_code, content[pos:pos+tag_len])
640 for tag_code, tag in extract_tags(content)
642 p = code_tag.index(b'\0', 4) + 1
643 code_reader = io.BytesIO(code_tag[p:])
645 # Parse ABC (AVM2 ByteCode)
646 def read_int(reader=None):
654 b = struct.unpack('<B', buf)[0]
655 res = res | ((b & 0x7f) << shift)
661 def u30(reader=None):
662 res = read_int(reader)
663 assert res & 0xf0000000 == 0
667 def s32(reader=None):
669 if v & 0x80000000 != 0:
670 v = - ((v ^ 0xffffffff) + 1)
673 def read_string(reader=None):
677 resb = reader.read(slen)
678 assert len(resb) == slen
679 return resb.decode('utf-8')
681 def read_bytes(count, reader=None):
684 resb = reader.read(count)
685 assert len(resb) == count
688 def read_byte(reader=None):
689 resb = read_bytes(1, reader=reader)
690 res = struct.unpack('<B', resb)[0]
693 # minor_version + major_version
698 for _c in range(1, int_count):
701 for _c in range(1, uint_count):
704 read_bytes((double_count-1) * 8)
706 constant_strings = [u'']
707 for _c in range(1, string_count):
709 constant_strings.append(s)
710 namespace_count = u30()
711 for _c in range(1, namespace_count):
715 for _c in range(1, ns_set_count):
717 for _c2 in range(count):
719 multiname_count = u30()
728 0x0e: 2, # MultinameA
729 0x1b: 1, # MultinameL
730 0x1c: 1, # MultinameLA
733 for _c in range(1, multiname_count):
735 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
737 u30() # namespace_idx
739 multinames.append(constant_strings[name_idx])
741 multinames.append('[MULTINAME kind: %d]' % kind)
742 for _c2 in range(MULTINAME_SIZES[kind]):
747 MethodInfo = collections.namedtuple(
749 ['NEED_ARGUMENTS', 'NEED_REST'])
751 for method_id in range(method_count):
754 for _ in range(param_count):
756 u30() # name index (always 0 for youtube)
758 if flags & 0x08 != 0:
761 for c in range(option_count):
764 if flags & 0x80 != 0:
765 # Param names present
766 for _ in range(param_count):
768 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
769 method_infos.append(mi)
772 metadata_count = u30()
773 for _c in range(metadata_count):
776 for _c2 in range(item_count):
780 def parse_traits_info():
781 trait_name_idx = u30()
782 kind_full = read_byte()
783 kind = kind_full & 0x0f
784 attrs = kind_full >> 4
786 if kind in [0x00, 0x06]: # Slot or Const
788 u30() # type_name_idx
792 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
795 methods[multinames[trait_name_idx]] = method_idx
796 elif kind == 0x04: # Class
799 elif kind == 0x05: # Function
802 methods[function_idx] = multinames[trait_name_idx]
804 raise ExtractorError(u'Unsupported trait kind %d' % kind)
806 if attrs & 0x4 != 0: # Metadata present
807 metadata_count = u30()
808 for _c3 in range(metadata_count):
809 u30() # metadata index
814 TARGET_CLASSNAME = u'SignatureDecipher'
815 searched_idx = multinames.index(TARGET_CLASSNAME)
816 searched_class_id = None
818 for class_id in range(class_count):
820 if name_idx == searched_idx:
821 # We found the class we're looking for!
822 searched_class_id = class_id
823 u30() # super_name idx
825 if flags & 0x08 != 0: # Protected namespace is present
826 u30() # protected_ns_idx
828 for _c2 in range(intrf_count):
832 for _c2 in range(trait_count):
835 if searched_class_id is None:
836 raise ExtractorError(u'Target class %r not found' %
841 for class_id in range(class_count):
844 for _c2 in range(trait_count):
845 trait_methods = parse_traits_info()
846 if class_id == searched_class_id:
847 method_names.update(trait_methods.items())
848 method_idxs.update(dict(
850 for name, idx in trait_methods.items()))
854 for _c in range(script_count):
857 for _c2 in range(trait_count):
861 method_body_count = u30()
862 Method = collections.namedtuple('Method', ['code', 'local_count'])
864 for _c in range(method_body_count):
868 u30() # init_scope_depth
869 u30() # max_scope_depth
871 code = read_bytes(code_length)
872 if method_idx in method_idxs:
873 m = Method(code, local_count)
874 methods[method_idxs[method_idx]] = m
875 exception_count = u30()
876 for _c2 in range(exception_count):
883 for _c2 in range(trait_count):
886 assert p + code_reader.tell() == len(code_tag)
887 assert len(methods) == len(method_idxs)
889 method_pyfunctions = {}
891 def extract_function(func_name):
892 if func_name in method_pyfunctions:
893 return method_pyfunctions[func_name]
894 if func_name not in methods:
895 raise ExtractorError(u'Cannot find function %r' % func_name)
896 m = methods[func_name]
899 registers = ['(this)'] + list(args) + [None] * m.local_count
901 coder = io.BytesIO(m.code)
903 opcode = struct.unpack('!B', coder.read(1))[0]
904 if opcode == 36: # pushbyte
905 v = struct.unpack('!B', coder.read(1))[0]
907 elif opcode == 44: # pushstring
909 stack.append(constant_strings[idx])
910 elif opcode == 48: # pushscope
911 # We don't implement the scope register, so we'll just
912 # ignore the popped value
914 elif opcode == 70: # callproperty
916 mname = multinames[index]
917 arg_count = u30(coder)
918 args = list(reversed(
919 [stack.pop() for _ in range(arg_count)]))
921 if mname == u'split':
922 assert len(args) == 1
923 assert isinstance(args[0], compat_str)
924 assert isinstance(obj, compat_str)
928 res = obj.split(args[0])
930 elif mname == u'slice':
931 assert len(args) == 1
932 assert isinstance(args[0], int)
933 assert isinstance(obj, list)
936 elif mname == u'join':
937 assert len(args) == 1
938 assert isinstance(args[0], compat_str)
939 assert isinstance(obj, list)
940 res = args[0].join(obj)
942 elif mname in method_pyfunctions:
943 stack.append(method_pyfunctions[mname](args))
945 raise NotImplementedError(
946 u'Unsupported property %r on %r'
948 elif opcode == 72: # returnvalue
951 elif opcode == 79: # callpropvoid
953 mname = multinames[index]
954 arg_count = u30(coder)
955 args = list(reversed(
956 [stack.pop() for _ in range(arg_count)]))
958 if mname == u'reverse':
959 assert isinstance(obj, list)
962 raise NotImplementedError(
963 u'Unsupported (void) property %r on %r'
965 elif opcode == 93: # findpropstrict
967 mname = multinames[index]
968 res = extract_function(mname)
970 elif opcode == 97: # setproperty
975 assert isinstance(obj, list)
976 assert isinstance(idx, int)
978 elif opcode == 98: # getlocal
980 stack.append(registers[index])
981 elif opcode == 99: # setlocal
984 registers[index] = value
985 elif opcode == 102: # getproperty
987 pname = multinames[index]
988 if pname == u'length':
990 assert isinstance(obj, list)
991 stack.append(len(obj))
992 else: # Assume attribute access
994 assert isinstance(idx, int)
996 assert isinstance(obj, list)
997 stack.append(obj[idx])
998 elif opcode == 128: # coerce
1000 elif opcode == 133: # coerce_s
1001 assert isinstance(stack[-1], (type(None), compat_str))
1002 elif opcode == 164: # modulo
1003 value2 = stack.pop()
1004 value1 = stack.pop()
1005 res = value1 % value2
1007 elif opcode == 208: # getlocal_0
1008 stack.append(registers[0])
1009 elif opcode == 209: # getlocal_1
1010 stack.append(registers[1])
1011 elif opcode == 210: # getlocal_2
1012 stack.append(registers[2])
1013 elif opcode == 211: # getlocal_3
1014 stack.append(registers[3])
1015 elif opcode == 214: # setlocal_2
1016 registers[2] = stack.pop()
1017 elif opcode == 215: # setlocal_3
1018 registers[3] = stack.pop()
1020 raise NotImplementedError(
1021 u'Unsupported opcode %d' % opcode)
1023 method_pyfunctions[func_name] = resfunc
1026 initial_function = extract_function(u'decipher')
1027 return lambda s: initial_function([s])
1029 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
1030 """Turn the encrypted s field into a working signature"""
1032 if player_url is not None:
1033 if player_url.startswith(u'//'):
1034 player_url = u'https:' + player_url
1036 player_id = (player_url, len(s))
1037 if player_id not in self._player_cache:
1038 func = self._extract_signature_function(
1039 video_id, player_url, len(s)
1041 self._player_cache[player_id] = func
1042 func = self._player_cache[player_id]
1043 if self._downloader.params.get('youtube_print_sig_code'):
1044 self._print_sig_code(func, len(s))
1047 tb = traceback.format_exc()
1048 self._downloader.report_warning(
1049 u'Automatic signature extraction failed: ' + tb)
1051 self._downloader.report_warning(
1052 u'Warning: Falling back to static signature algorithm')
1054 return self._static_decrypt_signature(
1055 s, video_id, player_url, age_gate)
1057 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
1059 # The videos with age protection use another player, so the
1060 # algorithms can be different.
1062 return s[2:63] + s[82] + s[64:82] + s[63]
1065 return s[86:29:-1] + s[88] + s[28:5:-1]
1067 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
1069 return s[84:27:-1] + s[86] + s[26:5:-1]
1071 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
1073 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
1075 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
1077 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
1079 return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1]
1081 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
1083 return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1]
1085 return s[80:63:-1] + s[0] + s[62:0:-1] + s[63]
1087 return s[80:37:-1] + s[7] + s[36:7:-1] + s[0] + s[6:0:-1] + s[37]
1089 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1091 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
1093 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1096 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
1098 def _get_available_subtitles(self, video_id, webpage):
1100 sub_list = self._download_webpage(
1101 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1102 video_id, note=False)
1103 except ExtractorError as err:
1104 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
1106 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
1111 params = compat_urllib_parse.urlencode({
1114 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
1115 'name': l[0].encode('utf-8'),
1117 url = u'http://www.youtube.com/api/timedtext?' + params
1118 sub_lang_list[lang] = url
1119 if not sub_lang_list:
1120 self._downloader.report_warning(u'video doesn\'t have subtitles')
1122 return sub_lang_list
1124 def _get_available_automatic_caption(self, video_id, webpage):
1125 """We need the webpage for getting the captions url, pass it as an
1126 argument to speed up the process."""
1127 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
1128 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1129 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
1130 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
1132 self._downloader.report_warning(err_msg)
1134 player_config = json.loads(mobj.group(1))
1136 args = player_config[u'args']
1137 caption_url = args[u'ttsurl']
1138 timestamp = args[u'timestamp']
1139 # We get the available subtitles
1140 list_params = compat_urllib_parse.urlencode({
1145 list_url = caption_url + '&' + list_params
1146 caption_list = self._download_xml(list_url, video_id)
1147 original_lang_node = caption_list.find('track')
1148 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
1149 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1151 original_lang = original_lang_node.attrib['lang_code']
1154 for lang_node in caption_list.findall('target'):
1155 sub_lang = lang_node.attrib['lang_code']
1156 params = compat_urllib_parse.urlencode({
1157 'lang': original_lang,
1163 sub_lang_list[sub_lang] = caption_url + '&' + params
1164 return sub_lang_list
1165 # An extractor error can be raise by the download process if there are
1166 # no automatic captions but there are subtitles
1167 except (KeyError, ExtractorError):
1168 self._downloader.report_warning(err_msg)
1171 def _print_formats(self, formats):
1172 print('Available formats:')
1174 print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'),
1175 self._video_dimensions.get(x, '???'),
1176 ' ('+self._special_itags[x]+')' if x in self._special_itags else ''))
1178 def _extract_id(self, url):
1179 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1181 raise ExtractorError(u'Invalid URL: %s' % url)
1182 video_id = mobj.group(2)
1185 def _get_video_url_list(self, url_map):
1187 Transform a dictionary in the format {itag:url} to a list of (itag, url)
1188 with the requested formats.
1190 req_format = self._downloader.params.get('format', None)
1191 format_limit = self._downloader.params.get('format_limit', None)
1192 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1193 if format_limit is not None and format_limit in available_formats:
1194 format_list = available_formats[available_formats.index(format_limit):]
1196 format_list = available_formats
1197 existing_formats = [x for x in format_list if x in url_map]
1198 if len(existing_formats) == 0:
1199 raise ExtractorError(u'no known formats available for video')
1200 if self._downloader.params.get('listformats', None):
1201 self._print_formats(existing_formats)
1203 if req_format is None or req_format == 'best':
1204 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1205 elif req_format == 'worst':
1206 video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality
1207 elif req_format in ('-1', 'all'):
1208 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1210 # Specific formats. We pick the first in a slash-delimeted sequence.
1211 # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
1212 # available in the specified format. For example,
1213 # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1214 # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
1215 # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
1216 req_formats = req_format.split('/')
1217 video_url_list = None
1218 for rf in req_formats:
1220 video_url_list = [(rf, url_map[rf])]
1222 if rf in self._video_formats_map:
1223 for srf in self._video_formats_map[rf]:
1225 video_url_list = [(srf, url_map[srf])]
1230 if video_url_list is None:
1231 raise ExtractorError(u'requested format not available')
1232 return video_url_list
1234 def _extract_from_m3u8(self, manifest_url, video_id):
1236 def _get_urls(_manifest):
1237 lines = _manifest.split('\n')
1238 urls = filter(lambda l: l and not l.startswith('#'),
1241 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1242 formats_urls = _get_urls(manifest)
1243 for format_url in formats_urls:
1244 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1245 url_map[itag] = format_url
1248 def _extract_annotations(self, video_id):
1249 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
1250 return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
1252 def _real_extract(self, url):
1253 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1254 mobj = re.search(self._NEXT_URL_RE, url)
1256 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1257 video_id = self._extract_id(url)
1260 self.report_video_webpage_download(video_id)
1261 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1262 request = compat_urllib_request.Request(url)
1264 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
1265 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1266 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
1268 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
1270 # Attempt to extract SWF player URL
1271 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1272 if mobj is not None:
1273 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1278 self.report_video_info_webpage_download(video_id)
1279 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1280 self.report_age_confirmation()
1282 # We simulate the access to the video from www.youtube.com/v/{video_id}
1283 # this can be viewed without login into Youtube
1284 data = compat_urllib_parse.urlencode({'video_id': video_id,
1285 'el': 'player_embedded',
1288 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1292 video_info_url = 'https://www.youtube.com/get_video_info?' + data
1293 video_info_webpage = self._download_webpage(video_info_url, video_id,
1295 errnote='unable to download video info webpage')
1296 video_info = compat_parse_qs(video_info_webpage)
1299 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1300 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1301 % (video_id, el_type))
1302 video_info_webpage = self._download_webpage(video_info_url, video_id,
1304 errnote='unable to download video info webpage')
1305 video_info = compat_parse_qs(video_info_webpage)
1306 if 'token' in video_info:
1308 if 'token' not in video_info:
1309 if 'reason' in video_info:
1310 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
1312 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1314 if 'view_count' in video_info:
1315 view_count = int(video_info['view_count'][0])
1319 # Check for "rental" videos
1320 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1321 raise ExtractorError(u'"rental" videos not supported')
1323 # Start extracting information
1324 self.report_information_extraction(video_id)
1327 if 'author' not in video_info:
1328 raise ExtractorError(u'Unable to extract uploader name')
1329 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1332 video_uploader_id = None
1333 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1334 if mobj is not None:
1335 video_uploader_id = mobj.group(1)
1337 self._downloader.report_warning(u'unable to extract uploader nickname')
1340 if 'title' in video_info:
1341 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1343 self._downloader.report_warning(u'Unable to extract video title')
1347 # We try first to get a high quality image:
1348 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1349 video_webpage, re.DOTALL)
1350 if m_thumb is not None:
1351 video_thumbnail = m_thumb.group(1)
1352 elif 'thumbnail_url' not in video_info:
1353 self._downloader.report_warning(u'unable to extract video thumbnail')
1354 video_thumbnail = None
1355 else: # don't panic if we can't find it
1356 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1360 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1361 if mobj is not None:
1362 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1363 upload_date = unified_strdate(upload_date)
1366 video_description = get_element_by_id("eow-description", video_webpage)
1367 if video_description:
1368 video_description = clean_html(video_description)
1370 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1372 video_description = unescapeHTML(fd_mobj.group(1))
1374 video_description = u''
1377 video_subtitles = self.extract_subtitles(video_id, video_webpage)
1379 if self._downloader.params.get('listsubtitles', False):
1380 self._list_available_subtitles(video_id, video_webpage)
1383 if 'length_seconds' not in video_info:
1384 self._downloader.report_warning(u'unable to extract video duration')
1387 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
1390 video_annotations = None
1391 if self._downloader.params.get('writeannotations', False):
1392 video_annotations = self._extract_annotations(video_id)
1394 # Decide which formats to download
1397 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
1399 raise ValueError('Could not find vevo ID')
1400 info = json.loads(mobj.group(1))
1402 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1403 # this signatures are encrypted
1404 if 'url_encoded_fmt_stream_map' not in args:
1405 raise ValueError(u'No stream_map present') # caught below
1406 re_signature = re.compile(r'[&,]s=')
1407 m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
1409 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
1410 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
1411 m_s = re_signature.search(args.get('adaptive_fmts', u''))
1413 if 'adaptive_fmts' in video_info:
1414 video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
1416 video_info['adaptive_fmts'] = [args['adaptive_fmts']]
1420 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1421 self.report_rtmp_download()
1422 video_url_list = [(None, video_info['conn'][0])]
1423 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
1424 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
1425 if 'rtmpe%3Dyes' in encoded_url_map:
1426 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1428 for url_data_str in encoded_url_map.split(','):
1429 url_data = compat_parse_qs(url_data_str)
1430 if 'itag' in url_data and 'url' in url_data:
1431 url = url_data['url'][0]
1432 if 'sig' in url_data:
1433 url += '&signature=' + url_data['sig'][0]
1434 elif 's' in url_data:
1435 encrypted_sig = url_data['s'][0]
1436 if self._downloader.params.get('verbose'):
1438 if player_url is None:
1439 player_version = 'unknown'
1441 player_version = self._search_regex(
1442 r'-(.+)\.swf$', player_url,
1443 u'flash player', fatal=False)
1444 player_desc = 'flash player %s' % player_version
1446 player_version = self._search_regex(
1447 r'html5player-(.+?)\.js', video_webpage,
1448 'html5 player', fatal=False)
1449 player_desc = u'html5 player %s' % player_version
1451 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
1452 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
1453 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1456 jsplayer_url_json = self._search_regex(
1457 r'"assets":.+?"js":\s*("[^"]+")',
1458 video_webpage, u'JS player URL')
1459 player_url = json.loads(jsplayer_url_json)
1461 signature = self._decrypt_signature(
1462 encrypted_sig, video_id, player_url, age_gate)
1463 url += '&signature=' + signature
1464 if 'ratebypass' not in url:
1465 url += '&ratebypass=yes'
1466 url_map[url_data['itag'][0]] = url
1467 video_url_list = self._get_video_url_list(url_map)
1468 if not video_url_list:
1470 elif video_info.get('hlsvp'):
1471 manifest_url = video_info['hlsvp'][0]
1472 url_map = self._extract_from_m3u8(manifest_url, video_id)
1473 video_url_list = self._get_video_url_list(url_map)
1474 if not video_url_list:
1478 raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
1481 for itag, video_real_url in video_url_list:
1483 video_extension = self._video_extensions.get(itag, 'flv')
1485 video_format = '{0} - {1}{2}'.format(itag if itag else video_extension,
1486 self._video_dimensions.get(itag, '???'),
1487 ' ('+self._special_itags[itag]+')' if itag in self._special_itags else '')
1491 'url': video_real_url,
1492 'uploader': video_uploader,
1493 'uploader_id': video_uploader_id,
1494 'upload_date': upload_date,
1495 'title': video_title,
1496 'ext': video_extension,
1497 'format': video_format,
1499 'thumbnail': video_thumbnail,
1500 'description': video_description,
1501 'player_url': player_url,
1502 'subtitles': video_subtitles,
1503 'duration': video_duration,
1504 'age_limit': 18 if age_gate else 0,
1505 'annotations': video_annotations,
1506 'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id,
1507 'view_count': view_count,
1511 class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
1512 IE_DESC = u'YouTube.com playlists'
1513 _VALID_URL = r"""(?:
1518 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1519 \? (?:.*?&)*? (?:p|a|list)=
1522 ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
1525 ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
1527 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&page=%s'
1528 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
1529 _VIDEO_RE = r'href="/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&[^"]*?index=(?P<index>\d+)'
1530 IE_NAME = u'youtube:playlist'
1533 def suitable(cls, url):
1534 """Receives a URL and returns True if suitable for this IE."""
1535 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1537 def _real_initialize(self):
1540 def _real_extract(self, url):
1541 # Extract playlist id
1542 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1544 raise ExtractorError(u'Invalid URL: %s' % url)
1545 playlist_id = mobj.group(1) or mobj.group(2)
1547 # Check if it's a video-specific URL
1548 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1549 if 'v' in query_dict:
1550 video_id = query_dict['v'][0]
1551 if self._downloader.params.get('noplaylist'):
1552 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
1553 return self.url_result(video_id, 'Youtube', video_id=video_id)
1555 self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1557 # Extract the video ids from the playlist pages
1560 for page_num in itertools.count(1):
1561 url = self._TEMPLATE_URL % (playlist_id, page_num)
1562 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1563 matches = re.finditer(self._VIDEO_RE, page)
1564 # We remove the duplicates and the link with index 0
1565 # (it's not the first video of the playlist)
1566 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1569 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1572 playlist_title = self._og_search_title(page)
1574 url_results = [self.url_result(vid_id, 'Youtube', video_id=vid_id)
1576 return self.playlist_result(url_results, playlist_id, playlist_title)
1579 class YoutubeChannelIE(InfoExtractor):
1580 IE_DESC = u'YouTube.com channels'
1581 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1582 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1583 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1584 IE_NAME = u'youtube:channel'
1586 def extract_videos_from_page(self, page):
1588 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1589 if mobj.group(1) not in ids_in_page:
1590 ids_in_page.append(mobj.group(1))
1593 def _real_extract(self, url):
1594 # Extract channel id
1595 mobj = re.match(self._VALID_URL, url)
1597 raise ExtractorError(u'Invalid URL: %s' % url)
1599 # Download channel page
1600 channel_id = mobj.group(1)
1602 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1603 channel_page = self._download_webpage(url, channel_id)
1604 if re.search(r'channel-header-autogenerated-label', channel_page) is not None:
1605 autogenerated = True
1607 autogenerated = False
1610 # The videos are contained in a single page
1611 # the ajax pages can't be used, they are empty
1612 video_ids = self.extract_videos_from_page(channel_page)
1614 # Download all channel pages using the json-based channel_ajax query
1615 for pagenum in itertools.count(1):
1616 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1617 page = self._download_webpage(url, channel_id,
1618 u'Downloading page #%s' % pagenum)
1620 page = json.loads(page)
1622 ids_in_page = self.extract_videos_from_page(page['content_html'])
1623 video_ids.extend(ids_in_page)
1625 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1628 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1630 url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1631 for video_id in video_ids]
1632 return self.playlist_result(url_entries, channel_id)
1635 class YoutubeUserIE(InfoExtractor):
1636 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
1637 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1638 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1639 _GDATA_PAGE_SIZE = 50
1640 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1641 IE_NAME = u'youtube:user'
1644 def suitable(cls, url):
1645 # Don't return True if the url can be extracted with other youtube
1646 # extractor, the regex would is too permissive and it would match.
1647 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1648 if any(ie.suitable(url) for ie in other_ies): return False
1649 else: return super(YoutubeUserIE, cls).suitable(url)
1651 def _real_extract(self, url):
1653 mobj = re.match(self._VALID_URL, url)
1655 raise ExtractorError(u'Invalid URL: %s' % url)
1657 username = mobj.group(1)
1659 # Download video ids using YouTube Data API. Result size per
1660 # query is limited (currently to 50 videos) so we need to query
1661 # page by page until there are no video ids - it means we got
1666 for pagenum in itertools.count(0):
1667 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1669 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1670 page = self._download_webpage(gdata_url, username,
1671 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1674 response = json.loads(page)
1675 except ValueError as err:
1676 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1677 if 'entry' not in response['feed']:
1678 # Number of videos is a multiple of self._MAX_RESULTS
1681 # Extract video identifiers
1683 for entry in response['feed']['entry']:
1684 ids_in_page.append(entry['id']['$t'].split('/')[-1])
1685 video_ids.extend(ids_in_page)
1687 # A little optimization - if current page is not
1688 # "full", ie. does not contain PAGE_SIZE video ids then
1689 # we can assume that this page is the last one - there
1690 # are no more ids on further pages - no need to query
1693 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1697 self.url_result(video_id, 'Youtube', video_id=video_id)
1698 for video_id in video_ids]
1699 return self.playlist_result(url_results, playlist_title=username)
1702 class YoutubeSearchIE(SearchInfoExtractor):
1703 IE_DESC = u'YouTube.com searches'
1704 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1706 IE_NAME = u'youtube:search'
1707 _SEARCH_KEY = 'ytsearch'
1709 def report_download_page(self, query, pagenum):
1710 """Report attempt to download search page with given number."""
1711 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1713 def _get_n_results(self, query, n):
1714 """Get a specified number of results for a query"""
1720 while (50 * pagenum) < limit:
1721 self.report_download_page(query, pagenum+1)
1722 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1723 request = compat_urllib_request.Request(result_url)
1725 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1726 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1727 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1728 api_response = json.loads(data)['data']
1730 if not 'items' in api_response:
1731 raise ExtractorError(u'[youtube] No video results')
1733 new_ids = list(video['id'] for video in api_response['items'])
1734 video_ids += new_ids
1736 limit = min(n, api_response['totalItems'])
1739 if len(video_ids) > n:
1740 video_ids = video_ids[:n]
1741 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1742 for video_id in video_ids]
1743 return self.playlist_result(videos, query)
1745 class YoutubeSearchDateIE(YoutubeSearchIE):
1746 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1747 _SEARCH_KEY = 'ytsearchdate'
1748 IE_DESC = u'YouTube.com searches, newest videos first'
1750 class YoutubeShowIE(InfoExtractor):
1751 IE_DESC = u'YouTube.com (multi-season) shows'
1752 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1753 IE_NAME = u'youtube:show'
1755 def _real_extract(self, url):
1756 mobj = re.match(self._VALID_URL, url)
1757 show_name = mobj.group(1)
1758 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1759 # There's one playlist for each season of the show
1760 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1761 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1762 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1765 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1767 Base class for extractors that fetch info from
1768 http://www.youtube.com/feed_ajax
1769 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1771 _LOGIN_REQUIRED = True
1773 # use action_load_personal_feed instead of action_load_system_feed
1774 _PERSONAL_FEED = False
1777 def _FEED_TEMPLATE(self):
1778 action = 'action_load_system_feed'
1779 if self._PERSONAL_FEED:
1780 action = 'action_load_personal_feed'
1781 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1785 return u'youtube:%s' % self._FEED_NAME
1787 def _real_initialize(self):
1790 def _real_extract(self, url):
1792 # The step argument is available only in 2.7 or higher
1793 for i in itertools.count(0):
1794 paging = i*self._PAGING_STEP
1795 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1796 u'%s feed' % self._FEED_NAME,
1797 u'Downloading page %s' % i)
1798 info = json.loads(info)
1799 feed_html = info['feed_html']
1800 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1801 ids = orderedSet(m.group(1) for m in m_ids)
1802 feed_entries.extend(
1803 self.url_result(video_id, 'Youtube', video_id=video_id)
1804 for video_id in ids)
1805 if info['paging'] is None:
1807 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1809 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1810 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1811 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1812 _FEED_NAME = 'subscriptions'
1813 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1815 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1816 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1817 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1818 _FEED_NAME = 'recommended'
1819 _PLAYLIST_TITLE = u'Youtube Recommended videos'
1821 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1822 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1823 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1824 _FEED_NAME = 'watch_later'
1825 _PLAYLIST_TITLE = u'Youtube Watch Later'
1827 _PERSONAL_FEED = True
1829 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1830 IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'
1831 _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'
1832 _FEED_NAME = 'history'
1833 _PERSONAL_FEED = True
1834 _PLAYLIST_TITLE = u'Youtube Watch History'
1836 def _real_extract(self, url):
1837 webpage = self._download_webpage('https://www.youtube.com/feed/history', u'History')
1838 data_paging = self._search_regex(r'data-paging="(\d+)"', webpage, u'data-paging')
1839 # The step is actually a ridiculously big number (like 1374343569725646)
1840 self._PAGING_STEP = int(data_paging)
1841 return super(YoutubeHistoryIE, self)._real_extract(url)
1843 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1844 IE_NAME = u'youtube:favorites'
1845 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1846 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1847 _LOGIN_REQUIRED = True
1849 def _real_extract(self, url):
1850 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1851 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1852 return self.url_result(playlist_id, 'YoutubePlaylist')
1855 class YoutubeTruncatedURLIE(InfoExtractor):
1856 IE_NAME = 'youtube:truncated_url'
1857 IE_DESC = False # Do not list
1858 _VALID_URL = r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$'
1860 def _real_extract(self, url):
1861 raise ExtractorError(
1862 u'Did you forget to quote the URL? Remember that & is a meta '
1863 u'character in most shells, so you want to put the URL in quotes, '
1865 u'\'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\''
1866 u' (or simply youtube-dl BaW_jenozKc ).',