14 import xml.etree.ElementTree
17 from .common import InfoExtractor, SearchInfoExtractor
18 from .subtitles import SubtitlesInfoExtractor
25 compat_urllib_request,
39 class YoutubeBaseInfoExtractor(InfoExtractor):
40 """Provide base functions for Youtube extractors"""
41 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
42 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
43 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
44 _NETRC_MACHINE = 'youtube'
45 # If True it will raise an error if no login info is provided
46 _LOGIN_REQUIRED = False
48 def report_lang(self):
49 """Report attempt to set language."""
50 self.to_screen(u'Setting language')
52 def _set_language(self):
53 request = compat_urllib_request.Request(self._LANG_URL)
56 compat_urllib_request.urlopen(request).read()
57 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
58 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
63 (username, password) = self._get_login_info()
64 # No authentication to be performed
66 if self._LOGIN_REQUIRED:
67 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
70 request = compat_urllib_request.Request(self._LOGIN_URL)
72 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
73 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
74 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
79 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
82 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
88 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
92 u'PersistentCookie': u'yes',
94 u'bgresponse': u'js_disabled',
95 u'checkConnection': u'',
96 u'checkedDomains': u'youtube',
102 u'signIn': u'Sign in',
104 u'service': u'youtube',
108 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
110 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
111 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
112 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
115 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
116 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
117 self._downloader.report_warning(u'unable to log in: bad username or password')
119 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
120 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
124 def _confirm_age(self):
127 'action_confirm': 'Confirm',
129 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
131 self.report_age_confirmation()
132 compat_urllib_request.urlopen(request).read().decode('utf-8')
133 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
134 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
137 def _real_initialize(self):
138 if self._downloader is None:
140 if not self._set_language():
142 if not self._login():
147 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
148 IE_DESC = u'YouTube.com'
151 (?:https?://)? # http(s):// (optional)
152 (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
153 tube\.majestyc\.net/|
154 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
155 (?:.*?\#/)? # handle anchor (#/) redirect urls
156 (?: # the various things that can precede the ID:
157 (?:(?:v|embed|e)/) # v/ or embed/ or e/
158 |(?: # or the v= param in all its forms
159 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
160 (?:\?|\#!?) # the params delimiter ? or # or #!
161 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
165 |youtu\.be/ # just youtu.be/xxxx
167 )? # all until now is optional -> you can pass the naked ID
168 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
169 (?(1).+)? # if we found the ID, everything can follow
171 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
172 # Listed in order of quality
173 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
174 # Apple HTTP Live Streaming
175 '96', '95', '94', '93', '92', '132', '151',
177 '85', '84', '102', '83', '101', '82', '100',
179 '138', '137', '248', '136', '247', '135', '246',
180 '245', '244', '134', '243', '133', '242', '160',
182 '141', '172', '140', '171', '139',
184 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
185 # Apple HTTP Live Streaming
186 '96', '95', '94', '93', '92', '132', '151',
188 '85', '102', '84', '101', '83', '100', '82',
190 '138', '248', '137', '247', '136', '246', '245',
191 '244', '135', '243', '134', '242', '133', '160',
193 '172', '141', '171', '140', '139',
195 _video_formats_map = {
196 'flv': ['35', '34', '6', '5'],
197 '3gp': ['36', '17', '13'],
198 'mp4': ['38', '37', '22', '18'],
199 'webm': ['46', '45', '44', '43'],
201 _video_extensions = {
223 # Apple HTTP Live Streaming
257 _video_dimensions = {
339 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
340 u"file": u"BaW_jenozKc.mp4",
342 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
343 u"uploader": u"Philipp Hagemeister",
344 u"uploader_id": u"phihag",
345 u"upload_date": u"20121002",
346 u"description": u"test chars: \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
350 u"url": u"http://www.youtube.com/watch?v=1ltcDfZMA3U",
351 u"file": u"1ltcDfZMA3U.flv",
352 u"note": u"Test VEVO video (#897)",
354 u"upload_date": u"20070518",
355 u"title": u"Maps - It Will Find You",
356 u"description": u"Music video by Maps performing It Will Find You.",
357 u"uploader": u"MuteUSA",
358 u"uploader_id": u"MuteUSA"
362 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
363 u"file": u"UxxajLWwzqY.mp4",
364 u"note": u"Test generic use_cipher_signature video (#897)",
366 u"upload_date": u"20120506",
367 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
368 u"description": u"md5:5b292926389560516e384ac437c0ec07",
369 u"uploader": u"Icona Pop",
370 u"uploader_id": u"IconaPop"
374 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
375 u"file": u"07FYdnEawAQ.mp4",
376 u"note": u"Test VEVO video with age protection (#956)",
378 u"upload_date": u"20130703",
379 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
380 u"description": u"md5:64249768eec3bc4276236606ea996373",
381 u"uploader": u"justintimberlakeVEVO",
382 u"uploader_id": u"justintimberlakeVEVO"
389 def suitable(cls, url):
390 """Receives a URL and returns True if suitable for this IE."""
391 if YoutubePlaylistIE.suitable(url): return False
392 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
394 def __init__(self, *args, **kwargs):
395 super(YoutubeIE, self).__init__(*args, **kwargs)
396 self._player_cache = {}
398 def report_video_webpage_download(self, video_id):
399 """Report attempt to download video webpage."""
400 self.to_screen(u'%s: Downloading video webpage' % video_id)
402 def report_video_info_webpage_download(self, video_id):
403 """Report attempt to download video info webpage."""
404 self.to_screen(u'%s: Downloading video info webpage' % video_id)
406 def report_information_extraction(self, video_id):
407 """Report attempt to extract video information."""
408 self.to_screen(u'%s: Extracting video information' % video_id)
410 def report_unavailable_format(self, video_id, format):
411 """Report extracted video URL."""
412 self.to_screen(u'%s: Format %s not available' % (video_id, format))
414 def report_rtmp_download(self):
415 """Indicate the download will use the RTMP protocol."""
416 self.to_screen(u'RTMP download detected')
418 def _extract_signature_function(self, video_id, player_url, slen):
419 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
421 player_type = id_m.group('ext')
422 player_id = id_m.group('id')
424 # Read from filesystem cache
425 func_id = '%s_%s_%d' % (player_type, player_id, slen)
426 assert os.path.basename(func_id) == func_id
427 cache_dir = get_cachedir(self._downloader.params)
429 cache_enabled = cache_dir is not None
431 cache_fn = os.path.join(os.path.expanduser(cache_dir),
435 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
436 cache_spec = json.load(cachef)
437 return lambda s: u''.join(s[i] for i in cache_spec)
439 pass # No cache available
441 if player_type == 'js':
442 code = self._download_webpage(
443 player_url, video_id,
444 note=u'Downloading %s player %s' % (player_type, player_id),
445 errnote=u'Download of %s failed' % player_url)
446 res = self._parse_sig_js(code)
447 elif player_type == 'swf':
448 urlh = self._request_webpage(
449 player_url, video_id,
450 note=u'Downloading %s player %s' % (player_type, player_id),
451 errnote=u'Download of %s failed' % player_url)
453 res = self._parse_sig_swf(code)
455 assert False, 'Invalid player type %r' % player_type
459 test_string = u''.join(map(compat_chr, range(slen)))
460 cache_res = res(test_string)
461 cache_spec = [ord(c) for c in cache_res]
463 os.makedirs(os.path.dirname(cache_fn))
464 except OSError as ose:
465 if ose.errno != errno.EEXIST:
467 write_json_file(cache_spec, cache_fn)
469 tb = traceback.format_exc()
470 self._downloader.report_warning(
471 u'Writing cache to %r failed: %s' % (cache_fn, tb))
475 def _print_sig_code(self, func, slen):
476 def gen_sig_code(idxs):
477 def _genslice(start, end, step):
478 starts = u'' if start == 0 else str(start)
479 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
480 steps = u'' if step == 1 else (u':%d' % step)
481 return u's[%s%s%s]' % (starts, ends, steps)
484 start = '(Never used)' # Quelch pyflakes warnings - start will be
485 # set as soon as step is set
486 for i, prev in zip(idxs[1:], idxs[:-1]):
490 yield _genslice(start, prev, step)
493 if i - prev in [-1, 1]:
498 yield u's[%d]' % prev
502 yield _genslice(start, i, step)
504 test_string = u''.join(map(compat_chr, range(slen)))
505 cache_res = func(test_string)
506 cache_spec = [ord(c) for c in cache_res]
507 expr_code = u' + '.join(gen_sig_code(cache_spec))
508 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
509 self.to_screen(u'Extracted signature function:\n' + code)
511 def _parse_sig_js(self, jscode):
512 funcname = self._search_regex(
513 r'signature=([a-zA-Z]+)', jscode,
514 u'Initial JS player signature function name')
519 return string.lowercase.index(varname)
521 def interpret_statement(stmt, local_vars, allow_recursion=20):
522 if allow_recursion < 0:
523 raise ExtractorError(u'Recursion limit reached')
525 if stmt.startswith(u'var '):
526 stmt = stmt[len(u'var '):]
527 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
528 r'=(?P<expr>.*)$', stmt)
530 if ass_m.groupdict().get('index'):
532 lvar = local_vars[ass_m.group('out')]
533 idx = interpret_expression(ass_m.group('index'),
534 local_vars, allow_recursion)
535 assert isinstance(idx, int)
538 expr = ass_m.group('expr')
541 local_vars[ass_m.group('out')] = val
543 expr = ass_m.group('expr')
544 elif stmt.startswith(u'return '):
546 expr = stmt[len(u'return '):]
548 raise ExtractorError(
549 u'Cannot determine left side of statement in %r' % stmt)
551 v = interpret_expression(expr, local_vars, allow_recursion)
554 def interpret_expression(expr, local_vars, allow_recursion):
559 return local_vars[expr]
561 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
563 member = m.group('member')
564 val = local_vars[m.group('in')]
565 if member == 'split("")':
567 if member == 'join("")':
569 if member == 'length':
571 if member == 'reverse()':
573 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
575 idx = interpret_expression(
576 slice_m.group('idx'), local_vars, allow_recursion-1)
580 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
582 val = local_vars[m.group('in')]
583 idx = interpret_expression(m.group('idx'), local_vars,
587 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
589 a = interpret_expression(m.group('a'),
590 local_vars, allow_recursion)
591 b = interpret_expression(m.group('b'),
592 local_vars, allow_recursion)
596 r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
598 fname = m.group('func')
599 if fname not in functions:
600 functions[fname] = extract_function(fname)
601 argvals = [int(v) if v.isdigit() else local_vars[v]
602 for v in m.group('args').split(',')]
603 return functions[fname](argvals)
604 raise ExtractorError(u'Unsupported JS expression %r' % expr)
606 def extract_function(funcname):
608 r'function ' + re.escape(funcname) +
609 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
611 argnames = func_m.group('args').split(',')
614 local_vars = dict(zip(argnames, args))
615 for stmt in func_m.group('code').split(';'):
616 res = interpret_statement(stmt, local_vars)
620 initial_function = extract_function(funcname)
621 return lambda s: initial_function([s])
623 def _parse_sig_swf(self, file_contents):
624 if file_contents[1:3] != b'WS':
625 raise ExtractorError(
626 u'Not an SWF file; header is %r' % file_contents[:3])
627 if file_contents[:1] == b'C':
628 content = zlib.decompress(file_contents[8:])
630 raise NotImplementedError(u'Unsupported compression format %r' %
633 def extract_tags(content):
635 while pos < len(content):
636 header16 = struct.unpack('<H', content[pos:pos+2])[0]
638 tag_code = header16 >> 6
639 tag_len = header16 & 0x3f
641 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
643 assert pos+tag_len <= len(content)
644 yield (tag_code, content[pos:pos+tag_len])
648 for tag_code, tag in extract_tags(content)
650 p = code_tag.index(b'\0', 4) + 1
651 code_reader = io.BytesIO(code_tag[p:])
653 # Parse ABC (AVM2 ByteCode)
654 def read_int(reader=None):
662 b = struct.unpack('<B', buf)[0]
663 res = res | ((b & 0x7f) << shift)
669 def u30(reader=None):
670 res = read_int(reader)
671 assert res & 0xf0000000 == 0
675 def s32(reader=None):
677 if v & 0x80000000 != 0:
678 v = - ((v ^ 0xffffffff) + 1)
681 def read_string(reader=None):
685 resb = reader.read(slen)
686 assert len(resb) == slen
687 return resb.decode('utf-8')
689 def read_bytes(count, reader=None):
692 resb = reader.read(count)
693 assert len(resb) == count
696 def read_byte(reader=None):
697 resb = read_bytes(1, reader=reader)
698 res = struct.unpack('<B', resb)[0]
701 # minor_version + major_version
706 for _c in range(1, int_count):
709 for _c in range(1, uint_count):
712 read_bytes((double_count-1) * 8)
714 constant_strings = [u'']
715 for _c in range(1, string_count):
717 constant_strings.append(s)
718 namespace_count = u30()
719 for _c in range(1, namespace_count):
723 for _c in range(1, ns_set_count):
725 for _c2 in range(count):
727 multiname_count = u30()
736 0x0e: 2, # MultinameA
737 0x1b: 1, # MultinameL
738 0x1c: 1, # MultinameLA
741 for _c in range(1, multiname_count):
743 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
745 u30() # namespace_idx
747 multinames.append(constant_strings[name_idx])
749 multinames.append('[MULTINAME kind: %d]' % kind)
750 for _c2 in range(MULTINAME_SIZES[kind]):
755 MethodInfo = collections.namedtuple(
757 ['NEED_ARGUMENTS', 'NEED_REST'])
759 for method_id in range(method_count):
762 for _ in range(param_count):
764 u30() # name index (always 0 for youtube)
766 if flags & 0x08 != 0:
769 for c in range(option_count):
772 if flags & 0x80 != 0:
773 # Param names present
774 for _ in range(param_count):
776 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
777 method_infos.append(mi)
780 metadata_count = u30()
781 for _c in range(metadata_count):
784 for _c2 in range(item_count):
788 def parse_traits_info():
789 trait_name_idx = u30()
790 kind_full = read_byte()
791 kind = kind_full & 0x0f
792 attrs = kind_full >> 4
794 if kind in [0x00, 0x06]: # Slot or Const
796 u30() # type_name_idx
800 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
803 methods[multinames[trait_name_idx]] = method_idx
804 elif kind == 0x04: # Class
807 elif kind == 0x05: # Function
810 methods[function_idx] = multinames[trait_name_idx]
812 raise ExtractorError(u'Unsupported trait kind %d' % kind)
814 if attrs & 0x4 != 0: # Metadata present
815 metadata_count = u30()
816 for _c3 in range(metadata_count):
817 u30() # metadata index
822 TARGET_CLASSNAME = u'SignatureDecipher'
823 searched_idx = multinames.index(TARGET_CLASSNAME)
824 searched_class_id = None
826 for class_id in range(class_count):
828 if name_idx == searched_idx:
829 # We found the class we're looking for!
830 searched_class_id = class_id
831 u30() # super_name idx
833 if flags & 0x08 != 0: # Protected namespace is present
834 u30() # protected_ns_idx
836 for _c2 in range(intrf_count):
840 for _c2 in range(trait_count):
843 if searched_class_id is None:
844 raise ExtractorError(u'Target class %r not found' %
849 for class_id in range(class_count):
852 for _c2 in range(trait_count):
853 trait_methods = parse_traits_info()
854 if class_id == searched_class_id:
855 method_names.update(trait_methods.items())
856 method_idxs.update(dict(
858 for name, idx in trait_methods.items()))
862 for _c in range(script_count):
865 for _c2 in range(trait_count):
869 method_body_count = u30()
870 Method = collections.namedtuple('Method', ['code', 'local_count'])
872 for _c in range(method_body_count):
876 u30() # init_scope_depth
877 u30() # max_scope_depth
879 code = read_bytes(code_length)
880 if method_idx in method_idxs:
881 m = Method(code, local_count)
882 methods[method_idxs[method_idx]] = m
883 exception_count = u30()
884 for _c2 in range(exception_count):
891 for _c2 in range(trait_count):
894 assert p + code_reader.tell() == len(code_tag)
895 assert len(methods) == len(method_idxs)
897 method_pyfunctions = {}
899 def extract_function(func_name):
900 if func_name in method_pyfunctions:
901 return method_pyfunctions[func_name]
902 if func_name not in methods:
903 raise ExtractorError(u'Cannot find function %r' % func_name)
904 m = methods[func_name]
907 registers = ['(this)'] + list(args) + [None] * m.local_count
909 coder = io.BytesIO(m.code)
911 opcode = struct.unpack('!B', coder.read(1))[0]
912 if opcode == 36: # pushbyte
913 v = struct.unpack('!B', coder.read(1))[0]
915 elif opcode == 44: # pushstring
917 stack.append(constant_strings[idx])
918 elif opcode == 48: # pushscope
919 # We don't implement the scope register, so we'll just
920 # ignore the popped value
922 elif opcode == 70: # callproperty
924 mname = multinames[index]
925 arg_count = u30(coder)
926 args = list(reversed(
927 [stack.pop() for _ in range(arg_count)]))
929 if mname == u'split':
930 assert len(args) == 1
931 assert isinstance(args[0], compat_str)
932 assert isinstance(obj, compat_str)
936 res = obj.split(args[0])
938 elif mname == u'slice':
939 assert len(args) == 1
940 assert isinstance(args[0], int)
941 assert isinstance(obj, list)
944 elif mname == u'join':
945 assert len(args) == 1
946 assert isinstance(args[0], compat_str)
947 assert isinstance(obj, list)
948 res = args[0].join(obj)
950 elif mname in method_pyfunctions:
951 stack.append(method_pyfunctions[mname](args))
953 raise NotImplementedError(
954 u'Unsupported property %r on %r'
956 elif opcode == 72: # returnvalue
959 elif opcode == 79: # callpropvoid
961 mname = multinames[index]
962 arg_count = u30(coder)
963 args = list(reversed(
964 [stack.pop() for _ in range(arg_count)]))
966 if mname == u'reverse':
967 assert isinstance(obj, list)
970 raise NotImplementedError(
971 u'Unsupported (void) property %r on %r'
973 elif opcode == 93: # findpropstrict
975 mname = multinames[index]
976 res = extract_function(mname)
978 elif opcode == 97: # setproperty
983 assert isinstance(obj, list)
984 assert isinstance(idx, int)
986 elif opcode == 98: # getlocal
988 stack.append(registers[index])
989 elif opcode == 99: # setlocal
992 registers[index] = value
993 elif opcode == 102: # getproperty
995 pname = multinames[index]
996 if pname == u'length':
998 assert isinstance(obj, list)
999 stack.append(len(obj))
1000 else: # Assume attribute access
1002 assert isinstance(idx, int)
1004 assert isinstance(obj, list)
1005 stack.append(obj[idx])
1006 elif opcode == 128: # coerce
1008 elif opcode == 133: # coerce_s
1009 assert isinstance(stack[-1], (type(None), compat_str))
1010 elif opcode == 164: # modulo
1011 value2 = stack.pop()
1012 value1 = stack.pop()
1013 res = value1 % value2
1015 elif opcode == 208: # getlocal_0
1016 stack.append(registers[0])
1017 elif opcode == 209: # getlocal_1
1018 stack.append(registers[1])
1019 elif opcode == 210: # getlocal_2
1020 stack.append(registers[2])
1021 elif opcode == 211: # getlocal_3
1022 stack.append(registers[3])
1023 elif opcode == 214: # setlocal_2
1024 registers[2] = stack.pop()
1025 elif opcode == 215: # setlocal_3
1026 registers[3] = stack.pop()
1028 raise NotImplementedError(
1029 u'Unsupported opcode %d' % opcode)
1031 method_pyfunctions[func_name] = resfunc
1034 initial_function = extract_function(u'decipher')
1035 return lambda s: initial_function([s])
1037 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
1038 """Turn the encrypted s field into a working signature"""
1040 if player_url is not None:
1042 player_id = (player_url, len(s))
1043 if player_id not in self._player_cache:
1044 func = self._extract_signature_function(
1045 video_id, player_url, len(s)
1047 self._player_cache[player_id] = func
1048 func = self._player_cache[player_id]
1049 if self._downloader.params.get('youtube_print_sig_code'):
1050 self._print_sig_code(func, len(s))
1053 tb = traceback.format_exc()
1054 self._downloader.report_warning(
1055 u'Automatic signature extraction failed: ' + tb)
1057 self._downloader.report_warning(
1058 u'Warning: Falling back to static signature algorithm')
1060 return self._static_decrypt_signature(
1061 s, video_id, player_url, age_gate)
1063 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
1065 # The videos with age protection use another player, so the
1066 # algorithms can be different.
1068 return s[2:63] + s[82] + s[64:82] + s[63]
1071 return s[86:29:-1] + s[88] + s[28:5:-1]
1073 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
1075 return s[84:27:-1] + s[86] + s[26:5:-1]
1077 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
1079 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
1081 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
1083 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
1085 return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1]
1087 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
1089 return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1]
1091 return s[80:63:-1] + s[0] + s[62:0:-1] + s[63]
1093 return s[80:37:-1] + s[7] + s[36:7:-1] + s[0] + s[6:0:-1] + s[37]
1095 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1097 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
1099 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1102 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
1104 def _get_available_subtitles(self, video_id):
1106 sub_list = self._download_webpage(
1107 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1108 video_id, note=False)
1109 except ExtractorError as err:
1110 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
1112 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
1117 params = compat_urllib_parse.urlencode({
1120 'fmt': self._downloader.params.get('subtitlesformat'),
1123 url = u'http://www.youtube.com/api/timedtext?' + params
1124 sub_lang_list[lang] = url
1125 if not sub_lang_list:
1126 self._downloader.report_warning(u'video doesn\'t have subtitles')
1128 return sub_lang_list
1130 def _get_available_automatic_caption(self, video_id, webpage):
1131 """We need the webpage for getting the captions url, pass it as an
1132 argument to speed up the process."""
1133 sub_format = self._downloader.params.get('subtitlesformat')
1134 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1135 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
1136 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
1138 self._downloader.report_warning(err_msg)
1140 player_config = json.loads(mobj.group(1))
1142 args = player_config[u'args']
1143 caption_url = args[u'ttsurl']
1144 timestamp = args[u'timestamp']
1145 # We get the available subtitles
1146 list_params = compat_urllib_parse.urlencode({
1151 list_url = caption_url + '&' + list_params
1152 list_page = self._download_webpage(list_url, video_id)
1153 caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8'))
1154 original_lang_node = caption_list.find('track')
1155 if not original_lang_node or original_lang_node.attrib.get('kind') != 'asr' :
1156 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1158 original_lang = original_lang_node.attrib['lang_code']
1161 for lang_node in caption_list.findall('target'):
1162 sub_lang = lang_node.attrib['lang_code']
1163 params = compat_urllib_parse.urlencode({
1164 'lang': original_lang,
1170 sub_lang_list[sub_lang] = caption_url + '&' + params
1171 return sub_lang_list
1172 # An extractor error can be raise by the download process if there are
1173 # no automatic captions but there are subtitles
1174 except (KeyError, ExtractorError):
1175 self._downloader.report_warning(err_msg)
1178 def _print_formats(self, formats):
1179 print('Available formats:')
1181 print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'),
1182 self._video_dimensions.get(x, '???'),
1183 ' ('+self._special_itags[x]+')' if x in self._special_itags else ''))
1185 def _extract_id(self, url):
1186 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1188 raise ExtractorError(u'Invalid URL: %s' % url)
1189 video_id = mobj.group(2)
1192 def _get_video_url_list(self, url_map):
1194 Transform a dictionary in the format {itag:url} to a list of (itag, url)
1195 with the requested formats.
1197 req_format = self._downloader.params.get('format', None)
1198 format_limit = self._downloader.params.get('format_limit', None)
1199 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1200 if format_limit is not None and format_limit in available_formats:
1201 format_list = available_formats[available_formats.index(format_limit):]
1203 format_list = available_formats
1204 existing_formats = [x for x in format_list if x in url_map]
1205 if len(existing_formats) == 0:
1206 raise ExtractorError(u'no known formats available for video')
1207 if self._downloader.params.get('listformats', None):
1208 self._print_formats(existing_formats)
1210 if req_format is None or req_format == 'best':
1211 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1212 elif req_format == 'worst':
1213 video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality
1214 elif req_format in ('-1', 'all'):
1215 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1217 # Specific formats. We pick the first in a slash-delimeted sequence.
1218 # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
1219 # available in the specified format. For example,
1220 # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1221 # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
1222 # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
1223 req_formats = req_format.split('/')
1224 video_url_list = None
1225 for rf in req_formats:
1227 video_url_list = [(rf, url_map[rf])]
1229 if rf in self._video_formats_map:
1230 for srf in self._video_formats_map[rf]:
1232 video_url_list = [(srf, url_map[srf])]
1237 if video_url_list is None:
1238 raise ExtractorError(u'requested format not available')
1239 return video_url_list
1241 def _extract_from_m3u8(self, manifest_url, video_id):
1243 def _get_urls(_manifest):
1244 lines = _manifest.split('\n')
1245 urls = filter(lambda l: l and not l.startswith('#'),
1248 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1249 formats_urls = _get_urls(manifest)
1250 for format_url in formats_urls:
1251 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1252 url_map[itag] = format_url
1255 def _extract_annotations(self, video_id):
1256 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
1257 return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
1259 def _real_extract(self, url):
1260 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1261 mobj = re.search(self._NEXT_URL_RE, url)
1263 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1264 video_id = self._extract_id(url)
1267 self.report_video_webpage_download(video_id)
1268 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1269 request = compat_urllib_request.Request(url)
1271 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
1272 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1273 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
1275 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
1277 # Attempt to extract SWF player URL
1278 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1279 if mobj is not None:
1280 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1285 self.report_video_info_webpage_download(video_id)
1286 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1287 self.report_age_confirmation()
1289 # We simulate the access to the video from www.youtube.com/v/{video_id}
1290 # this can be viewed without login into Youtube
1291 data = compat_urllib_parse.urlencode({'video_id': video_id,
1295 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1299 video_info_url = 'https://www.youtube.com/get_video_info?' + data
1300 video_info_webpage = self._download_webpage(video_info_url, video_id,
1302 errnote='unable to download video info webpage')
1303 video_info = compat_parse_qs(video_info_webpage)
1306 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1307 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1308 % (video_id, el_type))
1309 video_info_webpage = self._download_webpage(video_info_url, video_id,
1311 errnote='unable to download video info webpage')
1312 video_info = compat_parse_qs(video_info_webpage)
1313 if 'token' in video_info:
1315 if 'token' not in video_info:
1316 if 'reason' in video_info:
1317 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
1319 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1321 # Check for "rental" videos
1322 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1323 raise ExtractorError(u'"rental" videos not supported')
1325 # Start extracting information
1326 self.report_information_extraction(video_id)
1329 if 'author' not in video_info:
1330 raise ExtractorError(u'Unable to extract uploader name')
1331 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1334 video_uploader_id = None
1335 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1336 if mobj is not None:
1337 video_uploader_id = mobj.group(1)
1339 self._downloader.report_warning(u'unable to extract uploader nickname')
1342 if 'title' in video_info:
1343 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1345 self._downloader.report_warning(u'Unable to extract video title')
1349 # We try first to get a high quality image:
1350 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1351 video_webpage, re.DOTALL)
1352 if m_thumb is not None:
1353 video_thumbnail = m_thumb.group(1)
1354 elif 'thumbnail_url' not in video_info:
1355 self._downloader.report_warning(u'unable to extract video thumbnail')
1356 video_thumbnail = None
1357 else: # don't panic if we can't find it
1358 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1362 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1363 if mobj is not None:
1364 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1365 upload_date = unified_strdate(upload_date)
1368 video_description = get_element_by_id("eow-description", video_webpage)
1369 if video_description:
1370 video_description = clean_html(video_description)
1372 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1374 video_description = unescapeHTML(fd_mobj.group(1))
1376 video_description = u''
1379 video_subtitles = self.extract_subtitles(video_id, video_webpage)
1381 if self._downloader.params.get('listsubtitles', False):
1382 self._list_available_subtitles(video_id, video_webpage)
1385 if 'length_seconds' not in video_info:
1386 self._downloader.report_warning(u'unable to extract video duration')
1389 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
1392 video_annotations = None
1393 if self._downloader.params.get('writeannotations', False):
1394 video_annotations = self._extract_annotations(video_id)
1396 # Decide which formats to download
1399 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
1401 raise ValueError('Could not find vevo ID')
1402 info = json.loads(mobj.group(1))
1404 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1405 # this signatures are encrypted
1406 if 'url_encoded_fmt_stream_map' not in args:
1407 raise ValueError(u'No stream_map present') # caught below
1408 m_s = re.search(r'[&,]s=', args['url_encoded_fmt_stream_map'])
1410 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
1411 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
1412 m_s = re.search(r'[&,]s=', args.get('adaptive_fmts', u''))
1414 if 'url_encoded_fmt_stream_map' in video_info:
1415 video_info['url_encoded_fmt_stream_map'][0] += ',' + args['adaptive_fmts']
1417 video_info['url_encoded_fmt_stream_map'] = [args['adaptive_fmts']]
1418 elif 'adaptive_fmts' in video_info:
1419 if 'url_encoded_fmt_stream_map' in video_info:
1420 video_info['url_encoded_fmt_stream_map'][0] += ',' + video_info['adaptive_fmts'][0]
1422 video_info['url_encoded_fmt_stream_map'] = video_info['adaptive_fmts']
1426 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1427 self.report_rtmp_download()
1428 video_url_list = [(None, video_info['conn'][0])]
1429 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1430 if 'rtmpe%3Dyes' in video_info['url_encoded_fmt_stream_map'][0]:
1431 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1433 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
1434 url_data = compat_parse_qs(url_data_str)
1435 if 'itag' in url_data and 'url' in url_data:
1436 url = url_data['url'][0]
1437 if 'sig' in url_data:
1438 url += '&signature=' + url_data['sig'][0]
1439 elif 's' in url_data:
1440 encrypted_sig = url_data['s'][0]
1441 if self._downloader.params.get('verbose'):
1443 if player_url is None:
1444 player_version = 'unknown'
1446 player_version = self._search_regex(
1447 r'-(.+)\.swf$', player_url,
1448 u'flash player', fatal=False)
1449 player_desc = 'flash player %s' % player_version
1451 player_version = self._search_regex(
1452 r'html5player-(.+?)\.js', video_webpage,
1453 'html5 player', fatal=False)
1454 player_desc = u'html5 player %s' % player_version
1456 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
1457 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
1458 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1461 jsplayer_url_json = self._search_regex(
1462 r'"assets":.+?"js":\s*("[^"]+")',
1463 video_webpage, u'JS player URL')
1464 player_url = json.loads(jsplayer_url_json)
1466 signature = self._decrypt_signature(
1467 encrypted_sig, video_id, player_url, age_gate)
1468 url += '&signature=' + signature
1469 if 'ratebypass' not in url:
1470 url += '&ratebypass=yes'
1471 url_map[url_data['itag'][0]] = url
1472 video_url_list = self._get_video_url_list(url_map)
1473 if not video_url_list:
1475 elif video_info.get('hlsvp'):
1476 manifest_url = video_info['hlsvp'][0]
1477 url_map = self._extract_from_m3u8(manifest_url, video_id)
1478 video_url_list = self._get_video_url_list(url_map)
1479 if not video_url_list:
1483 raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
1486 for format_param, video_real_url in video_url_list:
1488 video_extension = self._video_extensions.get(format_param, 'flv')
1490 video_format = '{0} - {1}{2}'.format(format_param if format_param else video_extension,
1491 self._video_dimensions.get(format_param, '???'),
1492 ' ('+self._special_itags[format_param]+')' if format_param in self._special_itags else '')
1496 'url': video_real_url,
1497 'uploader': video_uploader,
1498 'uploader_id': video_uploader_id,
1499 'upload_date': upload_date,
1500 'title': video_title,
1501 'ext': video_extension,
1502 'format': video_format,
1503 'thumbnail': video_thumbnail,
1504 'description': video_description,
1505 'player_url': player_url,
1506 'subtitles': video_subtitles,
1507 'duration': video_duration,
1508 'age_limit': 18 if age_gate else 0,
1509 'annotations': video_annotations
1513 class YoutubePlaylistIE(InfoExtractor):
1514 IE_DESC = u'YouTube.com playlists'
1515 _VALID_URL = r"""(?:
1520 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1521 \? (?:.*?&)*? (?:p|a|list)=
1524 ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
1527 ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
1529 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1531 IE_NAME = u'youtube:playlist'
1534 def suitable(cls, url):
1535 """Receives a URL and returns True if suitable for this IE."""
1536 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1538 def _real_extract(self, url):
1539 # Extract playlist id
1540 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1542 raise ExtractorError(u'Invalid URL: %s' % url)
1543 playlist_id = mobj.group(1) or mobj.group(2)
1545 # Check if it's a video-specific URL
1546 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1547 if 'v' in query_dict:
1548 video_id = query_dict['v'][0]
1549 if self._downloader.params.get('noplaylist'):
1550 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
1551 return self.url_result('https://www.youtube.com/watch?v=' + video_id, 'Youtube')
1553 self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1555 # Download playlist videos from API
1558 for page_num in itertools.count(1):
1559 start_index = self._MAX_RESULTS * (page_num - 1) + 1
1560 if start_index >= 1000:
1561 self._downloader.report_warning(u'Max number of results reached')
1563 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index)
1564 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1567 response = json.loads(page)
1568 except ValueError as err:
1569 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1571 if 'feed' not in response:
1572 raise ExtractorError(u'Got a malformed response from YouTube API')
1573 playlist_title = response['feed']['title']['$t']
1574 if 'entry' not in response['feed']:
1575 # Number of videos is a multiple of self._MAX_RESULTS
1578 for entry in response['feed']['entry']:
1579 index = entry['yt$position']['$t']
1580 if 'media$group' in entry and 'yt$videoid' in entry['media$group']:
1583 'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t']
1586 videos = [v[1] for v in sorted(videos)]
1588 url_results = [self.url_result(vurl, 'Youtube') for vurl in videos]
1589 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1592 class YoutubeChannelIE(InfoExtractor):
1593 IE_DESC = u'YouTube.com channels'
1594 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1595 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1596 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1597 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1598 IE_NAME = u'youtube:channel'
1600 def extract_videos_from_page(self, page):
1602 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1603 if mobj.group(1) not in ids_in_page:
1604 ids_in_page.append(mobj.group(1))
1607 def _real_extract(self, url):
1608 # Extract channel id
1609 mobj = re.match(self._VALID_URL, url)
1611 raise ExtractorError(u'Invalid URL: %s' % url)
1613 # Download channel page
1614 channel_id = mobj.group(1)
1618 url = self._TEMPLATE_URL % (channel_id, pagenum)
1619 page = self._download_webpage(url, channel_id,
1620 u'Downloading page #%s' % pagenum)
1622 # Extract video identifiers
1623 ids_in_page = self.extract_videos_from_page(page)
1624 video_ids.extend(ids_in_page)
1626 # Download any subsequent channel pages using the json-based channel_ajax query
1627 if self._MORE_PAGES_INDICATOR in page:
1628 for pagenum in itertools.count(1):
1629 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1630 page = self._download_webpage(url, channel_id,
1631 u'Downloading page #%s' % pagenum)
1633 page = json.loads(page)
1635 ids_in_page = self.extract_videos_from_page(page['content_html'])
1636 video_ids.extend(ids_in_page)
1638 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1641 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1643 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1644 url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls]
1645 return [self.playlist_result(url_entries, channel_id)]
1648 class YoutubeUserIE(InfoExtractor):
1649 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
1650 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1651 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1652 _GDATA_PAGE_SIZE = 50
1653 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1654 IE_NAME = u'youtube:user'
1657 def suitable(cls, url):
1658 # Don't return True if the url can be extracted with other youtube
1659 # extractor, the regex would is too permissive and it would match.
1660 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1661 if any(ie.suitable(url) for ie in other_ies): return False
1662 else: return super(YoutubeUserIE, cls).suitable(url)
1664 def _real_extract(self, url):
1666 mobj = re.match(self._VALID_URL, url)
1668 raise ExtractorError(u'Invalid URL: %s' % url)
1670 username = mobj.group(1)
1672 # Download video ids using YouTube Data API. Result size per
1673 # query is limited (currently to 50 videos) so we need to query
1674 # page by page until there are no video ids - it means we got
1679 for pagenum in itertools.count(0):
1680 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1682 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1683 page = self._download_webpage(gdata_url, username,
1684 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1687 response = json.loads(page)
1688 except ValueError as err:
1689 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1690 if 'entry' not in response['feed']:
1691 # Number of videos is a multiple of self._MAX_RESULTS
1694 # Extract video identifiers
1696 for entry in response['feed']['entry']:
1697 ids_in_page.append(entry['id']['$t'].split('/')[-1])
1698 video_ids.extend(ids_in_page)
1700 # A little optimization - if current page is not
1701 # "full", ie. does not contain PAGE_SIZE video ids then
1702 # we can assume that this page is the last one - there
1703 # are no more ids on further pages - no need to query
1706 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1709 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1710 url_results = [self.url_result(rurl, 'Youtube') for rurl in urls]
1711 return [self.playlist_result(url_results, playlist_title = username)]
1713 class YoutubeSearchIE(SearchInfoExtractor):
1714 IE_DESC = u'YouTube.com searches'
1715 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1717 IE_NAME = u'youtube:search'
1718 _SEARCH_KEY = 'ytsearch'
1720 def report_download_page(self, query, pagenum):
1721 """Report attempt to download search page with given number."""
1722 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1724 def _get_n_results(self, query, n):
1725 """Get a specified number of results for a query"""
1731 while (50 * pagenum) < limit:
1732 self.report_download_page(query, pagenum+1)
1733 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1734 request = compat_urllib_request.Request(result_url)
1736 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1737 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1738 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1739 api_response = json.loads(data)['data']
1741 if not 'items' in api_response:
1742 raise ExtractorError(u'[youtube] No video results')
1744 new_ids = list(video['id'] for video in api_response['items'])
1745 video_ids += new_ids
1747 limit = min(n, api_response['totalItems'])
1750 if len(video_ids) > n:
1751 video_ids = video_ids[:n]
1752 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1753 return self.playlist_result(videos, query)
1756 class YoutubeShowIE(InfoExtractor):
1757 IE_DESC = u'YouTube.com (multi-season) shows'
1758 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1759 IE_NAME = u'youtube:show'
1761 def _real_extract(self, url):
1762 mobj = re.match(self._VALID_URL, url)
1763 show_name = mobj.group(1)
1764 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1765 # There's one playlist for each season of the show
1766 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1767 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1768 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1771 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1773 Base class for extractors that fetch info from
1774 http://www.youtube.com/feed_ajax
1775 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1777 _LOGIN_REQUIRED = True
1779 # use action_load_personal_feed instead of action_load_system_feed
1780 _PERSONAL_FEED = False
1783 def _FEED_TEMPLATE(self):
1784 action = 'action_load_system_feed'
1785 if self._PERSONAL_FEED:
1786 action = 'action_load_personal_feed'
1787 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1791 return u'youtube:%s' % self._FEED_NAME
1793 def _real_initialize(self):
1796 def _real_extract(self, url):
1798 # The step argument is available only in 2.7 or higher
1799 for i in itertools.count(0):
1800 paging = i*self._PAGING_STEP
1801 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1802 u'%s feed' % self._FEED_NAME,
1803 u'Downloading page %s' % i)
1804 info = json.loads(info)
1805 feed_html = info['feed_html']
1806 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1807 ids = orderedSet(m.group(1) for m in m_ids)
1808 feed_entries.extend(self.url_result(id, 'Youtube') for id in ids)
1809 if info['paging'] is None:
1811 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1813 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1814 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1815 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1816 _FEED_NAME = 'subscriptions'
1817 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1819 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1820 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1821 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1822 _FEED_NAME = 'recommended'
1823 _PLAYLIST_TITLE = u'Youtube Recommended videos'
1825 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1826 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1827 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1828 _FEED_NAME = 'watch_later'
1829 _PLAYLIST_TITLE = u'Youtube Watch Later'
1831 _PERSONAL_FEED = True
1833 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1834 IE_NAME = u'youtube:favorites'
1835 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1836 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1837 _LOGIN_REQUIRED = True
1839 def _real_extract(self, url):
1840 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1841 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1842 return self.url_result(playlist_id, 'YoutubePlaylist')
1845 class YoutubeTruncatedURLIE(InfoExtractor):
1846 IE_NAME = 'youtube:truncated_url'
1847 IE_DESC = False # Do not list
1848 _VALID_URL = r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$'
1850 def _real_extract(self, url):
1851 raise ExtractorError(
1852 u'Did you forget to quote the URL? Remember that & is a meta '
1853 u'character in most shells, so you want to put the URL in quotes, '
1855 u'\'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\''
1856 u' (or simply youtube-dl BaW_jenozKc ).',