14 import xml.etree.ElementTree
17 from .common import InfoExtractor, SearchInfoExtractor
18 from .subtitles import SubtitlesInfoExtractor
25 compat_urllib_request,
39 class YoutubeBaseInfoExtractor(InfoExtractor):
40 """Provide base functions for Youtube extractors"""
41 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
42 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
43 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
44 _NETRC_MACHINE = 'youtube'
45 # If True it will raise an error if no login info is provided
46 _LOGIN_REQUIRED = False
48 def report_lang(self):
49 """Report attempt to set language."""
50 self.to_screen(u'Setting language')
52 def _set_language(self):
53 request = compat_urllib_request.Request(self._LANG_URL)
56 compat_urllib_request.urlopen(request).read()
57 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
58 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
63 (username, password) = self._get_login_info()
64 # No authentication to be performed
66 if self._LOGIN_REQUIRED:
67 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
70 request = compat_urllib_request.Request(self._LOGIN_URL)
72 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
73 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
74 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
77 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
78 login_page, u'Login GALX parameter')
82 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
86 u'PersistentCookie': u'yes',
88 u'bgresponse': u'js_disabled',
89 u'checkConnection': u'',
90 u'checkedDomains': u'youtube',
95 u'signIn': u'Sign in',
97 u'service': u'youtube',
101 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
103 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
104 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
105 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
108 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
109 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
110 self._downloader.report_warning(u'unable to log in: bad username or password')
112 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
113 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
117 def _confirm_age(self):
120 'action_confirm': 'Confirm',
122 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
124 self.report_age_confirmation()
125 compat_urllib_request.urlopen(request).read().decode('utf-8')
126 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
127 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
130 def _real_initialize(self):
131 if self._downloader is None:
133 if not self._set_language():
135 if not self._login():
140 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
141 IE_DESC = u'YouTube.com'
144 (?:https?://)? # http(s):// (optional)
145 (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
146 tube\.majestyc\.net/|
147 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
148 (?:.*?\#/)? # handle anchor (#/) redirect urls
149 (?: # the various things that can precede the ID:
150 (?:(?:v|embed|e)/) # v/ or embed/ or e/
151 |(?: # or the v= param in all its forms
152 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
153 (?:\?|\#!?) # the params delimiter ? or # or #!
154 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
158 |youtu\.be/ # just youtu.be/xxxx
160 )? # all until now is optional -> you can pass the naked ID
161 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
162 (?(1).+)? # if we found the ID, everything can follow
164 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
165 # Listed in order of quality
166 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
167 # Apple HTTP Live Streaming
168 '96', '95', '94', '93', '92', '132', '151',
170 '85', '84', '102', '83', '101', '82', '100',
172 '138', '137', '248', '136', '247', '135', '246',
173 '245', '244', '134', '243', '133', '242', '160',
175 '141', '172', '140', '171', '139',
177 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
178 # Apple HTTP Live Streaming
179 '96', '95', '94', '93', '92', '132', '151',
181 '85', '102', '84', '101', '83', '100', '82',
183 '138', '248', '137', '247', '136', '246', '245',
184 '244', '135', '243', '134', '242', '133', '160',
186 '172', '141', '171', '140', '139',
188 _video_formats_map = {
189 'flv': ['35', '34', '6', '5'],
190 '3gp': ['36', '17', '13'],
191 'mp4': ['38', '37', '22', '18'],
192 'webm': ['46', '45', '44', '43'],
194 _video_extensions = {
216 # Apple HTTP Live Streaming
250 _video_dimensions = {
332 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
333 u"file": u"BaW_jenozKc.mp4",
335 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
336 u"uploader": u"Philipp Hagemeister",
337 u"uploader_id": u"phihag",
338 u"upload_date": u"20121002",
339 u"description": u"test chars: \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
343 u"url": u"http://www.youtube.com/watch?v=1ltcDfZMA3U",
344 u"file": u"1ltcDfZMA3U.mp4",
345 u"note": u"Test VEVO video (#897)",
347 u"upload_date": u"20070518",
348 u"title": u"Maps - It Will Find You",
349 u"description": u"Music video by Maps performing It Will Find You.",
350 u"uploader": u"MuteUSA",
351 u"uploader_id": u"MuteUSA"
355 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
356 u"file": u"UxxajLWwzqY.mp4",
357 u"note": u"Test generic use_cipher_signature video (#897)",
359 u"upload_date": u"20120506",
360 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
361 u"description": u"md5:5b292926389560516e384ac437c0ec07",
362 u"uploader": u"Icona Pop",
363 u"uploader_id": u"IconaPop"
367 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
368 u"file": u"07FYdnEawAQ.mp4",
369 u"note": u"Test VEVO video with age protection (#956)",
371 u"upload_date": u"20130703",
372 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
373 u"description": u"md5:64249768eec3bc4276236606ea996373",
374 u"uploader": u"justintimberlakeVEVO",
375 u"uploader_id": u"justintimberlakeVEVO"
382 def suitable(cls, url):
383 """Receives a URL and returns True if suitable for this IE."""
384 if YoutubePlaylistIE.suitable(url): return False
385 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
387 def __init__(self, *args, **kwargs):
388 super(YoutubeIE, self).__init__(*args, **kwargs)
389 self._player_cache = {}
391 def report_video_webpage_download(self, video_id):
392 """Report attempt to download video webpage."""
393 self.to_screen(u'%s: Downloading video webpage' % video_id)
395 def report_video_info_webpage_download(self, video_id):
396 """Report attempt to download video info webpage."""
397 self.to_screen(u'%s: Downloading video info webpage' % video_id)
399 def report_information_extraction(self, video_id):
400 """Report attempt to extract video information."""
401 self.to_screen(u'%s: Extracting video information' % video_id)
403 def report_unavailable_format(self, video_id, format):
404 """Report extracted video URL."""
405 self.to_screen(u'%s: Format %s not available' % (video_id, format))
407 def report_rtmp_download(self):
408 """Indicate the download will use the RTMP protocol."""
409 self.to_screen(u'RTMP download detected')
411 def _extract_signature_function(self, video_id, player_url, slen):
412 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
414 player_type = id_m.group('ext')
415 player_id = id_m.group('id')
417 # Read from filesystem cache
418 func_id = '%s_%s_%d' % (player_type, player_id, slen)
419 assert os.path.basename(func_id) == func_id
420 cache_dir = get_cachedir(self._downloader.params)
422 cache_enabled = cache_dir is not None
424 cache_fn = os.path.join(os.path.expanduser(cache_dir),
428 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
429 cache_spec = json.load(cachef)
430 return lambda s: u''.join(s[i] for i in cache_spec)
432 pass # No cache available
434 if player_type == 'js':
435 code = self._download_webpage(
436 player_url, video_id,
437 note=u'Downloading %s player %s' % (player_type, player_id),
438 errnote=u'Download of %s failed' % player_url)
439 res = self._parse_sig_js(code)
440 elif player_type == 'swf':
441 urlh = self._request_webpage(
442 player_url, video_id,
443 note=u'Downloading %s player %s' % (player_type, player_id),
444 errnote=u'Download of %s failed' % player_url)
446 res = self._parse_sig_swf(code)
448 assert False, 'Invalid player type %r' % player_type
452 test_string = u''.join(map(compat_chr, range(slen)))
453 cache_res = res(test_string)
454 cache_spec = [ord(c) for c in cache_res]
456 os.makedirs(os.path.dirname(cache_fn))
457 except OSError as ose:
458 if ose.errno != errno.EEXIST:
460 write_json_file(cache_spec, cache_fn)
462 tb = traceback.format_exc()
463 self._downloader.report_warning(
464 u'Writing cache to %r failed: %s' % (cache_fn, tb))
468 def _print_sig_code(self, func, slen):
469 def gen_sig_code(idxs):
470 def _genslice(start, end, step):
471 starts = u'' if start == 0 else str(start)
472 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
473 steps = u'' if step == 1 else (u':%d' % step)
474 return u's[%s%s%s]' % (starts, ends, steps)
477 start = '(Never used)' # Quelch pyflakes warnings - start will be
478 # set as soon as step is set
479 for i, prev in zip(idxs[1:], idxs[:-1]):
483 yield _genslice(start, prev, step)
486 if i - prev in [-1, 1]:
491 yield u's[%d]' % prev
495 yield _genslice(start, i, step)
497 test_string = u''.join(map(compat_chr, range(slen)))
498 cache_res = func(test_string)
499 cache_spec = [ord(c) for c in cache_res]
500 expr_code = u' + '.join(gen_sig_code(cache_spec))
501 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
502 self.to_screen(u'Extracted signature function:\n' + code)
504 def _parse_sig_js(self, jscode):
505 funcname = self._search_regex(
506 r'signature=([a-zA-Z]+)', jscode,
507 u'Initial JS player signature function name')
512 return string.lowercase.index(varname)
514 def interpret_statement(stmt, local_vars, allow_recursion=20):
515 if allow_recursion < 0:
516 raise ExtractorError(u'Recursion limit reached')
518 if stmt.startswith(u'var '):
519 stmt = stmt[len(u'var '):]
520 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
521 r'=(?P<expr>.*)$', stmt)
523 if ass_m.groupdict().get('index'):
525 lvar = local_vars[ass_m.group('out')]
526 idx = interpret_expression(ass_m.group('index'),
527 local_vars, allow_recursion)
528 assert isinstance(idx, int)
531 expr = ass_m.group('expr')
534 local_vars[ass_m.group('out')] = val
536 expr = ass_m.group('expr')
537 elif stmt.startswith(u'return '):
539 expr = stmt[len(u'return '):]
541 raise ExtractorError(
542 u'Cannot determine left side of statement in %r' % stmt)
544 v = interpret_expression(expr, local_vars, allow_recursion)
547 def interpret_expression(expr, local_vars, allow_recursion):
552 return local_vars[expr]
554 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
556 member = m.group('member')
557 val = local_vars[m.group('in')]
558 if member == 'split("")':
560 if member == 'join("")':
562 if member == 'length':
564 if member == 'reverse()':
566 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
568 idx = interpret_expression(
569 slice_m.group('idx'), local_vars, allow_recursion-1)
573 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
575 val = local_vars[m.group('in')]
576 idx = interpret_expression(m.group('idx'), local_vars,
580 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
582 a = interpret_expression(m.group('a'),
583 local_vars, allow_recursion)
584 b = interpret_expression(m.group('b'),
585 local_vars, allow_recursion)
589 r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
591 fname = m.group('func')
592 if fname not in functions:
593 functions[fname] = extract_function(fname)
594 argvals = [int(v) if v.isdigit() else local_vars[v]
595 for v in m.group('args').split(',')]
596 return functions[fname](argvals)
597 raise ExtractorError(u'Unsupported JS expression %r' % expr)
599 def extract_function(funcname):
601 r'function ' + re.escape(funcname) +
602 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
604 argnames = func_m.group('args').split(',')
607 local_vars = dict(zip(argnames, args))
608 for stmt in func_m.group('code').split(';'):
609 res = interpret_statement(stmt, local_vars)
613 initial_function = extract_function(funcname)
614 return lambda s: initial_function([s])
616 def _parse_sig_swf(self, file_contents):
617 if file_contents[1:3] != b'WS':
618 raise ExtractorError(
619 u'Not an SWF file; header is %r' % file_contents[:3])
620 if file_contents[:1] == b'C':
621 content = zlib.decompress(file_contents[8:])
623 raise NotImplementedError(u'Unsupported compression format %r' %
626 def extract_tags(content):
628 while pos < len(content):
629 header16 = struct.unpack('<H', content[pos:pos+2])[0]
631 tag_code = header16 >> 6
632 tag_len = header16 & 0x3f
634 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
636 assert pos+tag_len <= len(content)
637 yield (tag_code, content[pos:pos+tag_len])
641 for tag_code, tag in extract_tags(content)
643 p = code_tag.index(b'\0', 4) + 1
644 code_reader = io.BytesIO(code_tag[p:])
646 # Parse ABC (AVM2 ByteCode)
647 def read_int(reader=None):
655 b = struct.unpack('<B', buf)[0]
656 res = res | ((b & 0x7f) << shift)
662 def u30(reader=None):
663 res = read_int(reader)
664 assert res & 0xf0000000 == 0
668 def s32(reader=None):
670 if v & 0x80000000 != 0:
671 v = - ((v ^ 0xffffffff) + 1)
674 def read_string(reader=None):
678 resb = reader.read(slen)
679 assert len(resb) == slen
680 return resb.decode('utf-8')
682 def read_bytes(count, reader=None):
685 resb = reader.read(count)
686 assert len(resb) == count
689 def read_byte(reader=None):
690 resb = read_bytes(1, reader=reader)
691 res = struct.unpack('<B', resb)[0]
694 # minor_version + major_version
699 for _c in range(1, int_count):
702 for _c in range(1, uint_count):
705 read_bytes((double_count-1) * 8)
707 constant_strings = [u'']
708 for _c in range(1, string_count):
710 constant_strings.append(s)
711 namespace_count = u30()
712 for _c in range(1, namespace_count):
716 for _c in range(1, ns_set_count):
718 for _c2 in range(count):
720 multiname_count = u30()
729 0x0e: 2, # MultinameA
730 0x1b: 1, # MultinameL
731 0x1c: 1, # MultinameLA
734 for _c in range(1, multiname_count):
736 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
738 u30() # namespace_idx
740 multinames.append(constant_strings[name_idx])
742 multinames.append('[MULTINAME kind: %d]' % kind)
743 for _c2 in range(MULTINAME_SIZES[kind]):
748 MethodInfo = collections.namedtuple(
750 ['NEED_ARGUMENTS', 'NEED_REST'])
752 for method_id in range(method_count):
755 for _ in range(param_count):
757 u30() # name index (always 0 for youtube)
759 if flags & 0x08 != 0:
762 for c in range(option_count):
765 if flags & 0x80 != 0:
766 # Param names present
767 for _ in range(param_count):
769 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
770 method_infos.append(mi)
773 metadata_count = u30()
774 for _c in range(metadata_count):
777 for _c2 in range(item_count):
781 def parse_traits_info():
782 trait_name_idx = u30()
783 kind_full = read_byte()
784 kind = kind_full & 0x0f
785 attrs = kind_full >> 4
787 if kind in [0x00, 0x06]: # Slot or Const
789 u30() # type_name_idx
793 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
796 methods[multinames[trait_name_idx]] = method_idx
797 elif kind == 0x04: # Class
800 elif kind == 0x05: # Function
803 methods[function_idx] = multinames[trait_name_idx]
805 raise ExtractorError(u'Unsupported trait kind %d' % kind)
807 if attrs & 0x4 != 0: # Metadata present
808 metadata_count = u30()
809 for _c3 in range(metadata_count):
810 u30() # metadata index
815 TARGET_CLASSNAME = u'SignatureDecipher'
816 searched_idx = multinames.index(TARGET_CLASSNAME)
817 searched_class_id = None
819 for class_id in range(class_count):
821 if name_idx == searched_idx:
822 # We found the class we're looking for!
823 searched_class_id = class_id
824 u30() # super_name idx
826 if flags & 0x08 != 0: # Protected namespace is present
827 u30() # protected_ns_idx
829 for _c2 in range(intrf_count):
833 for _c2 in range(trait_count):
836 if searched_class_id is None:
837 raise ExtractorError(u'Target class %r not found' %
842 for class_id in range(class_count):
845 for _c2 in range(trait_count):
846 trait_methods = parse_traits_info()
847 if class_id == searched_class_id:
848 method_names.update(trait_methods.items())
849 method_idxs.update(dict(
851 for name, idx in trait_methods.items()))
855 for _c in range(script_count):
858 for _c2 in range(trait_count):
862 method_body_count = u30()
863 Method = collections.namedtuple('Method', ['code', 'local_count'])
865 for _c in range(method_body_count):
869 u30() # init_scope_depth
870 u30() # max_scope_depth
872 code = read_bytes(code_length)
873 if method_idx in method_idxs:
874 m = Method(code, local_count)
875 methods[method_idxs[method_idx]] = m
876 exception_count = u30()
877 for _c2 in range(exception_count):
884 for _c2 in range(trait_count):
887 assert p + code_reader.tell() == len(code_tag)
888 assert len(methods) == len(method_idxs)
890 method_pyfunctions = {}
892 def extract_function(func_name):
893 if func_name in method_pyfunctions:
894 return method_pyfunctions[func_name]
895 if func_name not in methods:
896 raise ExtractorError(u'Cannot find function %r' % func_name)
897 m = methods[func_name]
900 registers = ['(this)'] + list(args) + [None] * m.local_count
902 coder = io.BytesIO(m.code)
904 opcode = struct.unpack('!B', coder.read(1))[0]
905 if opcode == 36: # pushbyte
906 v = struct.unpack('!B', coder.read(1))[0]
908 elif opcode == 44: # pushstring
910 stack.append(constant_strings[idx])
911 elif opcode == 48: # pushscope
912 # We don't implement the scope register, so we'll just
913 # ignore the popped value
915 elif opcode == 70: # callproperty
917 mname = multinames[index]
918 arg_count = u30(coder)
919 args = list(reversed(
920 [stack.pop() for _ in range(arg_count)]))
922 if mname == u'split':
923 assert len(args) == 1
924 assert isinstance(args[0], compat_str)
925 assert isinstance(obj, compat_str)
929 res = obj.split(args[0])
931 elif mname == u'slice':
932 assert len(args) == 1
933 assert isinstance(args[0], int)
934 assert isinstance(obj, list)
937 elif mname == u'join':
938 assert len(args) == 1
939 assert isinstance(args[0], compat_str)
940 assert isinstance(obj, list)
941 res = args[0].join(obj)
943 elif mname in method_pyfunctions:
944 stack.append(method_pyfunctions[mname](args))
946 raise NotImplementedError(
947 u'Unsupported property %r on %r'
949 elif opcode == 72: # returnvalue
952 elif opcode == 79: # callpropvoid
954 mname = multinames[index]
955 arg_count = u30(coder)
956 args = list(reversed(
957 [stack.pop() for _ in range(arg_count)]))
959 if mname == u'reverse':
960 assert isinstance(obj, list)
963 raise NotImplementedError(
964 u'Unsupported (void) property %r on %r'
966 elif opcode == 93: # findpropstrict
968 mname = multinames[index]
969 res = extract_function(mname)
971 elif opcode == 97: # setproperty
976 assert isinstance(obj, list)
977 assert isinstance(idx, int)
979 elif opcode == 98: # getlocal
981 stack.append(registers[index])
982 elif opcode == 99: # setlocal
985 registers[index] = value
986 elif opcode == 102: # getproperty
988 pname = multinames[index]
989 if pname == u'length':
991 assert isinstance(obj, list)
992 stack.append(len(obj))
993 else: # Assume attribute access
995 assert isinstance(idx, int)
997 assert isinstance(obj, list)
998 stack.append(obj[idx])
999 elif opcode == 128: # coerce
1001 elif opcode == 133: # coerce_s
1002 assert isinstance(stack[-1], (type(None), compat_str))
1003 elif opcode == 164: # modulo
1004 value2 = stack.pop()
1005 value1 = stack.pop()
1006 res = value1 % value2
1008 elif opcode == 208: # getlocal_0
1009 stack.append(registers[0])
1010 elif opcode == 209: # getlocal_1
1011 stack.append(registers[1])
1012 elif opcode == 210: # getlocal_2
1013 stack.append(registers[2])
1014 elif opcode == 211: # getlocal_3
1015 stack.append(registers[3])
1016 elif opcode == 214: # setlocal_2
1017 registers[2] = stack.pop()
1018 elif opcode == 215: # setlocal_3
1019 registers[3] = stack.pop()
1021 raise NotImplementedError(
1022 u'Unsupported opcode %d' % opcode)
1024 method_pyfunctions[func_name] = resfunc
1027 initial_function = extract_function(u'decipher')
1028 return lambda s: initial_function([s])
1030 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
1031 """Turn the encrypted s field into a working signature"""
1033 if player_url is not None:
1035 player_id = (player_url, len(s))
1036 if player_id not in self._player_cache:
1037 func = self._extract_signature_function(
1038 video_id, player_url, len(s)
1040 self._player_cache[player_id] = func
1041 func = self._player_cache[player_id]
1042 if self._downloader.params.get('youtube_print_sig_code'):
1043 self._print_sig_code(func, len(s))
1046 tb = traceback.format_exc()
1047 self._downloader.report_warning(
1048 u'Automatic signature extraction failed: ' + tb)
1050 self._downloader.report_warning(
1051 u'Warning: Falling back to static signature algorithm')
1053 return self._static_decrypt_signature(
1054 s, video_id, player_url, age_gate)
1056 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
1058 # The videos with age protection use another player, so the
1059 # algorithms can be different.
1061 return s[2:63] + s[82] + s[64:82] + s[63]
1064 return s[86:29:-1] + s[88] + s[28:5:-1]
1066 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
1068 return s[84:27:-1] + s[86] + s[26:5:-1]
1070 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
1072 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
1074 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
1076 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
1078 return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1]
1080 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
1082 return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1]
1084 return s[80:63:-1] + s[0] + s[62:0:-1] + s[63]
1086 return s[80:37:-1] + s[7] + s[36:7:-1] + s[0] + s[6:0:-1] + s[37]
1088 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1090 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
1092 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1095 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
1097 def _get_available_subtitles(self, video_id):
1099 sub_list = self._download_webpage(
1100 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1101 video_id, note=False)
1102 except ExtractorError as err:
1103 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
1105 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
1110 params = compat_urllib_parse.urlencode({
1113 'fmt': self._downloader.params.get('subtitlesformat'),
1116 url = u'http://www.youtube.com/api/timedtext?' + params
1117 sub_lang_list[lang] = url
1118 if not sub_lang_list:
1119 self._downloader.report_warning(u'video doesn\'t have subtitles')
1121 return sub_lang_list
1123 def _get_available_automatic_caption(self, video_id, webpage):
1124 """We need the webpage for getting the captions url, pass it as an
1125 argument to speed up the process."""
1126 sub_format = self._downloader.params.get('subtitlesformat')
1127 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1128 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
1129 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
1131 self._downloader.report_warning(err_msg)
1133 player_config = json.loads(mobj.group(1))
1135 args = player_config[u'args']
1136 caption_url = args[u'ttsurl']
1137 timestamp = args[u'timestamp']
1138 # We get the available subtitles
1139 list_params = compat_urllib_parse.urlencode({
1144 list_url = caption_url + '&' + list_params
1145 list_page = self._download_webpage(list_url, video_id)
1146 caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8'))
1147 original_lang_node = caption_list.find('track')
1148 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
1149 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1151 original_lang = original_lang_node.attrib['lang_code']
1154 for lang_node in caption_list.findall('target'):
1155 sub_lang = lang_node.attrib['lang_code']
1156 params = compat_urllib_parse.urlencode({
1157 'lang': original_lang,
1163 sub_lang_list[sub_lang] = caption_url + '&' + params
1164 return sub_lang_list
1165 # An extractor error can be raise by the download process if there are
1166 # no automatic captions but there are subtitles
1167 except (KeyError, ExtractorError):
1168 self._downloader.report_warning(err_msg)
1171 def _print_formats(self, formats):
1172 print('Available formats:')
1174 print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'),
1175 self._video_dimensions.get(x, '???'),
1176 ' ('+self._special_itags[x]+')' if x in self._special_itags else ''))
1178 def _extract_id(self, url):
1179 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1181 raise ExtractorError(u'Invalid URL: %s' % url)
1182 video_id = mobj.group(2)
1185 def _get_video_url_list(self, url_map):
1187 Transform a dictionary in the format {itag:url} to a list of (itag, url)
1188 with the requested formats.
1190 req_format = self._downloader.params.get('format', None)
1191 format_limit = self._downloader.params.get('format_limit', None)
1192 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1193 if format_limit is not None and format_limit in available_formats:
1194 format_list = available_formats[available_formats.index(format_limit):]
1196 format_list = available_formats
1197 existing_formats = [x for x in format_list if x in url_map]
1198 if len(existing_formats) == 0:
1199 raise ExtractorError(u'no known formats available for video')
1200 if self._downloader.params.get('listformats', None):
1201 self._print_formats(existing_formats)
1203 if req_format is None or req_format == 'best':
1204 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1205 elif req_format == 'worst':
1206 video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality
1207 elif req_format in ('-1', 'all'):
1208 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1210 # Specific formats. We pick the first in a slash-delimeted sequence.
1211 # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
1212 # available in the specified format. For example,
1213 # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1214 # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
1215 # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
1216 req_formats = req_format.split('/')
1217 video_url_list = None
1218 for rf in req_formats:
1220 video_url_list = [(rf, url_map[rf])]
1222 if rf in self._video_formats_map:
1223 for srf in self._video_formats_map[rf]:
1225 video_url_list = [(srf, url_map[srf])]
1230 if video_url_list is None:
1231 raise ExtractorError(u'requested format not available')
1232 return video_url_list
1234 def _extract_from_m3u8(self, manifest_url, video_id):
1236 def _get_urls(_manifest):
1237 lines = _manifest.split('\n')
1238 urls = filter(lambda l: l and not l.startswith('#'),
1241 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1242 formats_urls = _get_urls(manifest)
1243 for format_url in formats_urls:
1244 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1245 url_map[itag] = format_url
1248 def _extract_annotations(self, video_id):
1249 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
1250 return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
1252 def _real_extract(self, url):
1253 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1254 mobj = re.search(self._NEXT_URL_RE, url)
1256 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1257 video_id = self._extract_id(url)
1260 self.report_video_webpage_download(video_id)
1261 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1262 request = compat_urllib_request.Request(url)
1264 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
1265 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1266 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
1268 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
1270 # Attempt to extract SWF player URL
1271 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1272 if mobj is not None:
1273 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1278 self.report_video_info_webpage_download(video_id)
1279 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1280 self.report_age_confirmation()
1282 # We simulate the access to the video from www.youtube.com/v/{video_id}
1283 # this can be viewed without login into Youtube
1284 data = compat_urllib_parse.urlencode({'video_id': video_id,
1288 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1292 video_info_url = 'https://www.youtube.com/get_video_info?' + data
1293 video_info_webpage = self._download_webpage(video_info_url, video_id,
1295 errnote='unable to download video info webpage')
1296 video_info = compat_parse_qs(video_info_webpage)
1299 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1300 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1301 % (video_id, el_type))
1302 video_info_webpage = self._download_webpage(video_info_url, video_id,
1304 errnote='unable to download video info webpage')
1305 video_info = compat_parse_qs(video_info_webpage)
1306 if 'token' in video_info:
1308 if 'token' not in video_info:
1309 if 'reason' in video_info:
1310 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
1312 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1314 # Check for "rental" videos
1315 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1316 raise ExtractorError(u'"rental" videos not supported')
1318 # Start extracting information
1319 self.report_information_extraction(video_id)
1322 if 'author' not in video_info:
1323 raise ExtractorError(u'Unable to extract uploader name')
1324 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1327 video_uploader_id = None
1328 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1329 if mobj is not None:
1330 video_uploader_id = mobj.group(1)
1332 self._downloader.report_warning(u'unable to extract uploader nickname')
1335 if 'title' in video_info:
1336 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1338 self._downloader.report_warning(u'Unable to extract video title')
1342 # We try first to get a high quality image:
1343 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1344 video_webpage, re.DOTALL)
1345 if m_thumb is not None:
1346 video_thumbnail = m_thumb.group(1)
1347 elif 'thumbnail_url' not in video_info:
1348 self._downloader.report_warning(u'unable to extract video thumbnail')
1349 video_thumbnail = None
1350 else: # don't panic if we can't find it
1351 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1355 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1356 if mobj is not None:
1357 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1358 upload_date = unified_strdate(upload_date)
1361 video_description = get_element_by_id("eow-description", video_webpage)
1362 if video_description:
1363 video_description = clean_html(video_description)
1365 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1367 video_description = unescapeHTML(fd_mobj.group(1))
1369 video_description = u''
1372 video_subtitles = self.extract_subtitles(video_id, video_webpage)
1374 if self._downloader.params.get('listsubtitles', False):
1375 self._list_available_subtitles(video_id, video_webpage)
1378 if 'length_seconds' not in video_info:
1379 self._downloader.report_warning(u'unable to extract video duration')
1382 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
1385 video_annotations = None
1386 if self._downloader.params.get('writeannotations', False):
1387 video_annotations = self._extract_annotations(video_id)
1389 # Decide which formats to download
1392 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
1394 raise ValueError('Could not find vevo ID')
1395 info = json.loads(mobj.group(1))
1397 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1398 # this signatures are encrypted
1399 if 'url_encoded_fmt_stream_map' not in args:
1400 raise ValueError(u'No stream_map present') # caught below
1401 re_signature = re.compile(r'[&,]s=')
1402 m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
1404 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
1405 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
1406 m_s = re_signature.search(args.get('adaptive_fmts', u''))
1408 if 'adaptive_fmts' in video_info:
1409 video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
1411 video_info['adaptive_fmts'] = [args['adaptive_fmts']]
1415 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1416 self.report_rtmp_download()
1417 video_url_list = [(None, video_info['conn'][0])]
1418 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
1419 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
1420 if 'rtmpe%3Dyes' in encoded_url_map:
1421 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1423 for url_data_str in encoded_url_map.split(','):
1424 url_data = compat_parse_qs(url_data_str)
1425 if 'itag' in url_data and 'url' in url_data:
1426 url = url_data['url'][0]
1427 if 'sig' in url_data:
1428 url += '&signature=' + url_data['sig'][0]
1429 elif 's' in url_data:
1430 encrypted_sig = url_data['s'][0]
1431 if self._downloader.params.get('verbose'):
1433 if player_url is None:
1434 player_version = 'unknown'
1436 player_version = self._search_regex(
1437 r'-(.+)\.swf$', player_url,
1438 u'flash player', fatal=False)
1439 player_desc = 'flash player %s' % player_version
1441 player_version = self._search_regex(
1442 r'html5player-(.+?)\.js', video_webpage,
1443 'html5 player', fatal=False)
1444 player_desc = u'html5 player %s' % player_version
1446 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
1447 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
1448 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1451 jsplayer_url_json = self._search_regex(
1452 r'"assets":.+?"js":\s*("[^"]+")',
1453 video_webpage, u'JS player URL')
1454 player_url = json.loads(jsplayer_url_json)
1456 signature = self._decrypt_signature(
1457 encrypted_sig, video_id, player_url, age_gate)
1458 url += '&signature=' + signature
1459 if 'ratebypass' not in url:
1460 url += '&ratebypass=yes'
1461 url_map[url_data['itag'][0]] = url
1462 video_url_list = self._get_video_url_list(url_map)
1463 if not video_url_list:
1465 elif video_info.get('hlsvp'):
1466 manifest_url = video_info['hlsvp'][0]
1467 url_map = self._extract_from_m3u8(manifest_url, video_id)
1468 video_url_list = self._get_video_url_list(url_map)
1469 if not video_url_list:
1473 raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
1476 for itag, video_real_url in video_url_list:
1478 video_extension = self._video_extensions.get(itag, 'flv')
1480 video_format = '{0} - {1}{2}'.format(itag if itag else video_extension,
1481 self._video_dimensions.get(itag, '???'),
1482 ' ('+self._special_itags[itag]+')' if itag in self._special_itags else '')
1486 'url': video_real_url,
1487 'uploader': video_uploader,
1488 'uploader_id': video_uploader_id,
1489 'upload_date': upload_date,
1490 'title': video_title,
1491 'ext': video_extension,
1492 'format': video_format,
1494 'thumbnail': video_thumbnail,
1495 'description': video_description,
1496 'player_url': player_url,
1497 'subtitles': video_subtitles,
1498 'duration': video_duration,
1499 'age_limit': 18 if age_gate else 0,
1500 'annotations': video_annotations
1504 class YoutubePlaylistIE(InfoExtractor):
1505 IE_DESC = u'YouTube.com playlists'
1506 _VALID_URL = r"""(?:
1511 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1512 \? (?:.*?&)*? (?:p|a|list)=
1515 ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
1518 ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
1520 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1522 IE_NAME = u'youtube:playlist'
1525 def suitable(cls, url):
1526 """Receives a URL and returns True if suitable for this IE."""
1527 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1529 def _real_extract(self, url):
1530 # Extract playlist id
1531 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1533 raise ExtractorError(u'Invalid URL: %s' % url)
1534 playlist_id = mobj.group(1) or mobj.group(2)
1536 # Check if it's a video-specific URL
1537 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1538 if 'v' in query_dict:
1539 video_id = query_dict['v'][0]
1540 if self._downloader.params.get('noplaylist'):
1541 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
1542 return self.url_result('https://www.youtube.com/watch?v=' + video_id, 'Youtube')
1544 self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1546 # Download playlist videos from API
1549 for page_num in itertools.count(1):
1550 start_index = self._MAX_RESULTS * (page_num - 1) + 1
1551 if start_index >= 1000:
1552 self._downloader.report_warning(u'Max number of results reached')
1554 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index)
1555 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1558 response = json.loads(page)
1559 except ValueError as err:
1560 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1562 if 'feed' not in response:
1563 raise ExtractorError(u'Got a malformed response from YouTube API')
1564 playlist_title = response['feed']['title']['$t']
1565 if 'entry' not in response['feed']:
1566 # Number of videos is a multiple of self._MAX_RESULTS
1569 for entry in response['feed']['entry']:
1570 index = entry['yt$position']['$t']
1571 if 'media$group' in entry and 'yt$videoid' in entry['media$group']:
1574 'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t']
1577 videos = [v[1] for v in sorted(videos)]
1579 url_results = [self.url_result(vurl, 'Youtube') for vurl in videos]
1580 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1583 class YoutubeChannelIE(InfoExtractor):
1584 IE_DESC = u'YouTube.com channels'
1585 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1586 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1587 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1588 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1589 IE_NAME = u'youtube:channel'
1591 def extract_videos_from_page(self, page):
1593 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1594 if mobj.group(1) not in ids_in_page:
1595 ids_in_page.append(mobj.group(1))
1598 def _real_extract(self, url):
1599 # Extract channel id
1600 mobj = re.match(self._VALID_URL, url)
1602 raise ExtractorError(u'Invalid URL: %s' % url)
1604 # Download channel page
1605 channel_id = mobj.group(1)
1609 url = self._TEMPLATE_URL % (channel_id, pagenum)
1610 page = self._download_webpage(url, channel_id,
1611 u'Downloading page #%s' % pagenum)
1613 # Extract video identifiers
1614 ids_in_page = self.extract_videos_from_page(page)
1615 video_ids.extend(ids_in_page)
1617 # Download any subsequent channel pages using the json-based channel_ajax query
1618 if self._MORE_PAGES_INDICATOR in page:
1619 for pagenum in itertools.count(1):
1620 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1621 page = self._download_webpage(url, channel_id,
1622 u'Downloading page #%s' % pagenum)
1624 page = json.loads(page)
1626 ids_in_page = self.extract_videos_from_page(page['content_html'])
1627 video_ids.extend(ids_in_page)
1629 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1632 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1634 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1635 url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls]
1636 return [self.playlist_result(url_entries, channel_id)]
1639 class YoutubeUserIE(InfoExtractor):
1640 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
1641 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1642 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1643 _GDATA_PAGE_SIZE = 50
1644 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1645 IE_NAME = u'youtube:user'
1648 def suitable(cls, url):
1649 # Don't return True if the url can be extracted with other youtube
1650 # extractor, the regex would is too permissive and it would match.
1651 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1652 if any(ie.suitable(url) for ie in other_ies): return False
1653 else: return super(YoutubeUserIE, cls).suitable(url)
1655 def _real_extract(self, url):
1657 mobj = re.match(self._VALID_URL, url)
1659 raise ExtractorError(u'Invalid URL: %s' % url)
1661 username = mobj.group(1)
1663 # Download video ids using YouTube Data API. Result size per
1664 # query is limited (currently to 50 videos) so we need to query
1665 # page by page until there are no video ids - it means we got
1670 for pagenum in itertools.count(0):
1671 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1673 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1674 page = self._download_webpage(gdata_url, username,
1675 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1678 response = json.loads(page)
1679 except ValueError as err:
1680 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1681 if 'entry' not in response['feed']:
1682 # Number of videos is a multiple of self._MAX_RESULTS
1685 # Extract video identifiers
1687 for entry in response['feed']['entry']:
1688 ids_in_page.append(entry['id']['$t'].split('/')[-1])
1689 video_ids.extend(ids_in_page)
1691 # A little optimization - if current page is not
1692 # "full", ie. does not contain PAGE_SIZE video ids then
1693 # we can assume that this page is the last one - there
1694 # are no more ids on further pages - no need to query
1697 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1700 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1701 url_results = [self.url_result(rurl, 'Youtube') for rurl in urls]
1702 return [self.playlist_result(url_results, playlist_title = username)]
1704 class YoutubeSearchIE(SearchInfoExtractor):
1705 IE_DESC = u'YouTube.com searches'
1706 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1708 IE_NAME = u'youtube:search'
1709 _SEARCH_KEY = 'ytsearch'
1711 def report_download_page(self, query, pagenum):
1712 """Report attempt to download search page with given number."""
1713 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1715 def _get_n_results(self, query, n):
1716 """Get a specified number of results for a query"""
1722 while (50 * pagenum) < limit:
1723 self.report_download_page(query, pagenum+1)
1724 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1725 request = compat_urllib_request.Request(result_url)
1727 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1728 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1729 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1730 api_response = json.loads(data)['data']
1732 if not 'items' in api_response:
1733 raise ExtractorError(u'[youtube] No video results')
1735 new_ids = list(video['id'] for video in api_response['items'])
1736 video_ids += new_ids
1738 limit = min(n, api_response['totalItems'])
1741 if len(video_ids) > n:
1742 video_ids = video_ids[:n]
1743 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1744 return self.playlist_result(videos, query)
1747 class YoutubeShowIE(InfoExtractor):
1748 IE_DESC = u'YouTube.com (multi-season) shows'
1749 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1750 IE_NAME = u'youtube:show'
1752 def _real_extract(self, url):
1753 mobj = re.match(self._VALID_URL, url)
1754 show_name = mobj.group(1)
1755 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1756 # There's one playlist for each season of the show
1757 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1758 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1759 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1762 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1764 Base class for extractors that fetch info from
1765 http://www.youtube.com/feed_ajax
1766 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1768 _LOGIN_REQUIRED = True
1770 # use action_load_personal_feed instead of action_load_system_feed
1771 _PERSONAL_FEED = False
1774 def _FEED_TEMPLATE(self):
1775 action = 'action_load_system_feed'
1776 if self._PERSONAL_FEED:
1777 action = 'action_load_personal_feed'
1778 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1782 return u'youtube:%s' % self._FEED_NAME
1784 def _real_initialize(self):
1787 def _real_extract(self, url):
1789 # The step argument is available only in 2.7 or higher
1790 for i in itertools.count(0):
1791 paging = i*self._PAGING_STEP
1792 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1793 u'%s feed' % self._FEED_NAME,
1794 u'Downloading page %s' % i)
1795 info = json.loads(info)
1796 feed_html = info['feed_html']
1797 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1798 ids = orderedSet(m.group(1) for m in m_ids)
1799 feed_entries.extend(self.url_result(id, 'Youtube') for id in ids)
1800 if info['paging'] is None:
1802 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1804 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1805 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1806 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1807 _FEED_NAME = 'subscriptions'
1808 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1810 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1811 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1812 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1813 _FEED_NAME = 'recommended'
1814 _PLAYLIST_TITLE = u'Youtube Recommended videos'
1816 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1817 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1818 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1819 _FEED_NAME = 'watch_later'
1820 _PLAYLIST_TITLE = u'Youtube Watch Later'
1822 _PERSONAL_FEED = True
1824 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1825 IE_NAME = u'youtube:favorites'
1826 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1827 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1828 _LOGIN_REQUIRED = True
1830 def _real_extract(self, url):
1831 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1832 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1833 return self.url_result(playlist_id, 'YoutubePlaylist')
1836 class YoutubeTruncatedURLIE(InfoExtractor):
1837 IE_NAME = 'youtube:truncated_url'
1838 IE_DESC = False # Do not list
1839 _VALID_URL = r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$'
1841 def _real_extract(self, url):
1842 raise ExtractorError(
1843 u'Did you forget to quote the URL? Remember that & is a meta '
1844 u'character in most shells, so you want to put the URL in quotes, '
1846 u'\'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\''
1847 u' (or simply youtube-dl BaW_jenozKc ).',