15 from .common import InfoExtractor, SearchInfoExtractor
16 from .subtitles import SubtitlesInfoExtractor
21 compat_urllib_request,
28 get_element_by_attribute,
36 class YoutubeBaseInfoExtractor(InfoExtractor):
37 """Provide base functions for Youtube extractors"""
38 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
39 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
40 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
41 _NETRC_MACHINE = 'youtube'
42 # If True it will raise an error if no login info is provided
43 _LOGIN_REQUIRED = False
45 def _set_language(self):
46 return bool(self._download_webpage(
48 note=u'Setting language', errnote='unable to set language',
52 (username, password) = self._get_login_info()
53 # No authentication to be performed
55 if self._LOGIN_REQUIRED:
56 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
59 login_page = self._download_webpage(
60 self._LOGIN_URL, None,
61 note=u'Downloading login page',
62 errnote=u'unable to fetch login page', fatal=False)
63 if login_page is False:
66 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
67 login_page, u'Login GALX parameter')
71 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
75 u'PersistentCookie': u'yes',
77 u'bgresponse': u'js_disabled',
78 u'checkConnection': u'',
79 u'checkedDomains': u'youtube',
84 u'signIn': u'Sign in',
86 u'service': u'youtube',
90 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
92 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
93 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
95 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
96 login_results = self._download_webpage(
98 note=u'Logging in', errnote=u'unable to log in', fatal=False)
99 if login_results is False:
101 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
102 self._downloader.report_warning(u'unable to log in: bad username or password')
106 def _confirm_age(self):
109 'action_confirm': 'Confirm',
111 req = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
113 self._download_webpage(
115 note=u'Confirming age', errnote=u'Unable to confirm age')
118 def _real_initialize(self):
119 if self._downloader is None:
121 if not self._set_language():
123 if not self._login():
128 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
129 IE_DESC = u'YouTube.com'
130 _VALID_URL = r"""(?x)^
132 (?:https?://|//)? # http(s):// or protocol-independent URL (optional)
133 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
134 tube\.majestyc\.net/|
135 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
136 (?:.*?\#/)? # handle anchor (#/) redirect urls
137 (?: # the various things that can precede the ID:
138 (?:(?:v|embed|e)/) # v/ or embed/ or e/
139 |(?: # or the v= param in all its forms
140 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
141 (?:\?|\#!?) # the params delimiter ? or # or #!
142 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
146 |youtu\.be/ # just youtu.be/xxxx
148 )? # all until now is optional -> you can pass the naked ID
149 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
150 (?(1).+)? # if we found the ID, everything can follow
152 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
153 # Listed in order of quality
154 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
155 # Apple HTTP Live Streaming
156 '96', '95', '94', '93', '92', '132', '151',
158 '85', '84', '102', '83', '101', '82', '100',
160 '138', '137', '248', '136', '247', '135', '246',
161 '245', '244', '134', '243', '133', '242', '160',
163 '141', '172', '140', '171', '139',
165 _video_extensions = {
187 # Apple HTTP Live Streaming
221 _video_dimensions = {
222 '5': {'width': 400, 'height': 240},
225 '17': {'width': 176, 'height': 144},
226 '18': {'width': 640, 'height': 360},
227 '22': {'width': 1280, 'height': 720},
228 '34': {'width': 640, 'height': 360},
229 '35': {'width': 854, 'height': 480},
230 '36': {'width': 320, 'height': 240},
231 '37': {'width': 1920, 'height': 1080},
232 '38': {'width': 4096, 'height': 3072},
233 '43': {'width': 640, 'height': 360},
234 '44': {'width': 854, 'height': 480},
235 '45': {'width': 1280, 'height': 720},
236 '46': {'width': 1920, 'height': 1080},
237 '82': {'height': 360, 'display': '360p'},
238 '83': {'height': 480, 'display': '480p'},
239 '84': {'height': 720, 'display': '720p'},
240 '85': {'height': 1080, 'display': '1080p'},
241 '92': {'height': 240, 'display': '240p'},
242 '93': {'height': 360, 'display': '360p'},
243 '94': {'height': 480, 'display': '480p'},
244 '95': {'height': 720, 'display': '720p'},
245 '96': {'height': 1080, 'display': '1080p'},
246 '100': {'height': 360, 'display': '360p'},
247 '101': {'height': 480, 'display': '480p'},
248 '102': {'height': 720, 'display': '720p'},
249 '132': {'height': 240, 'display': '240p'},
250 '151': {'height': 72, 'display': '72p'},
251 '133': {'height': 240, 'display': '240p'},
252 '134': {'height': 360, 'display': '360p'},
253 '135': {'height': 480, 'display': '480p'},
254 '136': {'height': 720, 'display': '720p'},
255 '137': {'height': 1080, 'display': '1080p'},
256 '138': {'height': 1081, 'display': '>1080p'},
257 '139': {'display': '48k'},
258 '140': {'display': '128k'},
259 '141': {'display': '256k'},
260 '160': {'height': 192, 'display': '192p'},
261 '171': {'display': '128k'},
262 '172': {'display': '256k'},
263 '242': {'height': 240, 'display': '240p'},
264 '243': {'height': 360, 'display': '360p'},
265 '244': {'height': 480, 'display': '480p'},
266 '245': {'height': 480, 'display': '480p'},
267 '246': {'height': 480, 'display': '480p'},
268 '247': {'height': 720, 'display': '720p'},
269 '248': {'height': 1080, 'display': '1080p'},
303 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
304 u"file": u"BaW_jenozKc.mp4",
306 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
307 u"uploader": u"Philipp Hagemeister",
308 u"uploader_id": u"phihag",
309 u"upload_date": u"20121002",
310 u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
314 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
315 u"file": u"UxxajLWwzqY.mp4",
316 u"note": u"Test generic use_cipher_signature video (#897)",
318 u"upload_date": u"20120506",
319 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
320 u"description": u"md5:5b292926389560516e384ac437c0ec07",
321 u"uploader": u"Icona Pop",
322 u"uploader_id": u"IconaPop"
326 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
327 u"file": u"07FYdnEawAQ.mp4",
328 u"note": u"Test VEVO video with age protection (#956)",
330 u"upload_date": u"20130703",
331 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
332 u"description": u"md5:64249768eec3bc4276236606ea996373",
333 u"uploader": u"justintimberlakeVEVO",
334 u"uploader_id": u"justintimberlakeVEVO"
338 u"url": u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
339 u"file": u"yZIXLfi8CZQ.mp4",
340 u"note": u"Embed-only video (#1746)",
342 u"upload_date": u"20120608",
343 u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
344 u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
345 u"uploader": u"SET India",
346 u"uploader_id": u"setindia"
353 def suitable(cls, url):
354 """Receives a URL and returns True if suitable for this IE."""
355 if YoutubePlaylistIE.suitable(url): return False
356 return re.match(cls._VALID_URL, url) is not None
358 def __init__(self, *args, **kwargs):
359 super(YoutubeIE, self).__init__(*args, **kwargs)
360 self._player_cache = {}
362 def report_video_info_webpage_download(self, video_id):
363 """Report attempt to download video info webpage."""
364 self.to_screen(u'%s: Downloading video info webpage' % video_id)
366 def report_information_extraction(self, video_id):
367 """Report attempt to extract video information."""
368 self.to_screen(u'%s: Extracting video information' % video_id)
370 def report_unavailable_format(self, video_id, format):
371 """Report extracted video URL."""
372 self.to_screen(u'%s: Format %s not available' % (video_id, format))
374 def report_rtmp_download(self):
375 """Indicate the download will use the RTMP protocol."""
376 self.to_screen(u'RTMP download detected')
378 def _extract_signature_function(self, video_id, player_url, slen):
379 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
381 player_type = id_m.group('ext')
382 player_id = id_m.group('id')
384 # Read from filesystem cache
385 func_id = '%s_%s_%d' % (player_type, player_id, slen)
386 assert os.path.basename(func_id) == func_id
387 cache_dir = get_cachedir(self._downloader.params)
389 cache_enabled = cache_dir is not None
391 cache_fn = os.path.join(os.path.expanduser(cache_dir),
395 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
396 cache_spec = json.load(cachef)
397 return lambda s: u''.join(s[i] for i in cache_spec)
399 pass # No cache available
401 if player_type == 'js':
402 code = self._download_webpage(
403 player_url, video_id,
404 note=u'Downloading %s player %s' % (player_type, player_id),
405 errnote=u'Download of %s failed' % player_url)
406 res = self._parse_sig_js(code)
407 elif player_type == 'swf':
408 urlh = self._request_webpage(
409 player_url, video_id,
410 note=u'Downloading %s player %s' % (player_type, player_id),
411 errnote=u'Download of %s failed' % player_url)
413 res = self._parse_sig_swf(code)
415 assert False, 'Invalid player type %r' % player_type
419 test_string = u''.join(map(compat_chr, range(slen)))
420 cache_res = res(test_string)
421 cache_spec = [ord(c) for c in cache_res]
423 os.makedirs(os.path.dirname(cache_fn))
424 except OSError as ose:
425 if ose.errno != errno.EEXIST:
427 write_json_file(cache_spec, cache_fn)
429 tb = traceback.format_exc()
430 self._downloader.report_warning(
431 u'Writing cache to %r failed: %s' % (cache_fn, tb))
435 def _print_sig_code(self, func, slen):
436 def gen_sig_code(idxs):
437 def _genslice(start, end, step):
438 starts = u'' if start == 0 else str(start)
439 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
440 steps = u'' if step == 1 else (u':%d' % step)
441 return u's[%s%s%s]' % (starts, ends, steps)
444 start = '(Never used)' # Quelch pyflakes warnings - start will be
445 # set as soon as step is set
446 for i, prev in zip(idxs[1:], idxs[:-1]):
450 yield _genslice(start, prev, step)
453 if i - prev in [-1, 1]:
458 yield u's[%d]' % prev
462 yield _genslice(start, i, step)
464 test_string = u''.join(map(compat_chr, range(slen)))
465 cache_res = func(test_string)
466 cache_spec = [ord(c) for c in cache_res]
467 expr_code = u' + '.join(gen_sig_code(cache_spec))
468 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
469 self.to_screen(u'Extracted signature function:\n' + code)
471 def _parse_sig_js(self, jscode):
472 funcname = self._search_regex(
473 r'signature=([a-zA-Z]+)', jscode,
474 u'Initial JS player signature function name')
479 return string.lowercase.index(varname)
481 def interpret_statement(stmt, local_vars, allow_recursion=20):
482 if allow_recursion < 0:
483 raise ExtractorError(u'Recursion limit reached')
485 if stmt.startswith(u'var '):
486 stmt = stmt[len(u'var '):]
487 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
488 r'=(?P<expr>.*)$', stmt)
490 if ass_m.groupdict().get('index'):
492 lvar = local_vars[ass_m.group('out')]
493 idx = interpret_expression(ass_m.group('index'),
494 local_vars, allow_recursion)
495 assert isinstance(idx, int)
498 expr = ass_m.group('expr')
501 local_vars[ass_m.group('out')] = val
503 expr = ass_m.group('expr')
504 elif stmt.startswith(u'return '):
506 expr = stmt[len(u'return '):]
508 raise ExtractorError(
509 u'Cannot determine left side of statement in %r' % stmt)
511 v = interpret_expression(expr, local_vars, allow_recursion)
514 def interpret_expression(expr, local_vars, allow_recursion):
519 return local_vars[expr]
521 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
523 member = m.group('member')
524 val = local_vars[m.group('in')]
525 if member == 'split("")':
527 if member == 'join("")':
529 if member == 'length':
531 if member == 'reverse()':
533 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
535 idx = interpret_expression(
536 slice_m.group('idx'), local_vars, allow_recursion-1)
540 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
542 val = local_vars[m.group('in')]
543 idx = interpret_expression(m.group('idx'), local_vars,
547 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
549 a = interpret_expression(m.group('a'),
550 local_vars, allow_recursion)
551 b = interpret_expression(m.group('b'),
552 local_vars, allow_recursion)
556 r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
558 fname = m.group('func')
559 if fname not in functions:
560 functions[fname] = extract_function(fname)
561 argvals = [int(v) if v.isdigit() else local_vars[v]
562 for v in m.group('args').split(',')]
563 return functions[fname](argvals)
564 raise ExtractorError(u'Unsupported JS expression %r' % expr)
566 def extract_function(funcname):
568 r'function ' + re.escape(funcname) +
569 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
571 argnames = func_m.group('args').split(',')
574 local_vars = dict(zip(argnames, args))
575 for stmt in func_m.group('code').split(';'):
576 res = interpret_statement(stmt, local_vars)
580 initial_function = extract_function(funcname)
581 return lambda s: initial_function([s])
583 def _parse_sig_swf(self, file_contents):
584 if file_contents[1:3] != b'WS':
585 raise ExtractorError(
586 u'Not an SWF file; header is %r' % file_contents[:3])
587 if file_contents[:1] == b'C':
588 content = zlib.decompress(file_contents[8:])
590 raise NotImplementedError(u'Unsupported compression format %r' %
593 def extract_tags(content):
595 while pos < len(content):
596 header16 = struct.unpack('<H', content[pos:pos+2])[0]
598 tag_code = header16 >> 6
599 tag_len = header16 & 0x3f
601 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
603 assert pos+tag_len <= len(content)
604 yield (tag_code, content[pos:pos+tag_len])
608 for tag_code, tag in extract_tags(content)
610 p = code_tag.index(b'\0', 4) + 1
611 code_reader = io.BytesIO(code_tag[p:])
613 # Parse ABC (AVM2 ByteCode)
614 def read_int(reader=None):
622 b = struct.unpack('<B', buf)[0]
623 res = res | ((b & 0x7f) << shift)
629 def u30(reader=None):
630 res = read_int(reader)
631 assert res & 0xf0000000 == 0
635 def s32(reader=None):
637 if v & 0x80000000 != 0:
638 v = - ((v ^ 0xffffffff) + 1)
641 def read_string(reader=None):
645 resb = reader.read(slen)
646 assert len(resb) == slen
647 return resb.decode('utf-8')
649 def read_bytes(count, reader=None):
652 resb = reader.read(count)
653 assert len(resb) == count
656 def read_byte(reader=None):
657 resb = read_bytes(1, reader=reader)
658 res = struct.unpack('<B', resb)[0]
661 # minor_version + major_version
666 for _c in range(1, int_count):
669 for _c in range(1, uint_count):
672 read_bytes((double_count-1) * 8)
674 constant_strings = [u'']
675 for _c in range(1, string_count):
677 constant_strings.append(s)
678 namespace_count = u30()
679 for _c in range(1, namespace_count):
683 for _c in range(1, ns_set_count):
685 for _c2 in range(count):
687 multiname_count = u30()
696 0x0e: 2, # MultinameA
697 0x1b: 1, # MultinameL
698 0x1c: 1, # MultinameLA
701 for _c in range(1, multiname_count):
703 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
705 u30() # namespace_idx
707 multinames.append(constant_strings[name_idx])
709 multinames.append('[MULTINAME kind: %d]' % kind)
710 for _c2 in range(MULTINAME_SIZES[kind]):
715 MethodInfo = collections.namedtuple(
717 ['NEED_ARGUMENTS', 'NEED_REST'])
719 for method_id in range(method_count):
722 for _ in range(param_count):
724 u30() # name index (always 0 for youtube)
726 if flags & 0x08 != 0:
729 for c in range(option_count):
732 if flags & 0x80 != 0:
733 # Param names present
734 for _ in range(param_count):
736 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
737 method_infos.append(mi)
740 metadata_count = u30()
741 for _c in range(metadata_count):
744 for _c2 in range(item_count):
748 def parse_traits_info():
749 trait_name_idx = u30()
750 kind_full = read_byte()
751 kind = kind_full & 0x0f
752 attrs = kind_full >> 4
754 if kind in [0x00, 0x06]: # Slot or Const
756 u30() # type_name_idx
760 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
763 methods[multinames[trait_name_idx]] = method_idx
764 elif kind == 0x04: # Class
767 elif kind == 0x05: # Function
770 methods[function_idx] = multinames[trait_name_idx]
772 raise ExtractorError(u'Unsupported trait kind %d' % kind)
774 if attrs & 0x4 != 0: # Metadata present
775 metadata_count = u30()
776 for _c3 in range(metadata_count):
777 u30() # metadata index
782 TARGET_CLASSNAME = u'SignatureDecipher'
783 searched_idx = multinames.index(TARGET_CLASSNAME)
784 searched_class_id = None
786 for class_id in range(class_count):
788 if name_idx == searched_idx:
789 # We found the class we're looking for!
790 searched_class_id = class_id
791 u30() # super_name idx
793 if flags & 0x08 != 0: # Protected namespace is present
794 u30() # protected_ns_idx
796 for _c2 in range(intrf_count):
800 for _c2 in range(trait_count):
803 if searched_class_id is None:
804 raise ExtractorError(u'Target class %r not found' %
809 for class_id in range(class_count):
812 for _c2 in range(trait_count):
813 trait_methods = parse_traits_info()
814 if class_id == searched_class_id:
815 method_names.update(trait_methods.items())
816 method_idxs.update(dict(
818 for name, idx in trait_methods.items()))
822 for _c in range(script_count):
825 for _c2 in range(trait_count):
829 method_body_count = u30()
830 Method = collections.namedtuple('Method', ['code', 'local_count'])
832 for _c in range(method_body_count):
836 u30() # init_scope_depth
837 u30() # max_scope_depth
839 code = read_bytes(code_length)
840 if method_idx in method_idxs:
841 m = Method(code, local_count)
842 methods[method_idxs[method_idx]] = m
843 exception_count = u30()
844 for _c2 in range(exception_count):
851 for _c2 in range(trait_count):
854 assert p + code_reader.tell() == len(code_tag)
855 assert len(methods) == len(method_idxs)
857 method_pyfunctions = {}
859 def extract_function(func_name):
860 if func_name in method_pyfunctions:
861 return method_pyfunctions[func_name]
862 if func_name not in methods:
863 raise ExtractorError(u'Cannot find function %r' % func_name)
864 m = methods[func_name]
867 registers = ['(this)'] + list(args) + [None] * m.local_count
869 coder = io.BytesIO(m.code)
871 opcode = struct.unpack('!B', coder.read(1))[0]
872 if opcode == 36: # pushbyte
873 v = struct.unpack('!B', coder.read(1))[0]
875 elif opcode == 44: # pushstring
877 stack.append(constant_strings[idx])
878 elif opcode == 48: # pushscope
879 # We don't implement the scope register, so we'll just
880 # ignore the popped value
882 elif opcode == 70: # callproperty
884 mname = multinames[index]
885 arg_count = u30(coder)
886 args = list(reversed(
887 [stack.pop() for _ in range(arg_count)]))
889 if mname == u'split':
890 assert len(args) == 1
891 assert isinstance(args[0], compat_str)
892 assert isinstance(obj, compat_str)
896 res = obj.split(args[0])
898 elif mname == u'slice':
899 assert len(args) == 1
900 assert isinstance(args[0], int)
901 assert isinstance(obj, list)
904 elif mname == u'join':
905 assert len(args) == 1
906 assert isinstance(args[0], compat_str)
907 assert isinstance(obj, list)
908 res = args[0].join(obj)
910 elif mname in method_pyfunctions:
911 stack.append(method_pyfunctions[mname](args))
913 raise NotImplementedError(
914 u'Unsupported property %r on %r'
916 elif opcode == 72: # returnvalue
919 elif opcode == 79: # callpropvoid
921 mname = multinames[index]
922 arg_count = u30(coder)
923 args = list(reversed(
924 [stack.pop() for _ in range(arg_count)]))
926 if mname == u'reverse':
927 assert isinstance(obj, list)
930 raise NotImplementedError(
931 u'Unsupported (void) property %r on %r'
933 elif opcode == 93: # findpropstrict
935 mname = multinames[index]
936 res = extract_function(mname)
938 elif opcode == 97: # setproperty
943 assert isinstance(obj, list)
944 assert isinstance(idx, int)
946 elif opcode == 98: # getlocal
948 stack.append(registers[index])
949 elif opcode == 99: # setlocal
952 registers[index] = value
953 elif opcode == 102: # getproperty
955 pname = multinames[index]
956 if pname == u'length':
958 assert isinstance(obj, list)
959 stack.append(len(obj))
960 else: # Assume attribute access
962 assert isinstance(idx, int)
964 assert isinstance(obj, list)
965 stack.append(obj[idx])
966 elif opcode == 128: # coerce
968 elif opcode == 133: # coerce_s
969 assert isinstance(stack[-1], (type(None), compat_str))
970 elif opcode == 164: # modulo
973 res = value1 % value2
975 elif opcode == 208: # getlocal_0
976 stack.append(registers[0])
977 elif opcode == 209: # getlocal_1
978 stack.append(registers[1])
979 elif opcode == 210: # getlocal_2
980 stack.append(registers[2])
981 elif opcode == 211: # getlocal_3
982 stack.append(registers[3])
983 elif opcode == 214: # setlocal_2
984 registers[2] = stack.pop()
985 elif opcode == 215: # setlocal_3
986 registers[3] = stack.pop()
988 raise NotImplementedError(
989 u'Unsupported opcode %d' % opcode)
991 method_pyfunctions[func_name] = resfunc
994 initial_function = extract_function(u'decipher')
995 return lambda s: initial_function([s])
997 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
998 """Turn the encrypted s field into a working signature"""
1000 if player_url is not None:
1001 if player_url.startswith(u'//'):
1002 player_url = u'https:' + player_url
1004 player_id = (player_url, len(s))
1005 if player_id not in self._player_cache:
1006 func = self._extract_signature_function(
1007 video_id, player_url, len(s)
1009 self._player_cache[player_id] = func
1010 func = self._player_cache[player_id]
1011 if self._downloader.params.get('youtube_print_sig_code'):
1012 self._print_sig_code(func, len(s))
1015 tb = traceback.format_exc()
1016 self._downloader.report_warning(
1017 u'Automatic signature extraction failed: ' + tb)
1019 self._downloader.report_warning(
1020 u'Warning: Falling back to static signature algorithm')
1022 return self._static_decrypt_signature(
1023 s, video_id, player_url, age_gate)
1025 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
1027 # The videos with age protection use another player, so the
1028 # algorithms can be different.
1030 return s[2:63] + s[82] + s[64:82] + s[63]
1033 return s[86:29:-1] + s[88] + s[28:5:-1]
1035 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
1037 return s[84:27:-1] + s[86] + s[26:5:-1]
1039 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
1041 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
1043 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
1045 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
1047 return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1]
1049 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
1051 return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1]
1053 return s[80:63:-1] + s[0] + s[62:0:-1] + s[63]
1055 return s[80:37:-1] + s[7] + s[36:7:-1] + s[0] + s[6:0:-1] + s[37]
1057 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1059 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
1061 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1064 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
1066 def _get_available_subtitles(self, video_id, webpage):
1068 sub_list = self._download_webpage(
1069 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1070 video_id, note=False)
1071 except ExtractorError as err:
1072 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
1074 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
1079 params = compat_urllib_parse.urlencode({
1082 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
1083 'name': l[0].encode('utf-8'),
1085 url = u'http://www.youtube.com/api/timedtext?' + params
1086 sub_lang_list[lang] = url
1087 if not sub_lang_list:
1088 self._downloader.report_warning(u'video doesn\'t have subtitles')
1090 return sub_lang_list
1092 def _get_available_automatic_caption(self, video_id, webpage):
1093 """We need the webpage for getting the captions url, pass it as an
1094 argument to speed up the process."""
1095 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
1096 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1097 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
1098 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
1100 self._downloader.report_warning(err_msg)
1102 player_config = json.loads(mobj.group(1))
1104 args = player_config[u'args']
1105 caption_url = args[u'ttsurl']
1106 timestamp = args[u'timestamp']
1107 # We get the available subtitles
1108 list_params = compat_urllib_parse.urlencode({
1113 list_url = caption_url + '&' + list_params
1114 caption_list = self._download_xml(list_url, video_id)
1115 original_lang_node = caption_list.find('track')
1116 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
1117 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1119 original_lang = original_lang_node.attrib['lang_code']
1122 for lang_node in caption_list.findall('target'):
1123 sub_lang = lang_node.attrib['lang_code']
1124 params = compat_urllib_parse.urlencode({
1125 'lang': original_lang,
1131 sub_lang_list[sub_lang] = caption_url + '&' + params
1132 return sub_lang_list
1133 # An extractor error can be raise by the download process if there are
1134 # no automatic captions but there are subtitles
1135 except (KeyError, ExtractorError):
1136 self._downloader.report_warning(err_msg)
1139 def _extract_id(self, url):
1140 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1142 raise ExtractorError(u'Invalid URL: %s' % url)
1143 video_id = mobj.group(2)
1146 def _get_video_url_list(self, url_map):
1148 Transform a dictionary in the format {itag:url} to a list of (itag, url)
1149 with the requested formats.
1151 existing_formats = [x for x in self._available_formats if x in url_map]
1152 if len(existing_formats) == 0:
1153 raise ExtractorError(u'no known formats available for video')
1154 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1155 video_url_list.reverse() # order worst to best
1156 return video_url_list
1158 def _extract_from_m3u8(self, manifest_url, video_id):
1160 def _get_urls(_manifest):
1161 lines = _manifest.split('\n')
1162 urls = filter(lambda l: l and not l.startswith('#'),
1165 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1166 formats_urls = _get_urls(manifest)
1167 for format_url in formats_urls:
1168 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1169 url_map[itag] = format_url
1172 def _extract_annotations(self, video_id):
1173 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
1174 return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
1176 def _real_extract(self, url):
1177 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1178 mobj = re.search(self._NEXT_URL_RE, url)
1180 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1181 video_id = self._extract_id(url)
1184 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1185 video_webpage = self._download_webpage(url, video_id)
1187 # Attempt to extract SWF player URL
1188 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1189 if mobj is not None:
1190 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1195 self.report_video_info_webpage_download(video_id)
1196 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1197 self.report_age_confirmation()
1199 # We simulate the access to the video from www.youtube.com/v/{video_id}
1200 # this can be viewed without login into Youtube
1201 data = compat_urllib_parse.urlencode({'video_id': video_id,
1202 'el': 'player_embedded',
1205 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1209 video_info_url = 'https://www.youtube.com/get_video_info?' + data
1210 video_info_webpage = self._download_webpage(video_info_url, video_id,
1212 errnote='unable to download video info webpage')
1213 video_info = compat_parse_qs(video_info_webpage)
1216 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1217 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1218 % (video_id, el_type))
1219 video_info_webpage = self._download_webpage(video_info_url, video_id,
1221 errnote='unable to download video info webpage')
1222 video_info = compat_parse_qs(video_info_webpage)
1223 if 'token' in video_info:
1225 if 'token' not in video_info:
1226 if 'reason' in video_info:
1227 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
1229 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1231 if 'view_count' in video_info:
1232 view_count = int(video_info['view_count'][0])
1236 # Check for "rental" videos
1237 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1238 raise ExtractorError(u'"rental" videos not supported')
1240 # Start extracting information
1241 self.report_information_extraction(video_id)
1244 if 'author' not in video_info:
1245 raise ExtractorError(u'Unable to extract uploader name')
1246 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1249 video_uploader_id = None
1250 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1251 if mobj is not None:
1252 video_uploader_id = mobj.group(1)
1254 self._downloader.report_warning(u'unable to extract uploader nickname')
1257 if 'title' in video_info:
1258 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1260 self._downloader.report_warning(u'Unable to extract video title')
1264 # We try first to get a high quality image:
1265 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1266 video_webpage, re.DOTALL)
1267 if m_thumb is not None:
1268 video_thumbnail = m_thumb.group(1)
1269 elif 'thumbnail_url' not in video_info:
1270 self._downloader.report_warning(u'unable to extract video thumbnail')
1271 video_thumbnail = None
1272 else: # don't panic if we can't find it
1273 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1277 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1278 if mobj is not None:
1279 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1280 upload_date = unified_strdate(upload_date)
1283 video_description = get_element_by_id("eow-description", video_webpage)
1284 if video_description:
1285 video_description = re.sub(r'''(?x)
1287 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1289 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1290 class="yt-uix-redirect-link"\s*>
1293 ''', r'\1', video_description)
1294 video_description = clean_html(video_description)
1296 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1298 video_description = unescapeHTML(fd_mobj.group(1))
1300 video_description = u''
1302 def _extract_count(klass):
1303 count = self._search_regex(
1304 r'class="%s">([\d,]+)</span>' % re.escape(klass),
1305 video_webpage, klass, default=None)
1306 if count is not None:
1307 return int(count.replace(',', ''))
1309 like_count = _extract_count(u'likes-count')
1310 dislike_count = _extract_count(u'dislikes-count')
1313 video_subtitles = self.extract_subtitles(video_id, video_webpage)
1315 if self._downloader.params.get('listsubtitles', False):
1316 self._list_available_subtitles(video_id, video_webpage)
1319 if 'length_seconds' not in video_info:
1320 self._downloader.report_warning(u'unable to extract video duration')
1321 video_duration = None
1323 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
1326 video_annotations = None
1327 if self._downloader.params.get('writeannotations', False):
1328 video_annotations = self._extract_annotations(video_id)
1330 # Decide which formats to download
1333 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
1335 raise ValueError('Could not find vevo ID')
1336 info = json.loads(mobj.group(1))
1338 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1339 # this signatures are encrypted
1340 if 'url_encoded_fmt_stream_map' not in args:
1341 raise ValueError(u'No stream_map present') # caught below
1342 re_signature = re.compile(r'[&,]s=')
1343 m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
1345 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
1346 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
1347 m_s = re_signature.search(args.get('adaptive_fmts', u''))
1349 if 'adaptive_fmts' in video_info:
1350 video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
1352 video_info['adaptive_fmts'] = [args['adaptive_fmts']]
1356 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1357 self.report_rtmp_download()
1358 video_url_list = [(None, video_info['conn'][0])]
1359 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
1360 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
1361 if 'rtmpe%3Dyes' in encoded_url_map:
1362 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1364 for url_data_str in encoded_url_map.split(','):
1365 url_data = compat_parse_qs(url_data_str)
1366 if 'itag' in url_data and 'url' in url_data:
1367 url = url_data['url'][0]
1368 if 'sig' in url_data:
1369 url += '&signature=' + url_data['sig'][0]
1370 elif 's' in url_data:
1371 encrypted_sig = url_data['s'][0]
1372 if self._downloader.params.get('verbose'):
1374 if player_url is None:
1375 player_version = 'unknown'
1377 player_version = self._search_regex(
1378 r'-(.+)\.swf$', player_url,
1379 u'flash player', fatal=False)
1380 player_desc = 'flash player %s' % player_version
1382 player_version = self._search_regex(
1383 r'html5player-(.+?)\.js', video_webpage,
1384 'html5 player', fatal=False)
1385 player_desc = u'html5 player %s' % player_version
1387 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
1388 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
1389 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1392 jsplayer_url_json = self._search_regex(
1393 r'"assets":.+?"js":\s*("[^"]+")',
1394 video_webpage, u'JS player URL')
1395 player_url = json.loads(jsplayer_url_json)
1397 signature = self._decrypt_signature(
1398 encrypted_sig, video_id, player_url, age_gate)
1399 url += '&signature=' + signature
1400 if 'ratebypass' not in url:
1401 url += '&ratebypass=yes'
1402 url_map[url_data['itag'][0]] = url
1403 video_url_list = self._get_video_url_list(url_map)
1404 elif video_info.get('hlsvp'):
1405 manifest_url = video_info['hlsvp'][0]
1406 url_map = self._extract_from_m3u8(manifest_url, video_id)
1407 video_url_list = self._get_video_url_list(url_map)
1409 raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
1412 for itag, video_real_url in video_url_list:
1414 video_extension = self._video_extensions.get(itag, 'flv')
1415 resolution = self._video_dimensions.get(itag, {}).get('display')
1416 width = self._video_dimensions.get(itag, {}).get('width')
1417 height = self._video_dimensions.get(itag, {}).get('height')
1418 note = self._special_itags.get(itag)
1420 video_format = '{0} - {1}{2}'.format(itag if itag else video_extension,
1421 '%dx%d' % (width, height) if width is not None and height is not None else (resolution if resolution is not None else '???'),
1422 ' ('+self._special_itags[itag]+')' if itag in self._special_itags else '')
1425 'url': video_real_url,
1426 'ext': video_extension,
1427 'format': video_format,
1429 'player_url': player_url,
1430 '_resolution': resolution,
1433 'format_note': note,
1436 self._sort_formats(formats)
1440 'uploader': video_uploader,
1441 'uploader_id': video_uploader_id,
1442 'upload_date': upload_date,
1443 'title': video_title,
1444 'thumbnail': video_thumbnail,
1445 'description': video_description,
1446 'subtitles': video_subtitles,
1447 'duration': video_duration,
1448 'age_limit': 18 if age_gate else 0,
1449 'annotations': video_annotations,
1450 'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id,
1451 'view_count': view_count,
1452 'like_count': like_count,
1453 'dislike_count': dislike_count,
1457 class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
1458 IE_DESC = u'YouTube.com playlists'
1459 _VALID_URL = r"""(?:
1464 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1465 \? (?:.*?&)*? (?:p|a|list)=
1468 ((?:PL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,})
1471 ((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
1473 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&page=%s'
1474 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
1475 _VIDEO_RE = r'href="/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&[^"]*?index=(?P<index>\d+)'
1476 IE_NAME = u'youtube:playlist'
1479 def suitable(cls, url):
1480 """Receives a URL and returns True if suitable for this IE."""
1481 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1483 def _real_initialize(self):
1486 def _ids_to_results(self, ids):
1487 return [self.url_result(vid_id, 'Youtube', video_id=vid_id)
1490 def _extract_mix(self, playlist_id):
1491 # The mixes are generated from a a single video
1492 # the id of the playlist is just 'RD' + video_id
1493 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
1494 webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
1495 title_span = (get_element_by_attribute('class', 'title long-title', webpage) or
1496 get_element_by_attribute('class', 'title ', webpage))
1497 title = clean_html(title_span)
1498 video_re = r'data-index="\d+".*?href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s' % re.escape(playlist_id)
1499 ids = orderedSet(re.findall(video_re, webpage))
1500 url_results = self._ids_to_results(ids)
1502 return self.playlist_result(url_results, playlist_id, title)
1504 def _real_extract(self, url):
1505 # Extract playlist id
1506 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1508 raise ExtractorError(u'Invalid URL: %s' % url)
1509 playlist_id = mobj.group(1) or mobj.group(2)
1511 # Check if it's a video-specific URL
1512 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1513 if 'v' in query_dict:
1514 video_id = query_dict['v'][0]
1515 if self._downloader.params.get('noplaylist'):
1516 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
1517 return self.url_result(video_id, 'Youtube', video_id=video_id)
1519 self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1521 if playlist_id.startswith('RD'):
1522 # Mixes require a custom extraction process
1523 return self._extract_mix(playlist_id)
1524 if playlist_id.startswith('TL'):
1525 raise ExtractorError(u'For downloading YouTube.com top lists, use '
1526 u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
1528 # Extract the video ids from the playlist pages
1531 for page_num in itertools.count(1):
1532 url = self._TEMPLATE_URL % (playlist_id, page_num)
1533 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1534 matches = re.finditer(self._VIDEO_RE, page)
1535 # We remove the duplicates and the link with index 0
1536 # (it's not the first video of the playlist)
1537 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1540 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1543 playlist_title = self._og_search_title(page)
1545 url_results = self._ids_to_results(ids)
1546 return self.playlist_result(url_results, playlist_id, playlist_title)
1549 class YoutubeTopListIE(YoutubePlaylistIE):
1550 IE_NAME = u'youtube:toplist'
1551 IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1552 u' (Example: "yttoplist:music:Top Tracks")')
1553 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1555 def _real_extract(self, url):
1556 mobj = re.match(self._VALID_URL, url)
1557 channel = mobj.group('chann')
1558 title = mobj.group('title')
1559 query = compat_urllib_parse.urlencode({'title': title})
1560 playlist_re = 'href="([^"]+?%s[^"]+?)"' % re.escape(query)
1561 channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)
1562 link = self._html_search_regex(playlist_re, channel_page, u'list')
1563 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1565 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1567 # sometimes the webpage doesn't contain the videos
1568 # retry until we get them
1569 for i in itertools.count(0):
1570 msg = u'Downloading Youtube mix'
1572 msg += ', retry #%d' % i
1573 webpage = self._download_webpage(url, title, msg)
1574 ids = orderedSet(re.findall(video_re, webpage))
1577 url_results = self._ids_to_results(ids)
1578 return self.playlist_result(url_results, playlist_title=title)
1581 class YoutubeChannelIE(InfoExtractor):
1582 IE_DESC = u'YouTube.com channels'
1583 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1584 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1585 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1586 IE_NAME = u'youtube:channel'
1588 def extract_videos_from_page(self, page):
1590 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1591 if mobj.group(1) not in ids_in_page:
1592 ids_in_page.append(mobj.group(1))
1595 def _real_extract(self, url):
1596 # Extract channel id
1597 mobj = re.match(self._VALID_URL, url)
1599 raise ExtractorError(u'Invalid URL: %s' % url)
1601 # Download channel page
1602 channel_id = mobj.group(1)
1604 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1605 channel_page = self._download_webpage(url, channel_id)
1606 autogenerated = re.search(r'''(?x)
1608 channel-header-autogenerated-label|
1609 yt-channel-title-autogenerated
1610 )[^"]*"''', channel_page) is not None
1613 # The videos are contained in a single page
1614 # the ajax pages can't be used, they are empty
1615 video_ids = self.extract_videos_from_page(channel_page)
1617 # Download all channel pages using the json-based channel_ajax query
1618 for pagenum in itertools.count(1):
1619 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1620 page = self._download_webpage(url, channel_id,
1621 u'Downloading page #%s' % pagenum)
1623 page = json.loads(page)
1625 ids_in_page = self.extract_videos_from_page(page['content_html'])
1626 video_ids.extend(ids_in_page)
1628 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1631 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1633 url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1634 for video_id in video_ids]
1635 return self.playlist_result(url_entries, channel_id)
1638 class YoutubeUserIE(InfoExtractor):
1639 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
1640 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1641 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1642 _GDATA_PAGE_SIZE = 50
1643 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1644 IE_NAME = u'youtube:user'
1647 def suitable(cls, url):
1648 # Don't return True if the url can be extracted with other youtube
1649 # extractor, the regex would is too permissive and it would match.
1650 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1651 if any(ie.suitable(url) for ie in other_ies): return False
1652 else: return super(YoutubeUserIE, cls).suitable(url)
1654 def _real_extract(self, url):
1656 mobj = re.match(self._VALID_URL, url)
1658 raise ExtractorError(u'Invalid URL: %s' % url)
1660 username = mobj.group(1)
1662 # Download video ids using YouTube Data API. Result size per
1663 # query is limited (currently to 50 videos) so we need to query
1664 # page by page until there are no video ids - it means we got
1669 for pagenum in itertools.count(0):
1670 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1672 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1673 page = self._download_webpage(gdata_url, username,
1674 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1677 response = json.loads(page)
1678 except ValueError as err:
1679 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1680 if 'entry' not in response['feed']:
1681 # Number of videos is a multiple of self._MAX_RESULTS
1684 # Extract video identifiers
1685 entries = response['feed']['entry']
1686 for entry in entries:
1687 title = entry['title']['$t']
1688 video_id = entry['id']['$t'].split('/')[-1]
1689 url_results.append({
1692 'ie_key': 'Youtube',
1697 # A little optimization - if current page is not
1698 # "full", ie. does not contain PAGE_SIZE video ids then
1699 # we can assume that this page is the last one - there
1700 # are no more ids on further pages - no need to query
1703 if len(entries) < self._GDATA_PAGE_SIZE:
1706 return self.playlist_result(url_results, playlist_title=username)
1709 class YoutubeSearchIE(SearchInfoExtractor):
1710 IE_DESC = u'YouTube.com searches'
1711 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1713 IE_NAME = u'youtube:search'
1714 _SEARCH_KEY = 'ytsearch'
1716 def _get_n_results(self, query, n):
1717 """Get a specified number of results for a query"""
1723 while (50 * pagenum) < limit:
1724 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1725 data_json = self._download_webpage(
1726 result_url, video_id=u'query "%s"' % query,
1727 note=u'Downloading page %s' % (pagenum + 1),
1728 errnote=u'Unable to download API page')
1729 data = json.loads(data_json)
1730 api_response = data['data']
1732 if 'items' not in api_response:
1733 raise ExtractorError(u'[youtube] No video results')
1735 new_ids = list(video['id'] for video in api_response['items'])
1736 video_ids += new_ids
1738 limit = min(n, api_response['totalItems'])
1741 if len(video_ids) > n:
1742 video_ids = video_ids[:n]
1743 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1744 for video_id in video_ids]
1745 return self.playlist_result(videos, query)
1747 class YoutubeSearchDateIE(YoutubeSearchIE):
1748 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
1749 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1750 _SEARCH_KEY = 'ytsearchdate'
1751 IE_DESC = u'YouTube.com searches, newest videos first'
1753 class YoutubeShowIE(InfoExtractor):
1754 IE_DESC = u'YouTube.com (multi-season) shows'
1755 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1756 IE_NAME = u'youtube:show'
1758 def _real_extract(self, url):
1759 mobj = re.match(self._VALID_URL, url)
1760 show_name = mobj.group(1)
1761 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1762 # There's one playlist for each season of the show
1763 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1764 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1765 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1768 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1770 Base class for extractors that fetch info from
1771 http://www.youtube.com/feed_ajax
1772 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1774 _LOGIN_REQUIRED = True
1775 # use action_load_personal_feed instead of action_load_system_feed
1776 _PERSONAL_FEED = False
1779 def _FEED_TEMPLATE(self):
1780 action = 'action_load_system_feed'
1781 if self._PERSONAL_FEED:
1782 action = 'action_load_personal_feed'
1783 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1787 return u'youtube:%s' % self._FEED_NAME
1789 def _real_initialize(self):
1792 def _real_extract(self, url):
1795 for i in itertools.count(1):
1796 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1797 u'%s feed' % self._FEED_NAME,
1798 u'Downloading page %s' % i)
1799 info = json.loads(info)
1800 feed_html = info['feed_html']
1801 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1802 ids = orderedSet(m.group(1) for m in m_ids)
1803 feed_entries.extend(
1804 self.url_result(video_id, 'Youtube', video_id=video_id)
1805 for video_id in ids)
1806 if info['paging'] is None:
1808 paging = info['paging']
1809 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1811 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1812 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1813 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1814 _FEED_NAME = 'subscriptions'
1815 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1817 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1818 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1819 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1820 _FEED_NAME = 'recommended'
1821 _PLAYLIST_TITLE = u'Youtube Recommended videos'
1823 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1824 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1825 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1826 _FEED_NAME = 'watch_later'
1827 _PLAYLIST_TITLE = u'Youtube Watch Later'
1828 _PERSONAL_FEED = True
1830 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1831 IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'
1832 _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'
1833 _FEED_NAME = 'history'
1834 _PERSONAL_FEED = True
1835 _PLAYLIST_TITLE = u'Youtube Watch History'
1837 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1838 IE_NAME = u'youtube:favorites'
1839 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1840 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1841 _LOGIN_REQUIRED = True
1843 def _real_extract(self, url):
1844 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1845 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1846 return self.url_result(playlist_id, 'YoutubePlaylist')
1849 class YoutubeTruncatedURLIE(InfoExtractor):
1850 IE_NAME = 'youtube:truncated_url'
1851 IE_DESC = False # Do not list
1852 _VALID_URL = r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$'
1854 def _real_extract(self, url):
1855 raise ExtractorError(
1856 u'Did you forget to quote the URL? Remember that & is a meta '
1857 u'character in most shells, so you want to put the URL in quotes, '
1859 u'\'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\''
1860 u' (or simply youtube-dl BaW_jenozKc ).',