15 from .common import InfoExtractor, SearchInfoExtractor
16 from .subtitles import SubtitlesInfoExtractor
21 compat_urllib_request,
28 get_element_by_attribute,
36 class YoutubeBaseInfoExtractor(InfoExtractor):
37 """Provide base functions for Youtube extractors"""
38 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
39 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
40 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
41 _NETRC_MACHINE = 'youtube'
42 # If True it will raise an error if no login info is provided
43 _LOGIN_REQUIRED = False
45 def _set_language(self):
46 return bool(self._download_webpage(
48 note=u'Setting language', errnote='unable to set language',
52 (username, password) = self._get_login_info()
53 # No authentication to be performed
55 if self._LOGIN_REQUIRED:
56 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
59 login_page = self._download_webpage(
60 self._LOGIN_URL, None,
61 note=u'Downloading login page',
62 errnote=u'unable to fetch login page', fatal=False)
63 if login_page is False:
66 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
67 login_page, u'Login GALX parameter')
71 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
75 u'PersistentCookie': u'yes',
77 u'bgresponse': u'js_disabled',
78 u'checkConnection': u'',
79 u'checkedDomains': u'youtube',
84 u'signIn': u'Sign in',
86 u'service': u'youtube',
90 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
92 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
93 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
95 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
96 login_results = self._download_webpage(
98 note=u'Logging in', errnote=u'unable to log in', fatal=False)
99 if login_results is False:
101 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
102 self._downloader.report_warning(u'unable to log in: bad username or password')
106 def _confirm_age(self):
109 'action_confirm': 'Confirm',
111 req = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
113 self._download_webpage(
115 note=u'Confirming age', errnote=u'Unable to confirm age')
118 def _real_initialize(self):
119 if self._downloader is None:
121 if not self._set_language():
123 if not self._login():
128 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
129 IE_DESC = u'YouTube.com'
130 _VALID_URL = r"""(?x)^
132 (?:https?://|//)? # http(s):// or protocol-independent URL (optional)
133 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
134 tube\.majestyc\.net/|
135 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
136 (?:.*?\#/)? # handle anchor (#/) redirect urls
137 (?: # the various things that can precede the ID:
138 (?:(?:v|embed|e)/) # v/ or embed/ or e/
139 |(?: # or the v= param in all its forms
140 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
141 (?:\?|\#!?) # the params delimiter ? or # or #!
142 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
146 |youtu\.be/ # just youtu.be/xxxx
148 )? # all until now is optional -> you can pass the naked ID
149 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
150 (?(1).+)? # if we found the ID, everything can follow
152 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
153 # Listed in order of quality
154 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
155 # Apple HTTP Live Streaming
156 '96', '95', '94', '93', '92', '132', '151',
158 '85', '84', '102', '83', '101', '82', '100',
160 '138', '137', '248', '136', '247', '135', '246',
161 '245', '244', '134', '243', '133', '242', '160',
163 '141', '172', '140', '171', '139',
165 _video_extensions = {
187 # Apple HTTP Live Streaming
221 _video_dimensions = {
222 '5': {'width': 400, 'height': 240},
225 '17': {'width': 176, 'height': 144},
226 '18': {'width': 640, 'height': 360},
227 '22': {'width': 1280, 'height': 720},
228 '34': {'width': 640, 'height': 360},
229 '35': {'width': 854, 'height': 480},
230 '36': {'width': 320, 'height': 240},
231 '37': {'width': 1920, 'height': 1080},
232 '38': {'width': 4096, 'height': 3072},
233 '43': {'width': 640, 'height': 360},
234 '44': {'width': 854, 'height': 480},
235 '45': {'width': 1280, 'height': 720},
236 '46': {'width': 1920, 'height': 1080},
237 '82': {'height': 360, 'display': '360p'},
238 '83': {'height': 480, 'display': '480p'},
239 '84': {'height': 720, 'display': '720p'},
240 '85': {'height': 1080, 'display': '1080p'},
241 '92': {'height': 240, 'display': '240p'},
242 '93': {'height': 360, 'display': '360p'},
243 '94': {'height': 480, 'display': '480p'},
244 '95': {'height': 720, 'display': '720p'},
245 '96': {'height': 1080, 'display': '1080p'},
246 '100': {'height': 360, 'display': '360p'},
247 '101': {'height': 480, 'display': '480p'},
248 '102': {'height': 720, 'display': '720p'},
249 '132': {'height': 240, 'display': '240p'},
250 '151': {'height': 72, 'display': '72p'},
251 '133': {'height': 240, 'display': '240p'},
252 '134': {'height': 360, 'display': '360p'},
253 '135': {'height': 480, 'display': '480p'},
254 '136': {'height': 720, 'display': '720p'},
255 '137': {'height': 1080, 'display': '1080p'},
256 '138': {'height': 1081, 'display': '>1080p'},
257 '139': {'display': '48k'},
258 '140': {'display': '128k'},
259 '141': {'display': '256k'},
260 '160': {'height': 192, 'display': '192p'},
261 '171': {'display': '128k'},
262 '172': {'display': '256k'},
263 '242': {'height': 240, 'display': '240p'},
264 '243': {'height': 360, 'display': '360p'},
265 '244': {'height': 480, 'display': '480p'},
266 '245': {'height': 480, 'display': '480p'},
267 '246': {'height': 480, 'display': '480p'},
268 '247': {'height': 720, 'display': '720p'},
269 '248': {'height': 1080, 'display': '1080p'},
303 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
304 u"file": u"BaW_jenozKc.mp4",
306 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
307 u"uploader": u"Philipp Hagemeister",
308 u"uploader_id": u"phihag",
309 u"upload_date": u"20121002",
310 u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
314 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
315 u"file": u"UxxajLWwzqY.mp4",
316 u"note": u"Test generic use_cipher_signature video (#897)",
318 u"upload_date": u"20120506",
319 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
320 u"description": u"md5:5b292926389560516e384ac437c0ec07",
321 u"uploader": u"Icona Pop",
322 u"uploader_id": u"IconaPop"
326 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
327 u"file": u"07FYdnEawAQ.mp4",
328 u"note": u"Test VEVO video with age protection (#956)",
330 u"upload_date": u"20130703",
331 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
332 u"description": u"md5:64249768eec3bc4276236606ea996373",
333 u"uploader": u"justintimberlakeVEVO",
334 u"uploader_id": u"justintimberlakeVEVO"
338 u"url": u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
339 u"file": u"yZIXLfi8CZQ.mp4",
340 u"note": u"Embed-only video (#1746)",
342 u"upload_date": u"20120608",
343 u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
344 u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
345 u"uploader": u"SET India",
346 u"uploader_id": u"setindia"
353 def suitable(cls, url):
354 """Receives a URL and returns True if suitable for this IE."""
355 if YoutubePlaylistIE.suitable(url): return False
356 return re.match(cls._VALID_URL, url) is not None
358 def __init__(self, *args, **kwargs):
359 super(YoutubeIE, self).__init__(*args, **kwargs)
360 self._player_cache = {}
362 def report_video_info_webpage_download(self, video_id):
363 """Report attempt to download video info webpage."""
364 self.to_screen(u'%s: Downloading video info webpage' % video_id)
366 def report_information_extraction(self, video_id):
367 """Report attempt to extract video information."""
368 self.to_screen(u'%s: Extracting video information' % video_id)
370 def report_unavailable_format(self, video_id, format):
371 """Report extracted video URL."""
372 self.to_screen(u'%s: Format %s not available' % (video_id, format))
374 def report_rtmp_download(self):
375 """Indicate the download will use the RTMP protocol."""
376 self.to_screen(u'RTMP download detected')
378 def _extract_signature_function(self, video_id, player_url, slen):
379 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
381 player_type = id_m.group('ext')
382 player_id = id_m.group('id')
384 # Read from filesystem cache
385 func_id = '%s_%s_%d' % (player_type, player_id, slen)
386 assert os.path.basename(func_id) == func_id
387 cache_dir = get_cachedir(self._downloader.params)
389 cache_enabled = cache_dir is not None
391 cache_fn = os.path.join(os.path.expanduser(cache_dir),
395 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
396 cache_spec = json.load(cachef)
397 return lambda s: u''.join(s[i] for i in cache_spec)
399 pass # No cache available
401 if player_type == 'js':
402 code = self._download_webpage(
403 player_url, video_id,
404 note=u'Downloading %s player %s' % (player_type, player_id),
405 errnote=u'Download of %s failed' % player_url)
406 res = self._parse_sig_js(code)
407 elif player_type == 'swf':
408 urlh = self._request_webpage(
409 player_url, video_id,
410 note=u'Downloading %s player %s' % (player_type, player_id),
411 errnote=u'Download of %s failed' % player_url)
413 res = self._parse_sig_swf(code)
415 assert False, 'Invalid player type %r' % player_type
419 test_string = u''.join(map(compat_chr, range(slen)))
420 cache_res = res(test_string)
421 cache_spec = [ord(c) for c in cache_res]
423 os.makedirs(os.path.dirname(cache_fn))
424 except OSError as ose:
425 if ose.errno != errno.EEXIST:
427 write_json_file(cache_spec, cache_fn)
429 tb = traceback.format_exc()
430 self._downloader.report_warning(
431 u'Writing cache to %r failed: %s' % (cache_fn, tb))
435 def _print_sig_code(self, func, slen):
436 def gen_sig_code(idxs):
437 def _genslice(start, end, step):
438 starts = u'' if start == 0 else str(start)
439 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
440 steps = u'' if step == 1 else (u':%d' % step)
441 return u's[%s%s%s]' % (starts, ends, steps)
444 start = '(Never used)' # Quelch pyflakes warnings - start will be
445 # set as soon as step is set
446 for i, prev in zip(idxs[1:], idxs[:-1]):
450 yield _genslice(start, prev, step)
453 if i - prev in [-1, 1]:
458 yield u's[%d]' % prev
462 yield _genslice(start, i, step)
464 test_string = u''.join(map(compat_chr, range(slen)))
465 cache_res = func(test_string)
466 cache_spec = [ord(c) for c in cache_res]
467 expr_code = u' + '.join(gen_sig_code(cache_spec))
468 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
469 self.to_screen(u'Extracted signature function:\n' + code)
471 def _parse_sig_js(self, jscode):
472 funcname = self._search_regex(
473 r'signature=([a-zA-Z]+)', jscode,
474 u'Initial JS player signature function name')
479 return string.lowercase.index(varname)
481 def interpret_statement(stmt, local_vars, allow_recursion=20):
482 if allow_recursion < 0:
483 raise ExtractorError(u'Recursion limit reached')
485 if stmt.startswith(u'var '):
486 stmt = stmt[len(u'var '):]
487 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
488 r'=(?P<expr>.*)$', stmt)
490 if ass_m.groupdict().get('index'):
492 lvar = local_vars[ass_m.group('out')]
493 idx = interpret_expression(ass_m.group('index'),
494 local_vars, allow_recursion)
495 assert isinstance(idx, int)
498 expr = ass_m.group('expr')
501 local_vars[ass_m.group('out')] = val
503 expr = ass_m.group('expr')
504 elif stmt.startswith(u'return '):
506 expr = stmt[len(u'return '):]
508 raise ExtractorError(
509 u'Cannot determine left side of statement in %r' % stmt)
511 v = interpret_expression(expr, local_vars, allow_recursion)
514 def interpret_expression(expr, local_vars, allow_recursion):
519 return local_vars[expr]
521 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
523 member = m.group('member')
524 val = local_vars[m.group('in')]
525 if member == 'split("")':
527 if member == 'join("")':
529 if member == 'length':
531 if member == 'reverse()':
533 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
535 idx = interpret_expression(
536 slice_m.group('idx'), local_vars, allow_recursion-1)
540 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
542 val = local_vars[m.group('in')]
543 idx = interpret_expression(m.group('idx'), local_vars,
547 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
549 a = interpret_expression(m.group('a'),
550 local_vars, allow_recursion)
551 b = interpret_expression(m.group('b'),
552 local_vars, allow_recursion)
556 r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
558 fname = m.group('func')
559 if fname not in functions:
560 functions[fname] = extract_function(fname)
561 argvals = [int(v) if v.isdigit() else local_vars[v]
562 for v in m.group('args').split(',')]
563 return functions[fname](argvals)
564 raise ExtractorError(u'Unsupported JS expression %r' % expr)
566 def extract_function(funcname):
568 r'function ' + re.escape(funcname) +
569 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
571 argnames = func_m.group('args').split(',')
574 local_vars = dict(zip(argnames, args))
575 for stmt in func_m.group('code').split(';'):
576 res = interpret_statement(stmt, local_vars)
580 initial_function = extract_function(funcname)
581 return lambda s: initial_function([s])
583 def _parse_sig_swf(self, file_contents):
584 if file_contents[1:3] != b'WS':
585 raise ExtractorError(
586 u'Not an SWF file; header is %r' % file_contents[:3])
587 if file_contents[:1] == b'C':
588 content = zlib.decompress(file_contents[8:])
590 raise NotImplementedError(u'Unsupported compression format %r' %
593 def extract_tags(content):
595 while pos < len(content):
596 header16 = struct.unpack('<H', content[pos:pos+2])[0]
598 tag_code = header16 >> 6
599 tag_len = header16 & 0x3f
601 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
603 assert pos+tag_len <= len(content)
604 yield (tag_code, content[pos:pos+tag_len])
608 for tag_code, tag in extract_tags(content)
610 p = code_tag.index(b'\0', 4) + 1
611 code_reader = io.BytesIO(code_tag[p:])
613 # Parse ABC (AVM2 ByteCode)
614 def read_int(reader=None):
622 b = struct.unpack('<B', buf)[0]
623 res = res | ((b & 0x7f) << shift)
629 def u30(reader=None):
630 res = read_int(reader)
631 assert res & 0xf0000000 == 0
635 def s32(reader=None):
637 if v & 0x80000000 != 0:
638 v = - ((v ^ 0xffffffff) + 1)
641 def read_string(reader=None):
645 resb = reader.read(slen)
646 assert len(resb) == slen
647 return resb.decode('utf-8')
649 def read_bytes(count, reader=None):
652 resb = reader.read(count)
653 assert len(resb) == count
656 def read_byte(reader=None):
657 resb = read_bytes(1, reader=reader)
658 res = struct.unpack('<B', resb)[0]
661 # minor_version + major_version
666 for _c in range(1, int_count):
669 for _c in range(1, uint_count):
672 read_bytes((double_count-1) * 8)
674 constant_strings = [u'']
675 for _c in range(1, string_count):
677 constant_strings.append(s)
678 namespace_count = u30()
679 for _c in range(1, namespace_count):
683 for _c in range(1, ns_set_count):
685 for _c2 in range(count):
687 multiname_count = u30()
696 0x0e: 2, # MultinameA
697 0x1b: 1, # MultinameL
698 0x1c: 1, # MultinameLA
701 for _c in range(1, multiname_count):
703 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
705 u30() # namespace_idx
707 multinames.append(constant_strings[name_idx])
709 multinames.append('[MULTINAME kind: %d]' % kind)
710 for _c2 in range(MULTINAME_SIZES[kind]):
715 MethodInfo = collections.namedtuple(
717 ['NEED_ARGUMENTS', 'NEED_REST'])
719 for method_id in range(method_count):
722 for _ in range(param_count):
724 u30() # name index (always 0 for youtube)
726 if flags & 0x08 != 0:
729 for c in range(option_count):
732 if flags & 0x80 != 0:
733 # Param names present
734 for _ in range(param_count):
736 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
737 method_infos.append(mi)
740 metadata_count = u30()
741 for _c in range(metadata_count):
744 for _c2 in range(item_count):
748 def parse_traits_info():
749 trait_name_idx = u30()
750 kind_full = read_byte()
751 kind = kind_full & 0x0f
752 attrs = kind_full >> 4
754 if kind in [0x00, 0x06]: # Slot or Const
756 u30() # type_name_idx
760 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
763 methods[multinames[trait_name_idx]] = method_idx
764 elif kind == 0x04: # Class
767 elif kind == 0x05: # Function
770 methods[function_idx] = multinames[trait_name_idx]
772 raise ExtractorError(u'Unsupported trait kind %d' % kind)
774 if attrs & 0x4 != 0: # Metadata present
775 metadata_count = u30()
776 for _c3 in range(metadata_count):
777 u30() # metadata index
782 TARGET_CLASSNAME = u'SignatureDecipher'
783 searched_idx = multinames.index(TARGET_CLASSNAME)
784 searched_class_id = None
786 for class_id in range(class_count):
788 if name_idx == searched_idx:
789 # We found the class we're looking for!
790 searched_class_id = class_id
791 u30() # super_name idx
793 if flags & 0x08 != 0: # Protected namespace is present
794 u30() # protected_ns_idx
796 for _c2 in range(intrf_count):
800 for _c2 in range(trait_count):
803 if searched_class_id is None:
804 raise ExtractorError(u'Target class %r not found' %
809 for class_id in range(class_count):
812 for _c2 in range(trait_count):
813 trait_methods = parse_traits_info()
814 if class_id == searched_class_id:
815 method_names.update(trait_methods.items())
816 method_idxs.update(dict(
818 for name, idx in trait_methods.items()))
822 for _c in range(script_count):
825 for _c2 in range(trait_count):
829 method_body_count = u30()
830 Method = collections.namedtuple('Method', ['code', 'local_count'])
832 for _c in range(method_body_count):
836 u30() # init_scope_depth
837 u30() # max_scope_depth
839 code = read_bytes(code_length)
840 if method_idx in method_idxs:
841 m = Method(code, local_count)
842 methods[method_idxs[method_idx]] = m
843 exception_count = u30()
844 for _c2 in range(exception_count):
851 for _c2 in range(trait_count):
854 assert p + code_reader.tell() == len(code_tag)
855 assert len(methods) == len(method_idxs)
857 method_pyfunctions = {}
859 def extract_function(func_name):
860 if func_name in method_pyfunctions:
861 return method_pyfunctions[func_name]
862 if func_name not in methods:
863 raise ExtractorError(u'Cannot find function %r' % func_name)
864 m = methods[func_name]
867 registers = ['(this)'] + list(args) + [None] * m.local_count
869 coder = io.BytesIO(m.code)
871 opcode = struct.unpack('!B', coder.read(1))[0]
872 if opcode == 36: # pushbyte
873 v = struct.unpack('!B', coder.read(1))[0]
875 elif opcode == 44: # pushstring
877 stack.append(constant_strings[idx])
878 elif opcode == 48: # pushscope
879 # We don't implement the scope register, so we'll just
880 # ignore the popped value
882 elif opcode == 70: # callproperty
884 mname = multinames[index]
885 arg_count = u30(coder)
886 args = list(reversed(
887 [stack.pop() for _ in range(arg_count)]))
889 if mname == u'split':
890 assert len(args) == 1
891 assert isinstance(args[0], compat_str)
892 assert isinstance(obj, compat_str)
896 res = obj.split(args[0])
898 elif mname == u'slice':
899 assert len(args) == 1
900 assert isinstance(args[0], int)
901 assert isinstance(obj, list)
904 elif mname == u'join':
905 assert len(args) == 1
906 assert isinstance(args[0], compat_str)
907 assert isinstance(obj, list)
908 res = args[0].join(obj)
910 elif mname in method_pyfunctions:
911 stack.append(method_pyfunctions[mname](args))
913 raise NotImplementedError(
914 u'Unsupported property %r on %r'
916 elif opcode == 72: # returnvalue
919 elif opcode == 79: # callpropvoid
921 mname = multinames[index]
922 arg_count = u30(coder)
923 args = list(reversed(
924 [stack.pop() for _ in range(arg_count)]))
926 if mname == u'reverse':
927 assert isinstance(obj, list)
930 raise NotImplementedError(
931 u'Unsupported (void) property %r on %r'
933 elif opcode == 93: # findpropstrict
935 mname = multinames[index]
936 res = extract_function(mname)
938 elif opcode == 97: # setproperty
943 assert isinstance(obj, list)
944 assert isinstance(idx, int)
946 elif opcode == 98: # getlocal
948 stack.append(registers[index])
949 elif opcode == 99: # setlocal
952 registers[index] = value
953 elif opcode == 102: # getproperty
955 pname = multinames[index]
956 if pname == u'length':
958 assert isinstance(obj, list)
959 stack.append(len(obj))
960 else: # Assume attribute access
962 assert isinstance(idx, int)
964 assert isinstance(obj, list)
965 stack.append(obj[idx])
966 elif opcode == 128: # coerce
968 elif opcode == 133: # coerce_s
969 assert isinstance(stack[-1], (type(None), compat_str))
970 elif opcode == 164: # modulo
973 res = value1 % value2
975 elif opcode == 208: # getlocal_0
976 stack.append(registers[0])
977 elif opcode == 209: # getlocal_1
978 stack.append(registers[1])
979 elif opcode == 210: # getlocal_2
980 stack.append(registers[2])
981 elif opcode == 211: # getlocal_3
982 stack.append(registers[3])
983 elif opcode == 214: # setlocal_2
984 registers[2] = stack.pop()
985 elif opcode == 215: # setlocal_3
986 registers[3] = stack.pop()
988 raise NotImplementedError(
989 u'Unsupported opcode %d' % opcode)
991 method_pyfunctions[func_name] = resfunc
994 initial_function = extract_function(u'decipher')
995 return lambda s: initial_function([s])
997 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
998 """Turn the encrypted s field into a working signature"""
1000 if player_url is not None:
1001 if player_url.startswith(u'//'):
1002 player_url = u'https:' + player_url
1004 player_id = (player_url, len(s))
1005 if player_id not in self._player_cache:
1006 func = self._extract_signature_function(
1007 video_id, player_url, len(s)
1009 self._player_cache[player_id] = func
1010 func = self._player_cache[player_id]
1011 if self._downloader.params.get('youtube_print_sig_code'):
1012 self._print_sig_code(func, len(s))
1015 tb = traceback.format_exc()
1016 self._downloader.report_warning(
1017 u'Automatic signature extraction failed: ' + tb)
1019 self._downloader.report_warning(
1020 u'Warning: Falling back to static signature algorithm')
1022 return self._static_decrypt_signature(
1023 s, video_id, player_url, age_gate)
1025 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
1027 # The videos with age protection use another player, so the
1028 # algorithms can be different.
1030 return s[2:63] + s[82] + s[64:82] + s[63]
1033 return s[86:29:-1] + s[88] + s[28:5:-1]
1035 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
1037 return s[84:27:-1] + s[86] + s[26:5:-1]
1039 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
1041 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
1043 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
1045 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
1047 return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1]
1049 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
1051 return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1]
1053 return s[80:63:-1] + s[0] + s[62:0:-1] + s[63]
1055 return s[80:37:-1] + s[7] + s[36:7:-1] + s[0] + s[6:0:-1] + s[37]
1057 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1059 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
1061 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1064 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
1066 def _get_available_subtitles(self, video_id, webpage):
1068 sub_list = self._download_webpage(
1069 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1070 video_id, note=False)
1071 except ExtractorError as err:
1072 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
1074 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
1079 params = compat_urllib_parse.urlencode({
1082 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
1083 'name': l[0].encode('utf-8'),
1085 url = u'http://www.youtube.com/api/timedtext?' + params
1086 sub_lang_list[lang] = url
1087 if not sub_lang_list:
1088 self._downloader.report_warning(u'video doesn\'t have subtitles')
1090 return sub_lang_list
1092 def _get_available_automatic_caption(self, video_id, webpage):
1093 """We need the webpage for getting the captions url, pass it as an
1094 argument to speed up the process."""
1095 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
1096 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1097 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
1098 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
1100 self._downloader.report_warning(err_msg)
1102 player_config = json.loads(mobj.group(1))
1104 args = player_config[u'args']
1105 caption_url = args[u'ttsurl']
1106 timestamp = args[u'timestamp']
1107 # We get the available subtitles
1108 list_params = compat_urllib_parse.urlencode({
1113 list_url = caption_url + '&' + list_params
1114 caption_list = self._download_xml(list_url, video_id)
1115 original_lang_node = caption_list.find('track')
1116 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
1117 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1119 original_lang = original_lang_node.attrib['lang_code']
1122 for lang_node in caption_list.findall('target'):
1123 sub_lang = lang_node.attrib['lang_code']
1124 params = compat_urllib_parse.urlencode({
1125 'lang': original_lang,
1131 sub_lang_list[sub_lang] = caption_url + '&' + params
1132 return sub_lang_list
1133 # An extractor error can be raise by the download process if there are
1134 # no automatic captions but there are subtitles
1135 except (KeyError, ExtractorError):
1136 self._downloader.report_warning(err_msg)
1139 def _extract_id(self, url):
1140 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1142 raise ExtractorError(u'Invalid URL: %s' % url)
1143 video_id = mobj.group(2)
1146 def _get_video_url_list(self, url_map):
1148 Transform a dictionary in the format {itag:url} to a list of (itag, url)
1149 with the requested formats.
1151 existing_formats = [x for x in self._available_formats if x in url_map]
1152 if len(existing_formats) == 0:
1153 raise ExtractorError(u'no known formats available for video')
1154 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1155 video_url_list.reverse() # order worst to best
1156 return video_url_list
1158 def _extract_from_m3u8(self, manifest_url, video_id):
1160 def _get_urls(_manifest):
1161 lines = _manifest.split('\n')
1162 urls = filter(lambda l: l and not l.startswith('#'),
1165 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1166 formats_urls = _get_urls(manifest)
1167 for format_url in formats_urls:
1168 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1169 url_map[itag] = format_url
1172 def _extract_annotations(self, video_id):
1173 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
1174 return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
1176 def _real_extract(self, url):
1177 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1178 mobj = re.search(self._NEXT_URL_RE, url)
1180 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1181 video_id = self._extract_id(url)
1184 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1185 video_webpage = self._download_webpage(url, video_id)
1187 # Attempt to extract SWF player URL
1188 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1189 if mobj is not None:
1190 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1195 self.report_video_info_webpage_download(video_id)
1196 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1197 self.report_age_confirmation()
1199 # We simulate the access to the video from www.youtube.com/v/{video_id}
1200 # this can be viewed without login into Youtube
1201 data = compat_urllib_parse.urlencode({'video_id': video_id,
1202 'el': 'player_embedded',
1205 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1209 video_info_url = 'https://www.youtube.com/get_video_info?' + data
1210 video_info_webpage = self._download_webpage(video_info_url, video_id,
1212 errnote='unable to download video info webpage')
1213 video_info = compat_parse_qs(video_info_webpage)
1216 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1217 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1218 % (video_id, el_type))
1219 video_info_webpage = self._download_webpage(video_info_url, video_id,
1221 errnote='unable to download video info webpage')
1222 video_info = compat_parse_qs(video_info_webpage)
1223 if 'token' in video_info:
1225 if 'token' not in video_info:
1226 if 'reason' in video_info:
1227 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
1229 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1231 if 'view_count' in video_info:
1232 view_count = int(video_info['view_count'][0])
1236 # Check for "rental" videos
1237 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1238 raise ExtractorError(u'"rental" videos not supported')
1240 # Start extracting information
1241 self.report_information_extraction(video_id)
1244 if 'author' not in video_info:
1245 raise ExtractorError(u'Unable to extract uploader name')
1246 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1249 video_uploader_id = None
1250 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1251 if mobj is not None:
1252 video_uploader_id = mobj.group(1)
1254 self._downloader.report_warning(u'unable to extract uploader nickname')
1257 if 'title' in video_info:
1258 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1260 self._downloader.report_warning(u'Unable to extract video title')
1264 # We try first to get a high quality image:
1265 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1266 video_webpage, re.DOTALL)
1267 if m_thumb is not None:
1268 video_thumbnail = m_thumb.group(1)
1269 elif 'thumbnail_url' not in video_info:
1270 self._downloader.report_warning(u'unable to extract video thumbnail')
1271 video_thumbnail = None
1272 else: # don't panic if we can't find it
1273 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1277 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1278 if mobj is not None:
1279 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1280 upload_date = unified_strdate(upload_date)
1283 video_description = get_element_by_id("eow-description", video_webpage)
1284 if video_description:
1285 video_description = re.sub(r'''(?x)
1287 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1289 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1290 class="yt-uix-redirect-link"\s*>
1293 ''', r'\1', video_description)
1294 video_description = clean_html(video_description)
1296 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1298 video_description = unescapeHTML(fd_mobj.group(1))
1300 video_description = u''
1302 def _extract_count(klass):
1303 count = self._search_regex(
1304 r'class="%s">([\d,]+)</span>' % re.escape(klass),
1305 video_webpage, klass, default=None)
1306 if count is not None:
1307 return int(count.replace(',', ''))
1309 like_count = _extract_count(u'likes-count')
1310 dislike_count = _extract_count(u'dislikes-count')
1313 video_subtitles = self.extract_subtitles(video_id, video_webpage)
1315 if self._downloader.params.get('listsubtitles', False):
1316 self._list_available_subtitles(video_id, video_webpage)
1319 if 'length_seconds' not in video_info:
1320 self._downloader.report_warning(u'unable to extract video duration')
1321 video_duration = None
1323 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
1326 video_annotations = None
1327 if self._downloader.params.get('writeannotations', False):
1328 video_annotations = self._extract_annotations(video_id)
1330 # Decide which formats to download
1333 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
1335 raise ValueError('Could not find vevo ID')
1336 info = json.loads(mobj.group(1))
1338 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1339 # this signatures are encrypted
1340 if 'url_encoded_fmt_stream_map' not in args:
1341 raise ValueError(u'No stream_map present') # caught below
1342 re_signature = re.compile(r'[&,]s=')
1343 m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
1345 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
1346 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
1347 m_s = re_signature.search(args.get('adaptive_fmts', u''))
1349 if 'adaptive_fmts' in video_info:
1350 video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
1352 video_info['adaptive_fmts'] = [args['adaptive_fmts']]
1356 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1357 self.report_rtmp_download()
1358 video_url_list = [(None, video_info['conn'][0])]
1359 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
1360 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
1361 if 'rtmpe%3Dyes' in encoded_url_map:
1362 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1364 for url_data_str in encoded_url_map.split(','):
1365 url_data = compat_parse_qs(url_data_str)
1366 if 'itag' in url_data and 'url' in url_data:
1367 url = url_data['url'][0]
1368 if 'sig' in url_data:
1369 url += '&signature=' + url_data['sig'][0]
1370 elif 's' in url_data:
1371 encrypted_sig = url_data['s'][0]
1372 if self._downloader.params.get('verbose'):
1374 if player_url is None:
1375 player_version = 'unknown'
1377 player_version = self._search_regex(
1378 r'-(.+)\.swf$', player_url,
1379 u'flash player', fatal=False)
1380 player_desc = 'flash player %s' % player_version
1382 player_version = self._search_regex(
1383 r'html5player-(.+?)\.js', video_webpage,
1384 'html5 player', fatal=False)
1385 player_desc = u'html5 player %s' % player_version
1387 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
1388 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
1389 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1392 jsplayer_url_json = self._search_regex(
1393 r'"assets":.+?"js":\s*("[^"]+")',
1394 video_webpage, u'JS player URL')
1395 player_url = json.loads(jsplayer_url_json)
1397 signature = self._decrypt_signature(
1398 encrypted_sig, video_id, player_url, age_gate)
1399 url += '&signature=' + signature
1400 if 'ratebypass' not in url:
1401 url += '&ratebypass=yes'
1402 url_map[url_data['itag'][0]] = url
1403 video_url_list = self._get_video_url_list(url_map)
1404 elif video_info.get('hlsvp'):
1405 manifest_url = video_info['hlsvp'][0]
1406 url_map = self._extract_from_m3u8(manifest_url, video_id)
1407 video_url_list = self._get_video_url_list(url_map)
1409 raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
1412 for itag, video_real_url in video_url_list:
1414 video_extension = self._video_extensions.get(itag, 'flv')
1415 resolution = self._video_dimensions.get(itag, {}).get('display')
1416 width = self._video_dimensions.get(itag, {}).get('width')
1417 height = self._video_dimensions.get(itag, {}).get('height')
1418 note = self._special_itags.get(itag)
1420 video_format = '{0} - {1}{2}'.format(itag if itag else video_extension,
1421 '%dx%d' % (width, height) if width is not None and height is not None else (resolution if resolution is not None else '???'),
1422 ' ('+self._special_itags[itag]+')' if itag in self._special_itags else '')
1425 'url': video_real_url,
1426 'ext': video_extension,
1427 'format': video_format,
1429 'player_url': player_url,
1430 '_resolution': resolution,
1433 'format_note': note,
1438 'uploader': video_uploader,
1439 'uploader_id': video_uploader_id,
1440 'upload_date': upload_date,
1441 'title': video_title,
1442 'thumbnail': video_thumbnail,
1443 'description': video_description,
1444 'subtitles': video_subtitles,
1445 'duration': video_duration,
1446 'age_limit': 18 if age_gate else 0,
1447 'annotations': video_annotations,
1448 'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id,
1449 'view_count': view_count,
1450 'like_count': like_count,
1451 'dislike_count': dislike_count,
1455 class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
1456 IE_DESC = u'YouTube.com playlists'
1457 _VALID_URL = r"""(?:
1462 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1463 \? (?:.*?&)*? (?:p|a|list)=
1466 ((?:PL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,})
1469 ((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
1471 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&page=%s'
1472 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
1473 _VIDEO_RE = r'href="/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&[^"]*?index=(?P<index>\d+)'
1474 IE_NAME = u'youtube:playlist'
1477 def suitable(cls, url):
1478 """Receives a URL and returns True if suitable for this IE."""
1479 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1481 def _real_initialize(self):
1484 def _ids_to_results(self, ids):
1485 return [self.url_result(vid_id, 'Youtube', video_id=vid_id)
1488 def _extract_mix(self, playlist_id):
1489 # The mixes are generated from a a single video
1490 # the id of the playlist is just 'RD' + video_id
1491 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
1492 webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
1493 title_span = (get_element_by_attribute('class', 'title long-title', webpage) or
1494 get_element_by_attribute('class', 'title ', webpage))
1495 title = clean_html(title_span)
1496 video_re = r'data-index="\d+".*?href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s' % re.escape(playlist_id)
1497 ids = orderedSet(re.findall(video_re, webpage))
1498 url_results = self._ids_to_results(ids)
1500 return self.playlist_result(url_results, playlist_id, title)
1502 def _real_extract(self, url):
1503 # Extract playlist id
1504 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1506 raise ExtractorError(u'Invalid URL: %s' % url)
1507 playlist_id = mobj.group(1) or mobj.group(2)
1509 # Check if it's a video-specific URL
1510 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1511 if 'v' in query_dict:
1512 video_id = query_dict['v'][0]
1513 if self._downloader.params.get('noplaylist'):
1514 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
1515 return self.url_result(video_id, 'Youtube', video_id=video_id)
1517 self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1519 if playlist_id.startswith('RD'):
1520 # Mixes require a custom extraction process
1521 return self._extract_mix(playlist_id)
1522 if playlist_id.startswith('TL'):
1523 raise ExtractorError(u'For downloading YouTube.com top lists, use '
1524 u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
1526 # Extract the video ids from the playlist pages
1529 for page_num in itertools.count(1):
1530 url = self._TEMPLATE_URL % (playlist_id, page_num)
1531 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1532 matches = re.finditer(self._VIDEO_RE, page)
1533 # We remove the duplicates and the link with index 0
1534 # (it's not the first video of the playlist)
1535 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1538 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1541 playlist_title = self._og_search_title(page)
1543 url_results = self._ids_to_results(ids)
1544 return self.playlist_result(url_results, playlist_id, playlist_title)
1547 class YoutubeTopListIE(YoutubePlaylistIE):
1548 IE_NAME = u'youtube:toplist'
1549 IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1550 u' (Example: "yttoplist:music:Top Tracks")')
1551 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1553 def _real_extract(self, url):
1554 mobj = re.match(self._VALID_URL, url)
1555 channel = mobj.group('chann')
1556 title = mobj.group('title')
1557 query = compat_urllib_parse.urlencode({'title': title})
1558 playlist_re = 'href="([^"]+?%s[^"]+?)"' % re.escape(query)
1559 channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)
1560 link = self._html_search_regex(playlist_re, channel_page, u'list')
1561 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1563 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1565 # sometimes the webpage doesn't contain the videos
1566 # retry until we get them
1567 for i in itertools.count(0):
1568 msg = u'Downloading Youtube mix'
1570 msg += ', retry #%d' % i
1571 webpage = self._download_webpage(url, title, msg)
1572 ids = orderedSet(re.findall(video_re, webpage))
1575 url_results = self._ids_to_results(ids)
1576 return self.playlist_result(url_results, playlist_title=title)
1579 class YoutubeChannelIE(InfoExtractor):
1580 IE_DESC = u'YouTube.com channels'
1581 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1582 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1583 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1584 IE_NAME = u'youtube:channel'
1586 def extract_videos_from_page(self, page):
1588 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1589 if mobj.group(1) not in ids_in_page:
1590 ids_in_page.append(mobj.group(1))
1593 def _real_extract(self, url):
1594 # Extract channel id
1595 mobj = re.match(self._VALID_URL, url)
1597 raise ExtractorError(u'Invalid URL: %s' % url)
1599 # Download channel page
1600 channel_id = mobj.group(1)
1602 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1603 channel_page = self._download_webpage(url, channel_id)
1604 autogenerated = re.search(r'''(?x)
1606 channel-header-autogenerated-label|
1607 yt-channel-title-autogenerated
1608 )[^"]*"''', channel_page) is not None
1611 # The videos are contained in a single page
1612 # the ajax pages can't be used, they are empty
1613 video_ids = self.extract_videos_from_page(channel_page)
1615 # Download all channel pages using the json-based channel_ajax query
1616 for pagenum in itertools.count(1):
1617 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1618 page = self._download_webpage(url, channel_id,
1619 u'Downloading page #%s' % pagenum)
1621 page = json.loads(page)
1623 ids_in_page = self.extract_videos_from_page(page['content_html'])
1624 video_ids.extend(ids_in_page)
1626 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1629 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1631 url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1632 for video_id in video_ids]
1633 return self.playlist_result(url_entries, channel_id)
1636 class YoutubeUserIE(InfoExtractor):
1637 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
1638 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1639 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1640 _GDATA_PAGE_SIZE = 50
1641 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1642 IE_NAME = u'youtube:user'
1645 def suitable(cls, url):
1646 # Don't return True if the url can be extracted with other youtube
1647 # extractor, the regex would is too permissive and it would match.
1648 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1649 if any(ie.suitable(url) for ie in other_ies): return False
1650 else: return super(YoutubeUserIE, cls).suitable(url)
1652 def _real_extract(self, url):
1654 mobj = re.match(self._VALID_URL, url)
1656 raise ExtractorError(u'Invalid URL: %s' % url)
1658 username = mobj.group(1)
1660 # Download video ids using YouTube Data API. Result size per
1661 # query is limited (currently to 50 videos) so we need to query
1662 # page by page until there are no video ids - it means we got
1667 for pagenum in itertools.count(0):
1668 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1670 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1671 page = self._download_webpage(gdata_url, username,
1672 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1675 response = json.loads(page)
1676 except ValueError as err:
1677 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1678 if 'entry' not in response['feed']:
1679 # Number of videos is a multiple of self._MAX_RESULTS
1682 # Extract video identifiers
1684 for entry in response['feed']['entry']:
1685 ids_in_page.append(entry['id']['$t'].split('/')[-1])
1686 video_ids.extend(ids_in_page)
1688 # A little optimization - if current page is not
1689 # "full", ie. does not contain PAGE_SIZE video ids then
1690 # we can assume that this page is the last one - there
1691 # are no more ids on further pages - no need to query
1694 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1698 self.url_result(video_id, 'Youtube', video_id=video_id)
1699 for video_id in video_ids]
1700 return self.playlist_result(url_results, playlist_title=username)
1703 class YoutubeSearchIE(SearchInfoExtractor):
1704 IE_DESC = u'YouTube.com searches'
1705 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1707 IE_NAME = u'youtube:search'
1708 _SEARCH_KEY = 'ytsearch'
1710 def _get_n_results(self, query, n):
1711 """Get a specified number of results for a query"""
1717 while (50 * pagenum) < limit:
1718 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1719 data_json = self._download_webpage(
1720 result_url, video_id=u'query "%s"' % query,
1721 note=u'Downloading page %s' % (pagenum + 1),
1722 errnote=u'Unable to download API page')
1723 data = json.loads(data_json)
1724 api_response = data['data']
1726 if 'items' not in api_response:
1727 raise ExtractorError(u'[youtube] No video results')
1729 new_ids = list(video['id'] for video in api_response['items'])
1730 video_ids += new_ids
1732 limit = min(n, api_response['totalItems'])
1735 if len(video_ids) > n:
1736 video_ids = video_ids[:n]
1737 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1738 for video_id in video_ids]
1739 return self.playlist_result(videos, query)
1741 class YoutubeSearchDateIE(YoutubeSearchIE):
1742 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
1743 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1744 _SEARCH_KEY = 'ytsearchdate'
1745 IE_DESC = u'YouTube.com searches, newest videos first'
1747 class YoutubeShowIE(InfoExtractor):
1748 IE_DESC = u'YouTube.com (multi-season) shows'
1749 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1750 IE_NAME = u'youtube:show'
1752 def _real_extract(self, url):
1753 mobj = re.match(self._VALID_URL, url)
1754 show_name = mobj.group(1)
1755 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1756 # There's one playlist for each season of the show
1757 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1758 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1759 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1762 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1764 Base class for extractors that fetch info from
1765 http://www.youtube.com/feed_ajax
1766 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1768 _LOGIN_REQUIRED = True
1769 # use action_load_personal_feed instead of action_load_system_feed
1770 _PERSONAL_FEED = False
1773 def _FEED_TEMPLATE(self):
1774 action = 'action_load_system_feed'
1775 if self._PERSONAL_FEED:
1776 action = 'action_load_personal_feed'
1777 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1781 return u'youtube:%s' % self._FEED_NAME
1783 def _real_initialize(self):
1786 def _real_extract(self, url):
1789 for i in itertools.count(1):
1790 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1791 u'%s feed' % self._FEED_NAME,
1792 u'Downloading page %s' % i)
1793 info = json.loads(info)
1794 feed_html = info['feed_html']
1795 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1796 ids = orderedSet(m.group(1) for m in m_ids)
1797 feed_entries.extend(
1798 self.url_result(video_id, 'Youtube', video_id=video_id)
1799 for video_id in ids)
1800 if info['paging'] is None:
1802 paging = info['paging']
1803 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1805 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1806 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1807 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1808 _FEED_NAME = 'subscriptions'
1809 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1811 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1812 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1813 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1814 _FEED_NAME = 'recommended'
1815 _PLAYLIST_TITLE = u'Youtube Recommended videos'
1817 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1818 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1819 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1820 _FEED_NAME = 'watch_later'
1821 _PLAYLIST_TITLE = u'Youtube Watch Later'
1822 _PERSONAL_FEED = True
1824 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1825 IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'
1826 _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'
1827 _FEED_NAME = 'history'
1828 _PERSONAL_FEED = True
1829 _PLAYLIST_TITLE = u'Youtube Watch History'
1831 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1832 IE_NAME = u'youtube:favorites'
1833 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1834 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1835 _LOGIN_REQUIRED = True
1837 def _real_extract(self, url):
1838 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1839 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1840 return self.url_result(playlist_id, 'YoutubePlaylist')
1843 class YoutubeTruncatedURLIE(InfoExtractor):
1844 IE_NAME = 'youtube:truncated_url'
1845 IE_DESC = False # Do not list
1846 _VALID_URL = r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$'
1848 def _real_extract(self, url):
1849 raise ExtractorError(
1850 u'Did you forget to quote the URL? Remember that & is a meta '
1851 u'character in most shells, so you want to put the URL in quotes, '
1853 u'\'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\''
1854 u' (or simply youtube-dl BaW_jenozKc ).',