15 from .common import InfoExtractor, SearchInfoExtractor
16 from .subtitles import SubtitlesInfoExtractor
21 compat_urllib_request,
28 get_element_by_attribute,
36 class YoutubeBaseInfoExtractor(InfoExtractor):
37 """Provide base functions for Youtube extractors"""
38 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
39 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
40 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
41 _NETRC_MACHINE = 'youtube'
42 # If True it will raise an error if no login info is provided
43 _LOGIN_REQUIRED = False
45 def _set_language(self):
46 return bool(self._download_webpage(
48 note=u'Setting language', errnote='unable to set language',
52 (username, password) = self._get_login_info()
53 # No authentication to be performed
55 if self._LOGIN_REQUIRED:
56 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
59 login_page = self._download_webpage(
60 self._LOGIN_URL, None,
61 note=u'Downloading login page',
62 errnote=u'unable to fetch login page', fatal=False)
63 if login_page is False:
66 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
67 login_page, u'Login GALX parameter')
71 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
75 u'PersistentCookie': u'yes',
77 u'bgresponse': u'js_disabled',
78 u'checkConnection': u'',
79 u'checkedDomains': u'youtube',
84 u'signIn': u'Sign in',
86 u'service': u'youtube',
90 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
92 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
93 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
95 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
96 login_results = self._download_webpage(
98 note=u'Logging in', errnote=u'unable to log in', fatal=False)
99 if login_results is False:
101 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
102 self._downloader.report_warning(u'unable to log in: bad username or password')
106 def _confirm_age(self):
109 'action_confirm': 'Confirm',
111 req = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
113 self._download_webpage(
115 note=u'Confirming age', errnote=u'Unable to confirm age')
118 def _real_initialize(self):
119 if self._downloader is None:
121 if not self._set_language():
123 if not self._login():
128 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
129 IE_DESC = u'YouTube.com'
130 _VALID_URL = r"""(?x)^
132 (?:https?://|//)? # http(s):// or protocol-independent URL (optional)
133 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
134 tube\.majestyc\.net/|
135 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
136 (?:.*?\#/)? # handle anchor (#/) redirect urls
137 (?: # the various things that can precede the ID:
138 (?:(?:v|embed|e)/) # v/ or embed/ or e/
139 |(?: # or the v= param in all its forms
140 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
141 (?:\?|\#!?) # the params delimiter ? or # or #!
142 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
146 |youtu\.be/ # just youtu.be/xxxx
148 )? # all until now is optional -> you can pass the naked ID
149 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
150 (?(1).+)? # if we found the ID, everything can follow
152 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
153 # Listed in order of quality
154 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
155 # Apple HTTP Live Streaming
156 '96', '95', '94', '93', '92', '132', '151',
158 '85', '84', '102', '83', '101', '82', '100',
160 '138', '137', '248', '136', '247', '135', '246',
161 '245', '244', '134', '243', '133', '242', '160',
163 '141', '172', '140', '171', '139',
165 _video_extensions = {
187 # Apple HTTP Live Streaming
221 _video_dimensions = {
222 '5': {'width': 400, 'height': 240},
225 '17': {'width': 176, 'height': 144},
226 '18': {'width': 640, 'height': 360},
227 '22': {'width': 1280, 'height': 720},
228 '34': {'width': 640, 'height': 360},
229 '35': {'width': 854, 'height': 480},
230 '36': {'width': 320, 'height': 240},
231 '37': {'width': 1920, 'height': 1080},
232 '38': {'width': 4096, 'height': 3072},
233 '43': {'width': 640, 'height': 360},
234 '44': {'width': 854, 'height': 480},
235 '45': {'width': 1280, 'height': 720},
236 '46': {'width': 1920, 'height': 1080},
237 '82': {'height': 360, 'display': '360p'},
238 '83': {'height': 480, 'display': '480p'},
239 '84': {'height': 720, 'display': '720p'},
240 '85': {'height': 1080, 'display': '1080p'},
241 '92': {'height': 240, 'display': '240p'},
242 '93': {'height': 360, 'display': '360p'},
243 '94': {'height': 480, 'display': '480p'},
244 '95': {'height': 720, 'display': '720p'},
245 '96': {'height': 1080, 'display': '1080p'},
246 '100': {'height': 360, 'display': '360p'},
247 '101': {'height': 480, 'display': '480p'},
248 '102': {'height': 720, 'display': '720p'},
249 '132': {'height': 240, 'display': '240p'},
250 '151': {'height': 72, 'display': '72p'},
251 '133': {'height': 240, 'display': '240p'},
252 '134': {'height': 360, 'display': '360p'},
253 '135': {'height': 480, 'display': '480p'},
254 '136': {'height': 720, 'display': '720p'},
255 '137': {'height': 1080, 'display': '1080p'},
256 '138': {'height': 1081, 'display': '>1080p'},
257 '139': {'display': '48k'},
258 '140': {'display': '128k'},
259 '141': {'display': '256k'},
260 '160': {'height': 192, 'display': '192p'},
261 '171': {'display': '128k'},
262 '172': {'display': '256k'},
263 '242': {'height': 240, 'display': '240p'},
264 '243': {'height': 360, 'display': '360p'},
265 '244': {'height': 480, 'display': '480p'},
266 '245': {'height': 480, 'display': '480p'},
267 '246': {'height': 480, 'display': '480p'},
268 '247': {'height': 720, 'display': '720p'},
269 '248': {'height': 1080, 'display': '1080p'},
303 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
304 u"file": u"BaW_jenozKc.mp4",
306 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
307 u"uploader": u"Philipp Hagemeister",
308 u"uploader_id": u"phihag",
309 u"upload_date": u"20121002",
310 u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
314 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
315 u"file": u"UxxajLWwzqY.mp4",
316 u"note": u"Test generic use_cipher_signature video (#897)",
318 u"upload_date": u"20120506",
319 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
320 u"description": u"md5:5b292926389560516e384ac437c0ec07",
321 u"uploader": u"Icona Pop",
322 u"uploader_id": u"IconaPop"
326 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
327 u"file": u"07FYdnEawAQ.mp4",
328 u"note": u"Test VEVO video with age protection (#956)",
330 u"upload_date": u"20130703",
331 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
332 u"description": u"md5:64249768eec3bc4276236606ea996373",
333 u"uploader": u"justintimberlakeVEVO",
334 u"uploader_id": u"justintimberlakeVEVO"
338 u"url": u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
339 u"file": u"yZIXLfi8CZQ.mp4",
340 u"note": u"Embed-only video (#1746)",
342 u"upload_date": u"20120608",
343 u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
344 u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
345 u"uploader": u"SET India",
346 u"uploader_id": u"setindia"
353 def suitable(cls, url):
354 """Receives a URL and returns True if suitable for this IE."""
355 if YoutubePlaylistIE.suitable(url): return False
356 return re.match(cls._VALID_URL, url) is not None
358 def __init__(self, *args, **kwargs):
359 super(YoutubeIE, self).__init__(*args, **kwargs)
360 self._player_cache = {}
362 def report_video_info_webpage_download(self, video_id):
363 """Report attempt to download video info webpage."""
364 self.to_screen(u'%s: Downloading video info webpage' % video_id)
366 def report_information_extraction(self, video_id):
367 """Report attempt to extract video information."""
368 self.to_screen(u'%s: Extracting video information' % video_id)
370 def report_unavailable_format(self, video_id, format):
371 """Report extracted video URL."""
372 self.to_screen(u'%s: Format %s not available' % (video_id, format))
374 def report_rtmp_download(self):
375 """Indicate the download will use the RTMP protocol."""
376 self.to_screen(u'RTMP download detected')
378 def _extract_signature_function(self, video_id, player_url, slen):
379 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
381 player_type = id_m.group('ext')
382 player_id = id_m.group('id')
384 # Read from filesystem cache
385 func_id = '%s_%s_%d' % (player_type, player_id, slen)
386 assert os.path.basename(func_id) == func_id
387 cache_dir = get_cachedir(self._downloader.params)
389 cache_enabled = cache_dir is not None
391 cache_fn = os.path.join(os.path.expanduser(cache_dir),
395 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
396 cache_spec = json.load(cachef)
397 return lambda s: u''.join(s[i] for i in cache_spec)
399 pass # No cache available
401 if player_type == 'js':
402 code = self._download_webpage(
403 player_url, video_id,
404 note=u'Downloading %s player %s' % (player_type, player_id),
405 errnote=u'Download of %s failed' % player_url)
406 res = self._parse_sig_js(code)
407 elif player_type == 'swf':
408 urlh = self._request_webpage(
409 player_url, video_id,
410 note=u'Downloading %s player %s' % (player_type, player_id),
411 errnote=u'Download of %s failed' % player_url)
413 res = self._parse_sig_swf(code)
415 assert False, 'Invalid player type %r' % player_type
419 test_string = u''.join(map(compat_chr, range(slen)))
420 cache_res = res(test_string)
421 cache_spec = [ord(c) for c in cache_res]
423 os.makedirs(os.path.dirname(cache_fn))
424 except OSError as ose:
425 if ose.errno != errno.EEXIST:
427 write_json_file(cache_spec, cache_fn)
429 tb = traceback.format_exc()
430 self._downloader.report_warning(
431 u'Writing cache to %r failed: %s' % (cache_fn, tb))
435 def _print_sig_code(self, func, slen):
436 def gen_sig_code(idxs):
437 def _genslice(start, end, step):
438 starts = u'' if start == 0 else str(start)
439 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
440 steps = u'' if step == 1 else (u':%d' % step)
441 return u's[%s%s%s]' % (starts, ends, steps)
444 start = '(Never used)' # Quelch pyflakes warnings - start will be
445 # set as soon as step is set
446 for i, prev in zip(idxs[1:], idxs[:-1]):
450 yield _genslice(start, prev, step)
453 if i - prev in [-1, 1]:
458 yield u's[%d]' % prev
462 yield _genslice(start, i, step)
464 test_string = u''.join(map(compat_chr, range(slen)))
465 cache_res = func(test_string)
466 cache_spec = [ord(c) for c in cache_res]
467 expr_code = u' + '.join(gen_sig_code(cache_spec))
468 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
469 self.to_screen(u'Extracted signature function:\n' + code)
471 def _parse_sig_js(self, jscode):
472 funcname = self._search_regex(
473 r'signature=([a-zA-Z]+)', jscode,
474 u'Initial JS player signature function name')
479 return string.lowercase.index(varname)
481 def interpret_statement(stmt, local_vars, allow_recursion=20):
482 if allow_recursion < 0:
483 raise ExtractorError(u'Recursion limit reached')
485 if stmt.startswith(u'var '):
486 stmt = stmt[len(u'var '):]
487 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
488 r'=(?P<expr>.*)$', stmt)
490 if ass_m.groupdict().get('index'):
492 lvar = local_vars[ass_m.group('out')]
493 idx = interpret_expression(ass_m.group('index'),
494 local_vars, allow_recursion)
495 assert isinstance(idx, int)
498 expr = ass_m.group('expr')
501 local_vars[ass_m.group('out')] = val
503 expr = ass_m.group('expr')
504 elif stmt.startswith(u'return '):
506 expr = stmt[len(u'return '):]
508 raise ExtractorError(
509 u'Cannot determine left side of statement in %r' % stmt)
511 v = interpret_expression(expr, local_vars, allow_recursion)
514 def interpret_expression(expr, local_vars, allow_recursion):
519 return local_vars[expr]
521 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
523 member = m.group('member')
524 val = local_vars[m.group('in')]
525 if member == 'split("")':
527 if member == 'join("")':
529 if member == 'length':
531 if member == 'reverse()':
533 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
535 idx = interpret_expression(
536 slice_m.group('idx'), local_vars, allow_recursion-1)
540 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
542 val = local_vars[m.group('in')]
543 idx = interpret_expression(m.group('idx'), local_vars,
547 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
549 a = interpret_expression(m.group('a'),
550 local_vars, allow_recursion)
551 b = interpret_expression(m.group('b'),
552 local_vars, allow_recursion)
556 r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
558 fname = m.group('func')
559 if fname not in functions:
560 functions[fname] = extract_function(fname)
561 argvals = [int(v) if v.isdigit() else local_vars[v]
562 for v in m.group('args').split(',')]
563 return functions[fname](argvals)
564 raise ExtractorError(u'Unsupported JS expression %r' % expr)
566 def extract_function(funcname):
568 r'function ' + re.escape(funcname) +
569 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
571 argnames = func_m.group('args').split(',')
574 local_vars = dict(zip(argnames, args))
575 for stmt in func_m.group('code').split(';'):
576 res = interpret_statement(stmt, local_vars)
580 initial_function = extract_function(funcname)
581 return lambda s: initial_function([s])
583 def _parse_sig_swf(self, file_contents):
584 if file_contents[1:3] != b'WS':
585 raise ExtractorError(
586 u'Not an SWF file; header is %r' % file_contents[:3])
587 if file_contents[:1] == b'C':
588 content = zlib.decompress(file_contents[8:])
590 raise NotImplementedError(u'Unsupported compression format %r' %
593 def extract_tags(content):
595 while pos < len(content):
596 header16 = struct.unpack('<H', content[pos:pos+2])[0]
598 tag_code = header16 >> 6
599 tag_len = header16 & 0x3f
601 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
603 assert pos+tag_len <= len(content)
604 yield (tag_code, content[pos:pos+tag_len])
608 for tag_code, tag in extract_tags(content)
610 p = code_tag.index(b'\0', 4) + 1
611 code_reader = io.BytesIO(code_tag[p:])
613 # Parse ABC (AVM2 ByteCode)
614 def read_int(reader=None):
622 b = struct.unpack('<B', buf)[0]
623 res = res | ((b & 0x7f) << shift)
629 def u30(reader=None):
630 res = read_int(reader)
631 assert res & 0xf0000000 == 0
635 def s32(reader=None):
637 if v & 0x80000000 != 0:
638 v = - ((v ^ 0xffffffff) + 1)
641 def read_string(reader=None):
645 resb = reader.read(slen)
646 assert len(resb) == slen
647 return resb.decode('utf-8')
649 def read_bytes(count, reader=None):
652 resb = reader.read(count)
653 assert len(resb) == count
656 def read_byte(reader=None):
657 resb = read_bytes(1, reader=reader)
658 res = struct.unpack('<B', resb)[0]
661 # minor_version + major_version
666 for _c in range(1, int_count):
669 for _c in range(1, uint_count):
672 read_bytes((double_count-1) * 8)
674 constant_strings = [u'']
675 for _c in range(1, string_count):
677 constant_strings.append(s)
678 namespace_count = u30()
679 for _c in range(1, namespace_count):
683 for _c in range(1, ns_set_count):
685 for _c2 in range(count):
687 multiname_count = u30()
696 0x0e: 2, # MultinameA
697 0x1b: 1, # MultinameL
698 0x1c: 1, # MultinameLA
701 for _c in range(1, multiname_count):
703 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
705 u30() # namespace_idx
707 multinames.append(constant_strings[name_idx])
709 multinames.append('[MULTINAME kind: %d]' % kind)
710 for _c2 in range(MULTINAME_SIZES[kind]):
715 MethodInfo = collections.namedtuple(
717 ['NEED_ARGUMENTS', 'NEED_REST'])
719 for method_id in range(method_count):
722 for _ in range(param_count):
724 u30() # name index (always 0 for youtube)
726 if flags & 0x08 != 0:
729 for c in range(option_count):
732 if flags & 0x80 != 0:
733 # Param names present
734 for _ in range(param_count):
736 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
737 method_infos.append(mi)
740 metadata_count = u30()
741 for _c in range(metadata_count):
744 for _c2 in range(item_count):
748 def parse_traits_info():
749 trait_name_idx = u30()
750 kind_full = read_byte()
751 kind = kind_full & 0x0f
752 attrs = kind_full >> 4
754 if kind in [0x00, 0x06]: # Slot or Const
756 u30() # type_name_idx
760 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
763 methods[multinames[trait_name_idx]] = method_idx
764 elif kind == 0x04: # Class
767 elif kind == 0x05: # Function
770 methods[function_idx] = multinames[trait_name_idx]
772 raise ExtractorError(u'Unsupported trait kind %d' % kind)
774 if attrs & 0x4 != 0: # Metadata present
775 metadata_count = u30()
776 for _c3 in range(metadata_count):
777 u30() # metadata index
782 TARGET_CLASSNAME = u'SignatureDecipher'
783 searched_idx = multinames.index(TARGET_CLASSNAME)
784 searched_class_id = None
786 for class_id in range(class_count):
788 if name_idx == searched_idx:
789 # We found the class we're looking for!
790 searched_class_id = class_id
791 u30() # super_name idx
793 if flags & 0x08 != 0: # Protected namespace is present
794 u30() # protected_ns_idx
796 for _c2 in range(intrf_count):
800 for _c2 in range(trait_count):
803 if searched_class_id is None:
804 raise ExtractorError(u'Target class %r not found' %
809 for class_id in range(class_count):
812 for _c2 in range(trait_count):
813 trait_methods = parse_traits_info()
814 if class_id == searched_class_id:
815 method_names.update(trait_methods.items())
816 method_idxs.update(dict(
818 for name, idx in trait_methods.items()))
822 for _c in range(script_count):
825 for _c2 in range(trait_count):
829 method_body_count = u30()
830 Method = collections.namedtuple('Method', ['code', 'local_count'])
832 for _c in range(method_body_count):
836 u30() # init_scope_depth
837 u30() # max_scope_depth
839 code = read_bytes(code_length)
840 if method_idx in method_idxs:
841 m = Method(code, local_count)
842 methods[method_idxs[method_idx]] = m
843 exception_count = u30()
844 for _c2 in range(exception_count):
851 for _c2 in range(trait_count):
854 assert p + code_reader.tell() == len(code_tag)
855 assert len(methods) == len(method_idxs)
857 method_pyfunctions = {}
859 def extract_function(func_name):
860 if func_name in method_pyfunctions:
861 return method_pyfunctions[func_name]
862 if func_name not in methods:
863 raise ExtractorError(u'Cannot find function %r' % func_name)
864 m = methods[func_name]
867 registers = ['(this)'] + list(args) + [None] * m.local_count
869 coder = io.BytesIO(m.code)
871 opcode = struct.unpack('!B', coder.read(1))[0]
872 if opcode == 36: # pushbyte
873 v = struct.unpack('!B', coder.read(1))[0]
875 elif opcode == 44: # pushstring
877 stack.append(constant_strings[idx])
878 elif opcode == 48: # pushscope
879 # We don't implement the scope register, so we'll just
880 # ignore the popped value
882 elif opcode == 70: # callproperty
884 mname = multinames[index]
885 arg_count = u30(coder)
886 args = list(reversed(
887 [stack.pop() for _ in range(arg_count)]))
889 if mname == u'split':
890 assert len(args) == 1
891 assert isinstance(args[0], compat_str)
892 assert isinstance(obj, compat_str)
896 res = obj.split(args[0])
898 elif mname == u'slice':
899 assert len(args) == 1
900 assert isinstance(args[0], int)
901 assert isinstance(obj, list)
904 elif mname == u'join':
905 assert len(args) == 1
906 assert isinstance(args[0], compat_str)
907 assert isinstance(obj, list)
908 res = args[0].join(obj)
910 elif mname in method_pyfunctions:
911 stack.append(method_pyfunctions[mname](args))
913 raise NotImplementedError(
914 u'Unsupported property %r on %r'
916 elif opcode == 72: # returnvalue
919 elif opcode == 79: # callpropvoid
921 mname = multinames[index]
922 arg_count = u30(coder)
923 args = list(reversed(
924 [stack.pop() for _ in range(arg_count)]))
926 if mname == u'reverse':
927 assert isinstance(obj, list)
930 raise NotImplementedError(
931 u'Unsupported (void) property %r on %r'
933 elif opcode == 93: # findpropstrict
935 mname = multinames[index]
936 res = extract_function(mname)
938 elif opcode == 97: # setproperty
943 assert isinstance(obj, list)
944 assert isinstance(idx, int)
946 elif opcode == 98: # getlocal
948 stack.append(registers[index])
949 elif opcode == 99: # setlocal
952 registers[index] = value
953 elif opcode == 102: # getproperty
955 pname = multinames[index]
956 if pname == u'length':
958 assert isinstance(obj, list)
959 stack.append(len(obj))
960 else: # Assume attribute access
962 assert isinstance(idx, int)
964 assert isinstance(obj, list)
965 stack.append(obj[idx])
966 elif opcode == 128: # coerce
968 elif opcode == 133: # coerce_s
969 assert isinstance(stack[-1], (type(None), compat_str))
970 elif opcode == 164: # modulo
973 res = value1 % value2
975 elif opcode == 208: # getlocal_0
976 stack.append(registers[0])
977 elif opcode == 209: # getlocal_1
978 stack.append(registers[1])
979 elif opcode == 210: # getlocal_2
980 stack.append(registers[2])
981 elif opcode == 211: # getlocal_3
982 stack.append(registers[3])
983 elif opcode == 214: # setlocal_2
984 registers[2] = stack.pop()
985 elif opcode == 215: # setlocal_3
986 registers[3] = stack.pop()
988 raise NotImplementedError(
989 u'Unsupported opcode %d' % opcode)
991 method_pyfunctions[func_name] = resfunc
994 initial_function = extract_function(u'decipher')
995 return lambda s: initial_function([s])
997 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
998 """Turn the encrypted s field into a working signature"""
1000 if player_url is not None:
1001 if player_url.startswith(u'//'):
1002 player_url = u'https:' + player_url
1004 player_id = (player_url, len(s))
1005 if player_id not in self._player_cache:
1006 func = self._extract_signature_function(
1007 video_id, player_url, len(s)
1009 self._player_cache[player_id] = func
1010 func = self._player_cache[player_id]
1011 if self._downloader.params.get('youtube_print_sig_code'):
1012 self._print_sig_code(func, len(s))
1015 tb = traceback.format_exc()
1016 self._downloader.report_warning(
1017 u'Automatic signature extraction failed: ' + tb)
1019 self._downloader.report_warning(
1020 u'Warning: Falling back to static signature algorithm')
1022 return self._static_decrypt_signature(
1023 s, video_id, player_url, age_gate)
1025 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
1027 # The videos with age protection use another player, so the
1028 # algorithms can be different.
1030 return s[2:63] + s[82] + s[64:82] + s[63]
1033 return s[86:29:-1] + s[88] + s[28:5:-1]
1035 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
1037 return s[84:27:-1] + s[86] + s[26:5:-1]
1039 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
1041 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
1043 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
1045 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
1047 return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1]
1049 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
1051 return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1]
1053 return s[80:63:-1] + s[0] + s[62:0:-1] + s[63]
1055 return s[80:37:-1] + s[7] + s[36:7:-1] + s[0] + s[6:0:-1] + s[37]
1057 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1059 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
1061 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1064 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
1066 def _get_available_subtitles(self, video_id, webpage):
1068 sub_list = self._download_webpage(
1069 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1070 video_id, note=False)
1071 except ExtractorError as err:
1072 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
1074 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
1079 params = compat_urllib_parse.urlencode({
1082 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
1083 'name': l[0].encode('utf-8'),
1085 url = u'http://www.youtube.com/api/timedtext?' + params
1086 sub_lang_list[lang] = url
1087 if not sub_lang_list:
1088 self._downloader.report_warning(u'video doesn\'t have subtitles')
1090 return sub_lang_list
1092 def _get_available_automatic_caption(self, video_id, webpage):
1093 """We need the webpage for getting the captions url, pass it as an
1094 argument to speed up the process."""
1095 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
1096 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1097 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
1098 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
1100 self._downloader.report_warning(err_msg)
1102 player_config = json.loads(mobj.group(1))
1104 args = player_config[u'args']
1105 caption_url = args[u'ttsurl']
1106 timestamp = args[u'timestamp']
1107 # We get the available subtitles
1108 list_params = compat_urllib_parse.urlencode({
1113 list_url = caption_url + '&' + list_params
1114 caption_list = self._download_xml(list_url, video_id)
1115 original_lang_node = caption_list.find('track')
1116 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
1117 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1119 original_lang = original_lang_node.attrib['lang_code']
1122 for lang_node in caption_list.findall('target'):
1123 sub_lang = lang_node.attrib['lang_code']
1124 params = compat_urllib_parse.urlencode({
1125 'lang': original_lang,
1131 sub_lang_list[sub_lang] = caption_url + '&' + params
1132 return sub_lang_list
1133 # An extractor error can be raise by the download process if there are
1134 # no automatic captions but there are subtitles
1135 except (KeyError, ExtractorError):
1136 self._downloader.report_warning(err_msg)
1139 def _extract_id(self, url):
1140 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1142 raise ExtractorError(u'Invalid URL: %s' % url)
1143 video_id = mobj.group(2)
1146 def _get_video_url_list(self, url_map):
1148 Transform a dictionary in the format {itag:url} to a list of (itag, url)
1149 with the requested formats.
1151 existing_formats = [x for x in self._available_formats if x in url_map]
1152 if len(existing_formats) == 0:
1153 raise ExtractorError(u'no known formats available for video')
1154 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1155 video_url_list.reverse() # order worst to best
1156 return video_url_list
1158 def _extract_from_m3u8(self, manifest_url, video_id):
1160 def _get_urls(_manifest):
1161 lines = _manifest.split('\n')
1162 urls = filter(lambda l: l and not l.startswith('#'),
1165 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1166 formats_urls = _get_urls(manifest)
1167 for format_url in formats_urls:
1168 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1169 url_map[itag] = format_url
1172 def _extract_annotations(self, video_id):
1173 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
1174 return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
1176 def _real_extract(self, url):
1177 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1178 mobj = re.search(self._NEXT_URL_RE, url)
1180 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1181 video_id = self._extract_id(url)
1184 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1185 video_webpage = self._download_webpage(url, video_id)
1187 # Attempt to extract SWF player URL
1188 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1189 if mobj is not None:
1190 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1195 self.report_video_info_webpage_download(video_id)
1196 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1197 self.report_age_confirmation()
1199 # We simulate the access to the video from www.youtube.com/v/{video_id}
1200 # this can be viewed without login into Youtube
1201 data = compat_urllib_parse.urlencode({'video_id': video_id,
1202 'el': 'player_embedded',
1205 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1209 video_info_url = 'https://www.youtube.com/get_video_info?' + data
1210 video_info_webpage = self._download_webpage(video_info_url, video_id,
1212 errnote='unable to download video info webpage')
1213 video_info = compat_parse_qs(video_info_webpage)
1216 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1217 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1218 % (video_id, el_type))
1219 video_info_webpage = self._download_webpage(video_info_url, video_id,
1221 errnote='unable to download video info webpage')
1222 video_info = compat_parse_qs(video_info_webpage)
1223 if 'token' in video_info:
1225 if 'token' not in video_info:
1226 if 'reason' in video_info:
1227 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
1229 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1231 if 'view_count' in video_info:
1232 view_count = int(video_info['view_count'][0])
1236 # Check for "rental" videos
1237 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1238 raise ExtractorError(u'"rental" videos not supported')
1240 # Start extracting information
1241 self.report_information_extraction(video_id)
1244 if 'author' not in video_info:
1245 raise ExtractorError(u'Unable to extract uploader name')
1246 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1249 video_uploader_id = None
1250 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1251 if mobj is not None:
1252 video_uploader_id = mobj.group(1)
1254 self._downloader.report_warning(u'unable to extract uploader nickname')
1257 if 'title' in video_info:
1258 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1260 self._downloader.report_warning(u'Unable to extract video title')
1264 # We try first to get a high quality image:
1265 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1266 video_webpage, re.DOTALL)
1267 if m_thumb is not None:
1268 video_thumbnail = m_thumb.group(1)
1269 elif 'thumbnail_url' not in video_info:
1270 self._downloader.report_warning(u'unable to extract video thumbnail')
1271 video_thumbnail = None
1272 else: # don't panic if we can't find it
1273 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1277 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1278 if mobj is not None:
1279 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1280 upload_date = unified_strdate(upload_date)
1283 video_description = get_element_by_id("eow-description", video_webpage)
1284 if video_description:
1285 video_description = re.sub(r'''(?x)
1287 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1289 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1290 class="yt-uix-redirect-link"\s*>
1293 ''', r'\1', video_description)
1294 video_description = clean_html(video_description)
1296 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1298 video_description = unescapeHTML(fd_mobj.group(1))
1300 video_description = u''
1302 def _extract_count(klass):
1303 count = self._search_regex(
1304 r'class="%s">([\d,]+)</span>' % re.escape(klass),
1305 video_webpage, klass, default=None)
1306 if count is not None:
1307 return int(count.replace(',', ''))
1309 like_count = _extract_count(u'likes-count')
1310 dislike_count = _extract_count(u'dislikes-count')
1313 video_subtitles = self.extract_subtitles(video_id, video_webpage)
1315 if self._downloader.params.get('listsubtitles', False):
1316 self._list_available_subtitles(video_id, video_webpage)
1319 if 'length_seconds' not in video_info:
1320 self._downloader.report_warning(u'unable to extract video duration')
1321 video_duration = None
1323 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
1326 video_annotations = None
1327 if self._downloader.params.get('writeannotations', False):
1328 video_annotations = self._extract_annotations(video_id)
1330 # Decide which formats to download
1333 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
1335 raise ValueError('Could not find vevo ID')
1336 info = json.loads(mobj.group(1))
1338 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1339 # this signatures are encrypted
1340 if 'url_encoded_fmt_stream_map' not in args:
1341 raise ValueError(u'No stream_map present') # caught below
1342 re_signature = re.compile(r'[&,]s=')
1343 m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
1345 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
1346 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
1347 m_s = re_signature.search(args.get('adaptive_fmts', u''))
1349 if 'adaptive_fmts' in video_info:
1350 video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
1352 video_info['adaptive_fmts'] = [args['adaptive_fmts']]
1356 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1357 self.report_rtmp_download()
1358 video_url_list = [(None, video_info['conn'][0])]
1359 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
1360 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
1361 if 'rtmpe%3Dyes' in encoded_url_map:
1362 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1364 for url_data_str in encoded_url_map.split(','):
1365 url_data = compat_parse_qs(url_data_str)
1366 if 'itag' in url_data and 'url' in url_data:
1367 url = url_data['url'][0]
1368 if 'sig' in url_data:
1369 url += '&signature=' + url_data['sig'][0]
1370 elif 's' in url_data:
1371 encrypted_sig = url_data['s'][0]
1372 if self._downloader.params.get('verbose'):
1374 if player_url is None:
1375 player_version = 'unknown'
1377 player_version = self._search_regex(
1378 r'-(.+)\.swf$', player_url,
1379 u'flash player', fatal=False)
1380 player_desc = 'flash player %s' % player_version
1382 player_version = self._search_regex(
1383 r'html5player-(.+?)\.js', video_webpage,
1384 'html5 player', fatal=False)
1385 player_desc = u'html5 player %s' % player_version
1387 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
1388 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
1389 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1392 jsplayer_url_json = self._search_regex(
1393 r'"assets":.+?"js":\s*("[^"]+")',
1394 video_webpage, u'JS player URL')
1395 player_url = json.loads(jsplayer_url_json)
1397 signature = self._decrypt_signature(
1398 encrypted_sig, video_id, player_url, age_gate)
1399 url += '&signature=' + signature
1400 if 'ratebypass' not in url:
1401 url += '&ratebypass=yes'
1402 url_map[url_data['itag'][0]] = url
1403 video_url_list = self._get_video_url_list(url_map)
1404 elif video_info.get('hlsvp'):
1405 manifest_url = video_info['hlsvp'][0]
1406 url_map = self._extract_from_m3u8(manifest_url, video_id)
1407 video_url_list = self._get_video_url_list(url_map)
1409 raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
1412 for itag, video_real_url in video_url_list:
1414 video_extension = self._video_extensions.get(itag, 'flv')
1415 resolution = self._video_dimensions.get(itag, {}).get('display')
1416 width = self._video_dimensions.get(itag, {}).get('width')
1417 height = self._video_dimensions.get(itag, {}).get('height')
1418 note = self._special_itags.get(itag)
1420 video_format = '{0} - {1}{2}'.format(itag if itag else video_extension,
1421 '%dx%d' % (width, height) if width is not None and height is not None else (resolution if resolution is not None else '???'),
1422 ' ('+self._special_itags[itag]+')' if itag in self._special_itags else '')
1425 'url': video_real_url,
1426 'ext': video_extension,
1427 'format': video_format,
1429 'player_url': player_url,
1430 '_resolution': resolution,
1433 'format_note': note,
1435 def _formats_key(f):
1436 return (f.get('height') if f.get('height') is not None else -1,
1437 f.get('width') if f.get('width') is not None else -1)
1438 formats = sorted(formats, key=_formats_key)
1442 'uploader': video_uploader,
1443 'uploader_id': video_uploader_id,
1444 'upload_date': upload_date,
1445 'title': video_title,
1446 'thumbnail': video_thumbnail,
1447 'description': video_description,
1448 'subtitles': video_subtitles,
1449 'duration': video_duration,
1450 'age_limit': 18 if age_gate else 0,
1451 'annotations': video_annotations,
1452 'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id,
1453 'view_count': view_count,
1454 'like_count': like_count,
1455 'dislike_count': dislike_count,
1459 class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
1460 IE_DESC = u'YouTube.com playlists'
1461 _VALID_URL = r"""(?:
1466 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1467 \? (?:.*?&)*? (?:p|a|list)=
1470 ((?:PL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,})
1473 ((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
1475 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&page=%s'
1476 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
1477 _VIDEO_RE = r'href="/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&[^"]*?index=(?P<index>\d+)'
1478 IE_NAME = u'youtube:playlist'
1481 def suitable(cls, url):
1482 """Receives a URL and returns True if suitable for this IE."""
1483 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1485 def _real_initialize(self):
1488 def _ids_to_results(self, ids):
1489 return [self.url_result(vid_id, 'Youtube', video_id=vid_id)
1492 def _extract_mix(self, playlist_id):
1493 # The mixes are generated from a a single video
1494 # the id of the playlist is just 'RD' + video_id
1495 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
1496 webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
1497 title_span = (get_element_by_attribute('class', 'title long-title', webpage) or
1498 get_element_by_attribute('class', 'title ', webpage))
1499 title = clean_html(title_span)
1500 video_re = r'data-index="\d+".*?href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s' % re.escape(playlist_id)
1501 ids = orderedSet(re.findall(video_re, webpage))
1502 url_results = self._ids_to_results(ids)
1504 return self.playlist_result(url_results, playlist_id, title)
1506 def _real_extract(self, url):
1507 # Extract playlist id
1508 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1510 raise ExtractorError(u'Invalid URL: %s' % url)
1511 playlist_id = mobj.group(1) or mobj.group(2)
1513 # Check if it's a video-specific URL
1514 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1515 if 'v' in query_dict:
1516 video_id = query_dict['v'][0]
1517 if self._downloader.params.get('noplaylist'):
1518 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
1519 return self.url_result(video_id, 'Youtube', video_id=video_id)
1521 self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1523 if playlist_id.startswith('RD'):
1524 # Mixes require a custom extraction process
1525 return self._extract_mix(playlist_id)
1526 if playlist_id.startswith('TL'):
1527 raise ExtractorError(u'For downloading YouTube.com top lists, use '
1528 u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
1530 # Extract the video ids from the playlist pages
1533 for page_num in itertools.count(1):
1534 url = self._TEMPLATE_URL % (playlist_id, page_num)
1535 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1536 matches = re.finditer(self._VIDEO_RE, page)
1537 # We remove the duplicates and the link with index 0
1538 # (it's not the first video of the playlist)
1539 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1542 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1545 playlist_title = self._og_search_title(page)
1547 url_results = self._ids_to_results(ids)
1548 return self.playlist_result(url_results, playlist_id, playlist_title)
1551 class YoutubeTopListIE(YoutubePlaylistIE):
1552 IE_NAME = u'youtube:toplist'
1553 IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1554 u' (Example: "yttoplist:music:Top Tracks")')
1555 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1557 def _real_extract(self, url):
1558 mobj = re.match(self._VALID_URL, url)
1559 channel = mobj.group('chann')
1560 title = mobj.group('title')
1561 query = compat_urllib_parse.urlencode({'title': title})
1562 playlist_re = 'href="([^"]+?%s[^"]+?)"' % re.escape(query)
1563 channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)
1564 link = self._html_search_regex(playlist_re, channel_page, u'list')
1565 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1567 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1569 # sometimes the webpage doesn't contain the videos
1570 # retry until we get them
1571 for i in itertools.count(0):
1572 msg = u'Downloading Youtube mix'
1574 msg += ', retry #%d' % i
1575 webpage = self._download_webpage(url, title, msg)
1576 ids = orderedSet(re.findall(video_re, webpage))
1579 url_results = self._ids_to_results(ids)
1580 return self.playlist_result(url_results, playlist_title=title)
1583 class YoutubeChannelIE(InfoExtractor):
1584 IE_DESC = u'YouTube.com channels'
1585 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1586 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1587 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1588 IE_NAME = u'youtube:channel'
1590 def extract_videos_from_page(self, page):
1592 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1593 if mobj.group(1) not in ids_in_page:
1594 ids_in_page.append(mobj.group(1))
1597 def _real_extract(self, url):
1598 # Extract channel id
1599 mobj = re.match(self._VALID_URL, url)
1601 raise ExtractorError(u'Invalid URL: %s' % url)
1603 # Download channel page
1604 channel_id = mobj.group(1)
1606 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1607 channel_page = self._download_webpage(url, channel_id)
1608 autogenerated = re.search(r'''(?x)
1610 channel-header-autogenerated-label|
1611 yt-channel-title-autogenerated
1612 )[^"]*"''', channel_page) is not None
1615 # The videos are contained in a single page
1616 # the ajax pages can't be used, they are empty
1617 video_ids = self.extract_videos_from_page(channel_page)
1619 # Download all channel pages using the json-based channel_ajax query
1620 for pagenum in itertools.count(1):
1621 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1622 page = self._download_webpage(url, channel_id,
1623 u'Downloading page #%s' % pagenum)
1625 page = json.loads(page)
1627 ids_in_page = self.extract_videos_from_page(page['content_html'])
1628 video_ids.extend(ids_in_page)
1630 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1633 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1635 url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1636 for video_id in video_ids]
1637 return self.playlist_result(url_entries, channel_id)
1640 class YoutubeUserIE(InfoExtractor):
1641 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
1642 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1643 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1644 _GDATA_PAGE_SIZE = 50
1645 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1646 IE_NAME = u'youtube:user'
1649 def suitable(cls, url):
1650 # Don't return True if the url can be extracted with other youtube
1651 # extractor, the regex would is too permissive and it would match.
1652 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1653 if any(ie.suitable(url) for ie in other_ies): return False
1654 else: return super(YoutubeUserIE, cls).suitable(url)
1656 def _real_extract(self, url):
1658 mobj = re.match(self._VALID_URL, url)
1660 raise ExtractorError(u'Invalid URL: %s' % url)
1662 username = mobj.group(1)
1664 # Download video ids using YouTube Data API. Result size per
1665 # query is limited (currently to 50 videos) so we need to query
1666 # page by page until there are no video ids - it means we got
1671 for pagenum in itertools.count(0):
1672 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1674 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1675 page = self._download_webpage(gdata_url, username,
1676 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1679 response = json.loads(page)
1680 except ValueError as err:
1681 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1682 if 'entry' not in response['feed']:
1683 # Number of videos is a multiple of self._MAX_RESULTS
1686 # Extract video identifiers
1688 for entry in response['feed']['entry']:
1689 ids_in_page.append(entry['id']['$t'].split('/')[-1])
1690 video_ids.extend(ids_in_page)
1692 # A little optimization - if current page is not
1693 # "full", ie. does not contain PAGE_SIZE video ids then
1694 # we can assume that this page is the last one - there
1695 # are no more ids on further pages - no need to query
1698 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1702 self.url_result(video_id, 'Youtube', video_id=video_id)
1703 for video_id in video_ids]
1704 return self.playlist_result(url_results, playlist_title=username)
1707 class YoutubeSearchIE(SearchInfoExtractor):
1708 IE_DESC = u'YouTube.com searches'
1709 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1711 IE_NAME = u'youtube:search'
1712 _SEARCH_KEY = 'ytsearch'
1714 def _get_n_results(self, query, n):
1715 """Get a specified number of results for a query"""
1721 while (50 * pagenum) < limit:
1722 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1723 data_json = self._download_webpage(
1724 result_url, video_id=u'query "%s"' % query,
1725 note=u'Downloading page %s' % (pagenum + 1),
1726 errnote=u'Unable to download API page')
1727 data = json.loads(data_json)
1728 api_response = data['data']
1730 if 'items' not in api_response:
1731 raise ExtractorError(u'[youtube] No video results')
1733 new_ids = list(video['id'] for video in api_response['items'])
1734 video_ids += new_ids
1736 limit = min(n, api_response['totalItems'])
1739 if len(video_ids) > n:
1740 video_ids = video_ids[:n]
1741 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1742 for video_id in video_ids]
1743 return self.playlist_result(videos, query)
1745 class YoutubeSearchDateIE(YoutubeSearchIE):
1746 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
1747 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1748 _SEARCH_KEY = 'ytsearchdate'
1749 IE_DESC = u'YouTube.com searches, newest videos first'
1751 class YoutubeShowIE(InfoExtractor):
1752 IE_DESC = u'YouTube.com (multi-season) shows'
1753 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1754 IE_NAME = u'youtube:show'
1756 def _real_extract(self, url):
1757 mobj = re.match(self._VALID_URL, url)
1758 show_name = mobj.group(1)
1759 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1760 # There's one playlist for each season of the show
1761 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1762 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1763 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1766 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1768 Base class for extractors that fetch info from
1769 http://www.youtube.com/feed_ajax
1770 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1772 _LOGIN_REQUIRED = True
1773 # use action_load_personal_feed instead of action_load_system_feed
1774 _PERSONAL_FEED = False
1777 def _FEED_TEMPLATE(self):
1778 action = 'action_load_system_feed'
1779 if self._PERSONAL_FEED:
1780 action = 'action_load_personal_feed'
1781 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1785 return u'youtube:%s' % self._FEED_NAME
1787 def _real_initialize(self):
1790 def _real_extract(self, url):
1793 for i in itertools.count(1):
1794 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1795 u'%s feed' % self._FEED_NAME,
1796 u'Downloading page %s' % i)
1797 info = json.loads(info)
1798 feed_html = info['feed_html']
1799 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1800 ids = orderedSet(m.group(1) for m in m_ids)
1801 feed_entries.extend(
1802 self.url_result(video_id, 'Youtube', video_id=video_id)
1803 for video_id in ids)
1804 if info['paging'] is None:
1806 paging = info['paging']
1807 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1809 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1810 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1811 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1812 _FEED_NAME = 'subscriptions'
1813 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1815 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1816 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1817 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1818 _FEED_NAME = 'recommended'
1819 _PLAYLIST_TITLE = u'Youtube Recommended videos'
1821 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1822 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1823 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1824 _FEED_NAME = 'watch_later'
1825 _PLAYLIST_TITLE = u'Youtube Watch Later'
1826 _PERSONAL_FEED = True
1828 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1829 IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'
1830 _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'
1831 _FEED_NAME = 'history'
1832 _PERSONAL_FEED = True
1833 _PLAYLIST_TITLE = u'Youtube Watch History'
1835 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1836 IE_NAME = u'youtube:favorites'
1837 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1838 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1839 _LOGIN_REQUIRED = True
1841 def _real_extract(self, url):
1842 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1843 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1844 return self.url_result(playlist_id, 'YoutubePlaylist')
1847 class YoutubeTruncatedURLIE(InfoExtractor):
1848 IE_NAME = 'youtube:truncated_url'
1849 IE_DESC = False # Do not list
1850 _VALID_URL = r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$'
1852 def _real_extract(self, url):
1853 raise ExtractorError(
1854 u'Did you forget to quote the URL? Remember that & is a meta '
1855 u'character in most shells, so you want to put the URL in quotes, '
1857 u'\'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\''
1858 u' (or simply youtube-dl BaW_jenozKc ).',