17 from .common import InfoExtractor, SearchInfoExtractor
18 from .subtitles import SubtitlesInfoExtractor
25 compat_urllib_request,
37 class YoutubeBaseInfoExtractor(InfoExtractor):
38 """Provide base functions for Youtube extractors"""
39 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
40 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
41 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
42 _NETRC_MACHINE = 'youtube'
43 # If True it will raise an error if no login info is provided
44 _LOGIN_REQUIRED = False
46 def report_lang(self):
47 """Report attempt to set language."""
48 self.to_screen(u'Setting language')
50 def _set_language(self):
51 request = compat_urllib_request.Request(self._LANG_URL)
54 compat_urllib_request.urlopen(request).read()
55 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
56 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
61 (username, password) = self._get_login_info()
62 # No authentication to be performed
64 if self._LOGIN_REQUIRED:
65 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
68 request = compat_urllib_request.Request(self._LOGIN_URL)
70 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
71 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
72 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
77 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
80 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
86 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
90 u'PersistentCookie': u'yes',
92 u'bgresponse': u'js_disabled',
93 u'checkConnection': u'',
94 u'checkedDomains': u'youtube',
100 u'signIn': u'Sign in',
102 u'service': u'youtube',
106 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
108 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
109 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
110 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
113 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
114 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
115 self._downloader.report_warning(u'unable to log in: bad username or password')
117 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
118 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
122 def _confirm_age(self):
125 'action_confirm': 'Confirm',
127 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
129 self.report_age_confirmation()
130 compat_urllib_request.urlopen(request).read().decode('utf-8')
131 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
132 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
135 def _real_initialize(self):
136 if self._downloader is None:
138 if not self._set_language():
140 if not self._login():
145 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
146 IE_DESC = u'YouTube.com'
149 (?:https?://)? # http(s):// (optional)
150 (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
151 tube\.majestyc\.net/|
152 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
153 (?:.*?\#/)? # handle anchor (#/) redirect urls
154 (?: # the various things that can precede the ID:
155 (?:(?:v|embed|e)/) # v/ or embed/ or e/
156 |(?: # or the v= param in all its forms
157 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
158 (?:\?|\#!?) # the params delimiter ? or # or #!
159 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
163 |youtu\.be/ # just youtu.be/xxxx
165 )? # all until now is optional -> you can pass the naked ID
166 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
167 (?(1).+)? # if we found the ID, everything can follow
169 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
170 # Listed in order of quality
171 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
172 # Apple HTTP Live Streaming
173 '96', '95', '94', '93', '92', '132', '151',
175 '85', '84', '102', '83', '101', '82', '100',
177 '138', '137', '248', '136', '247', '135', '246',
178 '245', '244', '134', '243', '133', '242', '160',
180 '141', '172', '140', '171', '139',
182 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
183 # Apple HTTP Live Streaming
184 '96', '95', '94', '93', '92', '132', '151',
186 '85', '102', '84', '101', '83', '100', '82',
188 '138', '248', '137', '247', '136', '246', '245',
189 '244', '135', '243', '134', '242', '133', '160',
191 '172', '141', '171', '140', '139',
193 _video_formats_map = {
194 'flv': ['35', '34', '6', '5'],
195 '3gp': ['36', '17', '13'],
196 'mp4': ['38', '37', '22', '18'],
197 'webm': ['46', '45', '44', '43'],
199 _video_extensions = {
221 # Apple HTTP Live Streaming
253 _video_dimensions = {
335 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
336 u"file": u"BaW_jenozKc.mp4",
338 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
339 u"uploader": u"Philipp Hagemeister",
340 u"uploader_id": u"phihag",
341 u"upload_date": u"20121002",
342 u"description": u"test chars: \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
346 u"url": u"http://www.youtube.com/watch?v=1ltcDfZMA3U",
347 u"file": u"1ltcDfZMA3U.flv",
348 u"note": u"Test VEVO video (#897)",
350 u"upload_date": u"20070518",
351 u"title": u"Maps - It Will Find You",
352 u"description": u"Music video by Maps performing It Will Find You.",
353 u"uploader": u"MuteUSA",
354 u"uploader_id": u"MuteUSA"
358 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
359 u"file": u"UxxajLWwzqY.mp4",
360 u"note": u"Test generic use_cipher_signature video (#897)",
362 u"upload_date": u"20120506",
363 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
364 u"description": u"md5:3e2666e0a55044490499ea45fe9037b7",
365 u"uploader": u"Icona Pop",
366 u"uploader_id": u"IconaPop"
370 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
371 u"file": u"07FYdnEawAQ.mp4",
372 u"note": u"Test VEVO video with age protection (#956)",
374 u"upload_date": u"20130703",
375 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
376 u"description": u"md5:64249768eec3bc4276236606ea996373",
377 u"uploader": u"justintimberlakeVEVO",
378 u"uploader_id": u"justintimberlakeVEVO"
382 u'url': u'https://www.youtube.com/watch?v=TGi3HqYrWHE',
383 u'file': u'TGi3HqYrWHE.mp4',
384 u'note': u'm3u8 video',
386 u'title': u'Triathlon - Men - London 2012 Olympic Games',
387 u'description': u'- Men - TR02 - Triathlon - 07 August 2012 - London 2012 Olympic Games',
388 u'uploader': u'olympic',
389 u'upload_date': u'20120807',
390 u'uploader_id': u'olympic',
393 u'skip_download': True,
400 def suitable(cls, url):
401 """Receives a URL and returns True if suitable for this IE."""
402 if YoutubePlaylistIE.suitable(url): return False
403 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
405 def __init__(self, *args, **kwargs):
406 super(YoutubeIE, self).__init__(*args, **kwargs)
407 self._player_cache = {}
409 def report_video_webpage_download(self, video_id):
410 """Report attempt to download video webpage."""
411 self.to_screen(u'%s: Downloading video webpage' % video_id)
413 def report_video_info_webpage_download(self, video_id):
414 """Report attempt to download video info webpage."""
415 self.to_screen(u'%s: Downloading video info webpage' % video_id)
417 def report_information_extraction(self, video_id):
418 """Report attempt to extract video information."""
419 self.to_screen(u'%s: Extracting video information' % video_id)
421 def report_unavailable_format(self, video_id, format):
422 """Report extracted video URL."""
423 self.to_screen(u'%s: Format %s not available' % (video_id, format))
425 def report_rtmp_download(self):
426 """Indicate the download will use the RTMP protocol."""
427 self.to_screen(u'RTMP download detected')
429 def _extract_signature_function(self, video_id, player_url, slen):
430 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
432 player_type = id_m.group('ext')
433 player_id = id_m.group('id')
435 # Read from filesystem cache
436 func_id = '%s_%s_%d' % (player_type, player_id, slen)
437 assert os.path.basename(func_id) == func_id
438 cache_dir = self._downloader.params.get('cachedir',
439 u'~/.youtube-dl/cache')
441 if cache_dir != u'NONE':
442 cache_fn = os.path.join(os.path.expanduser(cache_dir),
446 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
447 cache_spec = json.load(cachef)
448 return lambda s: u''.join(s[i] for i in cache_spec)
450 pass # No cache available
452 if player_type == 'js':
453 code = self._download_webpage(
454 player_url, video_id,
455 note=u'Downloading %s player %s' % (player_type, player_id),
456 errnote=u'Download of %s failed' % player_url)
457 res = self._parse_sig_js(code)
458 elif player_type == 'swf':
459 urlh = self._request_webpage(
460 player_url, video_id,
461 note=u'Downloading %s player %s' % (player_type, player_id),
462 errnote=u'Download of %s failed' % player_url)
464 res = self._parse_sig_swf(code)
466 assert False, 'Invalid player type %r' % player_type
468 if cache_dir is not False:
470 cache_res = res(map(compat_chr, range(slen)))
471 cache_spec = [ord(c) for c in cache_res]
473 os.makedirs(os.path.dirname(cache_fn))
474 except OSError as ose:
475 if ose.errno != errno.EEXIST:
477 write_json_file(cache_spec, cache_fn)
478 except Exception as e:
479 tb = traceback.format_exc()
480 self._downloader.report_warning(
481 u'Writing cache to %r failed: %s' % (cache_fn, tb))
485 def _print_sig_code(self, func, slen):
486 def gen_sig_code(idxs):
487 def _genslice(start, end, step):
488 starts = u'' if start == 0 else str(start)
489 ends = u':%d' % (end+step)
490 steps = u'' if step == 1 else (':%d' % step)
491 return u's[%s%s%s]' % (starts, ends, steps)
494 for i, prev in zip(idxs[1:], idxs[:-1]):
498 yield _genslice(start, prev, step)
501 if i - prev in [-1, 1]:
506 yield u's[%d]' % prev
510 yield _genslice(start, i, step)
512 cache_res = func(map(compat_chr, range(slen)))
513 cache_spec = [ord(c) for c in cache_res]
514 expr_code = u' + '.join(gen_sig_code(cache_spec))
515 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
516 self.to_screen(u'Extracted signature:\n' + code)
518 def _parse_sig_js(self, jscode):
519 funcname = self._search_regex(
520 r'signature=([a-zA-Z]+)', jscode,
521 u'Initial JS player signature function name')
526 return string.lowercase.index(varname)
528 def interpret_statement(stmt, local_vars, allow_recursion=20):
529 if allow_recursion < 0:
530 raise ExctractorError(u'Recursion limit reached')
532 if stmt.startswith(u'var '):
533 stmt = stmt[len(u'var '):]
534 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
535 r'=(?P<expr>.*)$', stmt)
537 if ass_m.groupdict().get('index'):
539 lvar = local_vars[ass_m.group('out')]
540 idx = interpret_expression(ass_m.group('index'),
541 local_vars, allow_recursion)
542 assert isinstance(idx, int)
545 expr = ass_m.group('expr')
548 local_vars[ass_m.group('out')] = val
550 expr = ass_m.group('expr')
551 elif stmt.startswith(u'return '):
553 expr = stmt[len(u'return '):]
555 raise ExtractorError(
556 u'Cannot determine left side of statement in %r' % stmt)
558 v = interpret_expression(expr, local_vars, allow_recursion)
561 def interpret_expression(expr, local_vars, allow_recursion):
566 return local_vars[expr]
568 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
570 member = m.group('member')
571 val = local_vars[m.group('in')]
572 if member == 'split("")':
574 if member == 'join("")':
576 if member == 'length':
578 if member == 'reverse()':
580 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
582 idx = interpret_expression(
583 slice_m.group('idx'), local_vars, allow_recursion-1)
587 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
589 val = local_vars[m.group('in')]
590 idx = interpret_expression(m.group('idx'), local_vars,
594 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
596 a = interpret_expression(m.group('a'),
597 local_vars, allow_recursion)
598 b = interpret_expression(m.group('b'),
599 local_vars, allow_recursion)
603 r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
605 fname = m.group('func')
606 if fname not in functions:
607 functions[fname] = extract_function(fname)
608 argvals = [int(v) if v.isdigit() else local_vars[v]
609 for v in m.group('args').split(',')]
610 return functions[fname](argvals)
611 raise ExtractorError(u'Unsupported JS expression %r' % expr)
613 def extract_function(funcname):
615 r'function ' + re.escape(funcname) +
616 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
618 argnames = func_m.group('args').split(',')
621 local_vars = dict(zip(argnames, args))
622 for stmt in func_m.group('code').split(';'):
623 res = interpret_statement(stmt, local_vars)
627 initial_function = extract_function(funcname)
628 return lambda s: initial_function([s])
630 def _parse_sig_swf(self, file_contents):
631 if file_contents[1:3] != b'WS':
632 raise ExtractorError(
633 u'Not an SWF file; header is %r' % file_contents[:3])
634 if file_contents[:1] == b'C':
635 content = zlib.decompress(file_contents[8:])
637 raise NotImplementedError(u'Unsupported compression format %r' %
640 def extract_tags(content):
642 while pos < len(content):
643 header16 = struct.unpack('<H', content[pos:pos+2])[0]
645 tag_code = header16 >> 6
646 tag_len = header16 & 0x3f
648 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
650 assert pos+tag_len <= len(content)
651 yield (tag_code, content[pos:pos+tag_len])
655 for tag_code, tag in extract_tags(content)
657 p = code_tag.index(b'\0', 4) + 1
658 code_reader = io.BytesIO(code_tag[p:])
660 # Parse ABC (AVM2 ByteCode)
661 def read_int(reader=None):
669 b = struct.unpack('<B', buf)[0]
670 res = res | ((b & 0x7f) << shift)
676 def u30(reader=None):
677 res = read_int(reader)
678 assert res & 0xf0000000 == 0
682 def s32(reader=None):
684 if v & 0x80000000 != 0:
685 v = - ((v ^ 0xffffffff) + 1)
688 def string(reader=None):
692 resb = reader.read(slen)
693 assert len(resb) == slen
694 return resb.decode('utf-8')
696 def read_bytes(count, reader=None):
699 resb = reader.read(count)
700 assert len(resb) == count
703 def read_byte(reader=None):
704 resb = read_bytes(1, reader=reader)
705 res = struct.unpack('<B', resb)[0]
708 # minor_version + major_version
709 _ = read_bytes(2 + 2)
713 for _c in range(1, int_count):
716 for _c in range(1, uint_count):
719 _ = read_bytes((double_count-1) * 8)
721 constant_strings = [u'']
722 for _c in range(1, string_count):
724 constant_strings.append(s)
725 namespace_count = u30()
726 for _c in range(1, namespace_count):
727 _ = read_bytes(1) # kind
730 for _c in range(1, ns_set_count):
732 for _c2 in range(count):
734 multiname_count = u30()
743 0x0e: 2, # MultinameA
744 0x1b: 1, # MultinameL
745 0x1c: 1, # MultinameLA
748 for _c in range(1, multiname_count):
750 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
752 namespace_idx = u30()
754 multinames.append(constant_strings[name_idx])
756 multinames.append('[MULTINAME kind: %d]' % kind)
757 for _c2 in range(MULTINAME_SIZES[kind]):
762 MethodInfo = collections.namedtuple(
764 ['NEED_ARGUMENTS', 'NEED_REST'])
766 for method_id in range(method_count):
768 _ = u30() # return type
769 for _ in range(param_count):
770 _ = u30() # param type
771 _ = u30() # name index (always 0 for youtube)
773 if flags & 0x08 != 0:
776 for c in range(option_count):
778 _ = read_bytes(1) # kind
779 if flags & 0x80 != 0:
780 # Param names present
781 for _ in range(param_count):
782 _ = u30() # param name
783 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
784 method_infos.append(mi)
787 metadata_count = u30()
788 for _c in range(metadata_count):
791 for _c2 in range(item_count):
795 def parse_traits_info():
796 trait_name_idx = u30()
797 kind_full = read_byte()
798 kind = kind_full & 0x0f
799 attrs = kind_full >> 4
801 if kind in [0x00, 0x06]: # Slot or Const
803 type_name_idx = u30()
806 _ = read_byte() # vkind
807 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
810 methods[multinames[trait_name_idx]] = method_idx
811 elif kind == 0x04: # Class
814 elif kind == 0x05: # Function
817 methods[function_idx] = multinames[trait_name_idx]
819 raise ExtractorError(u'Unsupported trait kind %d' % kind)
821 if attrs & 0x4 != 0: # Metadata present
822 metadata_count = u30()
823 for _c3 in range(metadata_count):
829 TARGET_CLASSNAME = u'SignatureDecipher'
830 searched_idx = multinames.index(TARGET_CLASSNAME)
831 searched_class_id = None
833 for class_id in range(class_count):
835 if name_idx == searched_idx:
836 # We found the class we're looking for!
837 searched_class_id = class_id
838 _ = u30() # super_name idx
840 if flags & 0x08 != 0: # Protected namespace is present
841 protected_ns_idx = u30()
843 for _c2 in range(intrf_count):
847 for _c2 in range(trait_count):
848 _ = parse_traits_info()
850 if searched_class_id is None:
851 raise ExtractorError(u'Target class %r not found' %
856 for class_id in range(class_count):
859 for _c2 in range(trait_count):
860 trait_methods = parse_traits_info()
861 if class_id == searched_class_id:
862 method_names.update(trait_methods.items())
863 method_idxs.update(dict(
865 for name, idx in trait_methods.items()))
869 for _c in range(script_count):
872 for _c2 in range(trait_count):
873 _ = parse_traits_info()
876 method_body_count = u30()
877 Method = collections.namedtuple('Method', ['code', 'local_count'])
879 for _c in range(method_body_count):
883 init_scope_depth = u30()
884 max_scope_depth = u30()
886 code = read_bytes(code_length)
887 if method_idx in method_idxs:
888 m = Method(code, local_count)
889 methods[method_idxs[method_idx]] = m
890 exception_count = u30()
891 for _c2 in range(exception_count):
898 for _c2 in range(trait_count):
899 _ = parse_traits_info()
901 assert p + code_reader.tell() == len(code_tag)
902 assert len(methods) == len(method_idxs)
904 method_pyfunctions = {}
906 def extract_function(func_name):
907 if func_name in method_pyfunctions:
908 return method_pyfunctions[func_name]
909 if func_name not in methods:
910 raise ExtractorError(u'Cannot find function %r' % func_name)
911 m = methods[func_name]
914 registers = ['(this)'] + list(args) + [None] * m.local_count
916 coder = io.BytesIO(m.code)
918 opcode = struct.unpack('!B', coder.read(1))[0]
919 if opcode == 36: # pushbyte
920 v = struct.unpack('!B', coder.read(1))[0]
922 elif opcode == 44: # pushstring
924 stack.append(constant_strings[idx])
925 elif opcode == 48: # pushscope
926 # We don't implement the scope register, so we'll just
927 # ignore the popped value
929 elif opcode == 70: # callproperty
931 mname = multinames[index]
932 arg_count = u30(coder)
933 args = list(reversed(
934 [stack.pop() for _ in range(arg_count)]))
936 if mname == u'split':
937 assert len(args) == 1
938 assert isinstance(args[0], compat_str)
939 assert isinstance(obj, compat_str)
943 res = obj.split(args[0])
945 elif mname == u'slice':
946 assert len(args) == 1
947 assert isinstance(args[0], int)
948 assert isinstance(obj, list)
951 elif mname == u'join':
952 assert len(args) == 1
953 assert isinstance(args[0], compat_str)
954 assert isinstance(obj, list)
955 res = args[0].join(obj)
957 elif mname in method_pyfunctions:
958 stack.append(method_pyfunctions[mname](args))
960 raise NotImplementedError(
961 u'Unsupported property %r on %r'
963 elif opcode == 72: # returnvalue
966 elif opcode == 79: # callpropvoid
968 mname = multinames[index]
969 arg_count = u30(coder)
970 args = list(reversed(
971 [stack.pop() for _ in range(arg_count)]))
973 if mname == u'reverse':
974 assert isinstance(obj, list)
977 raise NotImplementedError(
978 u'Unsupported (void) property %r on %r'
980 elif opcode == 93: # findpropstrict
982 mname = multinames[index]
983 res = extract_function(mname)
985 elif opcode == 97: # setproperty
990 assert isinstance(obj, list)
991 assert isinstance(idx, int)
993 elif opcode == 98: # getlocal
995 stack.append(registers[index])
996 elif opcode == 99: # setlocal
999 registers[index] = value
1000 elif opcode == 102: # getproperty
1002 pname = multinames[index]
1003 if pname == u'length':
1005 assert isinstance(obj, list)
1006 stack.append(len(obj))
1007 else: # Assume attribute access
1009 assert isinstance(idx, int)
1011 assert isinstance(obj, list)
1012 stack.append(obj[idx])
1013 elif opcode == 128: # coerce
1015 elif opcode == 133: # coerce_s
1016 assert isinstance(stack[-1], (type(None), compat_str))
1017 elif opcode == 164: # modulo
1018 value2 = stack.pop()
1019 value1 = stack.pop()
1020 res = value1 % value2
1022 elif opcode == 208: # getlocal_0
1023 stack.append(registers[0])
1024 elif opcode == 209: # getlocal_1
1025 stack.append(registers[1])
1026 elif opcode == 210: # getlocal_2
1027 stack.append(registers[2])
1028 elif opcode == 211: # getlocal_3
1029 stack.append(registers[3])
1030 elif opcode == 214: # setlocal_2
1031 registers[2] = stack.pop()
1032 elif opcode == 215: # setlocal_3
1033 registers[3] = stack.pop()
1035 raise NotImplementedError(
1036 u'Unsupported opcode %d' % opcode)
1038 method_pyfunctions[func_name] = resfunc
1041 initial_function = extract_function(u'decipher')
1042 return lambda s: initial_function([s])
1044 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
1045 """Turn the encrypted s field into a working signature"""
1047 if player_url is not None:
1049 if player_url not in self._player_cache:
1050 func = self._extract_signature_function(
1051 video_id, player_url, len(s)
1053 self._player_cache[player_url] = func
1054 func = self._player_cache[player_url]
1055 if self._downloader.params.get('youtube_print_sig_code'):
1056 self._print_sig_code(func, len(s))
1058 except Exception as e:
1059 tb = traceback.format_exc()
1060 self._downloader.report_warning(
1061 u'Automatic signature extraction failed: ' + tb)
1063 self._downloader.report_warning(
1064 u'Warning: Falling back to static signature algorithm')
1065 return self._static_decrypt_signature(
1066 s, video_id, player_url, age_gate)
1068 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
1070 # The videos with age protection use another player, so the
1071 # algorithms can be different.
1073 return s[2:63] + s[82] + s[64:82] + s[63]
1076 return s[86:29:-1] + s[88] + s[28:5:-1]
1078 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
1080 return s[84:27:-1] + s[86] + s[26:5:-1]
1082 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
1084 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
1086 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
1088 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
1090 return s[5:34] + s[0] + s[35:38] + s[3] + s[39:45] + s[38] + s[46:53] + s[73] + s[54:73] + s[85] + s[74:85] + s[53]
1092 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
1094 return s[81:36:-1] + s[0] + s[35:2:-1]
1096 return s[81:64:-1] + s[82] + s[63:52:-1] + s[45] + s[51:45:-1] + s[1] + s[44:1:-1] + s[0]
1098 return s[80:73:-1] + s[81] + s[72:54:-1] + s[2] + s[53:43:-1] + s[0] + s[42:2:-1] + s[43] + s[1] + s[54]
1100 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1102 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
1104 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1107 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
1109 def _decrypt_signature_age_gate(self, s):
1110 # The videos with age protection use another player, so the algorithms
1113 return s[2:63] + s[82] + s[64:82] + s[63]
1115 # Fallback to the other algortihms
1116 return self._decrypt_signature(s)
1118 def _get_available_subtitles(self, video_id):
1120 sub_list = self._download_webpage(
1121 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1122 video_id, note=False)
1123 except ExtractorError as err:
1124 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
1126 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
1131 params = compat_urllib_parse.urlencode({
1134 'fmt': self._downloader.params.get('subtitlesformat'),
1136 url = u'http://www.youtube.com/api/timedtext?' + params
1137 sub_lang_list[lang] = url
1138 if not sub_lang_list:
1139 self._downloader.report_warning(u'video doesn\'t have subtitles')
1141 return sub_lang_list
1143 def _get_available_automatic_caption(self, video_id, webpage):
1144 """We need the webpage for getting the captions url, pass it as an
1145 argument to speed up the process."""
1146 sub_format = self._downloader.params.get('subtitlesformat')
1147 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1148 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
1149 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
1151 self._downloader.report_warning(err_msg)
1153 player_config = json.loads(mobj.group(1))
1155 args = player_config[u'args']
1156 caption_url = args[u'ttsurl']
1157 timestamp = args[u'timestamp']
1158 # We get the available subtitles
1159 list_params = compat_urllib_parse.urlencode({
1164 list_url = caption_url + '&' + list_params
1165 list_page = self._download_webpage(list_url, video_id)
1166 caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8'))
1167 original_lang_node = caption_list.find('track')
1168 if original_lang_node.attrib.get('kind') != 'asr' :
1169 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1171 original_lang = original_lang_node.attrib['lang_code']
1174 for lang_node in caption_list.findall('target'):
1175 sub_lang = lang_node.attrib['lang_code']
1176 params = compat_urllib_parse.urlencode({
1177 'lang': original_lang,
1183 sub_lang_list[sub_lang] = caption_url + '&' + params
1184 return sub_lang_list
1185 # An extractor error can be raise by the download process if there are
1186 # no automatic captions but there are subtitles
1187 except (KeyError, ExtractorError):
1188 self._downloader.report_warning(err_msg)
1191 def _print_formats(self, formats):
1192 print('Available formats:')
1194 print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'),
1195 self._video_dimensions.get(x, '???'),
1196 ' ('+self._special_itags[x]+')' if x in self._special_itags else ''))
1198 def _extract_id(self, url):
1199 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1201 raise ExtractorError(u'Invalid URL: %s' % url)
1202 video_id = mobj.group(2)
1205 def _get_video_url_list(self, url_map):
1207 Transform a dictionary in the format {itag:url} to a list of (itag, url)
1208 with the requested formats.
1210 req_format = self._downloader.params.get('format', None)
1211 format_limit = self._downloader.params.get('format_limit', None)
1212 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1213 if format_limit is not None and format_limit in available_formats:
1214 format_list = available_formats[available_formats.index(format_limit):]
1216 format_list = available_formats
1217 existing_formats = [x for x in format_list if x in url_map]
1218 if len(existing_formats) == 0:
1219 raise ExtractorError(u'no known formats available for video')
1220 if self._downloader.params.get('listformats', None):
1221 self._print_formats(existing_formats)
1223 if req_format is None or req_format == 'best':
1224 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1225 elif req_format == 'worst':
1226 video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality
1227 elif req_format in ('-1', 'all'):
1228 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1230 # Specific formats. We pick the first in a slash-delimeted sequence.
1231 # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
1232 # available in the specified format. For example,
1233 # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1234 # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
1235 # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
1236 req_formats = req_format.split('/')
1237 video_url_list = None
1238 for rf in req_formats:
1240 video_url_list = [(rf, url_map[rf])]
1242 if rf in self._video_formats_map:
1243 for srf in self._video_formats_map[rf]:
1245 video_url_list = [(srf, url_map[srf])]
1250 if video_url_list is None:
1251 raise ExtractorError(u'requested format not available')
1252 return video_url_list
1254 def _extract_from_m3u8(self, manifest_url, video_id):
1256 def _get_urls(_manifest):
1257 lines = _manifest.split('\n')
1258 urls = filter(lambda l: l and not l.startswith('#'),
1261 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1262 formats_urls = _get_urls(manifest)
1263 for format_url in formats_urls:
1264 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1265 url_map[itag] = format_url
1268 def _real_extract(self, url):
1269 if re.match(r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$', url):
1270 self._downloader.report_warning(u'Did you forget to quote the URL? Remember that & is a meta-character in most shells, so you want to put the URL in quotes, like youtube-dl \'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\' (or simply youtube-dl BaW_jenozKc ).')
1272 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1273 mobj = re.search(self._NEXT_URL_RE, url)
1275 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1276 video_id = self._extract_id(url)
1279 self.report_video_webpage_download(video_id)
1280 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1281 request = compat_urllib_request.Request(url)
1283 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
1284 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1285 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
1287 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
1289 # Attempt to extract SWF player URL
1290 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1291 if mobj is not None:
1292 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1297 self.report_video_info_webpage_download(video_id)
1298 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1299 self.report_age_confirmation()
1301 # We simulate the access to the video from www.youtube.com/v/{video_id}
1302 # this can be viewed without login into Youtube
1303 data = compat_urllib_parse.urlencode({'video_id': video_id,
1307 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1311 video_info_url = 'https://www.youtube.com/get_video_info?' + data
1312 video_info_webpage = self._download_webpage(video_info_url, video_id,
1314 errnote='unable to download video info webpage')
1315 video_info = compat_parse_qs(video_info_webpage)
1318 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1319 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1320 % (video_id, el_type))
1321 video_info_webpage = self._download_webpage(video_info_url, video_id,
1323 errnote='unable to download video info webpage')
1324 video_info = compat_parse_qs(video_info_webpage)
1325 if 'token' in video_info:
1327 if 'token' not in video_info:
1328 if 'reason' in video_info:
1329 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
1331 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1333 # Check for "rental" videos
1334 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1335 raise ExtractorError(u'"rental" videos not supported')
1337 # Start extracting information
1338 self.report_information_extraction(video_id)
1341 if 'author' not in video_info:
1342 raise ExtractorError(u'Unable to extract uploader name')
1343 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1346 video_uploader_id = None
1347 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1348 if mobj is not None:
1349 video_uploader_id = mobj.group(1)
1351 self._downloader.report_warning(u'unable to extract uploader nickname')
1354 if 'title' not in video_info:
1355 raise ExtractorError(u'Unable to extract video title')
1356 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1359 # We try first to get a high quality image:
1360 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1361 video_webpage, re.DOTALL)
1362 if m_thumb is not None:
1363 video_thumbnail = m_thumb.group(1)
1364 elif 'thumbnail_url' not in video_info:
1365 self._downloader.report_warning(u'unable to extract video thumbnail')
1366 video_thumbnail = ''
1367 else: # don't panic if we can't find it
1368 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1372 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1373 if mobj is not None:
1374 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1375 upload_date = unified_strdate(upload_date)
1378 video_description = get_element_by_id("eow-description", video_webpage)
1379 if video_description:
1380 video_description = clean_html(video_description)
1382 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1384 video_description = unescapeHTML(fd_mobj.group(1))
1386 video_description = u''
1389 video_subtitles = self.extract_subtitles(video_id, video_webpage)
1391 if self._downloader.params.get('listsubtitles', False):
1392 self._list_available_subtitles(video_id, video_webpage)
1395 if 'length_seconds' not in video_info:
1396 self._downloader.report_warning(u'unable to extract video duration')
1399 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
1401 # Decide which formats to download
1404 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
1406 raise ValueError('Could not find vevo ID')
1407 info = json.loads(mobj.group(1))
1409 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1410 # this signatures are encrypted
1411 m_s = re.search(r'[&,]s=', args['url_encoded_fmt_stream_map'])
1413 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
1414 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
1415 m_s = re.search(r'[&,]s=', args.get('adaptive_fmts', u''))
1417 if 'url_encoded_fmt_stream_map' in video_info:
1418 video_info['url_encoded_fmt_stream_map'][0] += ',' + args['adaptive_fmts']
1420 video_info['url_encoded_fmt_stream_map'] = [args['adaptive_fmts']]
1421 elif 'adaptive_fmts' in video_info:
1422 if 'url_encoded_fmt_stream_map' in video_info:
1423 video_info['url_encoded_fmt_stream_map'][0] += ',' + video_info['adaptive_fmts'][0]
1425 video_info['url_encoded_fmt_stream_map'] = video_info['adaptive_fmts']
1429 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1430 self.report_rtmp_download()
1431 video_url_list = [(None, video_info['conn'][0])]
1432 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1433 if 'rtmpe%3Dyes' in video_info['url_encoded_fmt_stream_map'][0]:
1434 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1436 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
1437 url_data = compat_parse_qs(url_data_str)
1438 if 'itag' in url_data and 'url' in url_data:
1439 url = url_data['url'][0]
1440 if 'sig' in url_data:
1441 url += '&signature=' + url_data['sig'][0]
1442 elif 's' in url_data:
1443 encrypted_sig = url_data['s'][0]
1444 if self._downloader.params.get('verbose'):
1446 player_version = self._search_regex(
1448 player_url if player_url else None,
1449 'flash player', fatal=False)
1450 player_desc = 'flash player %s' % player_version
1452 player_version = self._search_regex(
1453 r'html5player-(.+?)\.js', video_webpage,
1454 'html5 player', fatal=False)
1455 player_desc = u'html5 player %s' % player_version
1457 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
1458 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
1459 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1462 jsplayer_url_json = self._search_regex(
1463 r'"assets":.+?"js":\s*("[^"]+")',
1464 video_webpage, u'JS player URL')
1465 player_url = json.loads(jsplayer_url_json)
1467 signature = self._decrypt_signature(
1468 encrypted_sig, video_id, player_url, age_gate)
1469 url += '&signature=' + signature
1470 if 'ratebypass' not in url:
1471 url += '&ratebypass=yes'
1472 url_map[url_data['itag'][0]] = url
1473 video_url_list = self._get_video_url_list(url_map)
1474 if not video_url_list:
1476 elif video_info.get('hlsvp'):
1477 manifest_url = video_info['hlsvp'][0]
1478 url_map = self._extract_from_m3u8(manifest_url, video_id)
1479 video_url_list = self._get_video_url_list(url_map)
1480 if not video_url_list:
1484 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
1487 for format_param, video_real_url in video_url_list:
1489 video_extension = self._video_extensions.get(format_param, 'flv')
1491 video_format = '{0} - {1}{2}'.format(format_param if format_param else video_extension,
1492 self._video_dimensions.get(format_param, '???'),
1493 ' ('+self._special_itags[format_param]+')' if format_param in self._special_itags else '')
1497 'url': video_real_url,
1498 'uploader': video_uploader,
1499 'uploader_id': video_uploader_id,
1500 'upload_date': upload_date,
1501 'title': video_title,
1502 'ext': video_extension,
1503 'format': video_format,
1504 'thumbnail': video_thumbnail,
1505 'description': video_description,
1506 'player_url': player_url,
1507 'subtitles': video_subtitles,
1508 'duration': video_duration
1512 class YoutubePlaylistIE(InfoExtractor):
1513 IE_DESC = u'YouTube.com playlists'
1514 _VALID_URL = r"""(?:
1519 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1520 \? (?:.*?&)*? (?:p|a|list)=
1523 ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
1526 ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
1528 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1530 IE_NAME = u'youtube:playlist'
1533 def suitable(cls, url):
1534 """Receives a URL and returns True if suitable for this IE."""
1535 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1537 def _real_extract(self, url):
1538 # Extract playlist id
1539 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1541 raise ExtractorError(u'Invalid URL: %s' % url)
1543 # Download playlist videos from API
1544 playlist_id = mobj.group(1) or mobj.group(2)
1547 for page_num in itertools.count(1):
1548 start_index = self._MAX_RESULTS * (page_num - 1) + 1
1549 if start_index >= 1000:
1550 self._downloader.report_warning(u'Max number of results reached')
1552 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index)
1553 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1556 response = json.loads(page)
1557 except ValueError as err:
1558 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1560 if 'feed' not in response:
1561 raise ExtractorError(u'Got a malformed response from YouTube API')
1562 playlist_title = response['feed']['title']['$t']
1563 if 'entry' not in response['feed']:
1564 # Number of videos is a multiple of self._MAX_RESULTS
1567 for entry in response['feed']['entry']:
1568 index = entry['yt$position']['$t']
1569 if 'media$group' in entry and 'yt$videoid' in entry['media$group']:
1572 'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t']
1575 videos = [v[1] for v in sorted(videos)]
1577 url_results = [self.url_result(vurl, 'Youtube') for vurl in videos]
1578 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1581 class YoutubeChannelIE(InfoExtractor):
1582 IE_DESC = u'YouTube.com channels'
1583 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1584 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1585 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1586 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1587 IE_NAME = u'youtube:channel'
1589 def extract_videos_from_page(self, page):
1591 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1592 if mobj.group(1) not in ids_in_page:
1593 ids_in_page.append(mobj.group(1))
1596 def _real_extract(self, url):
1597 # Extract channel id
1598 mobj = re.match(self._VALID_URL, url)
1600 raise ExtractorError(u'Invalid URL: %s' % url)
1602 # Download channel page
1603 channel_id = mobj.group(1)
1607 url = self._TEMPLATE_URL % (channel_id, pagenum)
1608 page = self._download_webpage(url, channel_id,
1609 u'Downloading page #%s' % pagenum)
1611 # Extract video identifiers
1612 ids_in_page = self.extract_videos_from_page(page)
1613 video_ids.extend(ids_in_page)
1615 # Download any subsequent channel pages using the json-based channel_ajax query
1616 if self._MORE_PAGES_INDICATOR in page:
1617 for pagenum in itertools.count(1):
1618 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1619 page = self._download_webpage(url, channel_id,
1620 u'Downloading page #%s' % pagenum)
1622 page = json.loads(page)
1624 ids_in_page = self.extract_videos_from_page(page['content_html'])
1625 video_ids.extend(ids_in_page)
1627 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1630 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1632 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1633 url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls]
1634 return [self.playlist_result(url_entries, channel_id)]
1637 class YoutubeUserIE(InfoExtractor):
1638 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
1639 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?)|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1640 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1641 _GDATA_PAGE_SIZE = 50
1642 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1643 IE_NAME = u'youtube:user'
1646 def suitable(cls, url):
1647 # Don't return True if the url can be extracted with other youtube
1648 # extractor, the regex would is too permissive and it would match.
1649 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1650 if any(ie.suitable(url) for ie in other_ies): return False
1651 else: return super(YoutubeUserIE, cls).suitable(url)
1653 def _real_extract(self, url):
1655 mobj = re.match(self._VALID_URL, url)
1657 raise ExtractorError(u'Invalid URL: %s' % url)
1659 username = mobj.group(1)
1661 # Download video ids using YouTube Data API. Result size per
1662 # query is limited (currently to 50 videos) so we need to query
1663 # page by page until there are no video ids - it means we got
1668 for pagenum in itertools.count(0):
1669 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1671 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1672 page = self._download_webpage(gdata_url, username,
1673 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1676 response = json.loads(page)
1677 except ValueError as err:
1678 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1679 if 'entry' not in response['feed']:
1680 # Number of videos is a multiple of self._MAX_RESULTS
1683 # Extract video identifiers
1685 for entry in response['feed']['entry']:
1686 ids_in_page.append(entry['id']['$t'].split('/')[-1])
1687 video_ids.extend(ids_in_page)
1689 # A little optimization - if current page is not
1690 # "full", ie. does not contain PAGE_SIZE video ids then
1691 # we can assume that this page is the last one - there
1692 # are no more ids on further pages - no need to query
1695 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1698 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1699 url_results = [self.url_result(rurl, 'Youtube') for rurl in urls]
1700 return [self.playlist_result(url_results, playlist_title = username)]
1702 class YoutubeSearchIE(SearchInfoExtractor):
1703 IE_DESC = u'YouTube.com searches'
1704 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1706 IE_NAME = u'youtube:search'
1707 _SEARCH_KEY = 'ytsearch'
1709 def report_download_page(self, query, pagenum):
1710 """Report attempt to download search page with given number."""
1711 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1713 def _get_n_results(self, query, n):
1714 """Get a specified number of results for a query"""
1720 while (50 * pagenum) < limit:
1721 self.report_download_page(query, pagenum+1)
1722 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1723 request = compat_urllib_request.Request(result_url)
1725 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1726 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1727 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1728 api_response = json.loads(data)['data']
1730 if not 'items' in api_response:
1731 raise ExtractorError(u'[youtube] No video results')
1733 new_ids = list(video['id'] for video in api_response['items'])
1734 video_ids += new_ids
1736 limit = min(n, api_response['totalItems'])
1739 if len(video_ids) > n:
1740 video_ids = video_ids[:n]
1741 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1742 return self.playlist_result(videos, query)
1745 class YoutubeShowIE(InfoExtractor):
1746 IE_DESC = u'YouTube.com (multi-season) shows'
1747 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1748 IE_NAME = u'youtube:show'
1750 def _real_extract(self, url):
1751 mobj = re.match(self._VALID_URL, url)
1752 show_name = mobj.group(1)
1753 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1754 # There's one playlist for each season of the show
1755 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1756 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1757 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1760 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1762 Base class for extractors that fetch info from
1763 http://www.youtube.com/feed_ajax
1764 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1766 _LOGIN_REQUIRED = True
1768 # use action_load_personal_feed instead of action_load_system_feed
1769 _PERSONAL_FEED = False
1772 def _FEED_TEMPLATE(self):
1773 action = 'action_load_system_feed'
1774 if self._PERSONAL_FEED:
1775 action = 'action_load_personal_feed'
1776 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1780 return u'youtube:%s' % self._FEED_NAME
1782 def _real_initialize(self):
1785 def _real_extract(self, url):
1787 # The step argument is available only in 2.7 or higher
1788 for i in itertools.count(0):
1789 paging = i*self._PAGING_STEP
1790 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1791 u'%s feed' % self._FEED_NAME,
1792 u'Downloading page %s' % i)
1793 info = json.loads(info)
1794 feed_html = info['feed_html']
1795 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1796 ids = orderedSet(m.group(1) for m in m_ids)
1797 feed_entries.extend(self.url_result(id, 'Youtube') for id in ids)
1798 if info['paging'] is None:
1800 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1802 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1803 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1804 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1805 _FEED_NAME = 'subscriptions'
1806 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1808 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1809 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1810 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1811 _FEED_NAME = 'recommended'
1812 _PLAYLIST_TITLE = u'Youtube Recommended videos'
1814 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1815 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1816 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1817 _FEED_NAME = 'watch_later'
1818 _PLAYLIST_TITLE = u'Youtube Watch Later'
1820 _PERSONAL_FEED = True
1822 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1823 IE_NAME = u'youtube:favorites'
1824 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1825 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1826 _LOGIN_REQUIRED = True
1828 def _real_extract(self, url):
1829 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1830 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1831 return self.url_result(playlist_id, 'YoutubePlaylist')