17 from .common import InfoExtractor, SearchInfoExtractor
18 from .subtitles import SubtitlesInfoExtractor
25 compat_urllib_request,
37 class YoutubeBaseInfoExtractor(InfoExtractor):
38 """Provide base functions for Youtube extractors"""
39 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
40 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
41 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
42 _NETRC_MACHINE = 'youtube'
43 # If True it will raise an error if no login info is provided
44 _LOGIN_REQUIRED = False
46 def report_lang(self):
47 """Report attempt to set language."""
48 self.to_screen(u'Setting language')
50 def _set_language(self):
51 request = compat_urllib_request.Request(self._LANG_URL)
54 compat_urllib_request.urlopen(request).read()
55 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
56 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
61 (username, password) = self._get_login_info()
62 # No authentication to be performed
64 if self._LOGIN_REQUIRED:
65 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
68 request = compat_urllib_request.Request(self._LOGIN_URL)
70 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
71 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
72 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
77 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
80 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
86 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
90 u'PersistentCookie': u'yes',
92 u'bgresponse': u'js_disabled',
93 u'checkConnection': u'',
94 u'checkedDomains': u'youtube',
100 u'signIn': u'Sign in',
102 u'service': u'youtube',
106 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
108 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
109 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
110 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
113 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
114 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
115 self._downloader.report_warning(u'unable to log in: bad username or password')
117 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
118 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
122 def _confirm_age(self):
125 'action_confirm': 'Confirm',
127 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
129 self.report_age_confirmation()
130 compat_urllib_request.urlopen(request).read().decode('utf-8')
131 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
132 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
135 def _real_initialize(self):
136 if self._downloader is None:
138 if not self._set_language():
140 if not self._login():
145 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
146 IE_DESC = u'YouTube.com'
149 (?:https?://)? # http(s):// (optional)
150 (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
151 tube\.majestyc\.net/|
152 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
153 (?:.*?\#/)? # handle anchor (#/) redirect urls
154 (?: # the various things that can precede the ID:
155 (?:(?:v|embed|e)/) # v/ or embed/ or e/
156 |(?: # or the v= param in all its forms
157 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
158 (?:\?|\#!?) # the params delimiter ? or # or #!
159 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
163 |youtu\.be/ # just youtu.be/xxxx
165 )? # all until now is optional -> you can pass the naked ID
166 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
167 (?(1).+)? # if we found the ID, everything can follow
169 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
170 # Listed in order of quality
171 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
172 # Apple HTTP Live Streaming
173 '96', '95', '94', '93', '92', '132', '151',
175 '85', '84', '102', '83', '101', '82', '100',
177 '138', '137', '248', '136', '247', '135', '246',
178 '245', '244', '134', '243', '133', '242', '160',
180 '141', '172', '140', '171', '139',
182 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
183 # Apple HTTP Live Streaming
184 '96', '95', '94', '93', '92', '132', '151',
186 '85', '102', '84', '101', '83', '100', '82',
188 '138', '248', '137', '247', '136', '246', '245',
189 '244', '135', '243', '134', '242', '133', '160',
191 '172', '141', '171', '140', '139',
193 _video_formats_map = {
194 'flv': ['35', '34', '6', '5'],
195 '3gp': ['36', '17', '13'],
196 'mp4': ['38', '37', '22', '18'],
197 'webm': ['46', '45', '44', '43'],
199 _video_extensions = {
221 # Apple HTTP Live Streaming
253 _video_dimensions = {
335 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
336 u"file": u"BaW_jenozKc.mp4",
338 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
339 u"uploader": u"Philipp Hagemeister",
340 u"uploader_id": u"phihag",
341 u"upload_date": u"20121002",
342 u"description": u"test chars: \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
346 u"url": u"http://www.youtube.com/watch?v=1ltcDfZMA3U",
347 u"file": u"1ltcDfZMA3U.flv",
348 u"note": u"Test VEVO video (#897)",
350 u"upload_date": u"20070518",
351 u"title": u"Maps - It Will Find You",
352 u"description": u"Music video by Maps performing It Will Find You.",
353 u"uploader": u"MuteUSA",
354 u"uploader_id": u"MuteUSA"
358 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
359 u"file": u"UxxajLWwzqY.mp4",
360 u"note": u"Test generic use_cipher_signature video (#897)",
362 u"upload_date": u"20120506",
363 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
364 u"description": u"md5:3e2666e0a55044490499ea45fe9037b7",
365 u"uploader": u"Icona Pop",
366 u"uploader_id": u"IconaPop"
370 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
371 u"file": u"07FYdnEawAQ.mp4",
372 u"note": u"Test VEVO video with age protection (#956)",
374 u"upload_date": u"20130703",
375 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
376 u"description": u"md5:64249768eec3bc4276236606ea996373",
377 u"uploader": u"justintimberlakeVEVO",
378 u"uploader_id": u"justintimberlakeVEVO"
382 u'url': u'https://www.youtube.com/watch?v=TGi3HqYrWHE',
383 u'file': u'TGi3HqYrWHE.mp4',
384 u'note': u'm3u8 video',
386 u'title': u'Triathlon - Men - London 2012 Olympic Games',
387 u'description': u'- Men - TR02 - Triathlon - 07 August 2012 - London 2012 Olympic Games',
388 u'uploader': u'olympic',
389 u'upload_date': u'20120807',
390 u'uploader_id': u'olympic',
393 u'skip_download': True,
400 def suitable(cls, url):
401 """Receives a URL and returns True if suitable for this IE."""
402 if YoutubePlaylistIE.suitable(url): return False
403 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
405 def __init__(self, *args, **kwargs):
406 super(YoutubeIE, self).__init__(*args, **kwargs)
407 self._player_cache = {}
409 def report_video_webpage_download(self, video_id):
410 """Report attempt to download video webpage."""
411 self.to_screen(u'%s: Downloading video webpage' % video_id)
413 def report_video_info_webpage_download(self, video_id):
414 """Report attempt to download video info webpage."""
415 self.to_screen(u'%s: Downloading video info webpage' % video_id)
417 def report_information_extraction(self, video_id):
418 """Report attempt to extract video information."""
419 self.to_screen(u'%s: Extracting video information' % video_id)
421 def report_unavailable_format(self, video_id, format):
422 """Report extracted video URL."""
423 self.to_screen(u'%s: Format %s not available' % (video_id, format))
425 def report_rtmp_download(self):
426 """Indicate the download will use the RTMP protocol."""
427 self.to_screen(u'RTMP download detected')
429 def _extract_signature_function(self, video_id, player_url, slen):
430 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
432 player_type = id_m.group('ext')
433 player_id = id_m.group('id')
435 # Read from filesystem cache
436 func_id = '%s_%s_%d' % (player_type, player_id, slen)
437 assert os.path.basename(func_id) == func_id
438 cache_dir = self._downloader.params.get('cachedir',
439 u'~/.youtube-dl/cache')
441 if cache_dir != u'NONE':
442 cache_fn = os.path.join(os.path.expanduser(cache_dir),
446 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
447 cache_spec = json.load(cachef)
448 return lambda s: u''.join(s[i] for i in cache_spec)
450 pass # No cache available
452 if player_type == 'js':
453 code = self._download_webpage(
454 player_url, video_id,
455 note=u'Downloading %s player %s' % (player_type, player_id),
456 errnote=u'Download of %s failed' % player_url)
457 res = self._parse_sig_js(code)
458 elif player_type == 'swf':
459 urlh = self._request_webpage(
460 player_url, video_id,
461 note=u'Downloading %s player %s' % (player_type, player_id),
462 errnote=u'Download of %s failed' % player_url)
464 res = self._parse_sig_swf(code)
466 assert False, 'Invalid player type %r' % player_type
468 if cache_dir is not False:
470 cache_res = res(map(compat_chr, range(slen)))
471 cache_spec = [ord(c) for c in cache_res]
473 os.makedirs(os.path.dirname(cache_fn))
474 except OSError as ose:
475 if ose.errno != errno.EEXIST:
477 write_json_file(cache_spec, cache_fn)
478 except Exception as e:
479 tb = traceback.format_exc()
480 self._downloader.report_warning(
481 u'Writing cache to %r failed: %s' % (cache_fn, tb))
485 def _print_sig_code(self, func, slen):
486 def gen_sig_code(idxs):
487 def _genslice(start, end, step):
488 starts = u'' if start == 0 else str(start)
489 ends = u':%d' % (end+step)
490 steps = u'' if step == 1 else (':%d' % step)
491 return u's[%s%s%s]' % (starts, ends, steps)
494 for i, prev in zip(idxs[1:], idxs[:-1]):
498 yield _genslice(start, prev, step)
501 if i - prev in [-1, 1]:
506 yield u's[%d]' % prev
510 yield _genslice(start, i, step)
512 cache_res = func(map(compat_chr, range(slen)))
513 cache_spec = [ord(c) for c in cache_res]
514 expr_code = u' + '.join(gen_sig_code(cache_spec))
515 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
516 self.to_screen(u'Extracted signature:\n' + code)
518 def _parse_sig_js(self, jscode):
519 funcname = self._search_regex(
520 r'signature=([a-zA-Z]+)', jscode,
521 u'Initial JS player signature function name')
526 return string.lowercase.index(varname)
528 def interpret_statement(stmt, local_vars, allow_recursion=20):
529 if allow_recursion < 0:
530 raise ExctractorError(u'Recursion limit reached')
532 if stmt.startswith(u'var '):
533 stmt = stmt[len(u'var '):]
534 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
535 r'=(?P<expr>.*)$', stmt)
537 if ass_m.groupdict().get('index'):
539 lvar = local_vars[ass_m.group('out')]
540 idx = interpret_expression(ass_m.group('index'),
541 local_vars, allow_recursion)
542 assert isinstance(idx, int)
545 expr = ass_m.group('expr')
548 local_vars[ass_m.group('out')] = val
550 expr = ass_m.group('expr')
551 elif stmt.startswith(u'return '):
553 expr = stmt[len(u'return '):]
555 raise ExtractorError(
556 u'Cannot determine left side of statement in %r' % stmt)
558 v = interpret_expression(expr, local_vars, allow_recursion)
561 def interpret_expression(expr, local_vars, allow_recursion):
566 return local_vars[expr]
568 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
570 member = m.group('member')
571 val = local_vars[m.group('in')]
572 if member == 'split("")':
574 if member == 'join("")':
576 if member == 'length':
578 if member == 'reverse()':
580 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
582 idx = interpret_expression(
583 slice_m.group('idx'), local_vars, allow_recursion-1)
587 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
589 val = local_vars[m.group('in')]
590 idx = interpret_expression(m.group('idx'), local_vars,
594 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
596 a = interpret_expression(m.group('a'),
597 local_vars, allow_recursion)
598 b = interpret_expression(m.group('b'),
599 local_vars, allow_recursion)
603 r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
605 fname = m.group('func')
606 if fname not in functions:
607 functions[fname] = extract_function(fname)
608 argvals = [int(v) if v.isdigit() else local_vars[v]
609 for v in m.group('args').split(',')]
610 return functions[fname](argvals)
611 raise ExtractorError(u'Unsupported JS expression %r' % expr)
613 def extract_function(funcname):
615 r'function ' + re.escape(funcname) +
616 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
618 argnames = func_m.group('args').split(',')
621 local_vars = dict(zip(argnames, args))
622 for stmt in func_m.group('code').split(';'):
623 res = interpret_statement(stmt, local_vars)
627 initial_function = extract_function(funcname)
628 return lambda s: initial_function([s])
630 def _parse_sig_swf(self, file_contents):
631 if file_contents[1:3] != b'WS':
632 raise ExtractorError(
633 u'Not an SWF file; header is %r' % file_contents[:3])
634 if file_contents[:1] == b'C':
635 content = zlib.decompress(file_contents[8:])
637 raise NotImplementedError(u'Unsupported compression format %r' %
640 def extract_tags(content):
642 while pos < len(content):
643 header16 = struct.unpack('<H', content[pos:pos+2])[0]
645 tag_code = header16 >> 6
646 tag_len = header16 & 0x3f
648 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
650 assert pos+tag_len <= len(content)
651 yield (tag_code, content[pos:pos+tag_len])
655 for tag_code, tag in extract_tags(content)
657 p = code_tag.index(b'\0', 4) + 1
658 code_reader = io.BytesIO(code_tag[p:])
660 # Parse ABC (AVM2 ByteCode)
661 def read_int(reader=None):
669 b = struct.unpack('<B', buf)[0]
670 res = res | ((b & 0x7f) << shift)
676 def u30(reader=None):
677 res = read_int(reader)
678 assert res & 0xf0000000 == 0
682 def s32(reader=None):
684 if v & 0x80000000 != 0:
685 v = - ((v ^ 0xffffffff) + 1)
688 def string(reader=None):
692 resb = reader.read(slen)
693 assert len(resb) == slen
694 return resb.decode('utf-8')
696 def read_bytes(count, reader=None):
699 resb = reader.read(count)
700 assert len(resb) == count
703 def read_byte(reader=None):
704 resb = read_bytes(1, reader=reader)
705 res = struct.unpack('<B', resb)[0]
708 # minor_version + major_version
709 _ = read_bytes(2 + 2)
713 for _c in range(1, int_count):
716 for _c in range(1, uint_count):
719 _ = read_bytes((double_count-1) * 8)
721 constant_strings = [u'']
722 for _c in range(1, string_count):
724 constant_strings.append(s)
725 namespace_count = u30()
726 for _c in range(1, namespace_count):
727 _ = read_bytes(1) # kind
730 for _c in range(1, ns_set_count):
732 for _c2 in range(count):
734 multiname_count = u30()
743 0x0e: 2, # MultinameA
744 0x1b: 1, # MultinameL
745 0x1c: 1, # MultinameLA
748 for _c in range(1, multiname_count):
750 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
752 namespace_idx = u30()
754 multinames.append(constant_strings[name_idx])
756 multinames.append('[MULTINAME kind: %d]' % kind)
757 for _c2 in range(MULTINAME_SIZES[kind]):
762 MethodInfo = collections.namedtuple(
764 ['NEED_ARGUMENTS', 'NEED_REST'])
766 for method_id in range(method_count):
768 _ = u30() # return type
769 for _ in range(param_count):
770 _ = u30() # param type
771 _ = u30() # name index (always 0 for youtube)
773 if flags & 0x08 != 0:
776 for c in range(option_count):
778 _ = read_bytes(1) # kind
779 if flags & 0x80 != 0:
780 # Param names present
781 for _ in range(param_count):
782 _ = u30() # param name
783 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
784 method_infos.append(mi)
787 metadata_count = u30()
788 for _c in range(metadata_count):
791 for _c2 in range(item_count):
795 def parse_traits_info():
796 trait_name_idx = u30()
797 kind_full = read_byte()
798 kind = kind_full & 0x0f
799 attrs = kind_full >> 4
801 if kind in [0x00, 0x06]: # Slot or Const
803 type_name_idx = u30()
806 _ = read_byte() # vkind
807 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
810 methods[multinames[trait_name_idx]] = method_idx
811 elif kind == 0x04: # Class
814 elif kind == 0x05: # Function
817 methods[function_idx] = multinames[trait_name_idx]
819 raise ExtractorError(u'Unsupported trait kind %d' % kind)
821 if attrs & 0x4 != 0: # Metadata present
822 metadata_count = u30()
823 for _c3 in range(metadata_count):
829 TARGET_CLASSNAME = u'SignatureDecipher'
830 searched_idx = multinames.index(TARGET_CLASSNAME)
831 searched_class_id = None
833 for class_id in range(class_count):
835 if name_idx == searched_idx:
836 # We found the class we're looking for!
837 searched_class_id = class_id
838 _ = u30() # super_name idx
840 if flags & 0x08 != 0: # Protected namespace is present
841 protected_ns_idx = u30()
843 for _c2 in range(intrf_count):
847 for _c2 in range(trait_count):
848 _ = parse_traits_info()
850 if searched_class_id is None:
851 raise ExtractorError(u'Target class %r not found' %
856 for class_id in range(class_count):
859 for _c2 in range(trait_count):
860 trait_methods = parse_traits_info()
861 if class_id == searched_class_id:
862 method_names.update(trait_methods.items())
863 method_idxs.update(dict(
865 for name, idx in trait_methods.items()))
869 for _c in range(script_count):
872 for _c2 in range(trait_count):
873 _ = parse_traits_info()
876 method_body_count = u30()
877 Method = collections.namedtuple('Method', ['code', 'local_count'])
879 for _c in range(method_body_count):
883 init_scope_depth = u30()
884 max_scope_depth = u30()
886 code = read_bytes(code_length)
887 if method_idx in method_idxs:
888 m = Method(code, local_count)
889 methods[method_idxs[method_idx]] = m
890 exception_count = u30()
891 for _c2 in range(exception_count):
898 for _c2 in range(trait_count):
899 _ = parse_traits_info()
901 assert p + code_reader.tell() == len(code_tag)
902 assert len(methods) == len(method_idxs)
904 method_pyfunctions = {}
906 def extract_function(func_name):
907 if func_name in method_pyfunctions:
908 return method_pyfunctions[func_name]
909 if func_name not in methods:
910 raise ExtractorError(u'Cannot find function %r' % func_name)
911 m = methods[func_name]
914 registers = ['(this)'] + list(args) + [None] * m.local_count
916 coder = io.BytesIO(m.code)
918 opcode = struct.unpack('!B', coder.read(1))[0]
919 if opcode == 36: # pushbyte
920 v = struct.unpack('!B', coder.read(1))[0]
922 elif opcode == 44: # pushstring
924 stack.append(constant_strings[idx])
925 elif opcode == 48: # pushscope
926 # We don't implement the scope register, so we'll just
927 # ignore the popped value
929 elif opcode == 70: # callproperty
931 mname = multinames[index]
932 arg_count = u30(coder)
933 args = list(reversed(
934 [stack.pop() for _ in range(arg_count)]))
936 if mname == u'split':
937 assert len(args) == 1
938 assert isinstance(args[0], compat_str)
939 assert isinstance(obj, compat_str)
943 res = obj.split(args[0])
945 elif mname == u'slice':
946 assert len(args) == 1
947 assert isinstance(args[0], int)
948 assert isinstance(obj, list)
951 elif mname == u'join':
952 assert len(args) == 1
953 assert isinstance(args[0], compat_str)
954 assert isinstance(obj, list)
955 res = args[0].join(obj)
957 elif mname in method_pyfunctions:
958 stack.append(method_pyfunctions[mname](args))
960 raise NotImplementedError(
961 u'Unsupported property %r on %r'
963 elif opcode == 72: # returnvalue
966 elif opcode == 79: # callpropvoid
968 mname = multinames[index]
969 arg_count = u30(coder)
970 args = list(reversed(
971 [stack.pop() for _ in range(arg_count)]))
973 if mname == u'reverse':
974 assert isinstance(obj, list)
977 raise NotImplementedError(
978 u'Unsupported (void) property %r on %r'
980 elif opcode == 93: # findpropstrict
982 mname = multinames[index]
983 res = extract_function(mname)
985 elif opcode == 97: # setproperty
990 assert isinstance(obj, list)
991 assert isinstance(idx, int)
993 elif opcode == 98: # getlocal
995 stack.append(registers[index])
996 elif opcode == 99: # setlocal
999 registers[index] = value
1000 elif opcode == 102: # getproperty
1002 pname = multinames[index]
1003 if pname == u'length':
1005 assert isinstance(obj, list)
1006 stack.append(len(obj))
1007 else: # Assume attribute access
1009 assert isinstance(idx, int)
1011 assert isinstance(obj, list)
1012 stack.append(obj[idx])
1013 elif opcode == 128: # coerce
1015 elif opcode == 133: # coerce_s
1016 assert isinstance(stack[-1], (type(None), compat_str))
1017 elif opcode == 164: # modulo
1018 value2 = stack.pop()
1019 value1 = stack.pop()
1020 res = value1 % value2
1022 elif opcode == 208: # getlocal_0
1023 stack.append(registers[0])
1024 elif opcode == 209: # getlocal_1
1025 stack.append(registers[1])
1026 elif opcode == 210: # getlocal_2
1027 stack.append(registers[2])
1028 elif opcode == 211: # getlocal_3
1029 stack.append(registers[3])
1030 elif opcode == 214: # setlocal_2
1031 registers[2] = stack.pop()
1032 elif opcode == 215: # setlocal_3
1033 registers[3] = stack.pop()
1035 raise NotImplementedError(
1036 u'Unsupported opcode %d' % opcode)
1038 method_pyfunctions[func_name] = resfunc
1041 initial_function = extract_function(u'decipher')
1042 return lambda s: initial_function([s])
1044 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
1045 """Turn the encrypted s field into a working signature"""
1047 if player_url is not None:
1049 if player_url not in self._player_cache:
1050 func = self._extract_signature_function(
1051 video_id, player_url, len(s)
1053 self._player_cache[player_url] = func
1054 func = self._player_cache[player_url]
1055 if self._downloader.params.get('youtube_print_sig_code'):
1056 self._print_sig_code(func, len(s))
1058 except Exception as e:
1059 tb = traceback.format_exc()
1060 self._downloader.report_warning(
1061 u'Automatic signature extraction failed: ' + tb)
1063 self._downloader.report_warning(
1064 u'Warning: Falling back to static signature algorithm')
1065 return self._static_decrypt_signature(
1066 s, video_id, player_url, age_gate)
1068 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
1070 # The videos with age protection use another player, so the
1071 # algorithms can be different.
1073 return s[2:63] + s[82] + s[64:82] + s[63]
1076 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
1078 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
1080 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
1082 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
1084 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
1086 return s[5:34] + s[0] + s[35:38] + s[3] + s[39:45] + s[38] + s[46:53] + s[73] + s[54:73] + s[85] + s[74:85] + s[53]
1088 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
1090 return s[81:36:-1] + s[0] + s[35:2:-1]
1092 return s[81:64:-1] + s[82] + s[63:52:-1] + s[45] + s[51:45:-1] + s[1] + s[44:1:-1] + s[0]
1094 return s[80:73:-1] + s[81] + s[72:54:-1] + s[2] + s[53:43:-1] + s[0] + s[42:2:-1] + s[43] + s[1] + s[54]
1096 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1098 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
1100 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1103 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
1105 def _decrypt_signature_age_gate(self, s):
1106 # The videos with age protection use another player, so the algorithms
1109 return s[2:63] + s[82] + s[64:82] + s[63]
1111 # Fallback to the other algortihms
1112 return self._decrypt_signature(s)
1114 def _get_available_subtitles(self, video_id):
1116 sub_list = self._download_webpage(
1117 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1118 video_id, note=False)
1119 except ExtractorError as err:
1120 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
1122 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
1127 params = compat_urllib_parse.urlencode({
1130 'fmt': self._downloader.params.get('subtitlesformat'),
1132 url = u'http://www.youtube.com/api/timedtext?' + params
1133 sub_lang_list[lang] = url
1134 if not sub_lang_list:
1135 self._downloader.report_warning(u'video doesn\'t have subtitles')
1137 return sub_lang_list
1139 def _get_available_automatic_caption(self, video_id, webpage):
1140 """We need the webpage for getting the captions url, pass it as an
1141 argument to speed up the process."""
1142 sub_format = self._downloader.params.get('subtitlesformat')
1143 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1144 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
1145 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
1147 self._downloader.report_warning(err_msg)
1149 player_config = json.loads(mobj.group(1))
1151 args = player_config[u'args']
1152 caption_url = args[u'ttsurl']
1153 timestamp = args[u'timestamp']
1154 # We get the available subtitles
1155 list_params = compat_urllib_parse.urlencode({
1160 list_url = caption_url + '&' + list_params
1161 list_page = self._download_webpage(list_url, video_id)
1162 caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8'))
1163 original_lang_node = caption_list.find('track')
1164 if original_lang_node.attrib.get('kind') != 'asr' :
1165 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1167 original_lang = original_lang_node.attrib['lang_code']
1170 for lang_node in caption_list.findall('target'):
1171 sub_lang = lang_node.attrib['lang_code']
1172 params = compat_urllib_parse.urlencode({
1173 'lang': original_lang,
1179 sub_lang_list[sub_lang] = caption_url + '&' + params
1180 return sub_lang_list
1181 # An extractor error can be raise by the download process if there are
1182 # no automatic captions but there are subtitles
1183 except (KeyError, ExtractorError):
1184 self._downloader.report_warning(err_msg)
1187 def _print_formats(self, formats):
1188 print('Available formats:')
1190 print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'),
1191 self._video_dimensions.get(x, '???'),
1192 ' ('+self._special_itags[x]+')' if x in self._special_itags else ''))
1194 def _extract_id(self, url):
1195 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1197 raise ExtractorError(u'Invalid URL: %s' % url)
1198 video_id = mobj.group(2)
1201 def _get_video_url_list(self, url_map):
1203 Transform a dictionary in the format {itag:url} to a list of (itag, url)
1204 with the requested formats.
1206 req_format = self._downloader.params.get('format', None)
1207 format_limit = self._downloader.params.get('format_limit', None)
1208 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1209 if format_limit is not None and format_limit in available_formats:
1210 format_list = available_formats[available_formats.index(format_limit):]
1212 format_list = available_formats
1213 existing_formats = [x for x in format_list if x in url_map]
1214 if len(existing_formats) == 0:
1215 raise ExtractorError(u'no known formats available for video')
1216 if self._downloader.params.get('listformats', None):
1217 self._print_formats(existing_formats)
1219 if req_format is None or req_format == 'best':
1220 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1221 elif req_format == 'worst':
1222 video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality
1223 elif req_format in ('-1', 'all'):
1224 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1226 # Specific formats. We pick the first in a slash-delimeted sequence.
1227 # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
1228 # available in the specified format. For example,
1229 # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1230 # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
1231 # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
1232 req_formats = req_format.split('/')
1233 video_url_list = None
1234 for rf in req_formats:
1236 video_url_list = [(rf, url_map[rf])]
1238 if rf in self._video_formats_map:
1239 for srf in self._video_formats_map[rf]:
1241 video_url_list = [(srf, url_map[srf])]
1246 if video_url_list is None:
1247 raise ExtractorError(u'requested format not available')
1248 return video_url_list
1250 def _extract_from_m3u8(self, manifest_url, video_id):
1252 def _get_urls(_manifest):
1253 lines = _manifest.split('\n')
1254 urls = filter(lambda l: l and not l.startswith('#'),
1257 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1258 formats_urls = _get_urls(manifest)
1259 for format_url in formats_urls:
1260 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1261 url_map[itag] = format_url
1264 def _real_extract(self, url):
1265 if re.match(r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$', url):
1266 self._downloader.report_warning(u'Did you forget to quote the URL? Remember that & is a meta-character in most shells, so you want to put the URL in quotes, like youtube-dl \'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\' (or simply youtube-dl BaW_jenozKc ).')
1268 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1269 mobj = re.search(self._NEXT_URL_RE, url)
1271 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1272 video_id = self._extract_id(url)
1275 self.report_video_webpage_download(video_id)
1276 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1277 request = compat_urllib_request.Request(url)
1279 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
1280 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1281 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
1283 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
1285 # Attempt to extract SWF player URL
1286 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1287 if mobj is not None:
1288 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1293 self.report_video_info_webpage_download(video_id)
1294 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1295 self.report_age_confirmation()
1297 # We simulate the access to the video from www.youtube.com/v/{video_id}
1298 # this can be viewed without login into Youtube
1299 data = compat_urllib_parse.urlencode({'video_id': video_id,
1303 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1307 video_info_url = 'https://www.youtube.com/get_video_info?' + data
1308 video_info_webpage = self._download_webpage(video_info_url, video_id,
1310 errnote='unable to download video info webpage')
1311 video_info = compat_parse_qs(video_info_webpage)
1314 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1315 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1316 % (video_id, el_type))
1317 video_info_webpage = self._download_webpage(video_info_url, video_id,
1319 errnote='unable to download video info webpage')
1320 video_info = compat_parse_qs(video_info_webpage)
1321 if 'token' in video_info:
1323 if 'token' not in video_info:
1324 if 'reason' in video_info:
1325 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
1327 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1329 # Check for "rental" videos
1330 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1331 raise ExtractorError(u'"rental" videos not supported')
1333 # Start extracting information
1334 self.report_information_extraction(video_id)
1337 if 'author' not in video_info:
1338 raise ExtractorError(u'Unable to extract uploader name')
1339 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1342 video_uploader_id = None
1343 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1344 if mobj is not None:
1345 video_uploader_id = mobj.group(1)
1347 self._downloader.report_warning(u'unable to extract uploader nickname')
1350 if 'title' not in video_info:
1351 raise ExtractorError(u'Unable to extract video title')
1352 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1355 # We try first to get a high quality image:
1356 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1357 video_webpage, re.DOTALL)
1358 if m_thumb is not None:
1359 video_thumbnail = m_thumb.group(1)
1360 elif 'thumbnail_url' not in video_info:
1361 self._downloader.report_warning(u'unable to extract video thumbnail')
1362 video_thumbnail = ''
1363 else: # don't panic if we can't find it
1364 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1368 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1369 if mobj is not None:
1370 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1371 upload_date = unified_strdate(upload_date)
1374 video_description = get_element_by_id("eow-description", video_webpage)
1375 if video_description:
1376 video_description = clean_html(video_description)
1378 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1380 video_description = unescapeHTML(fd_mobj.group(1))
1382 video_description = u''
1385 video_subtitles = self.extract_subtitles(video_id, video_webpage)
1387 if self._downloader.params.get('listsubtitles', False):
1388 self._list_available_subtitles(video_id, video_webpage)
1391 if 'length_seconds' not in video_info:
1392 self._downloader.report_warning(u'unable to extract video duration')
1395 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
1397 # Decide which formats to download
1400 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
1402 raise ValueError('Could not find vevo ID')
1403 info = json.loads(mobj.group(1))
1405 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1406 # this signatures are encrypted
1407 m_s = re.search(r'[&,]s=', args['url_encoded_fmt_stream_map'])
1409 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
1410 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
1411 m_s = re.search(r'[&,]s=', args.get('adaptive_fmts', u''))
1413 if 'url_encoded_fmt_stream_map' in video_info:
1414 video_info['url_encoded_fmt_stream_map'][0] += ',' + args['adaptive_fmts']
1416 video_info['url_encoded_fmt_stream_map'] = [args['adaptive_fmts']]
1417 elif 'adaptive_fmts' in video_info:
1418 if 'url_encoded_fmt_stream_map' in video_info:
1419 video_info['url_encoded_fmt_stream_map'][0] += ',' + video_info['adaptive_fmts'][0]
1421 video_info['url_encoded_fmt_stream_map'] = video_info['adaptive_fmts']
1425 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1426 self.report_rtmp_download()
1427 video_url_list = [(None, video_info['conn'][0])]
1428 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1429 if 'rtmpe%3Dyes' in video_info['url_encoded_fmt_stream_map'][0]:
1430 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1432 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
1433 url_data = compat_parse_qs(url_data_str)
1434 if 'itag' in url_data and 'url' in url_data:
1435 url = url_data['url'][0]
1436 if 'sig' in url_data:
1437 url += '&signature=' + url_data['sig'][0]
1438 elif 's' in url_data:
1439 encrypted_sig = url_data['s'][0]
1440 if self._downloader.params.get('verbose'):
1442 player_version = self._search_regex(
1444 player_url if player_url else None,
1445 'flash player', fatal=False)
1446 player_desc = 'flash player %s' % player_version
1448 player_version = self._search_regex(
1449 r'html5player-(.+?)\.js', video_webpage,
1450 'html5 player', fatal=False)
1451 player_desc = u'html5 player %s' % player_version
1453 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
1454 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
1455 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1458 jsplayer_url_json = self._search_regex(
1459 r'"assets":.+?"js":\s*("[^"]+")',
1460 video_webpage, u'JS player URL')
1461 player_url = json.loads(jsplayer_url_json)
1463 signature = self._decrypt_signature(
1464 encrypted_sig, video_id, player_url, age_gate)
1465 url += '&signature=' + signature
1466 if 'ratebypass' not in url:
1467 url += '&ratebypass=yes'
1468 url_map[url_data['itag'][0]] = url
1469 video_url_list = self._get_video_url_list(url_map)
1470 if not video_url_list:
1472 elif video_info.get('hlsvp'):
1473 manifest_url = video_info['hlsvp'][0]
1474 url_map = self._extract_from_m3u8(manifest_url, video_id)
1475 video_url_list = self._get_video_url_list(url_map)
1476 if not video_url_list:
1480 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
1483 for format_param, video_real_url in video_url_list:
1485 video_extension = self._video_extensions.get(format_param, 'flv')
1487 video_format = '{0} - {1}{2}'.format(format_param if format_param else video_extension,
1488 self._video_dimensions.get(format_param, '???'),
1489 ' ('+self._special_itags[format_param]+')' if format_param in self._special_itags else '')
1493 'url': video_real_url,
1494 'uploader': video_uploader,
1495 'uploader_id': video_uploader_id,
1496 'upload_date': upload_date,
1497 'title': video_title,
1498 'ext': video_extension,
1499 'format': video_format,
1500 'thumbnail': video_thumbnail,
1501 'description': video_description,
1502 'player_url': player_url,
1503 'subtitles': video_subtitles,
1504 'duration': video_duration
1508 class YoutubePlaylistIE(InfoExtractor):
1509 IE_DESC = u'YouTube.com playlists'
1510 _VALID_URL = r"""(?:
1515 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1516 \? (?:.*?&)*? (?:p|a|list)=
1519 ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
1522 ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
1524 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1526 IE_NAME = u'youtube:playlist'
1529 def suitable(cls, url):
1530 """Receives a URL and returns True if suitable for this IE."""
1531 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1533 def _real_extract(self, url):
1534 # Extract playlist id
1535 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1537 raise ExtractorError(u'Invalid URL: %s' % url)
1539 # Download playlist videos from API
1540 playlist_id = mobj.group(1) or mobj.group(2)
1543 for page_num in itertools.count(1):
1544 start_index = self._MAX_RESULTS * (page_num - 1) + 1
1545 if start_index >= 1000:
1546 self._downloader.report_warning(u'Max number of results reached')
1548 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index)
1549 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1552 response = json.loads(page)
1553 except ValueError as err:
1554 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1556 if 'feed' not in response:
1557 raise ExtractorError(u'Got a malformed response from YouTube API')
1558 playlist_title = response['feed']['title']['$t']
1559 if 'entry' not in response['feed']:
1560 # Number of videos is a multiple of self._MAX_RESULTS
1563 for entry in response['feed']['entry']:
1564 index = entry['yt$position']['$t']
1565 if 'media$group' in entry and 'yt$videoid' in entry['media$group']:
1568 'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t']
1571 videos = [v[1] for v in sorted(videos)]
1573 url_results = [self.url_result(vurl, 'Youtube') for vurl in videos]
1574 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1577 class YoutubeChannelIE(InfoExtractor):
1578 IE_DESC = u'YouTube.com channels'
1579 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1580 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1581 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1582 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1583 IE_NAME = u'youtube:channel'
1585 def extract_videos_from_page(self, page):
1587 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1588 if mobj.group(1) not in ids_in_page:
1589 ids_in_page.append(mobj.group(1))
1592 def _real_extract(self, url):
1593 # Extract channel id
1594 mobj = re.match(self._VALID_URL, url)
1596 raise ExtractorError(u'Invalid URL: %s' % url)
1598 # Download channel page
1599 channel_id = mobj.group(1)
1603 url = self._TEMPLATE_URL % (channel_id, pagenum)
1604 page = self._download_webpage(url, channel_id,
1605 u'Downloading page #%s' % pagenum)
1607 # Extract video identifiers
1608 ids_in_page = self.extract_videos_from_page(page)
1609 video_ids.extend(ids_in_page)
1611 # Download any subsequent channel pages using the json-based channel_ajax query
1612 if self._MORE_PAGES_INDICATOR in page:
1613 for pagenum in itertools.count(1):
1614 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1615 page = self._download_webpage(url, channel_id,
1616 u'Downloading page #%s' % pagenum)
1618 page = json.loads(page)
1620 ids_in_page = self.extract_videos_from_page(page['content_html'])
1621 video_ids.extend(ids_in_page)
1623 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1626 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1628 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1629 url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls]
1630 return [self.playlist_result(url_entries, channel_id)]
1633 class YoutubeUserIE(InfoExtractor):
1634 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
1635 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?)|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1636 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1637 _GDATA_PAGE_SIZE = 50
1638 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1639 IE_NAME = u'youtube:user'
1642 def suitable(cls, url):
1643 # Don't return True if the url can be extracted with other youtube
1644 # extractor, the regex would is too permissive and it would match.
1645 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1646 if any(ie.suitable(url) for ie in other_ies): return False
1647 else: return super(YoutubeUserIE, cls).suitable(url)
1649 def _real_extract(self, url):
1651 mobj = re.match(self._VALID_URL, url)
1653 raise ExtractorError(u'Invalid URL: %s' % url)
1655 username = mobj.group(1)
1657 # Download video ids using YouTube Data API. Result size per
1658 # query is limited (currently to 50 videos) so we need to query
1659 # page by page until there are no video ids - it means we got
1664 for pagenum in itertools.count(0):
1665 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1667 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1668 page = self._download_webpage(gdata_url, username,
1669 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1672 response = json.loads(page)
1673 except ValueError as err:
1674 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1675 if 'entry' not in response['feed']:
1676 # Number of videos is a multiple of self._MAX_RESULTS
1679 # Extract video identifiers
1681 for entry in response['feed']['entry']:
1682 ids_in_page.append(entry['id']['$t'].split('/')[-1])
1683 video_ids.extend(ids_in_page)
1685 # A little optimization - if current page is not
1686 # "full", ie. does not contain PAGE_SIZE video ids then
1687 # we can assume that this page is the last one - there
1688 # are no more ids on further pages - no need to query
1691 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1694 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1695 url_results = [self.url_result(rurl, 'Youtube') for rurl in urls]
1696 return [self.playlist_result(url_results, playlist_title = username)]
1698 class YoutubeSearchIE(SearchInfoExtractor):
1699 IE_DESC = u'YouTube.com searches'
1700 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1702 IE_NAME = u'youtube:search'
1703 _SEARCH_KEY = 'ytsearch'
1705 def report_download_page(self, query, pagenum):
1706 """Report attempt to download search page with given number."""
1707 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1709 def _get_n_results(self, query, n):
1710 """Get a specified number of results for a query"""
1716 while (50 * pagenum) < limit:
1717 self.report_download_page(query, pagenum+1)
1718 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1719 request = compat_urllib_request.Request(result_url)
1721 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1722 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1723 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1724 api_response = json.loads(data)['data']
1726 if not 'items' in api_response:
1727 raise ExtractorError(u'[youtube] No video results')
1729 new_ids = list(video['id'] for video in api_response['items'])
1730 video_ids += new_ids
1732 limit = min(n, api_response['totalItems'])
1735 if len(video_ids) > n:
1736 video_ids = video_ids[:n]
1737 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1738 return self.playlist_result(videos, query)
1741 class YoutubeShowIE(InfoExtractor):
1742 IE_DESC = u'YouTube.com (multi-season) shows'
1743 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1744 IE_NAME = u'youtube:show'
1746 def _real_extract(self, url):
1747 mobj = re.match(self._VALID_URL, url)
1748 show_name = mobj.group(1)
1749 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1750 # There's one playlist for each season of the show
1751 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1752 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1753 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1756 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1758 Base class for extractors that fetch info from
1759 http://www.youtube.com/feed_ajax
1760 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1762 _LOGIN_REQUIRED = True
1764 # use action_load_personal_feed instead of action_load_system_feed
1765 _PERSONAL_FEED = False
1768 def _FEED_TEMPLATE(self):
1769 action = 'action_load_system_feed'
1770 if self._PERSONAL_FEED:
1771 action = 'action_load_personal_feed'
1772 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1776 return u'youtube:%s' % self._FEED_NAME
1778 def _real_initialize(self):
1781 def _real_extract(self, url):
1783 # The step argument is available only in 2.7 or higher
1784 for i in itertools.count(0):
1785 paging = i*self._PAGING_STEP
1786 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1787 u'%s feed' % self._FEED_NAME,
1788 u'Downloading page %s' % i)
1789 info = json.loads(info)
1790 feed_html = info['feed_html']
1791 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1792 ids = orderedSet(m.group(1) for m in m_ids)
1793 feed_entries.extend(self.url_result(id, 'Youtube') for id in ids)
1794 if info['paging'] is None:
1796 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1798 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1799 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1800 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1801 _FEED_NAME = 'subscriptions'
1802 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1804 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1805 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1806 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1807 _FEED_NAME = 'recommended'
1808 _PLAYLIST_TITLE = u'Youtube Recommended videos'
1810 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1811 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1812 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1813 _FEED_NAME = 'watch_later'
1814 _PLAYLIST_TITLE = u'Youtube Watch Later'
1816 _PERSONAL_FEED = True
1818 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1819 IE_NAME = u'youtube:favorites'
1820 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1821 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1822 _LOGIN_REQUIRED = True
1824 def _real_extract(self, url):
1825 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1826 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1827 return self.url_result(playlist_id, 'YoutubePlaylist')