17 from .common import InfoExtractor, SearchInfoExtractor
18 from .subtitles import SubtitlesInfoExtractor
24 compat_urllib_request,
35 class YoutubeBaseInfoExtractor(InfoExtractor):
36 """Provide base functions for Youtube extractors"""
37 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
38 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
39 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
40 _NETRC_MACHINE = 'youtube'
41 # If True it will raise an error if no login info is provided
42 _LOGIN_REQUIRED = False
44 def report_lang(self):
45 """Report attempt to set language."""
46 self.to_screen(u'Setting language')
48 def _set_language(self):
49 request = compat_urllib_request.Request(self._LANG_URL)
52 compat_urllib_request.urlopen(request).read()
53 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
54 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
59 (username, password) = self._get_login_info()
60 # No authentication to be performed
62 if self._LOGIN_REQUIRED:
63 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
66 request = compat_urllib_request.Request(self._LOGIN_URL)
68 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
69 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
70 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
75 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
78 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
84 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
88 u'PersistentCookie': u'yes',
90 u'bgresponse': u'js_disabled',
91 u'checkConnection': u'',
92 u'checkedDomains': u'youtube',
98 u'signIn': u'Sign in',
100 u'service': u'youtube',
104 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
106 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
107 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
108 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
111 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
112 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
113 self._downloader.report_warning(u'unable to log in: bad username or password')
115 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
116 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
120 def _confirm_age(self):
123 'action_confirm': 'Confirm',
125 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
127 self.report_age_confirmation()
128 compat_urllib_request.urlopen(request).read().decode('utf-8')
129 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
130 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
133 def _real_initialize(self):
134 if self._downloader is None:
136 if not self._set_language():
138 if not self._login():
143 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
144 IE_DESC = u'YouTube.com'
147 (?:https?://)? # http(s):// (optional)
148 (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
149 tube\.majestyc\.net/|
150 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
151 (?:.*?\#/)? # handle anchor (#/) redirect urls
152 (?: # the various things that can precede the ID:
153 (?:(?:v|embed|e)/) # v/ or embed/ or e/
154 |(?: # or the v= param in all its forms
155 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
156 (?:\?|\#!?) # the params delimiter ? or # or #!
157 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
161 |youtu\.be/ # just youtu.be/xxxx
163 )? # all until now is optional -> you can pass the naked ID
164 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
165 (?(1).+)? # if we found the ID, everything can follow
167 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
168 # Listed in order of quality
169 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
170 # Apple HTTP Live Streaming
171 '96', '95', '94', '93', '92', '132', '151',
173 '85', '84', '102', '83', '101', '82', '100',
175 '138', '137', '248', '136', '247', '135', '246',
176 '245', '244', '134', '243', '133', '242', '160',
178 '141', '172', '140', '171', '139',
180 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
181 # Apple HTTP Live Streaming
182 '96', '95', '94', '93', '92', '132', '151',
184 '85', '102', '84', '101', '83', '100', '82',
186 '138', '248', '137', '247', '136', '246', '245',
187 '244', '135', '243', '134', '242', '133', '160',
189 '172', '141', '171', '140', '139',
191 _video_formats_map = {
192 'flv': ['35', '34', '6', '5'],
193 '3gp': ['36', '17', '13'],
194 'mp4': ['38', '37', '22', '18'],
195 'webm': ['46', '45', '44', '43'],
197 _video_extensions = {
219 # Apple HTTP Live Streaming
251 _video_dimensions = {
333 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
334 u"file": u"BaW_jenozKc.mp4",
336 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
337 u"uploader": u"Philipp Hagemeister",
338 u"uploader_id": u"phihag",
339 u"upload_date": u"20121002",
340 u"description": u"test chars: \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
344 u"url": u"http://www.youtube.com/watch?v=1ltcDfZMA3U",
345 u"file": u"1ltcDfZMA3U.flv",
346 u"note": u"Test VEVO video (#897)",
348 u"upload_date": u"20070518",
349 u"title": u"Maps - It Will Find You",
350 u"description": u"Music video by Maps performing It Will Find You.",
351 u"uploader": u"MuteUSA",
352 u"uploader_id": u"MuteUSA"
356 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
357 u"file": u"UxxajLWwzqY.mp4",
358 u"note": u"Test generic use_cipher_signature video (#897)",
360 u"upload_date": u"20120506",
361 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
362 u"description": u"md5:3e2666e0a55044490499ea45fe9037b7",
363 u"uploader": u"Icona Pop",
364 u"uploader_id": u"IconaPop"
368 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
369 u"file": u"07FYdnEawAQ.mp4",
370 u"note": u"Test VEVO video with age protection (#956)",
372 u"upload_date": u"20130703",
373 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
374 u"description": u"md5:64249768eec3bc4276236606ea996373",
375 u"uploader": u"justintimberlakeVEVO",
376 u"uploader_id": u"justintimberlakeVEVO"
380 u'url': u'https://www.youtube.com/watch?v=TGi3HqYrWHE',
381 u'file': u'TGi3HqYrWHE.mp4',
382 u'note': u'm3u8 video',
384 u'title': u'Triathlon - Men - London 2012 Olympic Games',
385 u'description': u'- Men - TR02 - Triathlon - 07 August 2012 - London 2012 Olympic Games',
386 u'uploader': u'olympic',
387 u'upload_date': u'20120807',
388 u'uploader_id': u'olympic',
391 u'skip_download': True,
398 def suitable(cls, url):
399 """Receives a URL and returns True if suitable for this IE."""
400 if YoutubePlaylistIE.suitable(url): return False
401 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
403 def __init__(self, *args, **kwargs):
404 super(YoutubeIE, self).__init__(*args, **kwargs)
405 self._player_cache = {}
407 def report_video_webpage_download(self, video_id):
408 """Report attempt to download video webpage."""
409 self.to_screen(u'%s: Downloading video webpage' % video_id)
411 def report_video_info_webpage_download(self, video_id):
412 """Report attempt to download video info webpage."""
413 self.to_screen(u'%s: Downloading video info webpage' % video_id)
415 def report_information_extraction(self, video_id):
416 """Report attempt to extract video information."""
417 self.to_screen(u'%s: Extracting video information' % video_id)
419 def report_unavailable_format(self, video_id, format):
420 """Report extracted video URL."""
421 self.to_screen(u'%s: Format %s not available' % (video_id, format))
423 def report_rtmp_download(self):
424 """Indicate the download will use the RTMP protocol."""
425 self.to_screen(u'RTMP download detected')
427 def _extract_signature_function(self, video_id, player_url, slen):
428 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
430 player_type = id_m.group('ext')
431 player_id = id_m.group('id')
433 # Read from filesystem cache
434 func_id = '%s_%s_%d' % (player_type, player_id, slen)
435 assert os.path.basename(func_id) == func_id
436 cache_dir = self.downloader.params.get('cachedir',
437 u'~/.youtube-dl/cache')
439 if cache_dir is not False:
440 cache_fn = os.path.join(os.path.expanduser(cache_dir),
444 with io.open(cache_fn, '', encoding='utf-8') as cachef:
445 cache_spec = json.load(cachef)
446 return lambda s: u''.join(s[i] for i in cache_spec)
448 pass # No cache available
450 if player_type == 'js':
451 code = self._download_webpage(
452 player_url, video_id,
453 note=u'Downloading %s player %s' % (player_type, player_id),
454 errnote=u'Download of %s failed' % player_url)
455 res = self._parse_sig_js(code)
456 elif player_type == 'swf':
457 urlh = self._request_webpage(
458 player_url, video_id,
459 note=u'Downloading %s player %s' % (player_type, player_id),
460 errnote=u'Download of %s failed' % player_url)
462 res = self._parse_sig_swf(code)
464 assert False, 'Invalid player type %r' % player_type
466 if cache_dir is not False:
467 cache_res = res(map(compat_chr, range(slen)))
468 cache_spec = [ord(c) for c in cache_res]
469 shutil.makedirs(os.path.dirname(cache_fn))
470 write_json_file(cache_spec, cache_fn)
474 def _parse_sig_js(self, jscode):
475 funcname = self._search_regex(
476 r'signature=([a-zA-Z]+)', jscode,
477 u'Initial JS player signature function name')
482 return string.lowercase.index(varname)
484 def interpret_statement(stmt, local_vars, allow_recursion=20):
485 if allow_recursion < 0:
486 raise ExctractorError(u'Recursion limit reached')
488 if stmt.startswith(u'var '):
489 stmt = stmt[len(u'var '):]
490 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
491 r'=(?P<expr>.*)$', stmt)
493 if ass_m.groupdict().get('index'):
495 lvar = local_vars[ass_m.group('out')]
496 idx = interpret_expression(ass_m.group('index'),
497 local_vars, allow_recursion)
498 assert isinstance(idx, int)
501 expr = ass_m.group('expr')
504 local_vars[ass_m.group('out')] = val
506 expr = ass_m.group('expr')
507 elif stmt.startswith(u'return '):
509 expr = stmt[len(u'return '):]
511 raise ExtractorError(
512 u'Cannot determine left side of statement in %r' % stmt)
514 v = interpret_expression(expr, local_vars, allow_recursion)
517 def interpret_expression(expr, local_vars, allow_recursion):
522 return local_vars[expr]
524 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
526 member = m.group('member')
527 val = local_vars[m.group('in')]
528 if member == 'split("")':
530 if member == 'join("")':
532 if member == 'length':
534 if member == 'reverse()':
536 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
538 idx = interpret_expression(
539 slice_m.group('idx'), local_vars, allow_recursion-1)
543 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
545 val = local_vars[m.group('in')]
546 idx = interpret_expression(m.group('idx'), local_vars,
550 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
552 a = interpret_expression(m.group('a'),
553 local_vars, allow_recursion)
554 b = interpret_expression(m.group('b'),
555 local_vars, allow_recursion)
559 r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
561 fname = m.group('func')
562 if fname not in functions:
563 functions[fname] = extract_function(fname)
564 argvals = [int(v) if v.isdigit() else local_vars[v]
565 for v in m.group('args').split(',')]
566 return functions[fname](argvals)
567 raise ExtractorError(u'Unsupported JS expression %r' % expr)
569 def extract_function(funcname):
571 r'function ' + re.escape(funcname) +
572 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
574 argnames = func_m.group('args').split(',')
577 local_vars = dict(zip(argnames, args))
578 for stmt in func_m.group('code').split(';'):
579 res = interpret_statement(stmt, local_vars)
583 initial_function = extract_function(funcname)
584 return lambda s: initial_function([s])
586 def _parse_sig_swf(self, file_contents):
587 if file_contents[1:3] != b'WS':
588 raise ExtractorError(
589 u'Not an SWF file; header is %r' % file_contents[:3])
590 if file_contents[:1] == b'C':
591 content = zlib.decompress(file_contents[8:])
593 raise NotImplementedError(u'Unsupported compression format %r' %
596 def extract_tags(content):
598 while pos < len(content):
599 header16 = struct.unpack('<H', content[pos:pos+2])[0]
601 tag_code = header16 >> 6
602 tag_len = header16 & 0x3f
604 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
606 assert pos+tag_len <= len(content)
607 yield (tag_code, content[pos:pos+tag_len])
611 for tag_code, tag in extract_tags(content)
613 p = code_tag.index(b'\0', 4) + 1
614 code_reader = io.BytesIO(code_tag[p:])
616 # Parse ABC (AVM2 ByteCode)
617 def read_int(reader=None):
625 b = struct.unpack('<B', buf)[0]
626 res = res | ((b & 0x7f) << shift)
632 def u30(reader=None):
633 res = read_int(reader)
634 assert res & 0xf0000000 == 0
638 def s32(reader=None):
640 if v & 0x80000000 != 0:
641 v = - ((v ^ 0xffffffff) + 1)
644 def string(reader=None):
648 resb = reader.read(slen)
649 assert len(resb) == slen
650 return resb.decode('utf-8')
652 def read_bytes(count, reader=None):
655 resb = reader.read(count)
656 assert len(resb) == count
659 def read_byte(reader=None):
660 resb = read_bytes(1, reader=reader)
661 res = struct.unpack('<B', resb)[0]
664 # minor_version + major_version
665 _ = read_bytes(2 + 2)
669 for _c in range(1, int_count):
672 for _c in range(1, uint_count):
675 _ = read_bytes((double_count-1) * 8)
677 constant_strings = [u'']
678 for _c in range(1, string_count):
680 constant_strings.append(s)
681 namespace_count = u30()
682 for _c in range(1, namespace_count):
683 _ = read_bytes(1) # kind
686 for _c in range(1, ns_set_count):
688 for _c2 in range(count):
690 multiname_count = u30()
699 0x0e: 2, # MultinameA
700 0x1b: 1, # MultinameL
701 0x1c: 1, # MultinameLA
704 for _c in range(1, multiname_count):
706 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
708 namespace_idx = u30()
710 multinames.append(constant_strings[name_idx])
712 multinames.append('[MULTINAME kind: %d]' % kind)
713 for _c2 in range(MULTINAME_SIZES[kind]):
718 MethodInfo = collections.namedtuple(
720 ['NEED_ARGUMENTS', 'NEED_REST'])
722 for method_id in range(method_count):
724 _ = u30() # return type
725 for _ in range(param_count):
726 _ = u30() # param type
727 _ = u30() # name index (always 0 for youtube)
729 if flags & 0x08 != 0:
732 for c in range(option_count):
734 _ = read_bytes(1) # kind
735 if flags & 0x80 != 0:
736 # Param names present
737 for _ in range(param_count):
738 _ = u30() # param name
739 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
740 method_infos.append(mi)
743 metadata_count = u30()
744 for _c in range(metadata_count):
747 for _c2 in range(item_count):
751 def parse_traits_info():
752 trait_name_idx = u30()
753 kind_full = read_byte()
754 kind = kind_full & 0x0f
755 attrs = kind_full >> 4
757 if kind in [0x00, 0x06]: # Slot or Const
759 type_name_idx = u30()
762 _ = read_byte() # vkind
763 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
766 methods[multinames[trait_name_idx]] = method_idx
767 elif kind == 0x04: # Class
770 elif kind == 0x05: # Function
773 methods[function_idx] = multinames[trait_name_idx]
775 raise ExtractorError(u'Unsupported trait kind %d' % kind)
777 if attrs & 0x4 != 0: # Metadata present
778 metadata_count = u30()
779 for _c3 in range(metadata_count):
785 TARGET_CLASSNAME = u'SignatureDecipher'
786 searched_idx = multinames.index(TARGET_CLASSNAME)
787 searched_class_id = None
789 for class_id in range(class_count):
791 if name_idx == searched_idx:
792 # We found the class we're looking for!
793 searched_class_id = class_id
794 _ = u30() # super_name idx
796 if flags & 0x08 != 0: # Protected namespace is present
797 protected_ns_idx = u30()
799 for _c2 in range(intrf_count):
803 for _c2 in range(trait_count):
804 _ = parse_traits_info()
806 if searched_class_id is None:
807 raise ExtractorError(u'Target class %r not found' %
812 for class_id in range(class_count):
815 for _c2 in range(trait_count):
816 trait_methods = parse_traits_info()
817 if class_id == searched_class_id:
818 method_names.update(trait_methods.items())
819 method_idxs.update(dict(
821 for name, idx in trait_methods.items()))
825 for _c in range(script_count):
828 for _c2 in range(trait_count):
829 _ = parse_traits_info()
832 method_body_count = u30()
833 Method = collections.namedtuple('Method', ['code', 'local_count'])
835 for _c in range(method_body_count):
839 init_scope_depth = u30()
840 max_scope_depth = u30()
842 code = read_bytes(code_length)
843 if method_idx in method_idxs:
844 m = Method(code, local_count)
845 methods[method_idxs[method_idx]] = m
846 exception_count = u30()
847 for _c2 in range(exception_count):
854 for _c2 in range(trait_count):
855 _ = parse_traits_info()
857 assert p + code_reader.tell() == len(code_tag)
858 assert len(methods) == len(method_idxs)
860 method_pyfunctions = {}
862 def extract_function(func_name):
863 if func_name in method_pyfunctions:
864 return method_pyfunctions[func_name]
865 if func_name not in methods:
866 raise ExtractorError(u'Cannot find function %r' % func_name)
867 m = methods[func_name]
870 registers = ['(this)'] + list(args) + [None] * m.local_count
872 coder = io.BytesIO(m.code)
874 opcode = struct.unpack('!B', coder.read(1))[0]
875 if opcode == 36: # pushbyte
876 v = struct.unpack('!B', coder.read(1))[0]
878 elif opcode == 44: # pushstring
880 stack.append(constant_strings[idx])
881 elif opcode == 48: # pushscope
882 # We don't implement the scope register, so we'll just
883 # ignore the popped value
885 elif opcode == 70: # callproperty
887 mname = multinames[index]
888 arg_count = u30(coder)
889 args = list(reversed(
890 [stack.pop() for _ in range(arg_count)]))
892 if mname == u'split':
893 assert len(args) == 1
894 assert isinstance(args[0], compat_str)
895 assert isinstance(obj, compat_str)
899 res = obj.split(args[0])
901 elif mname == u'slice':
902 assert len(args) == 1
903 assert isinstance(args[0], int)
904 assert isinstance(obj, list)
907 elif mname == u'join':
908 assert len(args) == 1
909 assert isinstance(args[0], compat_str)
910 assert isinstance(obj, list)
911 res = args[0].join(obj)
913 elif mname in method_pyfunctions:
914 stack.append(method_pyfunctions[mname](args))
916 raise NotImplementedError(
917 u'Unsupported property %r on %r'
919 elif opcode == 72: # returnvalue
922 elif opcode == 79: # callpropvoid
924 mname = multinames[index]
925 arg_count = u30(coder)
926 args = list(reversed(
927 [stack.pop() for _ in range(arg_count)]))
929 if mname == u'reverse':
930 assert isinstance(obj, list)
933 raise NotImplementedError(
934 u'Unsupported (void) property %r on %r'
936 elif opcode == 93: # findpropstrict
938 mname = multinames[index]
939 res = extract_function(mname)
941 elif opcode == 97: # setproperty
946 assert isinstance(obj, list)
947 assert isinstance(idx, int)
949 elif opcode == 98: # getlocal
951 stack.append(registers[index])
952 elif opcode == 99: # setlocal
955 registers[index] = value
956 elif opcode == 102: # getproperty
958 pname = multinames[index]
959 if pname == u'length':
961 assert isinstance(obj, list)
962 stack.append(len(obj))
963 else: # Assume attribute access
965 assert isinstance(idx, int)
967 assert isinstance(obj, list)
968 stack.append(obj[idx])
969 elif opcode == 128: # coerce
971 elif opcode == 133: # coerce_s
972 assert isinstance(stack[-1], (type(None), compat_str))
973 elif opcode == 164: # modulo
976 res = value1 % value2
978 elif opcode == 208: # getlocal_0
979 stack.append(registers[0])
980 elif opcode == 209: # getlocal_1
981 stack.append(registers[1])
982 elif opcode == 210: # getlocal_2
983 stack.append(registers[2])
984 elif opcode == 211: # getlocal_3
985 stack.append(registers[3])
986 elif opcode == 214: # setlocal_2
987 registers[2] = stack.pop()
988 elif opcode == 215: # setlocal_3
989 registers[3] = stack.pop()
991 raise NotImplementedError(
992 u'Unsupported opcode %d' % opcode)
994 method_pyfunctions[func_name] = resfunc
997 initial_function = extract_function(u'decipher')
998 return lambda s: initial_function([s])
1000 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
1001 """Turn the encrypted s field into a working signature"""
1003 if player_url is not None:
1005 if player_url not in self._player_cache:
1006 func = self._extract_signature_function(
1007 video_id, player_url, len(s)
1009 self._player_cache[player_url] = func
1010 return self._player_cache[player_url](s)
1011 except Exception as e:
1012 tb = traceback.format_exc()
1013 self._downloader.report_warning(
1014 u'Automatic signature extraction failed: ' + tb)
1016 self._downloader.report_warning(
1017 u'Warning: Falling back to static signature algorithm')
1018 return self._static_decrypt_signature(
1019 s, video_id, player_url, age_gate)
1021 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
1023 # The videos with age protection use another player, so the
1024 # algorithms can be different.
1026 return s[2:63] + s[82] + s[64:82] + s[63]
1029 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
1031 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
1033 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
1035 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
1037 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
1039 return s[5:34] + s[0] + s[35:38] + s[3] + s[39:45] + s[38] + s[46:53] + s[73] + s[54:73] + s[85] + s[74:85] + s[53]
1041 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
1043 return s[81:36:-1] + s[0] + s[35:2:-1]
1045 return s[81:64:-1] + s[82] + s[63:52:-1] + s[45] + s[51:45:-1] + s[1] + s[44:1:-1] + s[0]
1047 return s[80:73:-1] + s[81] + s[72:54:-1] + s[2] + s[53:43:-1] + s[0] + s[42:2:-1] + s[43] + s[1] + s[54]
1049 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1051 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
1053 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1056 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
1058 def _decrypt_signature_age_gate(self, s):
1059 # The videos with age protection use another player, so the algorithms
1062 return s[2:63] + s[82] + s[64:82] + s[63]
1064 # Fallback to the other algortihms
1065 return self._decrypt_signature(s)
1067 def _get_available_subtitles(self, video_id):
1069 sub_list = self._download_webpage(
1070 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1071 video_id, note=False)
1072 except ExtractorError as err:
1073 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
1075 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
1080 params = compat_urllib_parse.urlencode({
1083 'fmt': self._downloader.params.get('subtitlesformat'),
1085 url = u'http://www.youtube.com/api/timedtext?' + params
1086 sub_lang_list[lang] = url
1087 if not sub_lang_list:
1088 self._downloader.report_warning(u'video doesn\'t have subtitles')
1090 return sub_lang_list
1092 def _get_available_automatic_caption(self, video_id, webpage):
1093 """We need the webpage for getting the captions url, pass it as an
1094 argument to speed up the process."""
1095 sub_format = self._downloader.params.get('subtitlesformat')
1096 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1097 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
1098 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
1100 self._downloader.report_warning(err_msg)
1102 player_config = json.loads(mobj.group(1))
1104 args = player_config[u'args']
1105 caption_url = args[u'ttsurl']
1106 timestamp = args[u'timestamp']
1107 # We get the available subtitles
1108 list_params = compat_urllib_parse.urlencode({
1113 list_url = caption_url + '&' + list_params
1114 list_page = self._download_webpage(list_url, video_id)
1115 caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8'))
1116 original_lang_node = caption_list.find('track')
1117 if original_lang_node.attrib.get('kind') != 'asr' :
1118 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1120 original_lang = original_lang_node.attrib['lang_code']
1123 for lang_node in caption_list.findall('target'):
1124 sub_lang = lang_node.attrib['lang_code']
1125 params = compat_urllib_parse.urlencode({
1126 'lang': original_lang,
1132 sub_lang_list[sub_lang] = caption_url + '&' + params
1133 return sub_lang_list
1134 # An extractor error can be raise by the download process if there are
1135 # no automatic captions but there are subtitles
1136 except (KeyError, ExtractorError):
1137 self._downloader.report_warning(err_msg)
1140 def _print_formats(self, formats):
1141 print('Available formats:')
1143 print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'),
1144 self._video_dimensions.get(x, '???'),
1145 ' ('+self._special_itags[x]+')' if x in self._special_itags else ''))
1147 def _extract_id(self, url):
1148 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1150 raise ExtractorError(u'Invalid URL: %s' % url)
1151 video_id = mobj.group(2)
1154 def _get_video_url_list(self, url_map):
1156 Transform a dictionary in the format {itag:url} to a list of (itag, url)
1157 with the requested formats.
1159 req_format = self._downloader.params.get('format', None)
1160 format_limit = self._downloader.params.get('format_limit', None)
1161 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1162 if format_limit is not None and format_limit in available_formats:
1163 format_list = available_formats[available_formats.index(format_limit):]
1165 format_list = available_formats
1166 existing_formats = [x for x in format_list if x in url_map]
1167 if len(existing_formats) == 0:
1168 raise ExtractorError(u'no known formats available for video')
1169 if self._downloader.params.get('listformats', None):
1170 self._print_formats(existing_formats)
1172 if req_format is None or req_format == 'best':
1173 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1174 elif req_format == 'worst':
1175 video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality
1176 elif req_format in ('-1', 'all'):
1177 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1179 # Specific formats. We pick the first in a slash-delimeted sequence.
1180 # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
1181 # available in the specified format. For example,
1182 # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1183 # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
1184 # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
1185 req_formats = req_format.split('/')
1186 video_url_list = None
1187 for rf in req_formats:
1189 video_url_list = [(rf, url_map[rf])]
1191 if rf in self._video_formats_map:
1192 for srf in self._video_formats_map[rf]:
1194 video_url_list = [(srf, url_map[srf])]
1199 if video_url_list is None:
1200 raise ExtractorError(u'requested format not available')
1201 return video_url_list
1203 def _extract_from_m3u8(self, manifest_url, video_id):
1205 def _get_urls(_manifest):
1206 lines = _manifest.split('\n')
1207 urls = filter(lambda l: l and not l.startswith('#'),
1210 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1211 formats_urls = _get_urls(manifest)
1212 for format_url in formats_urls:
1213 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1214 url_map[itag] = format_url
1217 def _real_extract(self, url):
1218 if re.match(r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$', url):
1219 self._downloader.report_warning(u'Did you forget to quote the URL? Remember that & is a meta-character in most shells, so you want to put the URL in quotes, like youtube-dl \'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\' (or simply youtube-dl BaW_jenozKc ).')
1221 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1222 mobj = re.search(self._NEXT_URL_RE, url)
1224 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1225 video_id = self._extract_id(url)
1228 self.report_video_webpage_download(video_id)
1229 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1230 request = compat_urllib_request.Request(url)
1232 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
1233 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1234 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
1236 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
1238 # Attempt to extract SWF player URL
1239 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1240 if mobj is not None:
1241 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1246 self.report_video_info_webpage_download(video_id)
1247 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1248 self.report_age_confirmation()
1250 # We simulate the access to the video from www.youtube.com/v/{video_id}
1251 # this can be viewed without login into Youtube
1252 data = compat_urllib_parse.urlencode({'video_id': video_id,
1256 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1260 video_info_url = 'https://www.youtube.com/get_video_info?' + data
1261 video_info_webpage = self._download_webpage(video_info_url, video_id,
1263 errnote='unable to download video info webpage')
1264 video_info = compat_parse_qs(video_info_webpage)
1267 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1268 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1269 % (video_id, el_type))
1270 video_info_webpage = self._download_webpage(video_info_url, video_id,
1272 errnote='unable to download video info webpage')
1273 video_info = compat_parse_qs(video_info_webpage)
1274 if 'token' in video_info:
1276 if 'token' not in video_info:
1277 if 'reason' in video_info:
1278 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
1280 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1282 # Check for "rental" videos
1283 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1284 raise ExtractorError(u'"rental" videos not supported')
1286 # Start extracting information
1287 self.report_information_extraction(video_id)
1290 if 'author' not in video_info:
1291 raise ExtractorError(u'Unable to extract uploader name')
1292 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1295 video_uploader_id = None
1296 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1297 if mobj is not None:
1298 video_uploader_id = mobj.group(1)
1300 self._downloader.report_warning(u'unable to extract uploader nickname')
1303 if 'title' not in video_info:
1304 raise ExtractorError(u'Unable to extract video title')
1305 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1308 # We try first to get a high quality image:
1309 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1310 video_webpage, re.DOTALL)
1311 if m_thumb is not None:
1312 video_thumbnail = m_thumb.group(1)
1313 elif 'thumbnail_url' not in video_info:
1314 self._downloader.report_warning(u'unable to extract video thumbnail')
1315 video_thumbnail = ''
1316 else: # don't panic if we can't find it
1317 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1321 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1322 if mobj is not None:
1323 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1324 upload_date = unified_strdate(upload_date)
1327 video_description = get_element_by_id("eow-description", video_webpage)
1328 if video_description:
1329 video_description = clean_html(video_description)
1331 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1333 video_description = unescapeHTML(fd_mobj.group(1))
1335 video_description = u''
1338 video_subtitles = self.extract_subtitles(video_id, video_webpage)
1340 if self._downloader.params.get('listsubtitles', False):
1341 self._list_available_subtitles(video_id, video_webpage)
1344 if 'length_seconds' not in video_info:
1345 self._downloader.report_warning(u'unable to extract video duration')
1348 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
1350 # Decide which formats to download
1353 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
1355 raise ValueError('Could not find vevo ID')
1356 info = json.loads(mobj.group(1))
1358 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1359 # this signatures are encrypted
1360 m_s = re.search(r'[&,]s=', args['url_encoded_fmt_stream_map'])
1362 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
1363 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
1364 m_s = re.search(r'[&,]s=', args.get('adaptive_fmts', u''))
1366 if 'url_encoded_fmt_stream_map' in video_info:
1367 video_info['url_encoded_fmt_stream_map'][0] += ',' + args['adaptive_fmts']
1369 video_info['url_encoded_fmt_stream_map'] = [args['adaptive_fmts']]
1370 elif 'adaptive_fmts' in video_info:
1371 if 'url_encoded_fmt_stream_map' in video_info:
1372 video_info['url_encoded_fmt_stream_map'][0] += ',' + video_info['adaptive_fmts'][0]
1374 video_info['url_encoded_fmt_stream_map'] = video_info['adaptive_fmts']
1378 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1379 self.report_rtmp_download()
1380 video_url_list = [(None, video_info['conn'][0])]
1381 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1382 if 'rtmpe%3Dyes' in video_info['url_encoded_fmt_stream_map'][0]:
1383 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1385 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
1386 url_data = compat_parse_qs(url_data_str)
1387 if 'itag' in url_data and 'url' in url_data:
1388 url = url_data['url'][0]
1389 if 'sig' in url_data:
1390 url += '&signature=' + url_data['sig'][0]
1391 elif 's' in url_data:
1392 encrypted_sig = url_data['s'][0]
1393 if self._downloader.params.get('verbose'):
1395 player_version = self._search_regex(
1397 player_url if player_url else None,
1398 'flash player', fatal=False)
1399 player_desc = 'flash player %s' % player_version
1401 player_version = self._search_regex(
1402 r'html5player-(.+?)\.js', video_webpage,
1403 'html5 player', fatal=False)
1404 player_desc = u'html5 player %s' % player_version
1406 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
1407 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
1408 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1411 jsplayer_url_json = self._search_regex(
1412 r'"assets":.+?"js":\s*("[^"]+")',
1413 video_webpage, u'JS player URL')
1414 player_url = json.loads(jsplayer_url_json)
1416 signature = self._decrypt_signature(
1417 encrypted_sig, video_id, player_url, age_gate)
1418 url += '&signature=' + signature
1419 if 'ratebypass' not in url:
1420 url += '&ratebypass=yes'
1421 url_map[url_data['itag'][0]] = url
1422 video_url_list = self._get_video_url_list(url_map)
1423 if not video_url_list:
1425 elif video_info.get('hlsvp'):
1426 manifest_url = video_info['hlsvp'][0]
1427 url_map = self._extract_from_m3u8(manifest_url, video_id)
1428 video_url_list = self._get_video_url_list(url_map)
1429 if not video_url_list:
1433 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
1436 for format_param, video_real_url in video_url_list:
1438 video_extension = self._video_extensions.get(format_param, 'flv')
1440 video_format = '{0} - {1}{2}'.format(format_param if format_param else video_extension,
1441 self._video_dimensions.get(format_param, '???'),
1442 ' ('+self._special_itags[format_param]+')' if format_param in self._special_itags else '')
1446 'url': video_real_url,
1447 'uploader': video_uploader,
1448 'uploader_id': video_uploader_id,
1449 'upload_date': upload_date,
1450 'title': video_title,
1451 'ext': video_extension,
1452 'format': video_format,
1453 'thumbnail': video_thumbnail,
1454 'description': video_description,
1455 'player_url': player_url,
1456 'subtitles': video_subtitles,
1457 'duration': video_duration
1461 class YoutubePlaylistIE(InfoExtractor):
1462 IE_DESC = u'YouTube.com playlists'
1463 _VALID_URL = r"""(?:
1468 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1469 \? (?:.*?&)*? (?:p|a|list)=
1472 ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
1475 ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
1477 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1479 IE_NAME = u'youtube:playlist'
1482 def suitable(cls, url):
1483 """Receives a URL and returns True if suitable for this IE."""
1484 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1486 def _real_extract(self, url):
1487 # Extract playlist id
1488 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1490 raise ExtractorError(u'Invalid URL: %s' % url)
1492 # Download playlist videos from API
1493 playlist_id = mobj.group(1) or mobj.group(2)
1496 for page_num in itertools.count(1):
1497 start_index = self._MAX_RESULTS * (page_num - 1) + 1
1498 if start_index >= 1000:
1499 self._downloader.report_warning(u'Max number of results reached')
1501 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index)
1502 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1505 response = json.loads(page)
1506 except ValueError as err:
1507 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1509 if 'feed' not in response:
1510 raise ExtractorError(u'Got a malformed response from YouTube API')
1511 playlist_title = response['feed']['title']['$t']
1512 if 'entry' not in response['feed']:
1513 # Number of videos is a multiple of self._MAX_RESULTS
1516 for entry in response['feed']['entry']:
1517 index = entry['yt$position']['$t']
1518 if 'media$group' in entry and 'yt$videoid' in entry['media$group']:
1521 'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t']
1524 videos = [v[1] for v in sorted(videos)]
1526 url_results = [self.url_result(vurl, 'Youtube') for vurl in videos]
1527 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1530 class YoutubeChannelIE(InfoExtractor):
1531 IE_DESC = u'YouTube.com channels'
1532 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1533 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1534 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1535 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1536 IE_NAME = u'youtube:channel'
1538 def extract_videos_from_page(self, page):
1540 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1541 if mobj.group(1) not in ids_in_page:
1542 ids_in_page.append(mobj.group(1))
1545 def _real_extract(self, url):
1546 # Extract channel id
1547 mobj = re.match(self._VALID_URL, url)
1549 raise ExtractorError(u'Invalid URL: %s' % url)
1551 # Download channel page
1552 channel_id = mobj.group(1)
1556 url = self._TEMPLATE_URL % (channel_id, pagenum)
1557 page = self._download_webpage(url, channel_id,
1558 u'Downloading page #%s' % pagenum)
1560 # Extract video identifiers
1561 ids_in_page = self.extract_videos_from_page(page)
1562 video_ids.extend(ids_in_page)
1564 # Download any subsequent channel pages using the json-based channel_ajax query
1565 if self._MORE_PAGES_INDICATOR in page:
1566 for pagenum in itertools.count(1):
1567 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1568 page = self._download_webpage(url, channel_id,
1569 u'Downloading page #%s' % pagenum)
1571 page = json.loads(page)
1573 ids_in_page = self.extract_videos_from_page(page['content_html'])
1574 video_ids.extend(ids_in_page)
1576 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1579 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1581 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1582 url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls]
1583 return [self.playlist_result(url_entries, channel_id)]
1586 class YoutubeUserIE(InfoExtractor):
1587 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
1588 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?)|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1589 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1590 _GDATA_PAGE_SIZE = 50
1591 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1592 IE_NAME = u'youtube:user'
1595 def suitable(cls, url):
1596 # Don't return True if the url can be extracted with other youtube
1597 # extractor, the regex would is too permissive and it would match.
1598 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1599 if any(ie.suitable(url) for ie in other_ies): return False
1600 else: return super(YoutubeUserIE, cls).suitable(url)
1602 def _real_extract(self, url):
1604 mobj = re.match(self._VALID_URL, url)
1606 raise ExtractorError(u'Invalid URL: %s' % url)
1608 username = mobj.group(1)
1610 # Download video ids using YouTube Data API. Result size per
1611 # query is limited (currently to 50 videos) so we need to query
1612 # page by page until there are no video ids - it means we got
1617 for pagenum in itertools.count(0):
1618 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1620 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1621 page = self._download_webpage(gdata_url, username,
1622 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1625 response = json.loads(page)
1626 except ValueError as err:
1627 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1628 if 'entry' not in response['feed']:
1629 # Number of videos is a multiple of self._MAX_RESULTS
1632 # Extract video identifiers
1634 for entry in response['feed']['entry']:
1635 ids_in_page.append(entry['id']['$t'].split('/')[-1])
1636 video_ids.extend(ids_in_page)
1638 # A little optimization - if current page is not
1639 # "full", ie. does not contain PAGE_SIZE video ids then
1640 # we can assume that this page is the last one - there
1641 # are no more ids on further pages - no need to query
1644 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1647 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1648 url_results = [self.url_result(rurl, 'Youtube') for rurl in urls]
1649 return [self.playlist_result(url_results, playlist_title = username)]
1651 class YoutubeSearchIE(SearchInfoExtractor):
1652 IE_DESC = u'YouTube.com searches'
1653 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1655 IE_NAME = u'youtube:search'
1656 _SEARCH_KEY = 'ytsearch'
1658 def report_download_page(self, query, pagenum):
1659 """Report attempt to download search page with given number."""
1660 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1662 def _get_n_results(self, query, n):
1663 """Get a specified number of results for a query"""
1669 while (50 * pagenum) < limit:
1670 self.report_download_page(query, pagenum+1)
1671 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1672 request = compat_urllib_request.Request(result_url)
1674 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1675 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1676 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1677 api_response = json.loads(data)['data']
1679 if not 'items' in api_response:
1680 raise ExtractorError(u'[youtube] No video results')
1682 new_ids = list(video['id'] for video in api_response['items'])
1683 video_ids += new_ids
1685 limit = min(n, api_response['totalItems'])
1688 if len(video_ids) > n:
1689 video_ids = video_ids[:n]
1690 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1691 return self.playlist_result(videos, query)
1694 class YoutubeShowIE(InfoExtractor):
1695 IE_DESC = u'YouTube.com (multi-season) shows'
1696 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1697 IE_NAME = u'youtube:show'
1699 def _real_extract(self, url):
1700 mobj = re.match(self._VALID_URL, url)
1701 show_name = mobj.group(1)
1702 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1703 # There's one playlist for each season of the show
1704 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1705 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1706 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1709 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1711 Base class for extractors that fetch info from
1712 http://www.youtube.com/feed_ajax
1713 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1715 _LOGIN_REQUIRED = True
1717 # use action_load_personal_feed instead of action_load_system_feed
1718 _PERSONAL_FEED = False
1721 def _FEED_TEMPLATE(self):
1722 action = 'action_load_system_feed'
1723 if self._PERSONAL_FEED:
1724 action = 'action_load_personal_feed'
1725 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1729 return u'youtube:%s' % self._FEED_NAME
1731 def _real_initialize(self):
1734 def _real_extract(self, url):
1736 # The step argument is available only in 2.7 or higher
1737 for i in itertools.count(0):
1738 paging = i*self._PAGING_STEP
1739 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1740 u'%s feed' % self._FEED_NAME,
1741 u'Downloading page %s' % i)
1742 info = json.loads(info)
1743 feed_html = info['feed_html']
1744 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1745 ids = orderedSet(m.group(1) for m in m_ids)
1746 feed_entries.extend(self.url_result(id, 'Youtube') for id in ids)
1747 if info['paging'] is None:
1749 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1751 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1752 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1753 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1754 _FEED_NAME = 'subscriptions'
1755 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1757 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1758 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1759 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1760 _FEED_NAME = 'recommended'
1761 _PLAYLIST_TITLE = u'Youtube Recommended videos'
1763 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1764 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1765 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1766 _FEED_NAME = 'watch_later'
1767 _PLAYLIST_TITLE = u'Youtube Watch Later'
1769 _PERSONAL_FEED = True
1771 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1772 IE_NAME = u'youtube:favorites'
1773 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1774 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1775 _LOGIN_REQUIRED = True
1777 def _real_extract(self, url):
1778 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1779 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1780 return self.url_result(playlist_id, 'YoutubePlaylist')