15 from .common import InfoExtractor, SearchInfoExtractor
16 from .subtitles import SubtitlesInfoExtractor
22 compat_urllib_request,
33 class YoutubeBaseInfoExtractor(InfoExtractor):
34 """Provide base functions for Youtube extractors"""
35 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
36 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
37 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
38 _NETRC_MACHINE = 'youtube'
39 # If True it will raise an error if no login info is provided
40 _LOGIN_REQUIRED = False
42 def report_lang(self):
43 """Report attempt to set language."""
44 self.to_screen(u'Setting language')
46 def _set_language(self):
47 request = compat_urllib_request.Request(self._LANG_URL)
50 compat_urllib_request.urlopen(request).read()
51 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
52 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
57 (username, password) = self._get_login_info()
58 # No authentication to be performed
60 if self._LOGIN_REQUIRED:
61 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
64 request = compat_urllib_request.Request(self._LOGIN_URL)
66 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
67 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
68 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
73 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
76 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
82 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
86 u'PersistentCookie': u'yes',
88 u'bgresponse': u'js_disabled',
89 u'checkConnection': u'',
90 u'checkedDomains': u'youtube',
96 u'signIn': u'Sign in',
98 u'service': u'youtube',
102 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
104 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
105 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
106 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
109 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
110 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
111 self._downloader.report_warning(u'unable to log in: bad username or password')
113 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
114 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
118 def _confirm_age(self):
121 'action_confirm': 'Confirm',
123 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
125 self.report_age_confirmation()
126 compat_urllib_request.urlopen(request).read().decode('utf-8')
127 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
128 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
131 def _real_initialize(self):
132 if self._downloader is None:
134 if not self._set_language():
136 if not self._login():
141 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
142 IE_DESC = u'YouTube.com'
145 (?:https?://)? # http(s):// (optional)
146 (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
147 tube\.majestyc\.net/|
148 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
149 (?:.*?\#/)? # handle anchor (#/) redirect urls
150 (?: # the various things that can precede the ID:
151 (?:(?:v|embed|e)/) # v/ or embed/ or e/
152 |(?: # or the v= param in all its forms
153 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
154 (?:\?|\#!?) # the params delimiter ? or # or #!
155 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
159 |youtu\.be/ # just youtu.be/xxxx
161 )? # all until now is optional -> you can pass the naked ID
162 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
163 (?(1).+)? # if we found the ID, everything can follow
165 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
166 # Listed in order of quality
167 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
168 # Apple HTTP Live Streaming
169 '96', '95', '94', '93', '92', '132', '151',
171 '85', '84', '102', '83', '101', '82', '100',
173 '138', '137', '248', '136', '247', '135', '246',
174 '245', '244', '134', '243', '133', '242', '160',
176 '141', '172', '140', '171', '139',
178 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
179 # Apple HTTP Live Streaming
180 '96', '95', '94', '93', '92', '132', '151',
182 '85', '102', '84', '101', '83', '100', '82',
184 '138', '248', '137', '247', '136', '246', '245',
185 '244', '135', '243', '134', '242', '133', '160',
187 '172', '141', '171', '140', '139',
189 _video_formats_map = {
190 'flv': ['35', '34', '6', '5'],
191 '3gp': ['36', '17', '13'],
192 'mp4': ['38', '37', '22', '18'],
193 'webm': ['46', '45', '44', '43'],
195 _video_extensions = {
217 # Apple HTTP Live Streaming
249 _video_dimensions = {
331 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
332 u"file": u"BaW_jenozKc.mp4",
334 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
335 u"uploader": u"Philipp Hagemeister",
336 u"uploader_id": u"phihag",
337 u"upload_date": u"20121002",
338 u"description": u"test chars: \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
342 u"url": u"http://www.youtube.com/watch?v=1ltcDfZMA3U",
343 u"file": u"1ltcDfZMA3U.flv",
344 u"note": u"Test VEVO video (#897)",
346 u"upload_date": u"20070518",
347 u"title": u"Maps - It Will Find You",
348 u"description": u"Music video by Maps performing It Will Find You.",
349 u"uploader": u"MuteUSA",
350 u"uploader_id": u"MuteUSA"
354 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
355 u"file": u"UxxajLWwzqY.mp4",
356 u"note": u"Test generic use_cipher_signature video (#897)",
358 u"upload_date": u"20120506",
359 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
360 u"description": u"md5:3e2666e0a55044490499ea45fe9037b7",
361 u"uploader": u"Icona Pop",
362 u"uploader_id": u"IconaPop"
366 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
367 u"file": u"07FYdnEawAQ.mp4",
368 u"note": u"Test VEVO video with age protection (#956)",
370 u"upload_date": u"20130703",
371 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
372 u"description": u"md5:64249768eec3bc4276236606ea996373",
373 u"uploader": u"justintimberlakeVEVO",
374 u"uploader_id": u"justintimberlakeVEVO"
378 u'url': u'https://www.youtube.com/watch?v=TGi3HqYrWHE',
379 u'file': u'TGi3HqYrWHE.mp4',
380 u'note': u'm3u8 video',
382 u'title': u'Triathlon - Men - London 2012 Olympic Games',
383 u'description': u'- Men - TR02 - Triathlon - 07 August 2012 - London 2012 Olympic Games',
384 u'uploader': u'olympic',
385 u'upload_date': u'20120807',
386 u'uploader_id': u'olympic',
389 u'skip_download': True,
396 def suitable(cls, url):
397 """Receives a URL and returns True if suitable for this IE."""
398 if YoutubePlaylistIE.suitable(url): return False
399 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
401 def __init__(self, *args, **kwargs):
402 super(YoutubeIE, self).__init__(*args, **kwargs)
403 self._jsplayer_cache = {}
405 def report_video_webpage_download(self, video_id):
406 """Report attempt to download video webpage."""
407 self.to_screen(u'%s: Downloading video webpage' % video_id)
409 def report_video_info_webpage_download(self, video_id):
410 """Report attempt to download video info webpage."""
411 self.to_screen(u'%s: Downloading video info webpage' % video_id)
413 def report_information_extraction(self, video_id):
414 """Report attempt to extract video information."""
415 self.to_screen(u'%s: Extracting video information' % video_id)
417 def report_unavailable_format(self, video_id, format):
418 """Report extracted video URL."""
419 self.to_screen(u'%s: Format %s not available' % (video_id, format))
421 def report_rtmp_download(self):
422 """Indicate the download will use the RTMP protocol."""
423 self.to_screen(u'RTMP download detected')
425 def _extract_signature_function(self, video_id, player_url):
426 id_m = re.match(r'.*-(?P<id>[^.]+)\.(?P<ext>[^.]+)$', player_url)
427 player_type = id_m.group('ext')
428 player_id = id_m.group('id')
430 if player_type == 'js':
431 code = self._download_webpage(
432 player_url, video_id,
433 note=u'Downloading %s player %s' % (player_type, jsplayer_id),
434 errnote=u'Download of %s failed' % player_url)
435 return self._parse_sig_js(code)
436 elif player_tpye == 'swf':
437 urlh = self._request_webpage(
438 player_url, video_id,
439 note=u'Downloading %s player %s' % (player_type, jsplayer_id),
440 errnote=u'Download of %s failed' % player_url)
442 return self._parse_sig_swf(code)
444 assert False, 'Invalid player type %r' % player_type
446 def _parse_sig_js(self, jscode):
447 funcname = self._search_regex(
448 r'signature=([a-zA-Z]+)', jscode,
449 u'Initial JS player signature function name')
454 return string.lowercase.index(varname)
456 def interpret_statement(stmt, local_vars, allow_recursion=20):
457 if allow_recursion < 0:
458 raise ExctractorError(u'Recursion limit reached')
460 if stmt.startswith(u'var '):
461 stmt = stmt[len(u'var '):]
462 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
463 r'=(?P<expr>.*)$', stmt)
465 if ass_m.groupdict().get('index'):
467 lvar = local_vars[ass_m.group('out')]
468 idx = interpret_expression(ass_m.group('index'),
469 local_vars, allow_recursion)
470 assert isinstance(idx, int)
473 expr = ass_m.group('expr')
476 local_vars[ass_m.group('out')] = val
478 expr = ass_m.group('expr')
479 elif stmt.startswith(u'return '):
481 expr = stmt[len(u'return '):]
483 raise ExtractorError(
484 u'Cannot determine left side of statement in %r' % stmt)
486 v = interpret_expression(expr, local_vars, allow_recursion)
489 def interpret_expression(expr, local_vars, allow_recursion):
494 return local_vars[expr]
496 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
498 member = m.group('member')
499 val = local_vars[m.group('in')]
500 if member == 'split("")':
502 if member == 'join("")':
504 if member == 'length':
506 if member == 'reverse()':
508 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
510 idx = interpret_expression(
511 slice_m.group('idx'), local_vars, allow_recursion-1)
515 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
517 val = local_vars[m.group('in')]
518 idx = interpret_expression(m.group('idx'), local_vars,
522 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
524 a = interpret_expression(m.group('a'),
525 local_vars, allow_recursion)
526 b = interpret_expression(m.group('b'),
527 local_vars, allow_recursion)
531 r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
533 fname = m.group('func')
534 if fname not in functions:
535 functions[fname] = extract_function(fname)
536 argvals = [int(v) if v.isdigit() else local_vars[v]
537 for v in m.group('args').split(',')]
538 return functions[fname](argvals)
539 raise ExtractorError(u'Unsupported JS expression %r' % expr)
541 def extract_function(funcname):
543 r'function ' + re.escape(funcname) +
544 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
546 argnames = func_m.group('args').split(',')
549 local_vars = dict(zip(argnames, args))
550 for stmt in func_m.group('code').split(';'):
551 res = interpret_statement(stmt, local_vars)
555 initial_function = extract_function(funcname)
556 return lambda s: initial_function([s])
558 def _parse_sig_swf(self, file_contents):
559 if file_contents[1:3] != b'WS':
560 raise ExtractorError(
561 u'Not an SWF file; header is %r' % file_contents[:3])
562 if file_contents[:1] == b'C':
563 content = zlib.decompress(file_contents[8:])
565 raise NotImplementedError(u'Unsupported compression format %r' %
568 def extract_tags(content):
570 while pos < len(content):
571 header16 = struct.unpack('<H', content[pos:pos+2])[0]
573 tag_code = header16 >> 6
574 tag_len = header16 & 0x3f
576 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
578 assert pos+tag_len <= len(content)
579 yield (tag_code, content[pos:pos+tag_len])
583 for tag_code, tag in extract_tags(content)
585 p = code_tag.index(b'\0', 4) + 1
587 # Parse ABC (AVM2 ByteCode)
588 def read_int(data=None, pos=None):
589 if hasattr(data, 'read'):
597 b = struct.unpack('<B', buf)[0]
598 res = res | ((b & 0x7f) << shift)
611 b = struct.unpack('<B', data[pos:pos+1])[0]
613 res = res | ((b & 0x7f) << shift)
618 assert read_int(b'\x00', 0) == (0, 1)
619 assert read_int(b'\x10', 0) == (16, 1)
620 assert read_int(b'\x34', 0) == (0x34, 1)
621 assert read_int(b'\xb4\x12', 0) == (0x12 * 0x80 + 0x34, 2)
622 assert read_int(b'\xff\xff\xff\x00', 0) == (0x1fffff, 4)
624 def u30(*args, **kwargs):
625 res = read_int(*args, **kwargs)
626 if isinstance(res, tuple):
627 assert res[0] & 0xf0000000 == 0
629 assert res & 0xf0000000 == 0
633 def s32(data=None, pos=None):
634 v, pos = read_int(data, pos)
635 if v & 0x80000000 != 0:
636 v = - ((v ^ 0xffffffff) + 1)
638 assert s32(b'\xff\xff\xff\xff\x0f', 0) == (-1, 5)
642 return (code_tag[p:p+slen].decode('utf-8'), p + slen)
644 def read_byte(data=None, pos=None):
649 res = struct.unpack('<B', data[pos:pos+1])[0]
650 return (res, pos + 1)
652 # minor_version + major_version
657 for _c in range(1, int_count):
659 uint_count, p = u30()
660 for _c in range(1, uint_count):
662 double_count, p = u30()
663 p += (double_count-1) * 8
664 string_count, p = u30()
665 constant_strings = [u'']
666 for _c in range(1, string_count):
668 constant_strings.append(s)
669 namespace_count, p = u30()
670 for _c in range(1, namespace_count):
673 ns_set_count, p = u30()
674 for _c in range(1, ns_set_count):
676 for _c2 in range(count):
678 multiname_count, p = u30()
687 0x0e: 2, # MultinameA
688 0x1b: 1, # MultinameL
689 0x1c: 1, # MultinameLA
692 for _c in range(1, multiname_count):
694 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
696 namespace_idx, p = u30()
698 multinames.append(constant_strings[name_idx])
700 multinames.append('[MULTINAME kind: %d]' % kind)
701 for _c2 in range(MULTINAME_SIZES[kind]):
705 method_count, p = u30()
706 MethodInfo = collections.namedtuple(
708 ['NEED_ARGUMENTS', 'NEED_REST'])
710 for method_id in range(method_count):
711 param_count, p = u30()
712 _, p = u30() # return type
713 for _ in range(param_count):
714 _, p = u30() # param type
715 _, p = u30() # name index (always 0 for youtube)
716 flags, p = read_byte()
717 if flags & 0x08 != 0:
719 option_count, p = u30()
720 for c in range(option_count):
723 if flags & 0x80 != 0:
724 # Param names present
725 for _ in range(param_count):
726 _, p = u30() # param name
727 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
728 method_infos.append(mi)
731 metadata_count, p = u30()
732 for _c in range(metadata_count):
734 item_count, p = u30()
735 for _c2 in range(item_count):
739 def parse_traits_info(pos=None):
742 trait_name_idx, pos = u30(pos=pos)
743 kind_full, pos = read_byte(pos=pos)
744 kind = kind_full & 0x0f
745 attrs = kind_full >> 4
747 if kind in [0x00, 0x06]: # Slot or Const
748 _, pos = u30(pos=pos) # Slot id
749 type_name_idx, pos = u30(pos=pos)
750 vindex, pos = u30(pos=pos)
752 _, pos = read_byte(pos=pos) # vkind
753 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
754 _, pos = u30(pos=pos) # disp_id
755 method_idx, pos = u30(pos=pos)
756 methods[multinames[trait_name_idx]] = method_idx
757 elif kind == 0x04: # Class
758 _, pos = u30(pos=pos) # slot_id
759 _, pos = u30(pos=pos) # classi
760 elif kind == 0x05: # Function
761 _, pos = u30(pos=pos) # slot_id
762 function_idx, pos = u30(pos=pos)
763 methods[function_idx] = multinames[trait_name_idx]
765 raise ExtractorError(u'Unsupported trait kind %d' % kind)
767 if attrs & 0x4 != 0: # Metadata present
768 metadata_count, pos = u30(pos=pos)
769 for _c3 in range(metadata_count):
770 _, pos = u30(pos=pos)
772 return (methods, pos)
775 TARGET_CLASSNAME = u'SignatureDecipher'
776 searched_idx = multinames.index(TARGET_CLASSNAME)
777 searched_class_id = None
778 class_count, p = u30()
779 for class_id in range(class_count):
781 if name_idx == searched_idx:
782 # We found the class we're looking for!
783 searched_class_id = class_id
784 _, p = u30() # super_name idx
785 flags, p = read_byte()
786 if flags & 0x08 != 0: # Protected namespace is present
787 protected_ns_idx, p = u30()
788 intrf_count, p = u30()
789 for _c2 in range(intrf_count):
792 trait_count, p = u30()
793 for _c2 in range(trait_count):
794 _, p = parse_traits_info()
796 if searched_class_id is None:
797 raise ExtractorError(u'Target class %r not found' %
802 for class_id in range(class_count):
804 trait_count, p = u30()
805 for _c2 in range(trait_count):
806 trait_methods, p = parse_traits_info()
807 if class_id == searched_class_id:
808 method_names.update(trait_methods.items())
809 method_idxs.update(dict(
811 for name, idx in trait_methods.items()))
814 script_count, p = u30()
815 for _c in range(script_count):
817 trait_count, p = u30()
818 for _c2 in range(trait_count):
819 _, p = parse_traits_info()
822 method_body_count, p = u30()
823 Method = collections.namedtuple('Method', ['code', 'local_count'])
825 for _c in range(method_body_count):
826 method_idx, p = u30()
828 local_count, p = u30()
829 init_scope_depth, p = u30()
830 max_scope_depth, p = u30()
831 code_length, p = u30()
832 if method_idx in method_idxs:
833 m = Method(code_tag[p:p+code_length], local_count)
834 methods[method_idxs[method_idx]] = m
836 exception_count, p = u30()
837 for _c2 in range(exception_count):
840 _, p = u30() # target
841 _, p = u30() # exc_type
842 _, p = u30() # var_name
843 trait_count, p = u30()
844 for _c2 in range(trait_count):
845 _, p = parse_traits_info()
847 assert p == len(code_tag)
848 assert len(methods) == len(method_idxs)
850 method_pyfunctions = {}
852 def extract_function(func_name):
853 if func_name in method_pyfunctions:
854 return method_pyfunctions[func_name]
855 if func_name not in methods:
856 raise ExtractorError(u'Cannot find function %r' % func_name)
857 m = methods[func_name]
860 print('Entering function %s(%r)' % (func_name, args))
861 registers = ['(this)'] + list(args) + [None] * m.local_count
863 coder = io.BytesIO(m.code)
865 opcode = struct.unpack('!B', coder.read(1))[0]
866 if opcode == 208: # getlocal_0
867 stack.append(registers[0])
868 elif opcode == 209: # getlocal_1
869 stack.append(registers[1])
870 elif opcode == 210: # getlocal_2
871 stack.append(registers[2])
872 elif opcode == 36: # pushbyte
873 v = struct.unpack('!B', coder.read(1))[0]
875 elif opcode == 44: # pushstring
877 stack.append(constant_strings[idx])
878 elif opcode == 48: # pushscope
879 # We don't implement the scope register, so we'll just
880 # ignore the popped value
882 elif opcode == 70: # callproperty
884 mname = multinames[index]
885 arg_count = u30(coder)
886 args = list(reversed(
887 [stack.pop() for _ in range(arg_count)]))
889 if mname == u'split':
890 assert len(args) == 1
891 assert isinstance(args[0], compat_str)
892 assert isinstance(obj, compat_str)
896 res = obj.split(args[0])
898 elif mname in method_pyfunctions:
899 stack.append(method_pyfunctions[mname](args))
901 raise NotImplementedError(
902 u'Unsupported property %r on %r'
904 elif opcode == 93: # findpropstrict
906 mname = multinames[index]
907 res = extract_function(mname)
909 elif opcode == 97: # setproperty
914 assert isinstance(obj, list)
915 assert isinstance(idx, int)
917 elif opcode == 98: # getlocal
919 stack.append(registers[index])
920 elif opcode == 99: # setlocal
923 registers[index] = value
924 elif opcode == 102: # getproperty
926 pname = multinames[index]
927 if pname == u'length':
929 assert isinstance(obj, list)
930 stack.append(len(obj))
931 else: # Assume attribute access
933 assert isinstance(idx, int)
935 assert isinstance(obj, list)
936 stack.append(obj[idx])
937 elif opcode == 128: # coerce
939 elif opcode == 133: # coerce_s
940 assert isinstance(stack[-1], (type(None), compat_str))
941 elif opcode == 164: # modulo
944 res = value1 % value2
946 elif opcode == 214: # setlocal_2
947 registers[2] = stack.pop()
948 elif opcode == 215: # setlocal_3
949 registers[3] = stack.pop()
951 raise NotImplementedError(
952 u'Unsupported opcode %d' % opcode)
954 method_pyfunctions[func_name] = resfunc
957 initial_function = extract_function(u'decipher')
958 return lambda s: initial_function([s])
960 def _decrypt_signature(self, s, video_id, jsplayer_url, age_gate=False):
961 """Turn the encrypted s field into a working signature"""
963 if jsplayer_url is not None:
965 if jsplayer_url not in self._jsplayer_cache:
966 self._jsplayer_cache[jsplayer_url] = self._extract_signature_function(
967 video_id, jsplayer_url
969 return self._jsplayer_cache[jsplayer_url]([s])
970 except Exception as e:
971 tb = traceback.format_exc()
972 self._downloader.report_warning(u'Automatic signature extraction failed: ' + tb)
974 self._downloader.report_warning(u'Warning: Falling back to static signature algorithm')
977 # The videos with age protection use another player, so the
978 # algorithms can be different.
980 return s[2:63] + s[82] + s[64:82] + s[63]
983 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
985 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
987 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
989 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
991 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
993 return s[5:34] + s[0] + s[35:38] + s[3] + s[39:45] + s[38] + s[46:53] + s[73] + s[54:73] + s[85] + s[74:85] + s[53]
995 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
997 return s[81:36:-1] + s[0] + s[35:2:-1]
999 return s[81:64:-1] + s[82] + s[63:52:-1] + s[45] + s[51:45:-1] + s[1] + s[44:1:-1] + s[0]
1001 return s[80:73:-1] + s[81] + s[72:54:-1] + s[2] + s[53:43:-1] + s[0] + s[42:2:-1] + s[43] + s[1] + s[54]
1003 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1005 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
1007 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1010 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
1012 def _decrypt_signature_age_gate(self, s):
1013 # The videos with age protection use another player, so the algorithms
1016 return s[2:63] + s[82] + s[64:82] + s[63]
1018 # Fallback to the other algortihms
1019 return self._decrypt_signature(s)
1021 def _get_available_subtitles(self, video_id):
1023 sub_list = self._download_webpage(
1024 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1025 video_id, note=False)
1026 except ExtractorError as err:
1027 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
1029 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
1034 params = compat_urllib_parse.urlencode({
1037 'fmt': self._downloader.params.get('subtitlesformat'),
1039 url = u'http://www.youtube.com/api/timedtext?' + params
1040 sub_lang_list[lang] = url
1041 if not sub_lang_list:
1042 self._downloader.report_warning(u'video doesn\'t have subtitles')
1044 return sub_lang_list
1046 def _get_available_automatic_caption(self, video_id, webpage):
1047 """We need the webpage for getting the captions url, pass it as an
1048 argument to speed up the process."""
1049 sub_format = self._downloader.params.get('subtitlesformat')
1050 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1051 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
1052 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
1054 self._downloader.report_warning(err_msg)
1056 player_config = json.loads(mobj.group(1))
1058 args = player_config[u'args']
1059 caption_url = args[u'ttsurl']
1060 timestamp = args[u'timestamp']
1061 # We get the available subtitles
1062 list_params = compat_urllib_parse.urlencode({
1067 list_url = caption_url + '&' + list_params
1068 list_page = self._download_webpage(list_url, video_id)
1069 caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8'))
1070 original_lang_node = caption_list.find('track')
1071 if original_lang_node.attrib.get('kind') != 'asr' :
1072 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1074 original_lang = original_lang_node.attrib['lang_code']
1077 for lang_node in caption_list.findall('target'):
1078 sub_lang = lang_node.attrib['lang_code']
1079 params = compat_urllib_parse.urlencode({
1080 'lang': original_lang,
1086 sub_lang_list[sub_lang] = caption_url + '&' + params
1087 return sub_lang_list
1088 # An extractor error can be raise by the download process if there are
1089 # no automatic captions but there are subtitles
1090 except (KeyError, ExtractorError):
1091 self._downloader.report_warning(err_msg)
1094 def _print_formats(self, formats):
1095 print('Available formats:')
1097 print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'),
1098 self._video_dimensions.get(x, '???'),
1099 ' ('+self._special_itags[x]+')' if x in self._special_itags else ''))
1101 def _extract_id(self, url):
1102 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1104 raise ExtractorError(u'Invalid URL: %s' % url)
1105 video_id = mobj.group(2)
1108 def _get_video_url_list(self, url_map):
1110 Transform a dictionary in the format {itag:url} to a list of (itag, url)
1111 with the requested formats.
1113 req_format = self._downloader.params.get('format', None)
1114 format_limit = self._downloader.params.get('format_limit', None)
1115 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1116 if format_limit is not None and format_limit in available_formats:
1117 format_list = available_formats[available_formats.index(format_limit):]
1119 format_list = available_formats
1120 existing_formats = [x for x in format_list if x in url_map]
1121 if len(existing_formats) == 0:
1122 raise ExtractorError(u'no known formats available for video')
1123 if self._downloader.params.get('listformats', None):
1124 self._print_formats(existing_formats)
1126 if req_format is None or req_format == 'best':
1127 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1128 elif req_format == 'worst':
1129 video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality
1130 elif req_format in ('-1', 'all'):
1131 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1133 # Specific formats. We pick the first in a slash-delimeted sequence.
1134 # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
1135 # available in the specified format. For example,
1136 # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1137 # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
1138 # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
1139 req_formats = req_format.split('/')
1140 video_url_list = None
1141 for rf in req_formats:
1143 video_url_list = [(rf, url_map[rf])]
1145 if rf in self._video_formats_map:
1146 for srf in self._video_formats_map[rf]:
1148 video_url_list = [(srf, url_map[srf])]
1153 if video_url_list is None:
1154 raise ExtractorError(u'requested format not available')
1155 return video_url_list
1157 def _extract_from_m3u8(self, manifest_url, video_id):
1159 def _get_urls(_manifest):
1160 lines = _manifest.split('\n')
1161 urls = filter(lambda l: l and not l.startswith('#'),
1164 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1165 formats_urls = _get_urls(manifest)
1166 for format_url in formats_urls:
1167 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1168 url_map[itag] = format_url
1171 def _real_extract(self, url):
1172 if re.match(r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$', url):
1173 self._downloader.report_warning(u'Did you forget to quote the URL? Remember that & is a meta-character in most shells, so you want to put the URL in quotes, like youtube-dl \'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\' (or simply youtube-dl BaW_jenozKc ).')
1175 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1176 mobj = re.search(self._NEXT_URL_RE, url)
1178 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1179 video_id = self._extract_id(url)
1182 self.report_video_webpage_download(video_id)
1183 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1184 request = compat_urllib_request.Request(url)
1186 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
1187 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1188 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
1190 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
1192 # Attempt to extract SWF player URL
1193 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1194 if mobj is not None:
1195 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1200 self.report_video_info_webpage_download(video_id)
1201 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1202 self.report_age_confirmation()
1204 # We simulate the access to the video from www.youtube.com/v/{video_id}
1205 # this can be viewed without login into Youtube
1206 data = compat_urllib_parse.urlencode({'video_id': video_id,
1210 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1214 video_info_url = 'https://www.youtube.com/get_video_info?' + data
1215 video_info_webpage = self._download_webpage(video_info_url, video_id,
1217 errnote='unable to download video info webpage')
1218 video_info = compat_parse_qs(video_info_webpage)
1221 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1222 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1223 % (video_id, el_type))
1224 video_info_webpage = self._download_webpage(video_info_url, video_id,
1226 errnote='unable to download video info webpage')
1227 video_info = compat_parse_qs(video_info_webpage)
1228 if 'token' in video_info:
1230 if 'token' not in video_info:
1231 if 'reason' in video_info:
1232 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
1234 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1236 # Check for "rental" videos
1237 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1238 raise ExtractorError(u'"rental" videos not supported')
1240 # Start extracting information
1241 self.report_information_extraction(video_id)
1244 if 'author' not in video_info:
1245 raise ExtractorError(u'Unable to extract uploader name')
1246 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1249 video_uploader_id = None
1250 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1251 if mobj is not None:
1252 video_uploader_id = mobj.group(1)
1254 self._downloader.report_warning(u'unable to extract uploader nickname')
1257 if 'title' not in video_info:
1258 raise ExtractorError(u'Unable to extract video title')
1259 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1262 # We try first to get a high quality image:
1263 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1264 video_webpage, re.DOTALL)
1265 if m_thumb is not None:
1266 video_thumbnail = m_thumb.group(1)
1267 elif 'thumbnail_url' not in video_info:
1268 self._downloader.report_warning(u'unable to extract video thumbnail')
1269 video_thumbnail = ''
1270 else: # don't panic if we can't find it
1271 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1275 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1276 if mobj is not None:
1277 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1278 upload_date = unified_strdate(upload_date)
1281 video_description = get_element_by_id("eow-description", video_webpage)
1282 if video_description:
1283 video_description = clean_html(video_description)
1285 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1287 video_description = unescapeHTML(fd_mobj.group(1))
1289 video_description = u''
1292 video_subtitles = self.extract_subtitles(video_id, video_webpage)
1294 if self._downloader.params.get('listsubtitles', False):
1295 self._list_available_subtitles(video_id, video_webpage)
1298 if 'length_seconds' not in video_info:
1299 self._downloader.report_warning(u'unable to extract video duration')
1302 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
1304 # Decide which formats to download
1307 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
1309 raise ValueError('Could not find vevo ID')
1310 info = json.loads(mobj.group(1))
1312 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1313 # this signatures are encrypted
1314 m_s = re.search(r'[&,]s=', args['url_encoded_fmt_stream_map'])
1316 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
1317 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
1318 m_s = re.search(r'[&,]s=', args.get('adaptive_fmts', u''))
1320 if 'url_encoded_fmt_stream_map' in video_info:
1321 video_info['url_encoded_fmt_stream_map'][0] += ',' + args['adaptive_fmts']
1323 video_info['url_encoded_fmt_stream_map'] = [args['adaptive_fmts']]
1324 elif 'adaptive_fmts' in video_info:
1325 if 'url_encoded_fmt_stream_map' in video_info:
1326 video_info['url_encoded_fmt_stream_map'][0] += ',' + video_info['adaptive_fmts'][0]
1328 video_info['url_encoded_fmt_stream_map'] = video_info['adaptive_fmts']
1332 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1333 self.report_rtmp_download()
1334 video_url_list = [(None, video_info['conn'][0])]
1335 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1336 if 'rtmpe%3Dyes' in video_info['url_encoded_fmt_stream_map'][0]:
1337 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1339 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
1340 url_data = compat_parse_qs(url_data_str)
1341 if 'itag' in url_data and 'url' in url_data:
1342 url = url_data['url'][0]
1343 if 'sig' in url_data:
1344 url += '&signature=' + url_data['sig'][0]
1345 elif 's' in url_data:
1346 encrypted_sig = url_data['s'][0]
1347 if self._downloader.params.get('verbose'):
1349 player_version = self._search_regex(r'-(.+)\.swf$',
1350 player_url if player_url else 'NOT FOUND',
1351 'flash player', fatal=False)
1352 player_desc = 'flash player %s' % player_version
1354 player_version = self._search_regex(r'html5player-(.+?)\.js', video_webpage,
1355 'html5 player', fatal=False)
1356 player_desc = u'html5 player %s' % player_version
1358 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
1359 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
1360 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1365 jsplayer_url_json = self._search_regex(
1366 r'"assets":.+?"js":\s*("[^"]+")',
1367 video_webpage, u'JS player URL')
1368 jsplayer_url = json.loads(jsplayer_url_json)
1370 signature = self._decrypt_signature(encrypted_sig, video_id, jsplayer_url, age_gate)
1371 url += '&signature=' + signature
1372 if 'ratebypass' not in url:
1373 url += '&ratebypass=yes'
1374 url_map[url_data['itag'][0]] = url
1375 video_url_list = self._get_video_url_list(url_map)
1376 if not video_url_list:
1378 elif video_info.get('hlsvp'):
1379 manifest_url = video_info['hlsvp'][0]
1380 url_map = self._extract_from_m3u8(manifest_url, video_id)
1381 video_url_list = self._get_video_url_list(url_map)
1382 if not video_url_list:
1386 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
1389 for format_param, video_real_url in video_url_list:
1391 video_extension = self._video_extensions.get(format_param, 'flv')
1393 video_format = '{0} - {1}{2}'.format(format_param if format_param else video_extension,
1394 self._video_dimensions.get(format_param, '???'),
1395 ' ('+self._special_itags[format_param]+')' if format_param in self._special_itags else '')
1399 'url': video_real_url,
1400 'uploader': video_uploader,
1401 'uploader_id': video_uploader_id,
1402 'upload_date': upload_date,
1403 'title': video_title,
1404 'ext': video_extension,
1405 'format': video_format,
1406 'thumbnail': video_thumbnail,
1407 'description': video_description,
1408 'player_url': player_url,
1409 'subtitles': video_subtitles,
1410 'duration': video_duration
1414 class YoutubePlaylistIE(InfoExtractor):
1415 IE_DESC = u'YouTube.com playlists'
1416 _VALID_URL = r"""(?:
1421 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1422 \? (?:.*?&)*? (?:p|a|list)=
1425 ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
1428 ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
1430 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1432 IE_NAME = u'youtube:playlist'
1435 def suitable(cls, url):
1436 """Receives a URL and returns True if suitable for this IE."""
1437 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1439 def _real_extract(self, url):
1440 # Extract playlist id
1441 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1443 raise ExtractorError(u'Invalid URL: %s' % url)
1445 # Download playlist videos from API
1446 playlist_id = mobj.group(1) or mobj.group(2)
1449 for page_num in itertools.count(1):
1450 start_index = self._MAX_RESULTS * (page_num - 1) + 1
1451 if start_index >= 1000:
1452 self._downloader.report_warning(u'Max number of results reached')
1454 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index)
1455 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1458 response = json.loads(page)
1459 except ValueError as err:
1460 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1462 if 'feed' not in response:
1463 raise ExtractorError(u'Got a malformed response from YouTube API')
1464 playlist_title = response['feed']['title']['$t']
1465 if 'entry' not in response['feed']:
1466 # Number of videos is a multiple of self._MAX_RESULTS
1469 for entry in response['feed']['entry']:
1470 index = entry['yt$position']['$t']
1471 if 'media$group' in entry and 'yt$videoid' in entry['media$group']:
1474 'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t']
1477 videos = [v[1] for v in sorted(videos)]
1479 url_results = [self.url_result(vurl, 'Youtube') for vurl in videos]
1480 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1483 class YoutubeChannelIE(InfoExtractor):
1484 IE_DESC = u'YouTube.com channels'
1485 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1486 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1487 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1488 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1489 IE_NAME = u'youtube:channel'
1491 def extract_videos_from_page(self, page):
1493 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1494 if mobj.group(1) not in ids_in_page:
1495 ids_in_page.append(mobj.group(1))
1498 def _real_extract(self, url):
1499 # Extract channel id
1500 mobj = re.match(self._VALID_URL, url)
1502 raise ExtractorError(u'Invalid URL: %s' % url)
1504 # Download channel page
1505 channel_id = mobj.group(1)
1509 url = self._TEMPLATE_URL % (channel_id, pagenum)
1510 page = self._download_webpage(url, channel_id,
1511 u'Downloading page #%s' % pagenum)
1513 # Extract video identifiers
1514 ids_in_page = self.extract_videos_from_page(page)
1515 video_ids.extend(ids_in_page)
1517 # Download any subsequent channel pages using the json-based channel_ajax query
1518 if self._MORE_PAGES_INDICATOR in page:
1519 for pagenum in itertools.count(1):
1520 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1521 page = self._download_webpage(url, channel_id,
1522 u'Downloading page #%s' % pagenum)
1524 page = json.loads(page)
1526 ids_in_page = self.extract_videos_from_page(page['content_html'])
1527 video_ids.extend(ids_in_page)
1529 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1532 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1534 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1535 url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls]
1536 return [self.playlist_result(url_entries, channel_id)]
1539 class YoutubeUserIE(InfoExtractor):
1540 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
1541 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?)|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1542 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1543 _GDATA_PAGE_SIZE = 50
1544 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1545 IE_NAME = u'youtube:user'
1548 def suitable(cls, url):
1549 # Don't return True if the url can be extracted with other youtube
1550 # extractor, the regex would is too permissive and it would match.
1551 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1552 if any(ie.suitable(url) for ie in other_ies): return False
1553 else: return super(YoutubeUserIE, cls).suitable(url)
1555 def _real_extract(self, url):
1557 mobj = re.match(self._VALID_URL, url)
1559 raise ExtractorError(u'Invalid URL: %s' % url)
1561 username = mobj.group(1)
1563 # Download video ids using YouTube Data API. Result size per
1564 # query is limited (currently to 50 videos) so we need to query
1565 # page by page until there are no video ids - it means we got
1570 for pagenum in itertools.count(0):
1571 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1573 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1574 page = self._download_webpage(gdata_url, username,
1575 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1578 response = json.loads(page)
1579 except ValueError as err:
1580 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1581 if 'entry' not in response['feed']:
1582 # Number of videos is a multiple of self._MAX_RESULTS
1585 # Extract video identifiers
1587 for entry in response['feed']['entry']:
1588 ids_in_page.append(entry['id']['$t'].split('/')[-1])
1589 video_ids.extend(ids_in_page)
1591 # A little optimization - if current page is not
1592 # "full", ie. does not contain PAGE_SIZE video ids then
1593 # we can assume that this page is the last one - there
1594 # are no more ids on further pages - no need to query
1597 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1600 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1601 url_results = [self.url_result(rurl, 'Youtube') for rurl in urls]
1602 return [self.playlist_result(url_results, playlist_title = username)]
1604 class YoutubeSearchIE(SearchInfoExtractor):
1605 IE_DESC = u'YouTube.com searches'
1606 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1608 IE_NAME = u'youtube:search'
1609 _SEARCH_KEY = 'ytsearch'
1611 def report_download_page(self, query, pagenum):
1612 """Report attempt to download search page with given number."""
1613 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1615 def _get_n_results(self, query, n):
1616 """Get a specified number of results for a query"""
1622 while (50 * pagenum) < limit:
1623 self.report_download_page(query, pagenum+1)
1624 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1625 request = compat_urllib_request.Request(result_url)
1627 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1628 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1629 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1630 api_response = json.loads(data)['data']
1632 if not 'items' in api_response:
1633 raise ExtractorError(u'[youtube] No video results')
1635 new_ids = list(video['id'] for video in api_response['items'])
1636 video_ids += new_ids
1638 limit = min(n, api_response['totalItems'])
1641 if len(video_ids) > n:
1642 video_ids = video_ids[:n]
1643 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1644 return self.playlist_result(videos, query)
1647 class YoutubeShowIE(InfoExtractor):
1648 IE_DESC = u'YouTube.com (multi-season) shows'
1649 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1650 IE_NAME = u'youtube:show'
1652 def _real_extract(self, url):
1653 mobj = re.match(self._VALID_URL, url)
1654 show_name = mobj.group(1)
1655 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1656 # There's one playlist for each season of the show
1657 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1658 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1659 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1662 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1664 Base class for extractors that fetch info from
1665 http://www.youtube.com/feed_ajax
1666 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1668 _LOGIN_REQUIRED = True
1670 # use action_load_personal_feed instead of action_load_system_feed
1671 _PERSONAL_FEED = False
1674 def _FEED_TEMPLATE(self):
1675 action = 'action_load_system_feed'
1676 if self._PERSONAL_FEED:
1677 action = 'action_load_personal_feed'
1678 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1682 return u'youtube:%s' % self._FEED_NAME
1684 def _real_initialize(self):
1687 def _real_extract(self, url):
1689 # The step argument is available only in 2.7 or higher
1690 for i in itertools.count(0):
1691 paging = i*self._PAGING_STEP
1692 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1693 u'%s feed' % self._FEED_NAME,
1694 u'Downloading page %s' % i)
1695 info = json.loads(info)
1696 feed_html = info['feed_html']
1697 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1698 ids = orderedSet(m.group(1) for m in m_ids)
1699 feed_entries.extend(self.url_result(id, 'Youtube') for id in ids)
1700 if info['paging'] is None:
1702 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1704 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1705 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1706 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1707 _FEED_NAME = 'subscriptions'
1708 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1710 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1711 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1712 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1713 _FEED_NAME = 'recommended'
1714 _PLAYLIST_TITLE = u'Youtube Recommended videos'
1716 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1717 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1718 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1719 _FEED_NAME = 'watch_later'
1720 _PLAYLIST_TITLE = u'Youtube Watch Later'
1722 _PERSONAL_FEED = True
1724 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1725 IE_NAME = u'youtube:favorites'
1726 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1727 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1728 _LOGIN_REQUIRED = True
1730 def _real_extract(self, url):
1731 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1732 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1733 return self.url_result(playlist_id, 'YoutubePlaylist')