15 from .common import InfoExtractor, SearchInfoExtractor
16 from .subtitles import SubtitlesInfoExtractor
22 compat_urllib_request,
33 class YoutubeBaseInfoExtractor(InfoExtractor):
34 """Provide base functions for Youtube extractors"""
35 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
36 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
37 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
38 _NETRC_MACHINE = 'youtube'
39 # If True it will raise an error if no login info is provided
40 _LOGIN_REQUIRED = False
42 def report_lang(self):
43 """Report attempt to set language."""
44 self.to_screen(u'Setting language')
46 def _set_language(self):
47 request = compat_urllib_request.Request(self._LANG_URL)
50 compat_urllib_request.urlopen(request).read()
51 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
52 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
57 (username, password) = self._get_login_info()
58 # No authentication to be performed
60 if self._LOGIN_REQUIRED:
61 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
64 request = compat_urllib_request.Request(self._LOGIN_URL)
66 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
67 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
68 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
73 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
76 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
82 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
86 u'PersistentCookie': u'yes',
88 u'bgresponse': u'js_disabled',
89 u'checkConnection': u'',
90 u'checkedDomains': u'youtube',
96 u'signIn': u'Sign in',
98 u'service': u'youtube',
102 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
104 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
105 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
106 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
109 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
110 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
111 self._downloader.report_warning(u'unable to log in: bad username or password')
113 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
114 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
118 def _confirm_age(self):
121 'action_confirm': 'Confirm',
123 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
125 self.report_age_confirmation()
126 compat_urllib_request.urlopen(request).read().decode('utf-8')
127 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
128 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
131 def _real_initialize(self):
132 if self._downloader is None:
134 if not self._set_language():
136 if not self._login():
141 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
142 IE_DESC = u'YouTube.com'
145 (?:https?://)? # http(s):// (optional)
146 (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
147 tube\.majestyc\.net/|
148 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
149 (?:.*?\#/)? # handle anchor (#/) redirect urls
150 (?: # the various things that can precede the ID:
151 (?:(?:v|embed|e)/) # v/ or embed/ or e/
152 |(?: # or the v= param in all its forms
153 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
154 (?:\?|\#!?) # the params delimiter ? or # or #!
155 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
159 |youtu\.be/ # just youtu.be/xxxx
161 )? # all until now is optional -> you can pass the naked ID
162 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
163 (?(1).+)? # if we found the ID, everything can follow
165 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
166 # Listed in order of quality
167 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
168 # Apple HTTP Live Streaming
169 '96', '95', '94', '93', '92', '132', '151',
171 '85', '84', '102', '83', '101', '82', '100',
173 '138', '137', '248', '136', '247', '135', '246',
174 '245', '244', '134', '243', '133', '242', '160',
176 '141', '172', '140', '171', '139',
178 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
179 # Apple HTTP Live Streaming
180 '96', '95', '94', '93', '92', '132', '151',
182 '85', '102', '84', '101', '83', '100', '82',
184 '138', '248', '137', '247', '136', '246', '245',
185 '244', '135', '243', '134', '242', '133', '160',
187 '172', '141', '171', '140', '139',
189 _video_formats_map = {
190 'flv': ['35', '34', '6', '5'],
191 '3gp': ['36', '17', '13'],
192 'mp4': ['38', '37', '22', '18'],
193 'webm': ['46', '45', '44', '43'],
195 _video_extensions = {
217 # Apple HTTP Live Streaming
249 _video_dimensions = {
331 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
332 u"file": u"BaW_jenozKc.mp4",
334 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
335 u"uploader": u"Philipp Hagemeister",
336 u"uploader_id": u"phihag",
337 u"upload_date": u"20121002",
338 u"description": u"test chars: \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
342 u"url": u"http://www.youtube.com/watch?v=1ltcDfZMA3U",
343 u"file": u"1ltcDfZMA3U.flv",
344 u"note": u"Test VEVO video (#897)",
346 u"upload_date": u"20070518",
347 u"title": u"Maps - It Will Find You",
348 u"description": u"Music video by Maps performing It Will Find You.",
349 u"uploader": u"MuteUSA",
350 u"uploader_id": u"MuteUSA"
354 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
355 u"file": u"UxxajLWwzqY.mp4",
356 u"note": u"Test generic use_cipher_signature video (#897)",
358 u"upload_date": u"20120506",
359 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
360 u"description": u"md5:3e2666e0a55044490499ea45fe9037b7",
361 u"uploader": u"Icona Pop",
362 u"uploader_id": u"IconaPop"
366 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
367 u"file": u"07FYdnEawAQ.mp4",
368 u"note": u"Test VEVO video with age protection (#956)",
370 u"upload_date": u"20130703",
371 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
372 u"description": u"md5:64249768eec3bc4276236606ea996373",
373 u"uploader": u"justintimberlakeVEVO",
374 u"uploader_id": u"justintimberlakeVEVO"
378 u'url': u'https://www.youtube.com/watch?v=TGi3HqYrWHE',
379 u'file': u'TGi3HqYrWHE.mp4',
380 u'note': u'm3u8 video',
382 u'title': u'Triathlon - Men - London 2012 Olympic Games',
383 u'description': u'- Men - TR02 - Triathlon - 07 August 2012 - London 2012 Olympic Games',
384 u'uploader': u'olympic',
385 u'upload_date': u'20120807',
386 u'uploader_id': u'olympic',
389 u'skip_download': True,
396 def suitable(cls, url):
397 """Receives a URL and returns True if suitable for this IE."""
398 if YoutubePlaylistIE.suitable(url): return False
399 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
401 def __init__(self, *args, **kwargs):
402 super(YoutubeIE, self).__init__(*args, **kwargs)
403 self._player_cache = {}
405 def report_video_webpage_download(self, video_id):
406 """Report attempt to download video webpage."""
407 self.to_screen(u'%s: Downloading video webpage' % video_id)
409 def report_video_info_webpage_download(self, video_id):
410 """Report attempt to download video info webpage."""
411 self.to_screen(u'%s: Downloading video info webpage' % video_id)
413 def report_information_extraction(self, video_id):
414 """Report attempt to extract video information."""
415 self.to_screen(u'%s: Extracting video information' % video_id)
417 def report_unavailable_format(self, video_id, format):
418 """Report extracted video URL."""
419 self.to_screen(u'%s: Format %s not available' % (video_id, format))
421 def report_rtmp_download(self):
422 """Indicate the download will use the RTMP protocol."""
423 self.to_screen(u'RTMP download detected')
425 def _extract_signature_function(self, video_id, player_url):
426 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9]+)\.(?P<ext>[a-z]+)$',
428 player_type = id_m.group('ext')
429 player_id = id_m.group('id')
431 # TODO read from filesystem cache
433 if player_type == 'js':
434 code = self._download_webpage(
435 player_url, video_id,
436 note=u'Downloading %s player %s' % (player_type, player_id),
437 errnote=u'Download of %s failed' % player_url)
438 res = self._parse_sig_js(code)
439 elif player_tpye == 'swf':
440 urlh = self._request_webpage(
441 player_url, video_id,
442 note=u'Downloading %s player %s' % (player_type, player_id),
443 errnote=u'Download of %s failed' % player_url)
445 res = self._parse_sig_swf(code)
447 assert False, 'Invalid player type %r' % player_type
453 def _parse_sig_js(self, jscode):
454 funcname = self._search_regex(
455 r'signature=([a-zA-Z]+)', jscode,
456 u'Initial JS player signature function name')
461 return string.lowercase.index(varname)
463 def interpret_statement(stmt, local_vars, allow_recursion=20):
464 if allow_recursion < 0:
465 raise ExctractorError(u'Recursion limit reached')
467 if stmt.startswith(u'var '):
468 stmt = stmt[len(u'var '):]
469 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
470 r'=(?P<expr>.*)$', stmt)
472 if ass_m.groupdict().get('index'):
474 lvar = local_vars[ass_m.group('out')]
475 idx = interpret_expression(ass_m.group('index'),
476 local_vars, allow_recursion)
477 assert isinstance(idx, int)
480 expr = ass_m.group('expr')
483 local_vars[ass_m.group('out')] = val
485 expr = ass_m.group('expr')
486 elif stmt.startswith(u'return '):
488 expr = stmt[len(u'return '):]
490 raise ExtractorError(
491 u'Cannot determine left side of statement in %r' % stmt)
493 v = interpret_expression(expr, local_vars, allow_recursion)
496 def interpret_expression(expr, local_vars, allow_recursion):
501 return local_vars[expr]
503 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
505 member = m.group('member')
506 val = local_vars[m.group('in')]
507 if member == 'split("")':
509 if member == 'join("")':
511 if member == 'length':
513 if member == 'reverse()':
515 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
517 idx = interpret_expression(
518 slice_m.group('idx'), local_vars, allow_recursion-1)
522 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
524 val = local_vars[m.group('in')]
525 idx = interpret_expression(m.group('idx'), local_vars,
529 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
531 a = interpret_expression(m.group('a'),
532 local_vars, allow_recursion)
533 b = interpret_expression(m.group('b'),
534 local_vars, allow_recursion)
538 r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
540 fname = m.group('func')
541 if fname not in functions:
542 functions[fname] = extract_function(fname)
543 argvals = [int(v) if v.isdigit() else local_vars[v]
544 for v in m.group('args').split(',')]
545 return functions[fname](argvals)
546 raise ExtractorError(u'Unsupported JS expression %r' % expr)
548 def extract_function(funcname):
550 r'function ' + re.escape(funcname) +
551 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
553 argnames = func_m.group('args').split(',')
556 local_vars = dict(zip(argnames, args))
557 for stmt in func_m.group('code').split(';'):
558 res = interpret_statement(stmt, local_vars)
562 initial_function = extract_function(funcname)
563 return lambda s: initial_function([s])
565 def _parse_sig_swf(self, file_contents):
566 if file_contents[1:3] != b'WS':
567 raise ExtractorError(
568 u'Not an SWF file; header is %r' % file_contents[:3])
569 if file_contents[:1] == b'C':
570 content = zlib.decompress(file_contents[8:])
572 raise NotImplementedError(u'Unsupported compression format %r' %
575 def extract_tags(content):
577 while pos < len(content):
578 header16 = struct.unpack('<H', content[pos:pos+2])[0]
580 tag_code = header16 >> 6
581 tag_len = header16 & 0x3f
583 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
585 assert pos+tag_len <= len(content)
586 yield (tag_code, content[pos:pos+tag_len])
590 for tag_code, tag in extract_tags(content)
592 p = code_tag.index(b'\0', 4) + 1
594 # Parse ABC (AVM2 ByteCode)
595 def read_int(data=None, pos=None):
596 if hasattr(data, 'read'):
604 b = struct.unpack('<B', buf)[0]
605 res = res | ((b & 0x7f) << shift)
618 b = struct.unpack('<B', data[pos:pos+1])[0]
620 res = res | ((b & 0x7f) << shift)
625 assert read_int(b'\x00', 0) == (0, 1)
626 assert read_int(b'\x10', 0) == (16, 1)
627 assert read_int(b'\x34', 0) == (0x34, 1)
628 assert read_int(b'\xb4\x12', 0) == (0x12 * 0x80 + 0x34, 2)
629 assert read_int(b'\xff\xff\xff\x00', 0) == (0x1fffff, 4)
631 def u30(*args, **kwargs):
632 res = read_int(*args, **kwargs)
633 if isinstance(res, tuple):
634 assert res[0] & 0xf0000000 == 0
636 assert res & 0xf0000000 == 0
640 def s32(data=None, pos=None):
641 v, pos = read_int(data, pos)
642 if v & 0x80000000 != 0:
643 v = - ((v ^ 0xffffffff) + 1)
645 assert s32(b'\xff\xff\xff\xff\x0f', 0) == (-1, 5)
649 return (code_tag[p:p+slen].decode('utf-8'), p + slen)
651 def read_byte(data=None, pos=None):
656 res = struct.unpack('<B', data[pos:pos+1])[0]
657 return (res, pos + 1)
659 # minor_version + major_version
664 for _c in range(1, int_count):
666 uint_count, p = u30()
667 for _c in range(1, uint_count):
669 double_count, p = u30()
670 p += (double_count-1) * 8
671 string_count, p = u30()
672 constant_strings = [u'']
673 for _c in range(1, string_count):
675 constant_strings.append(s)
676 namespace_count, p = u30()
677 for _c in range(1, namespace_count):
680 ns_set_count, p = u30()
681 for _c in range(1, ns_set_count):
683 for _c2 in range(count):
685 multiname_count, p = u30()
694 0x0e: 2, # MultinameA
695 0x1b: 1, # MultinameL
696 0x1c: 1, # MultinameLA
699 for _c in range(1, multiname_count):
701 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
703 namespace_idx, p = u30()
705 multinames.append(constant_strings[name_idx])
707 multinames.append('[MULTINAME kind: %d]' % kind)
708 for _c2 in range(MULTINAME_SIZES[kind]):
712 method_count, p = u30()
713 MethodInfo = collections.namedtuple(
715 ['NEED_ARGUMENTS', 'NEED_REST'])
717 for method_id in range(method_count):
718 param_count, p = u30()
719 _, p = u30() # return type
720 for _ in range(param_count):
721 _, p = u30() # param type
722 _, p = u30() # name index (always 0 for youtube)
723 flags, p = read_byte()
724 if flags & 0x08 != 0:
726 option_count, p = u30()
727 for c in range(option_count):
730 if flags & 0x80 != 0:
731 # Param names present
732 for _ in range(param_count):
733 _, p = u30() # param name
734 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
735 method_infos.append(mi)
738 metadata_count, p = u30()
739 for _c in range(metadata_count):
741 item_count, p = u30()
742 for _c2 in range(item_count):
746 def parse_traits_info(pos=None):
749 trait_name_idx, pos = u30(pos=pos)
750 kind_full, pos = read_byte(pos=pos)
751 kind = kind_full & 0x0f
752 attrs = kind_full >> 4
754 if kind in [0x00, 0x06]: # Slot or Const
755 _, pos = u30(pos=pos) # Slot id
756 type_name_idx, pos = u30(pos=pos)
757 vindex, pos = u30(pos=pos)
759 _, pos = read_byte(pos=pos) # vkind
760 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
761 _, pos = u30(pos=pos) # disp_id
762 method_idx, pos = u30(pos=pos)
763 methods[multinames[trait_name_idx]] = method_idx
764 elif kind == 0x04: # Class
765 _, pos = u30(pos=pos) # slot_id
766 _, pos = u30(pos=pos) # classi
767 elif kind == 0x05: # Function
768 _, pos = u30(pos=pos) # slot_id
769 function_idx, pos = u30(pos=pos)
770 methods[function_idx] = multinames[trait_name_idx]
772 raise ExtractorError(u'Unsupported trait kind %d' % kind)
774 if attrs & 0x4 != 0: # Metadata present
775 metadata_count, pos = u30(pos=pos)
776 for _c3 in range(metadata_count):
777 _, pos = u30(pos=pos)
779 return (methods, pos)
782 TARGET_CLASSNAME = u'SignatureDecipher'
783 searched_idx = multinames.index(TARGET_CLASSNAME)
784 searched_class_id = None
785 class_count, p = u30()
786 for class_id in range(class_count):
788 if name_idx == searched_idx:
789 # We found the class we're looking for!
790 searched_class_id = class_id
791 _, p = u30() # super_name idx
792 flags, p = read_byte()
793 if flags & 0x08 != 0: # Protected namespace is present
794 protected_ns_idx, p = u30()
795 intrf_count, p = u30()
796 for _c2 in range(intrf_count):
799 trait_count, p = u30()
800 for _c2 in range(trait_count):
801 _, p = parse_traits_info()
803 if searched_class_id is None:
804 raise ExtractorError(u'Target class %r not found' %
809 for class_id in range(class_count):
811 trait_count, p = u30()
812 for _c2 in range(trait_count):
813 trait_methods, p = parse_traits_info()
814 if class_id == searched_class_id:
815 method_names.update(trait_methods.items())
816 method_idxs.update(dict(
818 for name, idx in trait_methods.items()))
821 script_count, p = u30()
822 for _c in range(script_count):
824 trait_count, p = u30()
825 for _c2 in range(trait_count):
826 _, p = parse_traits_info()
829 method_body_count, p = u30()
830 Method = collections.namedtuple('Method', ['code', 'local_count'])
832 for _c in range(method_body_count):
833 method_idx, p = u30()
835 local_count, p = u30()
836 init_scope_depth, p = u30()
837 max_scope_depth, p = u30()
838 code_length, p = u30()
839 if method_idx in method_idxs:
840 m = Method(code_tag[p:p+code_length], local_count)
841 methods[method_idxs[method_idx]] = m
843 exception_count, p = u30()
844 for _c2 in range(exception_count):
847 _, p = u30() # target
848 _, p = u30() # exc_type
849 _, p = u30() # var_name
850 trait_count, p = u30()
851 for _c2 in range(trait_count):
852 _, p = parse_traits_info()
854 assert p == len(code_tag)
855 assert len(methods) == len(method_idxs)
857 method_pyfunctions = {}
859 def extract_function(func_name):
860 if func_name in method_pyfunctions:
861 return method_pyfunctions[func_name]
862 if func_name not in methods:
863 raise ExtractorError(u'Cannot find function %r' % func_name)
864 m = methods[func_name]
867 registers = ['(this)'] + list(args) + [None] * m.local_count
869 coder = io.BytesIO(m.code)
871 opcode = struct.unpack('!B', coder.read(1))[0]
872 if opcode == 36: # pushbyte
873 v = struct.unpack('!B', coder.read(1))[0]
875 elif opcode == 44: # pushstring
877 stack.append(constant_strings[idx])
878 elif opcode == 48: # pushscope
879 # We don't implement the scope register, so we'll just
880 # ignore the popped value
882 elif opcode == 70: # callproperty
884 mname = multinames[index]
885 arg_count = u30(coder)
886 args = list(reversed(
887 [stack.pop() for _ in range(arg_count)]))
889 if mname == u'split':
890 assert len(args) == 1
891 assert isinstance(args[0], compat_str)
892 assert isinstance(obj, compat_str)
896 res = obj.split(args[0])
898 elif mname == u'slice':
899 assert len(args) == 1
900 assert isinstance(args[0], int)
901 assert isinstance(obj, list)
904 elif mname == u'join':
905 assert len(args) == 1
906 assert isinstance(args[0], compat_str)
907 assert isinstance(obj, list)
908 res = args[0].join(obj)
910 elif mname in method_pyfunctions:
911 stack.append(method_pyfunctions[mname](args))
913 raise NotImplementedError(
914 u'Unsupported property %r on %r'
916 elif opcode == 72: # returnvalue
919 elif opcode == 79: # callpropvoid
921 mname = multinames[index]
922 arg_count = u30(coder)
923 args = list(reversed(
924 [stack.pop() for _ in range(arg_count)]))
926 if mname == u'reverse':
927 assert isinstance(obj, list)
930 raise NotImplementedError(
931 u'Unsupported (void) property %r on %r'
933 elif opcode == 93: # findpropstrict
935 mname = multinames[index]
936 res = extract_function(mname)
938 elif opcode == 97: # setproperty
943 assert isinstance(obj, list)
944 assert isinstance(idx, int)
946 elif opcode == 98: # getlocal
948 stack.append(registers[index])
949 elif opcode == 99: # setlocal
952 registers[index] = value
953 elif opcode == 102: # getproperty
955 pname = multinames[index]
956 if pname == u'length':
958 assert isinstance(obj, list)
959 stack.append(len(obj))
960 else: # Assume attribute access
962 assert isinstance(idx, int)
964 assert isinstance(obj, list)
965 stack.append(obj[idx])
966 elif opcode == 128: # coerce
968 elif opcode == 133: # coerce_s
969 assert isinstance(stack[-1], (type(None), compat_str))
970 elif opcode == 164: # modulo
973 res = value1 % value2
975 elif opcode == 208: # getlocal_0
976 stack.append(registers[0])
977 elif opcode == 209: # getlocal_1
978 stack.append(registers[1])
979 elif opcode == 210: # getlocal_2
980 stack.append(registers[2])
981 elif opcode == 211: # getlocal_3
982 stack.append(registers[3])
983 elif opcode == 214: # setlocal_2
984 registers[2] = stack.pop()
985 elif opcode == 215: # setlocal_3
986 registers[3] = stack.pop()
988 raise NotImplementedError(
989 u'Unsupported opcode %d' % opcode)
991 method_pyfunctions[func_name] = resfunc
994 initial_function = extract_function(u'decipher')
995 return lambda s: initial_function([s])
997 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
998 """Turn the encrypted s field into a working signature"""
1000 if player_url is not None:
1002 if player_url not in self._player_cache:
1003 func = self._extract_signature_function(
1004 video_id, player_url
1006 self._player_cache[player_url] = func
1007 return self._player_cache[player_url](s)
1008 except Exception as e:
1009 tb = traceback.format_exc()
1010 self._downloader.report_warning(
1011 u'Automatic signature extraction failed: ' + tb)
1013 self._downloader.report_warning(
1014 u'Warning: Falling back to static signature algorithm')
1015 return self._static_decrypt_signature(s)
1017 def _static_decrypt_signature(self, s):
1019 # The videos with age protection use another player, so the
1020 # algorithms can be different.
1022 return s[2:63] + s[82] + s[64:82] + s[63]
1025 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
1027 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
1029 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
1031 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
1033 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
1035 return s[5:34] + s[0] + s[35:38] + s[3] + s[39:45] + s[38] + s[46:53] + s[73] + s[54:73] + s[85] + s[74:85] + s[53]
1037 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
1039 return s[81:36:-1] + s[0] + s[35:2:-1]
1041 return s[81:64:-1] + s[82] + s[63:52:-1] + s[45] + s[51:45:-1] + s[1] + s[44:1:-1] + s[0]
1043 return s[80:73:-1] + s[81] + s[72:54:-1] + s[2] + s[53:43:-1] + s[0] + s[42:2:-1] + s[43] + s[1] + s[54]
1045 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1047 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
1049 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1052 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
1054 def _decrypt_signature_age_gate(self, s):
1055 # The videos with age protection use another player, so the algorithms
1058 return s[2:63] + s[82] + s[64:82] + s[63]
1060 # Fallback to the other algortihms
1061 return self._decrypt_signature(s)
1063 def _get_available_subtitles(self, video_id):
1065 sub_list = self._download_webpage(
1066 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1067 video_id, note=False)
1068 except ExtractorError as err:
1069 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
1071 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
1076 params = compat_urllib_parse.urlencode({
1079 'fmt': self._downloader.params.get('subtitlesformat'),
1081 url = u'http://www.youtube.com/api/timedtext?' + params
1082 sub_lang_list[lang] = url
1083 if not sub_lang_list:
1084 self._downloader.report_warning(u'video doesn\'t have subtitles')
1086 return sub_lang_list
1088 def _get_available_automatic_caption(self, video_id, webpage):
1089 """We need the webpage for getting the captions url, pass it as an
1090 argument to speed up the process."""
1091 sub_format = self._downloader.params.get('subtitlesformat')
1092 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1093 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
1094 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
1096 self._downloader.report_warning(err_msg)
1098 player_config = json.loads(mobj.group(1))
1100 args = player_config[u'args']
1101 caption_url = args[u'ttsurl']
1102 timestamp = args[u'timestamp']
1103 # We get the available subtitles
1104 list_params = compat_urllib_parse.urlencode({
1109 list_url = caption_url + '&' + list_params
1110 list_page = self._download_webpage(list_url, video_id)
1111 caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8'))
1112 original_lang_node = caption_list.find('track')
1113 if original_lang_node.attrib.get('kind') != 'asr' :
1114 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1116 original_lang = original_lang_node.attrib['lang_code']
1119 for lang_node in caption_list.findall('target'):
1120 sub_lang = lang_node.attrib['lang_code']
1121 params = compat_urllib_parse.urlencode({
1122 'lang': original_lang,
1128 sub_lang_list[sub_lang] = caption_url + '&' + params
1129 return sub_lang_list
1130 # An extractor error can be raise by the download process if there are
1131 # no automatic captions but there are subtitles
1132 except (KeyError, ExtractorError):
1133 self._downloader.report_warning(err_msg)
1136 def _print_formats(self, formats):
1137 print('Available formats:')
1139 print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'),
1140 self._video_dimensions.get(x, '???'),
1141 ' ('+self._special_itags[x]+')' if x in self._special_itags else ''))
1143 def _extract_id(self, url):
1144 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1146 raise ExtractorError(u'Invalid URL: %s' % url)
1147 video_id = mobj.group(2)
1150 def _get_video_url_list(self, url_map):
1152 Transform a dictionary in the format {itag:url} to a list of (itag, url)
1153 with the requested formats.
1155 req_format = self._downloader.params.get('format', None)
1156 format_limit = self._downloader.params.get('format_limit', None)
1157 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1158 if format_limit is not None and format_limit in available_formats:
1159 format_list = available_formats[available_formats.index(format_limit):]
1161 format_list = available_formats
1162 existing_formats = [x for x in format_list if x in url_map]
1163 if len(existing_formats) == 0:
1164 raise ExtractorError(u'no known formats available for video')
1165 if self._downloader.params.get('listformats', None):
1166 self._print_formats(existing_formats)
1168 if req_format is None or req_format == 'best':
1169 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1170 elif req_format == 'worst':
1171 video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality
1172 elif req_format in ('-1', 'all'):
1173 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1175 # Specific formats. We pick the first in a slash-delimeted sequence.
1176 # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
1177 # available in the specified format. For example,
1178 # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1179 # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
1180 # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
1181 req_formats = req_format.split('/')
1182 video_url_list = None
1183 for rf in req_formats:
1185 video_url_list = [(rf, url_map[rf])]
1187 if rf in self._video_formats_map:
1188 for srf in self._video_formats_map[rf]:
1190 video_url_list = [(srf, url_map[srf])]
1195 if video_url_list is None:
1196 raise ExtractorError(u'requested format not available')
1197 return video_url_list
1199 def _extract_from_m3u8(self, manifest_url, video_id):
1201 def _get_urls(_manifest):
1202 lines = _manifest.split('\n')
1203 urls = filter(lambda l: l and not l.startswith('#'),
1206 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1207 formats_urls = _get_urls(manifest)
1208 for format_url in formats_urls:
1209 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1210 url_map[itag] = format_url
1213 def _real_extract(self, url):
1214 if re.match(r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$', url):
1215 self._downloader.report_warning(u'Did you forget to quote the URL? Remember that & is a meta-character in most shells, so you want to put the URL in quotes, like youtube-dl \'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\' (or simply youtube-dl BaW_jenozKc ).')
1217 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1218 mobj = re.search(self._NEXT_URL_RE, url)
1220 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1221 video_id = self._extract_id(url)
1224 self.report_video_webpage_download(video_id)
1225 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1226 request = compat_urllib_request.Request(url)
1228 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
1229 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1230 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
1232 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
1234 # Attempt to extract SWF player URL
1235 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1236 if mobj is not None:
1237 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1242 self.report_video_info_webpage_download(video_id)
1243 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1244 self.report_age_confirmation()
1246 # We simulate the access to the video from www.youtube.com/v/{video_id}
1247 # this can be viewed without login into Youtube
1248 data = compat_urllib_parse.urlencode({'video_id': video_id,
1252 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1256 video_info_url = 'https://www.youtube.com/get_video_info?' + data
1257 video_info_webpage = self._download_webpage(video_info_url, video_id,
1259 errnote='unable to download video info webpage')
1260 video_info = compat_parse_qs(video_info_webpage)
1263 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1264 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1265 % (video_id, el_type))
1266 video_info_webpage = self._download_webpage(video_info_url, video_id,
1268 errnote='unable to download video info webpage')
1269 video_info = compat_parse_qs(video_info_webpage)
1270 if 'token' in video_info:
1272 if 'token' not in video_info:
1273 if 'reason' in video_info:
1274 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
1276 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1278 # Check for "rental" videos
1279 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1280 raise ExtractorError(u'"rental" videos not supported')
1282 # Start extracting information
1283 self.report_information_extraction(video_id)
1286 if 'author' not in video_info:
1287 raise ExtractorError(u'Unable to extract uploader name')
1288 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1291 video_uploader_id = None
1292 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1293 if mobj is not None:
1294 video_uploader_id = mobj.group(1)
1296 self._downloader.report_warning(u'unable to extract uploader nickname')
1299 if 'title' not in video_info:
1300 raise ExtractorError(u'Unable to extract video title')
1301 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1304 # We try first to get a high quality image:
1305 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1306 video_webpage, re.DOTALL)
1307 if m_thumb is not None:
1308 video_thumbnail = m_thumb.group(1)
1309 elif 'thumbnail_url' not in video_info:
1310 self._downloader.report_warning(u'unable to extract video thumbnail')
1311 video_thumbnail = ''
1312 else: # don't panic if we can't find it
1313 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1317 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1318 if mobj is not None:
1319 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1320 upload_date = unified_strdate(upload_date)
1323 video_description = get_element_by_id("eow-description", video_webpage)
1324 if video_description:
1325 video_description = clean_html(video_description)
1327 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1329 video_description = unescapeHTML(fd_mobj.group(1))
1331 video_description = u''
1334 video_subtitles = self.extract_subtitles(video_id, video_webpage)
1336 if self._downloader.params.get('listsubtitles', False):
1337 self._list_available_subtitles(video_id, video_webpage)
1340 if 'length_seconds' not in video_info:
1341 self._downloader.report_warning(u'unable to extract video duration')
1344 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
1346 # Decide which formats to download
1349 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
1351 raise ValueError('Could not find vevo ID')
1352 info = json.loads(mobj.group(1))
1354 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1355 # this signatures are encrypted
1356 m_s = re.search(r'[&,]s=', args['url_encoded_fmt_stream_map'])
1358 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
1359 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
1360 m_s = re.search(r'[&,]s=', args.get('adaptive_fmts', u''))
1362 if 'url_encoded_fmt_stream_map' in video_info:
1363 video_info['url_encoded_fmt_stream_map'][0] += ',' + args['adaptive_fmts']
1365 video_info['url_encoded_fmt_stream_map'] = [args['adaptive_fmts']]
1366 elif 'adaptive_fmts' in video_info:
1367 if 'url_encoded_fmt_stream_map' in video_info:
1368 video_info['url_encoded_fmt_stream_map'][0] += ',' + video_info['adaptive_fmts'][0]
1370 video_info['url_encoded_fmt_stream_map'] = video_info['adaptive_fmts']
1374 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1375 self.report_rtmp_download()
1376 video_url_list = [(None, video_info['conn'][0])]
1377 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1378 if 'rtmpe%3Dyes' in video_info['url_encoded_fmt_stream_map'][0]:
1379 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1381 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
1382 url_data = compat_parse_qs(url_data_str)
1383 if 'itag' in url_data and 'url' in url_data:
1384 url = url_data['url'][0]
1385 if 'sig' in url_data:
1386 url += '&signature=' + url_data['sig'][0]
1387 elif 's' in url_data:
1388 encrypted_sig = url_data['s'][0]
1389 if self._downloader.params.get('verbose'):
1391 player_version = self._search_regex(
1393 player_url if player_url else None,
1394 'flash player', fatal=False)
1395 player_desc = 'flash player %s' % player_version
1397 player_version = self._search_regex(
1398 r'html5player-(.+?)\.js', video_webpage,
1399 'html5 player', fatal=False)
1400 player_desc = u'html5 player %s' % player_version
1402 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
1403 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
1404 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1407 jsplayer_url_json = self._search_regex(
1408 r'"assets":.+?"js":\s*("[^"]+")',
1409 video_webpage, u'JS player URL')
1410 player_url = json.loads(jsplayer_url_json)
1412 signature = self._decrypt_signature(
1413 encrypted_sig, video_id, player_url, age_gate)
1414 url += '&signature=' + signature
1415 if 'ratebypass' not in url:
1416 url += '&ratebypass=yes'
1417 url_map[url_data['itag'][0]] = url
1418 video_url_list = self._get_video_url_list(url_map)
1419 if not video_url_list:
1421 elif video_info.get('hlsvp'):
1422 manifest_url = video_info['hlsvp'][0]
1423 url_map = self._extract_from_m3u8(manifest_url, video_id)
1424 video_url_list = self._get_video_url_list(url_map)
1425 if not video_url_list:
1429 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
1432 for format_param, video_real_url in video_url_list:
1434 video_extension = self._video_extensions.get(format_param, 'flv')
1436 video_format = '{0} - {1}{2}'.format(format_param if format_param else video_extension,
1437 self._video_dimensions.get(format_param, '???'),
1438 ' ('+self._special_itags[format_param]+')' if format_param in self._special_itags else '')
1442 'url': video_real_url,
1443 'uploader': video_uploader,
1444 'uploader_id': video_uploader_id,
1445 'upload_date': upload_date,
1446 'title': video_title,
1447 'ext': video_extension,
1448 'format': video_format,
1449 'thumbnail': video_thumbnail,
1450 'description': video_description,
1451 'player_url': player_url,
1452 'subtitles': video_subtitles,
1453 'duration': video_duration
1457 class YoutubePlaylistIE(InfoExtractor):
1458 IE_DESC = u'YouTube.com playlists'
1459 _VALID_URL = r"""(?:
1464 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1465 \? (?:.*?&)*? (?:p|a|list)=
1468 ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
1471 ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
1473 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1475 IE_NAME = u'youtube:playlist'
1478 def suitable(cls, url):
1479 """Receives a URL and returns True if suitable for this IE."""
1480 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1482 def _real_extract(self, url):
1483 # Extract playlist id
1484 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1486 raise ExtractorError(u'Invalid URL: %s' % url)
1488 # Download playlist videos from API
1489 playlist_id = mobj.group(1) or mobj.group(2)
1492 for page_num in itertools.count(1):
1493 start_index = self._MAX_RESULTS * (page_num - 1) + 1
1494 if start_index >= 1000:
1495 self._downloader.report_warning(u'Max number of results reached')
1497 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index)
1498 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1501 response = json.loads(page)
1502 except ValueError as err:
1503 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1505 if 'feed' not in response:
1506 raise ExtractorError(u'Got a malformed response from YouTube API')
1507 playlist_title = response['feed']['title']['$t']
1508 if 'entry' not in response['feed']:
1509 # Number of videos is a multiple of self._MAX_RESULTS
1512 for entry in response['feed']['entry']:
1513 index = entry['yt$position']['$t']
1514 if 'media$group' in entry and 'yt$videoid' in entry['media$group']:
1517 'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t']
1520 videos = [v[1] for v in sorted(videos)]
1522 url_results = [self.url_result(vurl, 'Youtube') for vurl in videos]
1523 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1526 class YoutubeChannelIE(InfoExtractor):
1527 IE_DESC = u'YouTube.com channels'
1528 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1529 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1530 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1531 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1532 IE_NAME = u'youtube:channel'
1534 def extract_videos_from_page(self, page):
1536 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1537 if mobj.group(1) not in ids_in_page:
1538 ids_in_page.append(mobj.group(1))
1541 def _real_extract(self, url):
1542 # Extract channel id
1543 mobj = re.match(self._VALID_URL, url)
1545 raise ExtractorError(u'Invalid URL: %s' % url)
1547 # Download channel page
1548 channel_id = mobj.group(1)
1552 url = self._TEMPLATE_URL % (channel_id, pagenum)
1553 page = self._download_webpage(url, channel_id,
1554 u'Downloading page #%s' % pagenum)
1556 # Extract video identifiers
1557 ids_in_page = self.extract_videos_from_page(page)
1558 video_ids.extend(ids_in_page)
1560 # Download any subsequent channel pages using the json-based channel_ajax query
1561 if self._MORE_PAGES_INDICATOR in page:
1562 for pagenum in itertools.count(1):
1563 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1564 page = self._download_webpage(url, channel_id,
1565 u'Downloading page #%s' % pagenum)
1567 page = json.loads(page)
1569 ids_in_page = self.extract_videos_from_page(page['content_html'])
1570 video_ids.extend(ids_in_page)
1572 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1575 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1577 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1578 url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls]
1579 return [self.playlist_result(url_entries, channel_id)]
1582 class YoutubeUserIE(InfoExtractor):
1583 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
1584 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?)|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1585 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1586 _GDATA_PAGE_SIZE = 50
1587 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1588 IE_NAME = u'youtube:user'
1591 def suitable(cls, url):
1592 # Don't return True if the url can be extracted with other youtube
1593 # extractor, the regex would is too permissive and it would match.
1594 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1595 if any(ie.suitable(url) for ie in other_ies): return False
1596 else: return super(YoutubeUserIE, cls).suitable(url)
1598 def _real_extract(self, url):
1600 mobj = re.match(self._VALID_URL, url)
1602 raise ExtractorError(u'Invalid URL: %s' % url)
1604 username = mobj.group(1)
1606 # Download video ids using YouTube Data API. Result size per
1607 # query is limited (currently to 50 videos) so we need to query
1608 # page by page until there are no video ids - it means we got
1613 for pagenum in itertools.count(0):
1614 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1616 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1617 page = self._download_webpage(gdata_url, username,
1618 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1621 response = json.loads(page)
1622 except ValueError as err:
1623 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1624 if 'entry' not in response['feed']:
1625 # Number of videos is a multiple of self._MAX_RESULTS
1628 # Extract video identifiers
1630 for entry in response['feed']['entry']:
1631 ids_in_page.append(entry['id']['$t'].split('/')[-1])
1632 video_ids.extend(ids_in_page)
1634 # A little optimization - if current page is not
1635 # "full", ie. does not contain PAGE_SIZE video ids then
1636 # we can assume that this page is the last one - there
1637 # are no more ids on further pages - no need to query
1640 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1643 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1644 url_results = [self.url_result(rurl, 'Youtube') for rurl in urls]
1645 return [self.playlist_result(url_results, playlist_title = username)]
1647 class YoutubeSearchIE(SearchInfoExtractor):
1648 IE_DESC = u'YouTube.com searches'
1649 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1651 IE_NAME = u'youtube:search'
1652 _SEARCH_KEY = 'ytsearch'
1654 def report_download_page(self, query, pagenum):
1655 """Report attempt to download search page with given number."""
1656 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1658 def _get_n_results(self, query, n):
1659 """Get a specified number of results for a query"""
1665 while (50 * pagenum) < limit:
1666 self.report_download_page(query, pagenum+1)
1667 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1668 request = compat_urllib_request.Request(result_url)
1670 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1671 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1672 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1673 api_response = json.loads(data)['data']
1675 if not 'items' in api_response:
1676 raise ExtractorError(u'[youtube] No video results')
1678 new_ids = list(video['id'] for video in api_response['items'])
1679 video_ids += new_ids
1681 limit = min(n, api_response['totalItems'])
1684 if len(video_ids) > n:
1685 video_ids = video_ids[:n]
1686 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1687 return self.playlist_result(videos, query)
1690 class YoutubeShowIE(InfoExtractor):
1691 IE_DESC = u'YouTube.com (multi-season) shows'
1692 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1693 IE_NAME = u'youtube:show'
1695 def _real_extract(self, url):
1696 mobj = re.match(self._VALID_URL, url)
1697 show_name = mobj.group(1)
1698 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1699 # There's one playlist for each season of the show
1700 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1701 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1702 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1705 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1707 Base class for extractors that fetch info from
1708 http://www.youtube.com/feed_ajax
1709 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1711 _LOGIN_REQUIRED = True
1713 # use action_load_personal_feed instead of action_load_system_feed
1714 _PERSONAL_FEED = False
1717 def _FEED_TEMPLATE(self):
1718 action = 'action_load_system_feed'
1719 if self._PERSONAL_FEED:
1720 action = 'action_load_personal_feed'
1721 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1725 return u'youtube:%s' % self._FEED_NAME
1727 def _real_initialize(self):
1730 def _real_extract(self, url):
1732 # The step argument is available only in 2.7 or higher
1733 for i in itertools.count(0):
1734 paging = i*self._PAGING_STEP
1735 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1736 u'%s feed' % self._FEED_NAME,
1737 u'Downloading page %s' % i)
1738 info = json.loads(info)
1739 feed_html = info['feed_html']
1740 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1741 ids = orderedSet(m.group(1) for m in m_ids)
1742 feed_entries.extend(self.url_result(id, 'Youtube') for id in ids)
1743 if info['paging'] is None:
1745 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1747 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1748 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1749 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1750 _FEED_NAME = 'subscriptions'
1751 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1753 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1754 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1755 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1756 _FEED_NAME = 'recommended'
1757 _PLAYLIST_TITLE = u'Youtube Recommended videos'
1759 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1760 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1761 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1762 _FEED_NAME = 'watch_later'
1763 _PLAYLIST_TITLE = u'Youtube Watch Later'
1765 _PERSONAL_FEED = True
1767 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1768 IE_NAME = u'youtube:favorites'
1769 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1770 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1771 _LOGIN_REQUIRED = True
1773 def _real_extract(self, url):
1774 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1775 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1776 return self.url_result(playlist_id, 'YoutubePlaylist')