15 from .common import InfoExtractor, SearchInfoExtractor
16 from .subtitles import SubtitlesInfoExtractor
22 compat_urllib_request,
33 class YoutubeBaseInfoExtractor(InfoExtractor):
34 """Provide base functions for Youtube extractors"""
35 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
36 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
37 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
38 _NETRC_MACHINE = 'youtube'
39 # If True it will raise an error if no login info is provided
40 _LOGIN_REQUIRED = False
42 def report_lang(self):
43 """Report attempt to set language."""
44 self.to_screen(u'Setting language')
46 def _set_language(self):
47 request = compat_urllib_request.Request(self._LANG_URL)
50 compat_urllib_request.urlopen(request).read()
51 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
52 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
57 (username, password) = self._get_login_info()
58 # No authentication to be performed
60 if self._LOGIN_REQUIRED:
61 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
64 request = compat_urllib_request.Request(self._LOGIN_URL)
66 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
67 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
68 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
73 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
76 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
82 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
86 u'PersistentCookie': u'yes',
88 u'bgresponse': u'js_disabled',
89 u'checkConnection': u'',
90 u'checkedDomains': u'youtube',
96 u'signIn': u'Sign in',
98 u'service': u'youtube',
102 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
104 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
105 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
106 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
109 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
110 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
111 self._downloader.report_warning(u'unable to log in: bad username or password')
113 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
114 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
118 def _confirm_age(self):
121 'action_confirm': 'Confirm',
123 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
125 self.report_age_confirmation()
126 compat_urllib_request.urlopen(request).read().decode('utf-8')
127 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
128 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
131 def _real_initialize(self):
132 if self._downloader is None:
134 if not self._set_language():
136 if not self._login():
141 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
142 IE_DESC = u'YouTube.com'
145 (?:https?://)? # http(s):// (optional)
146 (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
147 tube\.majestyc\.net/|
148 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
149 (?:.*?\#/)? # handle anchor (#/) redirect urls
150 (?: # the various things that can precede the ID:
151 (?:(?:v|embed|e)/) # v/ or embed/ or e/
152 |(?: # or the v= param in all its forms
153 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
154 (?:\?|\#!?) # the params delimiter ? or # or #!
155 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
159 |youtu\.be/ # just youtu.be/xxxx
161 )? # all until now is optional -> you can pass the naked ID
162 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
163 (?(1).+)? # if we found the ID, everything can follow
165 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
166 # Listed in order of quality
167 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
168 # Apple HTTP Live Streaming
169 '96', '95', '94', '93', '92', '132', '151',
171 '85', '84', '102', '83', '101', '82', '100',
173 '138', '137', '248', '136', '247', '135', '246',
174 '245', '244', '134', '243', '133', '242', '160',
176 '141', '172', '140', '171', '139',
178 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
179 # Apple HTTP Live Streaming
180 '96', '95', '94', '93', '92', '132', '151',
182 '85', '102', '84', '101', '83', '100', '82',
184 '138', '248', '137', '247', '136', '246', '245',
185 '244', '135', '243', '134', '242', '133', '160',
187 '172', '141', '171', '140', '139',
189 _video_formats_map = {
190 'flv': ['35', '34', '6', '5'],
191 '3gp': ['36', '17', '13'],
192 'mp4': ['38', '37', '22', '18'],
193 'webm': ['46', '45', '44', '43'],
195 _video_extensions = {
217 # Apple HTTP Live Streaming
249 _video_dimensions = {
331 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
332 u"file": u"BaW_jenozKc.mp4",
334 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
335 u"uploader": u"Philipp Hagemeister",
336 u"uploader_id": u"phihag",
337 u"upload_date": u"20121002",
338 u"description": u"test chars: \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
342 u"url": u"http://www.youtube.com/watch?v=1ltcDfZMA3U",
343 u"file": u"1ltcDfZMA3U.flv",
344 u"note": u"Test VEVO video (#897)",
346 u"upload_date": u"20070518",
347 u"title": u"Maps - It Will Find You",
348 u"description": u"Music video by Maps performing It Will Find You.",
349 u"uploader": u"MuteUSA",
350 u"uploader_id": u"MuteUSA"
354 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
355 u"file": u"UxxajLWwzqY.mp4",
356 u"note": u"Test generic use_cipher_signature video (#897)",
358 u"upload_date": u"20120506",
359 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
360 u"description": u"md5:3e2666e0a55044490499ea45fe9037b7",
361 u"uploader": u"Icona Pop",
362 u"uploader_id": u"IconaPop"
366 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
367 u"file": u"07FYdnEawAQ.mp4",
368 u"note": u"Test VEVO video with age protection (#956)",
370 u"upload_date": u"20130703",
371 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
372 u"description": u"md5:64249768eec3bc4276236606ea996373",
373 u"uploader": u"justintimberlakeVEVO",
374 u"uploader_id": u"justintimberlakeVEVO"
378 u'url': u'https://www.youtube.com/watch?v=TGi3HqYrWHE',
379 u'file': u'TGi3HqYrWHE.mp4',
380 u'note': u'm3u8 video',
382 u'title': u'Triathlon - Men - London 2012 Olympic Games',
383 u'description': u'- Men - TR02 - Triathlon - 07 August 2012 - London 2012 Olympic Games',
384 u'uploader': u'olympic',
385 u'upload_date': u'20120807',
386 u'uploader_id': u'olympic',
389 u'skip_download': True,
396 def suitable(cls, url):
397 """Receives a URL and returns True if suitable for this IE."""
398 if YoutubePlaylistIE.suitable(url): return False
399 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
401 def __init__(self, *args, **kwargs):
402 super(YoutubeIE, self).__init__(*args, **kwargs)
403 self._jsplayer_cache = {}
405 def report_video_webpage_download(self, video_id):
406 """Report attempt to download video webpage."""
407 self.to_screen(u'%s: Downloading video webpage' % video_id)
409 def report_video_info_webpage_download(self, video_id):
410 """Report attempt to download video info webpage."""
411 self.to_screen(u'%s: Downloading video info webpage' % video_id)
413 def report_information_extraction(self, video_id):
414 """Report attempt to extract video information."""
415 self.to_screen(u'%s: Extracting video information' % video_id)
417 def report_unavailable_format(self, video_id, format):
418 """Report extracted video URL."""
419 self.to_screen(u'%s: Format %s not available' % (video_id, format))
421 def report_rtmp_download(self):
422 """Indicate the download will use the RTMP protocol."""
423 self.to_screen(u'RTMP download detected')
425 def _extract_signature_function(self, video_id, player_url):
426 id_m = re.match(r'.*-(?P<id>[^.]+)\.(?P<ext>[^.]+)$', player_url)
427 player_type = id_m.group('ext')
428 player_id = id_m.group('id')
430 if player_type == 'js':
431 code = self._download_webpage(
432 player_url, video_id,
433 note=u'Downloading %s player %s' % (player_type, jsplayer_id),
434 errnote=u'Download of %s failed' % player_url)
435 return self._parse_sig_js(code)
436 elif player_tpye == 'swf':
437 urlh = self._request_webpage(
438 player_url, video_id,
439 note=u'Downloading %s player %s' % (player_type, jsplayer_id),
440 errnote=u'Download of %s failed' % player_url)
442 return self._parse_sig_swf(code)
444 assert False, 'Invalid player type %r' % player_type
446 def _parse_sig_js(self, jscode):
447 funcname = self._search_regex(
448 r'signature=([a-zA-Z]+)', jscode,
449 u'Initial JS player signature function name')
454 return string.lowercase.index(varname)
456 def interpret_statement(stmt, local_vars, allow_recursion=20):
457 if allow_recursion < 0:
458 raise ExctractorError(u'Recursion limit reached')
460 if stmt.startswith(u'var '):
461 stmt = stmt[len(u'var '):]
462 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
463 r'=(?P<expr>.*)$', stmt)
465 if ass_m.groupdict().get('index'):
467 lvar = local_vars[ass_m.group('out')]
468 idx = interpret_expression(ass_m.group('index'),
469 local_vars, allow_recursion)
470 assert isinstance(idx, int)
473 expr = ass_m.group('expr')
476 local_vars[ass_m.group('out')] = val
478 expr = ass_m.group('expr')
479 elif stmt.startswith(u'return '):
481 expr = stmt[len(u'return '):]
483 raise ExtractorError(
484 u'Cannot determine left side of statement in %r' % stmt)
486 v = interpret_expression(expr, local_vars, allow_recursion)
489 def interpret_expression(expr, local_vars, allow_recursion):
494 return local_vars[expr]
496 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
498 member = m.group('member')
499 val = local_vars[m.group('in')]
500 if member == 'split("")':
502 if member == 'join("")':
504 if member == 'length':
506 if member == 'reverse()':
508 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
510 idx = interpret_expression(
511 slice_m.group('idx'), local_vars, allow_recursion-1)
515 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
517 val = local_vars[m.group('in')]
518 idx = interpret_expression(m.group('idx'), local_vars,
522 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
524 a = interpret_expression(m.group('a'),
525 local_vars, allow_recursion)
526 b = interpret_expression(m.group('b'),
527 local_vars, allow_recursion)
531 r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
533 fname = m.group('func')
534 if fname not in functions:
535 functions[fname] = extract_function(fname)
536 argvals = [int(v) if v.isdigit() else local_vars[v]
537 for v in m.group('args').split(',')]
538 return functions[fname](argvals)
539 raise ExtractorError(u'Unsupported JS expression %r' % expr)
541 def extract_function(funcname):
543 r'function ' + re.escape(funcname) +
544 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
546 argnames = func_m.group('args').split(',')
549 local_vars = dict(zip(argnames, args))
550 for stmt in func_m.group('code').split(';'):
551 res = interpret_statement(stmt, local_vars)
555 initial_function = extract_function(funcname)
556 return lambda s: initial_function([s])
558 def _parse_sig_swf(self, file_contents):
559 if file_contents[1:3] != b'WS':
560 raise ExtractorError(
561 u'Not an SWF file; header is %r' % file_contents[:3])
562 if file_contents[:1] == b'C':
563 content = zlib.decompress(file_contents[8:])
565 raise NotImplementedError(u'Unsupported compression format %r' %
568 def extract_tags(content):
570 while pos < len(content):
571 header16 = struct.unpack('<H', content[pos:pos+2])[0]
573 tag_code = header16 >> 6
574 tag_len = header16 & 0x3f
576 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
578 assert pos+tag_len <= len(content)
579 yield (tag_code, content[pos:pos+tag_len])
583 for tag_code, tag in extract_tags(content)
585 p = code_tag.index(b'\0', 4) + 1
587 # Parse ABC (AVM2 ByteCode)
588 def read_int(data=None, pos=None):
589 if hasattr(data, 'read'):
597 b = struct.unpack('<B', buf)[0]
598 res = res | ((b & 0x7f) << shift)
611 b = struct.unpack('<B', data[pos:pos+1])[0]
613 res = res | ((b & 0x7f) << shift)
618 assert read_int(b'\x00', 0) == (0, 1)
619 assert read_int(b'\x10', 0) == (16, 1)
620 assert read_int(b'\x34', 0) == (0x34, 1)
621 assert read_int(b'\xb4\x12', 0) == (0x12 * 0x80 + 0x34, 2)
622 assert read_int(b'\xff\xff\xff\x00', 0) == (0x1fffff, 4)
624 def u30(*args, **kwargs):
625 res = read_int(*args, **kwargs)
626 if isinstance(res, tuple):
627 assert res[0] & 0xf0000000 == 0
629 assert res & 0xf0000000 == 0
633 def s32(data=None, pos=None):
634 v, pos = read_int(data, pos)
635 if v & 0x80000000 != 0:
636 v = - ((v ^ 0xffffffff) + 1)
638 assert s32(b'\xff\xff\xff\xff\x0f', 0) == (-1, 5)
642 return (code_tag[p:p+slen].decode('utf-8'), p + slen)
644 def read_byte(data=None, pos=None):
649 res = struct.unpack('<B', data[pos:pos+1])[0]
650 return (res, pos + 1)
652 # minor_version + major_version
657 for _c in range(1, int_count):
659 uint_count, p = u30()
660 for _c in range(1, uint_count):
662 double_count, p = u30()
663 p += (double_count-1) * 8
664 string_count, p = u30()
665 constant_strings = [u'']
666 for _c in range(1, string_count):
668 constant_strings.append(s)
669 namespace_count, p = u30()
670 for _c in range(1, namespace_count):
673 ns_set_count, p = u30()
674 for _c in range(1, ns_set_count):
676 for _c2 in range(count):
678 multiname_count, p = u30()
687 0x0e: 2, # MultinameA
688 0x1b: 1, # MultinameL
689 0x1c: 1, # MultinameLA
692 for _c in range(1, multiname_count):
694 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
696 namespace_idx, p = u30()
698 multinames.append(constant_strings[name_idx])
700 multinames.append('[MULTINAME kind: %d]' % kind)
701 for _c2 in range(MULTINAME_SIZES[kind]):
705 method_count, p = u30()
706 MethodInfo = collections.namedtuple(
708 ['NEED_ARGUMENTS', 'NEED_REST'])
710 for method_id in range(method_count):
711 param_count, p = u30()
712 _, p = u30() # return type
713 for _ in range(param_count):
714 _, p = u30() # param type
715 _, p = u30() # name index (always 0 for youtube)
716 flags, p = read_byte()
717 if flags & 0x08 != 0:
719 option_count, p = u30()
720 for c in range(option_count):
723 if flags & 0x80 != 0:
724 # Param names present
725 for _ in range(param_count):
726 _, p = u30() # param name
727 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
728 method_infos.append(mi)
731 metadata_count, p = u30()
732 for _c in range(metadata_count):
734 item_count, p = u30()
735 for _c2 in range(item_count):
739 def parse_traits_info(pos=None):
742 trait_name_idx, pos = u30(pos=pos)
743 kind_full, pos = read_byte(pos=pos)
744 kind = kind_full & 0x0f
745 attrs = kind_full >> 4
747 if kind in [0x00, 0x06]: # Slot or Const
748 _, pos = u30(pos=pos) # Slot id
749 type_name_idx, pos = u30(pos=pos)
750 vindex, pos = u30(pos=pos)
752 _, pos = read_byte(pos=pos) # vkind
753 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
754 _, pos = u30(pos=pos) # disp_id
755 method_idx, pos = u30(pos=pos)
756 methods[multinames[trait_name_idx]] = method_idx
757 elif kind == 0x04: # Class
758 _, pos = u30(pos=pos) # slot_id
759 _, pos = u30(pos=pos) # classi
760 elif kind == 0x05: # Function
761 _, pos = u30(pos=pos) # slot_id
762 function_idx, pos = u30(pos=pos)
763 methods[function_idx] = multinames[trait_name_idx]
765 raise ExtractorError(u'Unsupported trait kind %d' % kind)
767 if attrs & 0x4 != 0: # Metadata present
768 metadata_count, pos = u30(pos=pos)
769 for _c3 in range(metadata_count):
770 _, pos = u30(pos=pos)
772 return (methods, pos)
775 TARGET_CLASSNAME = u'SignatureDecipher'
776 searched_idx = multinames.index(TARGET_CLASSNAME)
777 searched_class_id = None
778 class_count, p = u30()
779 for class_id in range(class_count):
781 if name_idx == searched_idx:
782 # We found the class we're looking for!
783 searched_class_id = class_id
784 _, p = u30() # super_name idx
785 flags, p = read_byte()
786 if flags & 0x08 != 0: # Protected namespace is present
787 protected_ns_idx, p = u30()
788 intrf_count, p = u30()
789 for _c2 in range(intrf_count):
792 trait_count, p = u30()
793 for _c2 in range(trait_count):
794 _, p = parse_traits_info()
796 if searched_class_id is None:
797 raise ExtractorError(u'Target class %r not found' %
802 for class_id in range(class_count):
804 trait_count, p = u30()
805 for _c2 in range(trait_count):
806 trait_methods, p = parse_traits_info()
807 if class_id == searched_class_id:
808 method_names.update(trait_methods.items())
809 method_idxs.update(dict(
811 for name, idx in trait_methods.items()))
814 script_count, p = u30()
815 for _c in range(script_count):
817 trait_count, p = u30()
818 for _c2 in range(trait_count):
819 _, p = parse_traits_info()
822 method_body_count, p = u30()
823 Method = collections.namedtuple('Method', ['code', 'local_count'])
825 for _c in range(method_body_count):
826 method_idx, p = u30()
828 local_count, p = u30()
829 init_scope_depth, p = u30()
830 max_scope_depth, p = u30()
831 code_length, p = u30()
832 if method_idx in method_idxs:
833 m = Method(code_tag[p:p+code_length], local_count)
834 methods[method_idxs[method_idx]] = m
836 exception_count, p = u30()
837 for _c2 in range(exception_count):
840 _, p = u30() # target
841 _, p = u30() # exc_type
842 _, p = u30() # var_name
843 trait_count, p = u30()
844 for _c2 in range(trait_count):
845 _, p = parse_traits_info()
847 assert p == len(code_tag)
848 assert len(methods) == len(method_idxs)
850 method_pyfunctions = {}
852 def extract_function(func_name):
853 if func_name in method_pyfunctions:
854 return method_pyfunctions[func_name]
855 if func_name not in methods:
856 raise ExtractorError(u'Cannot find function %r' % func_name)
857 m = methods[func_name]
860 registers = ['(this)'] + list(args) + [None] * m.local_count
862 coder = io.BytesIO(m.code)
864 opcode = struct.unpack('!B', coder.read(1))[0]
865 if opcode == 36: # pushbyte
866 v = struct.unpack('!B', coder.read(1))[0]
868 elif opcode == 44: # pushstring
870 stack.append(constant_strings[idx])
871 elif opcode == 48: # pushscope
872 # We don't implement the scope register, so we'll just
873 # ignore the popped value
875 elif opcode == 70: # callproperty
877 mname = multinames[index]
878 arg_count = u30(coder)
879 args = list(reversed(
880 [stack.pop() for _ in range(arg_count)]))
882 if mname == u'split':
883 assert len(args) == 1
884 assert isinstance(args[0], compat_str)
885 assert isinstance(obj, compat_str)
889 res = obj.split(args[0])
891 elif mname == u'slice':
892 assert len(args) == 1
893 assert isinstance(args[0], int)
894 assert isinstance(obj, list)
897 elif mname == u'join':
898 assert len(args) == 1
899 assert isinstance(args[0], compat_str)
900 assert isinstance(obj, list)
901 res = args[0].join(obj)
903 elif mname in method_pyfunctions:
904 stack.append(method_pyfunctions[mname](args))
906 raise NotImplementedError(
907 u'Unsupported property %r on %r'
909 elif opcode == 72: # returnvalue
912 elif opcode == 79: # callpropvoid
914 mname = multinames[index]
915 arg_count = u30(coder)
916 args = list(reversed(
917 [stack.pop() for _ in range(arg_count)]))
919 if mname == u'reverse':
920 assert isinstance(obj, list)
923 raise NotImplementedError(
924 u'Unsupported (void) property %r on %r'
926 elif opcode == 93: # findpropstrict
928 mname = multinames[index]
929 res = extract_function(mname)
931 elif opcode == 97: # setproperty
936 assert isinstance(obj, list)
937 assert isinstance(idx, int)
939 elif opcode == 98: # getlocal
941 stack.append(registers[index])
942 elif opcode == 99: # setlocal
945 registers[index] = value
946 elif opcode == 102: # getproperty
948 pname = multinames[index]
949 if pname == u'length':
951 assert isinstance(obj, list)
952 stack.append(len(obj))
953 else: # Assume attribute access
955 assert isinstance(idx, int)
957 assert isinstance(obj, list)
958 stack.append(obj[idx])
959 elif opcode == 128: # coerce
961 elif opcode == 133: # coerce_s
962 assert isinstance(stack[-1], (type(None), compat_str))
963 elif opcode == 164: # modulo
966 res = value1 % value2
968 elif opcode == 208: # getlocal_0
969 stack.append(registers[0])
970 elif opcode == 209: # getlocal_1
971 stack.append(registers[1])
972 elif opcode == 210: # getlocal_2
973 stack.append(registers[2])
974 elif opcode == 211: # getlocal_3
975 stack.append(registers[3])
976 elif opcode == 214: # setlocal_2
977 registers[2] = stack.pop()
978 elif opcode == 215: # setlocal_3
979 registers[3] = stack.pop()
981 raise NotImplementedError(
982 u'Unsupported opcode %d' % opcode)
984 method_pyfunctions[func_name] = resfunc
987 initial_function = extract_function(u'decipher')
988 return lambda s: initial_function([s])
990 def _decrypt_signature(self, s, video_id, jsplayer_url, age_gate=False):
991 """Turn the encrypted s field into a working signature"""
993 if jsplayer_url is not None:
995 if jsplayer_url not in self._jsplayer_cache:
996 self._jsplayer_cache[jsplayer_url] = self._extract_signature_function(
997 video_id, jsplayer_url
999 return self._jsplayer_cache[jsplayer_url]([s])
1000 except Exception as e:
1001 tb = traceback.format_exc()
1002 self._downloader.report_warning(u'Automatic signature extraction failed: ' + tb)
1004 self._downloader.report_warning(u'Warning: Falling back to static signature algorithm')
1007 # The videos with age protection use another player, so the
1008 # algorithms can be different.
1010 return s[2:63] + s[82] + s[64:82] + s[63]
1013 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
1015 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
1017 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
1019 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
1021 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
1023 return s[5:34] + s[0] + s[35:38] + s[3] + s[39:45] + s[38] + s[46:53] + s[73] + s[54:73] + s[85] + s[74:85] + s[53]
1025 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
1027 return s[81:36:-1] + s[0] + s[35:2:-1]
1029 return s[81:64:-1] + s[82] + s[63:52:-1] + s[45] + s[51:45:-1] + s[1] + s[44:1:-1] + s[0]
1031 return s[80:73:-1] + s[81] + s[72:54:-1] + s[2] + s[53:43:-1] + s[0] + s[42:2:-1] + s[43] + s[1] + s[54]
1033 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1035 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
1037 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1040 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
1042 def _decrypt_signature_age_gate(self, s):
1043 # The videos with age protection use another player, so the algorithms
1046 return s[2:63] + s[82] + s[64:82] + s[63]
1048 # Fallback to the other algortihms
1049 return self._decrypt_signature(s)
1051 def _get_available_subtitles(self, video_id):
1053 sub_list = self._download_webpage(
1054 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1055 video_id, note=False)
1056 except ExtractorError as err:
1057 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
1059 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
1064 params = compat_urllib_parse.urlencode({
1067 'fmt': self._downloader.params.get('subtitlesformat'),
1069 url = u'http://www.youtube.com/api/timedtext?' + params
1070 sub_lang_list[lang] = url
1071 if not sub_lang_list:
1072 self._downloader.report_warning(u'video doesn\'t have subtitles')
1074 return sub_lang_list
1076 def _get_available_automatic_caption(self, video_id, webpage):
1077 """We need the webpage for getting the captions url, pass it as an
1078 argument to speed up the process."""
1079 sub_format = self._downloader.params.get('subtitlesformat')
1080 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1081 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
1082 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
1084 self._downloader.report_warning(err_msg)
1086 player_config = json.loads(mobj.group(1))
1088 args = player_config[u'args']
1089 caption_url = args[u'ttsurl']
1090 timestamp = args[u'timestamp']
1091 # We get the available subtitles
1092 list_params = compat_urllib_parse.urlencode({
1097 list_url = caption_url + '&' + list_params
1098 list_page = self._download_webpage(list_url, video_id)
1099 caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8'))
1100 original_lang_node = caption_list.find('track')
1101 if original_lang_node.attrib.get('kind') != 'asr' :
1102 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1104 original_lang = original_lang_node.attrib['lang_code']
1107 for lang_node in caption_list.findall('target'):
1108 sub_lang = lang_node.attrib['lang_code']
1109 params = compat_urllib_parse.urlencode({
1110 'lang': original_lang,
1116 sub_lang_list[sub_lang] = caption_url + '&' + params
1117 return sub_lang_list
1118 # An extractor error can be raise by the download process if there are
1119 # no automatic captions but there are subtitles
1120 except (KeyError, ExtractorError):
1121 self._downloader.report_warning(err_msg)
1124 def _print_formats(self, formats):
1125 print('Available formats:')
1127 print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'),
1128 self._video_dimensions.get(x, '???'),
1129 ' ('+self._special_itags[x]+')' if x in self._special_itags else ''))
1131 def _extract_id(self, url):
1132 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1134 raise ExtractorError(u'Invalid URL: %s' % url)
1135 video_id = mobj.group(2)
1138 def _get_video_url_list(self, url_map):
1140 Transform a dictionary in the format {itag:url} to a list of (itag, url)
1141 with the requested formats.
1143 req_format = self._downloader.params.get('format', None)
1144 format_limit = self._downloader.params.get('format_limit', None)
1145 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1146 if format_limit is not None and format_limit in available_formats:
1147 format_list = available_formats[available_formats.index(format_limit):]
1149 format_list = available_formats
1150 existing_formats = [x for x in format_list if x in url_map]
1151 if len(existing_formats) == 0:
1152 raise ExtractorError(u'no known formats available for video')
1153 if self._downloader.params.get('listformats', None):
1154 self._print_formats(existing_formats)
1156 if req_format is None or req_format == 'best':
1157 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1158 elif req_format == 'worst':
1159 video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality
1160 elif req_format in ('-1', 'all'):
1161 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1163 # Specific formats. We pick the first in a slash-delimeted sequence.
1164 # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
1165 # available in the specified format. For example,
1166 # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1167 # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
1168 # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
1169 req_formats = req_format.split('/')
1170 video_url_list = None
1171 for rf in req_formats:
1173 video_url_list = [(rf, url_map[rf])]
1175 if rf in self._video_formats_map:
1176 for srf in self._video_formats_map[rf]:
1178 video_url_list = [(srf, url_map[srf])]
1183 if video_url_list is None:
1184 raise ExtractorError(u'requested format not available')
1185 return video_url_list
1187 def _extract_from_m3u8(self, manifest_url, video_id):
1189 def _get_urls(_manifest):
1190 lines = _manifest.split('\n')
1191 urls = filter(lambda l: l and not l.startswith('#'),
1194 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1195 formats_urls = _get_urls(manifest)
1196 for format_url in formats_urls:
1197 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1198 url_map[itag] = format_url
1201 def _real_extract(self, url):
1202 if re.match(r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$', url):
1203 self._downloader.report_warning(u'Did you forget to quote the URL? Remember that & is a meta-character in most shells, so you want to put the URL in quotes, like youtube-dl \'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\' (or simply youtube-dl BaW_jenozKc ).')
1205 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1206 mobj = re.search(self._NEXT_URL_RE, url)
1208 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1209 video_id = self._extract_id(url)
1212 self.report_video_webpage_download(video_id)
1213 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1214 request = compat_urllib_request.Request(url)
1216 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
1217 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1218 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
1220 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
1222 # Attempt to extract SWF player URL
1223 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1224 if mobj is not None:
1225 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1230 self.report_video_info_webpage_download(video_id)
1231 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1232 self.report_age_confirmation()
1234 # We simulate the access to the video from www.youtube.com/v/{video_id}
1235 # this can be viewed without login into Youtube
1236 data = compat_urllib_parse.urlencode({'video_id': video_id,
1240 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1244 video_info_url = 'https://www.youtube.com/get_video_info?' + data
1245 video_info_webpage = self._download_webpage(video_info_url, video_id,
1247 errnote='unable to download video info webpage')
1248 video_info = compat_parse_qs(video_info_webpage)
1251 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1252 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1253 % (video_id, el_type))
1254 video_info_webpage = self._download_webpage(video_info_url, video_id,
1256 errnote='unable to download video info webpage')
1257 video_info = compat_parse_qs(video_info_webpage)
1258 if 'token' in video_info:
1260 if 'token' not in video_info:
1261 if 'reason' in video_info:
1262 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
1264 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1266 # Check for "rental" videos
1267 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1268 raise ExtractorError(u'"rental" videos not supported')
1270 # Start extracting information
1271 self.report_information_extraction(video_id)
1274 if 'author' not in video_info:
1275 raise ExtractorError(u'Unable to extract uploader name')
1276 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1279 video_uploader_id = None
1280 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1281 if mobj is not None:
1282 video_uploader_id = mobj.group(1)
1284 self._downloader.report_warning(u'unable to extract uploader nickname')
1287 if 'title' not in video_info:
1288 raise ExtractorError(u'Unable to extract video title')
1289 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1292 # We try first to get a high quality image:
1293 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1294 video_webpage, re.DOTALL)
1295 if m_thumb is not None:
1296 video_thumbnail = m_thumb.group(1)
1297 elif 'thumbnail_url' not in video_info:
1298 self._downloader.report_warning(u'unable to extract video thumbnail')
1299 video_thumbnail = ''
1300 else: # don't panic if we can't find it
1301 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1305 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1306 if mobj is not None:
1307 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1308 upload_date = unified_strdate(upload_date)
1311 video_description = get_element_by_id("eow-description", video_webpage)
1312 if video_description:
1313 video_description = clean_html(video_description)
1315 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1317 video_description = unescapeHTML(fd_mobj.group(1))
1319 video_description = u''
1322 video_subtitles = self.extract_subtitles(video_id, video_webpage)
1324 if self._downloader.params.get('listsubtitles', False):
1325 self._list_available_subtitles(video_id, video_webpage)
1328 if 'length_seconds' not in video_info:
1329 self._downloader.report_warning(u'unable to extract video duration')
1332 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
1334 # Decide which formats to download
1337 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
1339 raise ValueError('Could not find vevo ID')
1340 info = json.loads(mobj.group(1))
1342 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1343 # this signatures are encrypted
1344 m_s = re.search(r'[&,]s=', args['url_encoded_fmt_stream_map'])
1346 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
1347 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
1348 m_s = re.search(r'[&,]s=', args.get('adaptive_fmts', u''))
1350 if 'url_encoded_fmt_stream_map' in video_info:
1351 video_info['url_encoded_fmt_stream_map'][0] += ',' + args['adaptive_fmts']
1353 video_info['url_encoded_fmt_stream_map'] = [args['adaptive_fmts']]
1354 elif 'adaptive_fmts' in video_info:
1355 if 'url_encoded_fmt_stream_map' in video_info:
1356 video_info['url_encoded_fmt_stream_map'][0] += ',' + video_info['adaptive_fmts'][0]
1358 video_info['url_encoded_fmt_stream_map'] = video_info['adaptive_fmts']
1362 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1363 self.report_rtmp_download()
1364 video_url_list = [(None, video_info['conn'][0])]
1365 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1366 if 'rtmpe%3Dyes' in video_info['url_encoded_fmt_stream_map'][0]:
1367 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1369 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
1370 url_data = compat_parse_qs(url_data_str)
1371 if 'itag' in url_data and 'url' in url_data:
1372 url = url_data['url'][0]
1373 if 'sig' in url_data:
1374 url += '&signature=' + url_data['sig'][0]
1375 elif 's' in url_data:
1376 encrypted_sig = url_data['s'][0]
1377 if self._downloader.params.get('verbose'):
1379 player_version = self._search_regex(r'-(.+)\.swf$',
1380 player_url if player_url else 'NOT FOUND',
1381 'flash player', fatal=False)
1382 player_desc = 'flash player %s' % player_version
1384 player_version = self._search_regex(r'html5player-(.+?)\.js', video_webpage,
1385 'html5 player', fatal=False)
1386 player_desc = u'html5 player %s' % player_version
1388 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
1389 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
1390 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1395 jsplayer_url_json = self._search_regex(
1396 r'"assets":.+?"js":\s*("[^"]+")',
1397 video_webpage, u'JS player URL')
1398 jsplayer_url = json.loads(jsplayer_url_json)
1400 signature = self._decrypt_signature(encrypted_sig, video_id, jsplayer_url, age_gate)
1401 url += '&signature=' + signature
1402 if 'ratebypass' not in url:
1403 url += '&ratebypass=yes'
1404 url_map[url_data['itag'][0]] = url
1405 video_url_list = self._get_video_url_list(url_map)
1406 if not video_url_list:
1408 elif video_info.get('hlsvp'):
1409 manifest_url = video_info['hlsvp'][0]
1410 url_map = self._extract_from_m3u8(manifest_url, video_id)
1411 video_url_list = self._get_video_url_list(url_map)
1412 if not video_url_list:
1416 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
1419 for format_param, video_real_url in video_url_list:
1421 video_extension = self._video_extensions.get(format_param, 'flv')
1423 video_format = '{0} - {1}{2}'.format(format_param if format_param else video_extension,
1424 self._video_dimensions.get(format_param, '???'),
1425 ' ('+self._special_itags[format_param]+')' if format_param in self._special_itags else '')
1429 'url': video_real_url,
1430 'uploader': video_uploader,
1431 'uploader_id': video_uploader_id,
1432 'upload_date': upload_date,
1433 'title': video_title,
1434 'ext': video_extension,
1435 'format': video_format,
1436 'thumbnail': video_thumbnail,
1437 'description': video_description,
1438 'player_url': player_url,
1439 'subtitles': video_subtitles,
1440 'duration': video_duration
1444 class YoutubePlaylistIE(InfoExtractor):
1445 IE_DESC = u'YouTube.com playlists'
1446 _VALID_URL = r"""(?:
1451 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1452 \? (?:.*?&)*? (?:p|a|list)=
1455 ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
1458 ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
1460 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1462 IE_NAME = u'youtube:playlist'
1465 def suitable(cls, url):
1466 """Receives a URL and returns True if suitable for this IE."""
1467 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1469 def _real_extract(self, url):
1470 # Extract playlist id
1471 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1473 raise ExtractorError(u'Invalid URL: %s' % url)
1475 # Download playlist videos from API
1476 playlist_id = mobj.group(1) or mobj.group(2)
1479 for page_num in itertools.count(1):
1480 start_index = self._MAX_RESULTS * (page_num - 1) + 1
1481 if start_index >= 1000:
1482 self._downloader.report_warning(u'Max number of results reached')
1484 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index)
1485 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1488 response = json.loads(page)
1489 except ValueError as err:
1490 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1492 if 'feed' not in response:
1493 raise ExtractorError(u'Got a malformed response from YouTube API')
1494 playlist_title = response['feed']['title']['$t']
1495 if 'entry' not in response['feed']:
1496 # Number of videos is a multiple of self._MAX_RESULTS
1499 for entry in response['feed']['entry']:
1500 index = entry['yt$position']['$t']
1501 if 'media$group' in entry and 'yt$videoid' in entry['media$group']:
1504 'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t']
1507 videos = [v[1] for v in sorted(videos)]
1509 url_results = [self.url_result(vurl, 'Youtube') for vurl in videos]
1510 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1513 class YoutubeChannelIE(InfoExtractor):
1514 IE_DESC = u'YouTube.com channels'
1515 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1516 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1517 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1518 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1519 IE_NAME = u'youtube:channel'
1521 def extract_videos_from_page(self, page):
1523 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1524 if mobj.group(1) not in ids_in_page:
1525 ids_in_page.append(mobj.group(1))
1528 def _real_extract(self, url):
1529 # Extract channel id
1530 mobj = re.match(self._VALID_URL, url)
1532 raise ExtractorError(u'Invalid URL: %s' % url)
1534 # Download channel page
1535 channel_id = mobj.group(1)
1539 url = self._TEMPLATE_URL % (channel_id, pagenum)
1540 page = self._download_webpage(url, channel_id,
1541 u'Downloading page #%s' % pagenum)
1543 # Extract video identifiers
1544 ids_in_page = self.extract_videos_from_page(page)
1545 video_ids.extend(ids_in_page)
1547 # Download any subsequent channel pages using the json-based channel_ajax query
1548 if self._MORE_PAGES_INDICATOR in page:
1549 for pagenum in itertools.count(1):
1550 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1551 page = self._download_webpage(url, channel_id,
1552 u'Downloading page #%s' % pagenum)
1554 page = json.loads(page)
1556 ids_in_page = self.extract_videos_from_page(page['content_html'])
1557 video_ids.extend(ids_in_page)
1559 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1562 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1564 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1565 url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls]
1566 return [self.playlist_result(url_entries, channel_id)]
1569 class YoutubeUserIE(InfoExtractor):
1570 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
1571 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?)|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1572 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1573 _GDATA_PAGE_SIZE = 50
1574 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1575 IE_NAME = u'youtube:user'
1578 def suitable(cls, url):
1579 # Don't return True if the url can be extracted with other youtube
1580 # extractor, the regex would is too permissive and it would match.
1581 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1582 if any(ie.suitable(url) for ie in other_ies): return False
1583 else: return super(YoutubeUserIE, cls).suitable(url)
1585 def _real_extract(self, url):
1587 mobj = re.match(self._VALID_URL, url)
1589 raise ExtractorError(u'Invalid URL: %s' % url)
1591 username = mobj.group(1)
1593 # Download video ids using YouTube Data API. Result size per
1594 # query is limited (currently to 50 videos) so we need to query
1595 # page by page until there are no video ids - it means we got
1600 for pagenum in itertools.count(0):
1601 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1603 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1604 page = self._download_webpage(gdata_url, username,
1605 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1608 response = json.loads(page)
1609 except ValueError as err:
1610 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1611 if 'entry' not in response['feed']:
1612 # Number of videos is a multiple of self._MAX_RESULTS
1615 # Extract video identifiers
1617 for entry in response['feed']['entry']:
1618 ids_in_page.append(entry['id']['$t'].split('/')[-1])
1619 video_ids.extend(ids_in_page)
1621 # A little optimization - if current page is not
1622 # "full", ie. does not contain PAGE_SIZE video ids then
1623 # we can assume that this page is the last one - there
1624 # are no more ids on further pages - no need to query
1627 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1630 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1631 url_results = [self.url_result(rurl, 'Youtube') for rurl in urls]
1632 return [self.playlist_result(url_results, playlist_title = username)]
1634 class YoutubeSearchIE(SearchInfoExtractor):
1635 IE_DESC = u'YouTube.com searches'
1636 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1638 IE_NAME = u'youtube:search'
1639 _SEARCH_KEY = 'ytsearch'
1641 def report_download_page(self, query, pagenum):
1642 """Report attempt to download search page with given number."""
1643 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1645 def _get_n_results(self, query, n):
1646 """Get a specified number of results for a query"""
1652 while (50 * pagenum) < limit:
1653 self.report_download_page(query, pagenum+1)
1654 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1655 request = compat_urllib_request.Request(result_url)
1657 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1658 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1659 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1660 api_response = json.loads(data)['data']
1662 if not 'items' in api_response:
1663 raise ExtractorError(u'[youtube] No video results')
1665 new_ids = list(video['id'] for video in api_response['items'])
1666 video_ids += new_ids
1668 limit = min(n, api_response['totalItems'])
1671 if len(video_ids) > n:
1672 video_ids = video_ids[:n]
1673 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1674 return self.playlist_result(videos, query)
1677 class YoutubeShowIE(InfoExtractor):
1678 IE_DESC = u'YouTube.com (multi-season) shows'
1679 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1680 IE_NAME = u'youtube:show'
1682 def _real_extract(self, url):
1683 mobj = re.match(self._VALID_URL, url)
1684 show_name = mobj.group(1)
1685 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1686 # There's one playlist for each season of the show
1687 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1688 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1689 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1692 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1694 Base class for extractors that fetch info from
1695 http://www.youtube.com/feed_ajax
1696 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1698 _LOGIN_REQUIRED = True
1700 # use action_load_personal_feed instead of action_load_system_feed
1701 _PERSONAL_FEED = False
1704 def _FEED_TEMPLATE(self):
1705 action = 'action_load_system_feed'
1706 if self._PERSONAL_FEED:
1707 action = 'action_load_personal_feed'
1708 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1712 return u'youtube:%s' % self._FEED_NAME
1714 def _real_initialize(self):
1717 def _real_extract(self, url):
1719 # The step argument is available only in 2.7 or higher
1720 for i in itertools.count(0):
1721 paging = i*self._PAGING_STEP
1722 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1723 u'%s feed' % self._FEED_NAME,
1724 u'Downloading page %s' % i)
1725 info = json.loads(info)
1726 feed_html = info['feed_html']
1727 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1728 ids = orderedSet(m.group(1) for m in m_ids)
1729 feed_entries.extend(self.url_result(id, 'Youtube') for id in ids)
1730 if info['paging'] is None:
1732 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1734 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1735 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1736 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1737 _FEED_NAME = 'subscriptions'
1738 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1740 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1741 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1742 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1743 _FEED_NAME = 'recommended'
1744 _PLAYLIST_TITLE = u'Youtube Recommended videos'
1746 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1747 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1748 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1749 _FEED_NAME = 'watch_later'
1750 _PLAYLIST_TITLE = u'Youtube Watch Later'
1752 _PERSONAL_FEED = True
1754 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1755 IE_NAME = u'youtube:favorites'
1756 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1757 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1758 _LOGIN_REQUIRED = True
1760 def _real_extract(self, url):
1761 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1762 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1763 return self.url_result(playlist_id, 'YoutubePlaylist')