14 import xml.etree.ElementTree
17 from .common import InfoExtractor, SearchInfoExtractor
18 from .subtitles import SubtitlesInfoExtractor
25 compat_urllib_request,
37 class YoutubeBaseInfoExtractor(InfoExtractor):
38 """Provide base functions for Youtube extractors"""
39 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
40 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
41 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
42 _NETRC_MACHINE = 'youtube'
43 # If True it will raise an error if no login info is provided
44 _LOGIN_REQUIRED = False
46 def report_lang(self):
47 """Report attempt to set language."""
48 self.to_screen(u'Setting language')
50 def _set_language(self):
51 request = compat_urllib_request.Request(self._LANG_URL)
54 compat_urllib_request.urlopen(request).read()
55 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
56 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
61 (username, password) = self._get_login_info()
62 # No authentication to be performed
64 if self._LOGIN_REQUIRED:
65 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
68 request = compat_urllib_request.Request(self._LOGIN_URL)
70 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
71 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
72 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
77 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
80 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
86 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
90 u'PersistentCookie': u'yes',
92 u'bgresponse': u'js_disabled',
93 u'checkConnection': u'',
94 u'checkedDomains': u'youtube',
100 u'signIn': u'Sign in',
102 u'service': u'youtube',
106 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
108 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
109 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
110 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
113 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
114 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
115 self._downloader.report_warning(u'unable to log in: bad username or password')
117 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
118 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
122 def _confirm_age(self):
125 'action_confirm': 'Confirm',
127 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
129 self.report_age_confirmation()
130 compat_urllib_request.urlopen(request).read().decode('utf-8')
131 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
132 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
135 def _real_initialize(self):
136 if self._downloader is None:
138 if not self._set_language():
140 if not self._login():
145 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
146 IE_DESC = u'YouTube.com'
149 (?:https?://)? # http(s):// (optional)
150 (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
151 tube\.majestyc\.net/|
152 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
153 (?:.*?\#/)? # handle anchor (#/) redirect urls
154 (?: # the various things that can precede the ID:
155 (?:(?:v|embed|e)/) # v/ or embed/ or e/
156 |(?: # or the v= param in all its forms
157 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
158 (?:\?|\#!?) # the params delimiter ? or # or #!
159 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
163 |youtu\.be/ # just youtu.be/xxxx
165 )? # all until now is optional -> you can pass the naked ID
166 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
167 (?(1).+)? # if we found the ID, everything can follow
169 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
170 # Listed in order of quality
171 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
172 # Apple HTTP Live Streaming
173 '96', '95', '94', '93', '92', '132', '151',
175 '85', '84', '102', '83', '101', '82', '100',
177 '138', '137', '248', '136', '247', '135', '246',
178 '245', '244', '134', '243', '133', '242', '160',
180 '141', '172', '140', '171', '139',
182 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
183 # Apple HTTP Live Streaming
184 '96', '95', '94', '93', '92', '132', '151',
186 '85', '102', '84', '101', '83', '100', '82',
188 '138', '248', '137', '247', '136', '246', '245',
189 '244', '135', '243', '134', '242', '133', '160',
191 '172', '141', '171', '140', '139',
193 _video_formats_map = {
194 'flv': ['35', '34', '6', '5'],
195 '3gp': ['36', '17', '13'],
196 'mp4': ['38', '37', '22', '18'],
197 'webm': ['46', '45', '44', '43'],
199 _video_extensions = {
221 # Apple HTTP Live Streaming
253 _video_dimensions = {
335 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
336 u"file": u"BaW_jenozKc.mp4",
338 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
339 u"uploader": u"Philipp Hagemeister",
340 u"uploader_id": u"phihag",
341 u"upload_date": u"20121002",
342 u"description": u"test chars: \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
346 u"url": u"http://www.youtube.com/watch?v=1ltcDfZMA3U",
347 u"file": u"1ltcDfZMA3U.flv",
348 u"note": u"Test VEVO video (#897)",
350 u"upload_date": u"20070518",
351 u"title": u"Maps - It Will Find You",
352 u"description": u"Music video by Maps performing It Will Find You.",
353 u"uploader": u"MuteUSA",
354 u"uploader_id": u"MuteUSA"
358 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
359 u"file": u"UxxajLWwzqY.mp4",
360 u"note": u"Test generic use_cipher_signature video (#897)",
362 u"upload_date": u"20120506",
363 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
364 u"description": u"md5:3e2666e0a55044490499ea45fe9037b7",
365 u"uploader": u"Icona Pop",
366 u"uploader_id": u"IconaPop"
370 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
371 u"file": u"07FYdnEawAQ.mp4",
372 u"note": u"Test VEVO video with age protection (#956)",
374 u"upload_date": u"20130703",
375 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
376 u"description": u"md5:64249768eec3bc4276236606ea996373",
377 u"uploader": u"justintimberlakeVEVO",
378 u"uploader_id": u"justintimberlakeVEVO"
382 u'url': u'https://www.youtube.com/watch?v=TGi3HqYrWHE',
383 u'file': u'TGi3HqYrWHE.mp4',
384 u'note': u'm3u8 video',
386 u'title': u'Triathlon - Men - London 2012 Olympic Games',
387 u'description': u'- Men - TR02 - Triathlon - 07 August 2012 - London 2012 Olympic Games',
388 u'uploader': u'olympic',
389 u'upload_date': u'20120807',
390 u'uploader_id': u'olympic',
393 u'skip_download': True,
400 def suitable(cls, url):
401 """Receives a URL and returns True if suitable for this IE."""
402 if YoutubePlaylistIE.suitable(url): return False
403 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
405 def __init__(self, *args, **kwargs):
406 super(YoutubeIE, self).__init__(*args, **kwargs)
407 self._player_cache = {}
409 def report_video_webpage_download(self, video_id):
410 """Report attempt to download video webpage."""
411 self.to_screen(u'%s: Downloading video webpage' % video_id)
413 def report_video_info_webpage_download(self, video_id):
414 """Report attempt to download video info webpage."""
415 self.to_screen(u'%s: Downloading video info webpage' % video_id)
417 def report_information_extraction(self, video_id):
418 """Report attempt to extract video information."""
419 self.to_screen(u'%s: Extracting video information' % video_id)
421 def report_unavailable_format(self, video_id, format):
422 """Report extracted video URL."""
423 self.to_screen(u'%s: Format %s not available' % (video_id, format))
425 def report_rtmp_download(self):
426 """Indicate the download will use the RTMP protocol."""
427 self.to_screen(u'RTMP download detected')
429 def _extract_signature_function(self, video_id, player_url, slen):
430 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
432 player_type = id_m.group('ext')
433 player_id = id_m.group('id')
435 # Read from filesystem cache
436 func_id = '%s_%s_%d' % (player_type, player_id, slen)
437 assert os.path.basename(func_id) == func_id
438 cache_dir = self._downloader.params.get('cachedir',
439 u'~/.youtube-dl/cache')
441 cache_enabled = cache_dir is not None
443 cache_fn = os.path.join(os.path.expanduser(cache_dir),
447 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
448 cache_spec = json.load(cachef)
449 return lambda s: u''.join(s[i] for i in cache_spec)
451 pass # No cache available
453 if player_type == 'js':
454 code = self._download_webpage(
455 player_url, video_id,
456 note=u'Downloading %s player %s' % (player_type, player_id),
457 errnote=u'Download of %s failed' % player_url)
458 res = self._parse_sig_js(code)
459 elif player_type == 'swf':
460 urlh = self._request_webpage(
461 player_url, video_id,
462 note=u'Downloading %s player %s' % (player_type, player_id),
463 errnote=u'Download of %s failed' % player_url)
465 res = self._parse_sig_swf(code)
467 assert False, 'Invalid player type %r' % player_type
471 test_string = u''.join(map(compat_chr, range(slen)))
472 cache_res = res(test_string)
473 cache_spec = [ord(c) for c in cache_res]
475 os.makedirs(os.path.dirname(cache_fn))
476 except OSError as ose:
477 if ose.errno != errno.EEXIST:
479 write_json_file(cache_spec, cache_fn)
481 tb = traceback.format_exc()
482 self._downloader.report_warning(
483 u'Writing cache to %r failed: %s' % (cache_fn, tb))
487 def _print_sig_code(self, func, slen):
488 def gen_sig_code(idxs):
489 def _genslice(start, end, step):
490 starts = u'' if start == 0 else str(start)
491 ends = u':%d' % (end+step)
492 steps = u'' if step == 1 else (':%d' % step)
493 return u's[%s%s%s]' % (starts, ends, steps)
496 start = '(Never used)' # Quelch pyflakes warnings - start will be
497 # set as soon as step is set
498 for i, prev in zip(idxs[1:], idxs[:-1]):
502 yield _genslice(start, prev, step)
505 if i - prev in [-1, 1]:
510 yield u's[%d]' % prev
514 yield _genslice(start, i, step)
516 test_string = u''.join(map(compat_chr, range(slen)))
517 cache_res = func(test_string)
518 cache_spec = [ord(c) for c in cache_res]
519 expr_code = u' + '.join(gen_sig_code(cache_spec))
520 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
521 self.to_screen(u'Extracted signature function:\n' + code)
523 def _parse_sig_js(self, jscode):
524 funcname = self._search_regex(
525 r'signature=([a-zA-Z]+)', jscode,
526 u'Initial JS player signature function name')
531 return string.lowercase.index(varname)
533 def interpret_statement(stmt, local_vars, allow_recursion=20):
534 if allow_recursion < 0:
535 raise ExtractorError(u'Recursion limit reached')
537 if stmt.startswith(u'var '):
538 stmt = stmt[len(u'var '):]
539 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
540 r'=(?P<expr>.*)$', stmt)
542 if ass_m.groupdict().get('index'):
544 lvar = local_vars[ass_m.group('out')]
545 idx = interpret_expression(ass_m.group('index'),
546 local_vars, allow_recursion)
547 assert isinstance(idx, int)
550 expr = ass_m.group('expr')
553 local_vars[ass_m.group('out')] = val
555 expr = ass_m.group('expr')
556 elif stmt.startswith(u'return '):
558 expr = stmt[len(u'return '):]
560 raise ExtractorError(
561 u'Cannot determine left side of statement in %r' % stmt)
563 v = interpret_expression(expr, local_vars, allow_recursion)
566 def interpret_expression(expr, local_vars, allow_recursion):
571 return local_vars[expr]
573 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
575 member = m.group('member')
576 val = local_vars[m.group('in')]
577 if member == 'split("")':
579 if member == 'join("")':
581 if member == 'length':
583 if member == 'reverse()':
585 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
587 idx = interpret_expression(
588 slice_m.group('idx'), local_vars, allow_recursion-1)
592 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
594 val = local_vars[m.group('in')]
595 idx = interpret_expression(m.group('idx'), local_vars,
599 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
601 a = interpret_expression(m.group('a'),
602 local_vars, allow_recursion)
603 b = interpret_expression(m.group('b'),
604 local_vars, allow_recursion)
608 r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
610 fname = m.group('func')
611 if fname not in functions:
612 functions[fname] = extract_function(fname)
613 argvals = [int(v) if v.isdigit() else local_vars[v]
614 for v in m.group('args').split(',')]
615 return functions[fname](argvals)
616 raise ExtractorError(u'Unsupported JS expression %r' % expr)
618 def extract_function(funcname):
620 r'function ' + re.escape(funcname) +
621 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
623 argnames = func_m.group('args').split(',')
626 local_vars = dict(zip(argnames, args))
627 for stmt in func_m.group('code').split(';'):
628 res = interpret_statement(stmt, local_vars)
632 initial_function = extract_function(funcname)
633 return lambda s: initial_function([s])
635 def _parse_sig_swf(self, file_contents):
636 if file_contents[1:3] != b'WS':
637 raise ExtractorError(
638 u'Not an SWF file; header is %r' % file_contents[:3])
639 if file_contents[:1] == b'C':
640 content = zlib.decompress(file_contents[8:])
642 raise NotImplementedError(u'Unsupported compression format %r' %
645 def extract_tags(content):
647 while pos < len(content):
648 header16 = struct.unpack('<H', content[pos:pos+2])[0]
650 tag_code = header16 >> 6
651 tag_len = header16 & 0x3f
653 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
655 assert pos+tag_len <= len(content)
656 yield (tag_code, content[pos:pos+tag_len])
660 for tag_code, tag in extract_tags(content)
662 p = code_tag.index(b'\0', 4) + 1
663 code_reader = io.BytesIO(code_tag[p:])
665 # Parse ABC (AVM2 ByteCode)
666 def read_int(reader=None):
674 b = struct.unpack('<B', buf)[0]
675 res = res | ((b & 0x7f) << shift)
681 def u30(reader=None):
682 res = read_int(reader)
683 assert res & 0xf0000000 == 0
687 def s32(reader=None):
689 if v & 0x80000000 != 0:
690 v = - ((v ^ 0xffffffff) + 1)
693 def read_string(reader=None):
697 resb = reader.read(slen)
698 assert len(resb) == slen
699 return resb.decode('utf-8')
701 def read_bytes(count, reader=None):
704 resb = reader.read(count)
705 assert len(resb) == count
708 def read_byte(reader=None):
709 resb = read_bytes(1, reader=reader)
710 res = struct.unpack('<B', resb)[0]
713 # minor_version + major_version
718 for _c in range(1, int_count):
721 for _c in range(1, uint_count):
724 read_bytes((double_count-1) * 8)
726 constant_strings = [u'']
727 for _c in range(1, string_count):
729 constant_strings.append(s)
730 namespace_count = u30()
731 for _c in range(1, namespace_count):
735 for _c in range(1, ns_set_count):
737 for _c2 in range(count):
739 multiname_count = u30()
748 0x0e: 2, # MultinameA
749 0x1b: 1, # MultinameL
750 0x1c: 1, # MultinameLA
753 for _c in range(1, multiname_count):
755 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
757 u30() # namespace_idx
759 multinames.append(constant_strings[name_idx])
761 multinames.append('[MULTINAME kind: %d]' % kind)
762 for _c2 in range(MULTINAME_SIZES[kind]):
767 MethodInfo = collections.namedtuple(
769 ['NEED_ARGUMENTS', 'NEED_REST'])
771 for method_id in range(method_count):
774 for _ in range(param_count):
776 u30() # name index (always 0 for youtube)
778 if flags & 0x08 != 0:
781 for c in range(option_count):
784 if flags & 0x80 != 0:
785 # Param names present
786 for _ in range(param_count):
788 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
789 method_infos.append(mi)
792 metadata_count = u30()
793 for _c in range(metadata_count):
796 for _c2 in range(item_count):
800 def parse_traits_info():
801 trait_name_idx = u30()
802 kind_full = read_byte()
803 kind = kind_full & 0x0f
804 attrs = kind_full >> 4
806 if kind in [0x00, 0x06]: # Slot or Const
808 u30() # type_name_idx
812 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
815 methods[multinames[trait_name_idx]] = method_idx
816 elif kind == 0x04: # Class
819 elif kind == 0x05: # Function
822 methods[function_idx] = multinames[trait_name_idx]
824 raise ExtractorError(u'Unsupported trait kind %d' % kind)
826 if attrs & 0x4 != 0: # Metadata present
827 metadata_count = u30()
828 for _c3 in range(metadata_count):
829 u30() # metadata index
834 TARGET_CLASSNAME = u'SignatureDecipher'
835 searched_idx = multinames.index(TARGET_CLASSNAME)
836 searched_class_id = None
838 for class_id in range(class_count):
840 if name_idx == searched_idx:
841 # We found the class we're looking for!
842 searched_class_id = class_id
843 u30() # super_name idx
845 if flags & 0x08 != 0: # Protected namespace is present
846 u30() # protected_ns_idx
848 for _c2 in range(intrf_count):
852 for _c2 in range(trait_count):
855 if searched_class_id is None:
856 raise ExtractorError(u'Target class %r not found' %
861 for class_id in range(class_count):
864 for _c2 in range(trait_count):
865 trait_methods = parse_traits_info()
866 if class_id == searched_class_id:
867 method_names.update(trait_methods.items())
868 method_idxs.update(dict(
870 for name, idx in trait_methods.items()))
874 for _c in range(script_count):
877 for _c2 in range(trait_count):
881 method_body_count = u30()
882 Method = collections.namedtuple('Method', ['code', 'local_count'])
884 for _c in range(method_body_count):
888 u30() # init_scope_depth
889 u30() # max_scope_depth
891 code = read_bytes(code_length)
892 if method_idx in method_idxs:
893 m = Method(code, local_count)
894 methods[method_idxs[method_idx]] = m
895 exception_count = u30()
896 for _c2 in range(exception_count):
903 for _c2 in range(trait_count):
906 assert p + code_reader.tell() == len(code_tag)
907 assert len(methods) == len(method_idxs)
909 method_pyfunctions = {}
911 def extract_function(func_name):
912 if func_name in method_pyfunctions:
913 return method_pyfunctions[func_name]
914 if func_name not in methods:
915 raise ExtractorError(u'Cannot find function %r' % func_name)
916 m = methods[func_name]
919 registers = ['(this)'] + list(args) + [None] * m.local_count
921 coder = io.BytesIO(m.code)
923 opcode = struct.unpack('!B', coder.read(1))[0]
924 if opcode == 36: # pushbyte
925 v = struct.unpack('!B', coder.read(1))[0]
927 elif opcode == 44: # pushstring
929 stack.append(constant_strings[idx])
930 elif opcode == 48: # pushscope
931 # We don't implement the scope register, so we'll just
932 # ignore the popped value
934 elif opcode == 70: # callproperty
936 mname = multinames[index]
937 arg_count = u30(coder)
938 args = list(reversed(
939 [stack.pop() for _ in range(arg_count)]))
941 if mname == u'split':
942 assert len(args) == 1
943 assert isinstance(args[0], compat_str)
944 assert isinstance(obj, compat_str)
948 res = obj.split(args[0])
950 elif mname == u'slice':
951 assert len(args) == 1
952 assert isinstance(args[0], int)
953 assert isinstance(obj, list)
956 elif mname == u'join':
957 assert len(args) == 1
958 assert isinstance(args[0], compat_str)
959 assert isinstance(obj, list)
960 res = args[0].join(obj)
962 elif mname in method_pyfunctions:
963 stack.append(method_pyfunctions[mname](args))
965 raise NotImplementedError(
966 u'Unsupported property %r on %r'
968 elif opcode == 72: # returnvalue
971 elif opcode == 79: # callpropvoid
973 mname = multinames[index]
974 arg_count = u30(coder)
975 args = list(reversed(
976 [stack.pop() for _ in range(arg_count)]))
978 if mname == u'reverse':
979 assert isinstance(obj, list)
982 raise NotImplementedError(
983 u'Unsupported (void) property %r on %r'
985 elif opcode == 93: # findpropstrict
987 mname = multinames[index]
988 res = extract_function(mname)
990 elif opcode == 97: # setproperty
995 assert isinstance(obj, list)
996 assert isinstance(idx, int)
998 elif opcode == 98: # getlocal
1000 stack.append(registers[index])
1001 elif opcode == 99: # setlocal
1004 registers[index] = value
1005 elif opcode == 102: # getproperty
1007 pname = multinames[index]
1008 if pname == u'length':
1010 assert isinstance(obj, list)
1011 stack.append(len(obj))
1012 else: # Assume attribute access
1014 assert isinstance(idx, int)
1016 assert isinstance(obj, list)
1017 stack.append(obj[idx])
1018 elif opcode == 128: # coerce
1020 elif opcode == 133: # coerce_s
1021 assert isinstance(stack[-1], (type(None), compat_str))
1022 elif opcode == 164: # modulo
1023 value2 = stack.pop()
1024 value1 = stack.pop()
1025 res = value1 % value2
1027 elif opcode == 208: # getlocal_0
1028 stack.append(registers[0])
1029 elif opcode == 209: # getlocal_1
1030 stack.append(registers[1])
1031 elif opcode == 210: # getlocal_2
1032 stack.append(registers[2])
1033 elif opcode == 211: # getlocal_3
1034 stack.append(registers[3])
1035 elif opcode == 214: # setlocal_2
1036 registers[2] = stack.pop()
1037 elif opcode == 215: # setlocal_3
1038 registers[3] = stack.pop()
1040 raise NotImplementedError(
1041 u'Unsupported opcode %d' % opcode)
1043 method_pyfunctions[func_name] = resfunc
1046 initial_function = extract_function(u'decipher')
1047 return lambda s: initial_function([s])
1049 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
1050 """Turn the encrypted s field into a working signature"""
1052 if player_url is not None:
1054 if player_url not in self._player_cache:
1055 func = self._extract_signature_function(
1056 video_id, player_url, len(s)
1058 self._player_cache[player_url] = func
1059 func = self._player_cache[player_url]
1060 if self._downloader.params.get('youtube_print_sig_code'):
1061 self._print_sig_code(func, len(s))
1064 tb = traceback.format_exc()
1065 self._downloader.report_warning(
1066 u'Automatic signature extraction failed: ' + tb)
1068 self._downloader.report_warning(
1069 u'Warning: Falling back to static signature algorithm')
1070 return self._static_decrypt_signature(
1071 s, video_id, player_url, age_gate)
1073 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
1075 # The videos with age protection use another player, so the
1076 # algorithms can be different.
1078 return s[2:63] + s[82] + s[64:82] + s[63]
1081 return s[86:29:-1] + s[88] + s[28:5:-1]
1083 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
1085 return s[84:27:-1] + s[86] + s[26:5:-1]
1087 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
1089 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
1091 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
1093 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
1095 return s[5:34] + s[0] + s[35:38] + s[3] + s[39:45] + s[38] + s[46:53] + s[73] + s[54:73] + s[85] + s[74:85] + s[53]
1097 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
1099 return s[81:36:-1] + s[0] + s[35:2:-1]
1101 return s[81:64:-1] + s[82] + s[63:52:-1] + s[45] + s[51:45:-1] + s[1] + s[44:1:-1] + s[0]
1103 return s[80:73:-1] + s[81] + s[72:54:-1] + s[2] + s[53:43:-1] + s[0] + s[42:2:-1] + s[43] + s[1] + s[54]
1105 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1107 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
1109 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1112 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
1114 def _get_available_subtitles(self, video_id):
1116 sub_list = self._download_webpage(
1117 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1118 video_id, note=False)
1119 except ExtractorError as err:
1120 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
1122 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
1127 params = compat_urllib_parse.urlencode({
1130 'fmt': self._downloader.params.get('subtitlesformat'),
1132 url = u'http://www.youtube.com/api/timedtext?' + params
1133 sub_lang_list[lang] = url
1134 if not sub_lang_list:
1135 self._downloader.report_warning(u'video doesn\'t have subtitles')
1137 return sub_lang_list
1139 def _get_available_automatic_caption(self, video_id, webpage):
1140 """We need the webpage for getting the captions url, pass it as an
1141 argument to speed up the process."""
1142 sub_format = self._downloader.params.get('subtitlesformat')
1143 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1144 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
1145 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
1147 self._downloader.report_warning(err_msg)
1149 player_config = json.loads(mobj.group(1))
1151 args = player_config[u'args']
1152 caption_url = args[u'ttsurl']
1153 timestamp = args[u'timestamp']
1154 # We get the available subtitles
1155 list_params = compat_urllib_parse.urlencode({
1160 list_url = caption_url + '&' + list_params
1161 list_page = self._download_webpage(list_url, video_id)
1162 caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8'))
1163 original_lang_node = caption_list.find('track')
1164 if original_lang_node.attrib.get('kind') != 'asr' :
1165 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1167 original_lang = original_lang_node.attrib['lang_code']
1170 for lang_node in caption_list.findall('target'):
1171 sub_lang = lang_node.attrib['lang_code']
1172 params = compat_urllib_parse.urlencode({
1173 'lang': original_lang,
1179 sub_lang_list[sub_lang] = caption_url + '&' + params
1180 return sub_lang_list
1181 # An extractor error can be raise by the download process if there are
1182 # no automatic captions but there are subtitles
1183 except (KeyError, ExtractorError):
1184 self._downloader.report_warning(err_msg)
1187 def _print_formats(self, formats):
1188 print('Available formats:')
1190 print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'),
1191 self._video_dimensions.get(x, '???'),
1192 ' ('+self._special_itags[x]+')' if x in self._special_itags else ''))
1194 def _extract_id(self, url):
1195 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1197 raise ExtractorError(u'Invalid URL: %s' % url)
1198 video_id = mobj.group(2)
1201 def _get_video_url_list(self, url_map):
1203 Transform a dictionary in the format {itag:url} to a list of (itag, url)
1204 with the requested formats.
1206 req_format = self._downloader.params.get('format', None)
1207 format_limit = self._downloader.params.get('format_limit', None)
1208 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1209 if format_limit is not None and format_limit in available_formats:
1210 format_list = available_formats[available_formats.index(format_limit):]
1212 format_list = available_formats
1213 existing_formats = [x for x in format_list if x in url_map]
1214 if len(existing_formats) == 0:
1215 raise ExtractorError(u'no known formats available for video')
1216 if self._downloader.params.get('listformats', None):
1217 self._print_formats(existing_formats)
1219 if req_format is None or req_format == 'best':
1220 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1221 elif req_format == 'worst':
1222 video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality
1223 elif req_format in ('-1', 'all'):
1224 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1226 # Specific formats. We pick the first in a slash-delimeted sequence.
1227 # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
1228 # available in the specified format. For example,
1229 # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1230 # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
1231 # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
1232 req_formats = req_format.split('/')
1233 video_url_list = None
1234 for rf in req_formats:
1236 video_url_list = [(rf, url_map[rf])]
1238 if rf in self._video_formats_map:
1239 for srf in self._video_formats_map[rf]:
1241 video_url_list = [(srf, url_map[srf])]
1246 if video_url_list is None:
1247 raise ExtractorError(u'requested format not available')
1248 return video_url_list
1250 def _extract_from_m3u8(self, manifest_url, video_id):
1252 def _get_urls(_manifest):
1253 lines = _manifest.split('\n')
1254 urls = filter(lambda l: l and not l.startswith('#'),
1257 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1258 formats_urls = _get_urls(manifest)
1259 for format_url in formats_urls:
1260 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1261 url_map[itag] = format_url
1264 def _real_extract(self, url):
1265 if re.match(r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$', url):
1266 self._downloader.report_warning(u'Did you forget to quote the URL? Remember that & is a meta-character in most shells, so you want to put the URL in quotes, like youtube-dl \'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\' (or simply youtube-dl BaW_jenozKc ).')
1268 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1269 mobj = re.search(self._NEXT_URL_RE, url)
1271 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1272 video_id = self._extract_id(url)
1275 self.report_video_webpage_download(video_id)
1276 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1277 request = compat_urllib_request.Request(url)
1279 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
1280 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1281 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
1283 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
1285 # Attempt to extract SWF player URL
1286 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1287 if mobj is not None:
1288 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1293 self.report_video_info_webpage_download(video_id)
1294 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1295 self.report_age_confirmation()
1297 # We simulate the access to the video from www.youtube.com/v/{video_id}
1298 # this can be viewed without login into Youtube
1299 data = compat_urllib_parse.urlencode({'video_id': video_id,
1303 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1307 video_info_url = 'https://www.youtube.com/get_video_info?' + data
1308 video_info_webpage = self._download_webpage(video_info_url, video_id,
1310 errnote='unable to download video info webpage')
1311 video_info = compat_parse_qs(video_info_webpage)
1314 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1315 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1316 % (video_id, el_type))
1317 video_info_webpage = self._download_webpage(video_info_url, video_id,
1319 errnote='unable to download video info webpage')
1320 video_info = compat_parse_qs(video_info_webpage)
1321 if 'token' in video_info:
1323 if 'token' not in video_info:
1324 if 'reason' in video_info:
1325 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
1327 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1329 # Check for "rental" videos
1330 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1331 raise ExtractorError(u'"rental" videos not supported')
1333 # Start extracting information
1334 self.report_information_extraction(video_id)
1337 if 'author' not in video_info:
1338 raise ExtractorError(u'Unable to extract uploader name')
1339 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1342 video_uploader_id = None
1343 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1344 if mobj is not None:
1345 video_uploader_id = mobj.group(1)
1347 self._downloader.report_warning(u'unable to extract uploader nickname')
1350 if 'title' not in video_info:
1351 raise ExtractorError(u'Unable to extract video title')
1352 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1355 # We try first to get a high quality image:
1356 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1357 video_webpage, re.DOTALL)
1358 if m_thumb is not None:
1359 video_thumbnail = m_thumb.group(1)
1360 elif 'thumbnail_url' not in video_info:
1361 self._downloader.report_warning(u'unable to extract video thumbnail')
1362 video_thumbnail = ''
1363 else: # don't panic if we can't find it
1364 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1368 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1369 if mobj is not None:
1370 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1371 upload_date = unified_strdate(upload_date)
1374 video_description = get_element_by_id("eow-description", video_webpage)
1375 if video_description:
1376 video_description = clean_html(video_description)
1378 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1380 video_description = unescapeHTML(fd_mobj.group(1))
1382 video_description = u''
1385 video_subtitles = self.extract_subtitles(video_id, video_webpage)
1387 if self._downloader.params.get('listsubtitles', False):
1388 self._list_available_subtitles(video_id, video_webpage)
1391 if 'length_seconds' not in video_info:
1392 self._downloader.report_warning(u'unable to extract video duration')
1395 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
1397 # Decide which formats to download
1400 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
1402 raise ValueError('Could not find vevo ID')
1403 info = json.loads(mobj.group(1))
1405 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1406 # this signatures are encrypted
1407 m_s = re.search(r'[&,]s=', args['url_encoded_fmt_stream_map'])
1409 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
1410 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
1411 m_s = re.search(r'[&,]s=', args.get('adaptive_fmts', u''))
1413 if 'url_encoded_fmt_stream_map' in video_info:
1414 video_info['url_encoded_fmt_stream_map'][0] += ',' + args['adaptive_fmts']
1416 video_info['url_encoded_fmt_stream_map'] = [args['adaptive_fmts']]
1417 elif 'adaptive_fmts' in video_info:
1418 if 'url_encoded_fmt_stream_map' in video_info:
1419 video_info['url_encoded_fmt_stream_map'][0] += ',' + video_info['adaptive_fmts'][0]
1421 video_info['url_encoded_fmt_stream_map'] = video_info['adaptive_fmts']
1425 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1426 self.report_rtmp_download()
1427 video_url_list = [(None, video_info['conn'][0])]
1428 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1429 if 'rtmpe%3Dyes' in video_info['url_encoded_fmt_stream_map'][0]:
1430 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1432 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
1433 url_data = compat_parse_qs(url_data_str)
1434 if 'itag' in url_data and 'url' in url_data:
1435 url = url_data['url'][0]
1436 if 'sig' in url_data:
1437 url += '&signature=' + url_data['sig'][0]
1438 elif 's' in url_data:
1439 encrypted_sig = url_data['s'][0]
1440 if self._downloader.params.get('verbose'):
1442 if player_url is None:
1443 player_version = 'unknown'
1445 player_version = self._search_regex(
1446 r'-(.+)\.swf$', player_url,
1447 u'flash player', fatal=False)
1448 player_desc = 'flash player %s' % player_version
1450 player_version = self._search_regex(
1451 r'html5player-(.+?)\.js', video_webpage,
1452 'html5 player', fatal=False)
1453 player_desc = u'html5 player %s' % player_version
1455 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
1456 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
1457 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1460 jsplayer_url_json = self._search_regex(
1461 r'"assets":.+?"js":\s*("[^"]+")',
1462 video_webpage, u'JS player URL')
1463 player_url = json.loads(jsplayer_url_json)
1465 signature = self._decrypt_signature(
1466 encrypted_sig, video_id, player_url, age_gate)
1467 url += '&signature=' + signature
1468 if 'ratebypass' not in url:
1469 url += '&ratebypass=yes'
1470 url_map[url_data['itag'][0]] = url
1471 video_url_list = self._get_video_url_list(url_map)
1472 if not video_url_list:
1474 elif video_info.get('hlsvp'):
1475 manifest_url = video_info['hlsvp'][0]
1476 url_map = self._extract_from_m3u8(manifest_url, video_id)
1477 video_url_list = self._get_video_url_list(url_map)
1478 if not video_url_list:
1482 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
1485 for format_param, video_real_url in video_url_list:
1487 video_extension = self._video_extensions.get(format_param, 'flv')
1489 video_format = '{0} - {1}{2}'.format(format_param if format_param else video_extension,
1490 self._video_dimensions.get(format_param, '???'),
1491 ' ('+self._special_itags[format_param]+')' if format_param in self._special_itags else '')
1495 'url': video_real_url,
1496 'uploader': video_uploader,
1497 'uploader_id': video_uploader_id,
1498 'upload_date': upload_date,
1499 'title': video_title,
1500 'ext': video_extension,
1501 'format': video_format,
1502 'thumbnail': video_thumbnail,
1503 'description': video_description,
1504 'player_url': player_url,
1505 'subtitles': video_subtitles,
1506 'duration': video_duration
1510 class YoutubePlaylistIE(InfoExtractor):
1511 IE_DESC = u'YouTube.com playlists'
1512 _VALID_URL = r"""(?:
1517 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1518 \? (?:.*?&)*? (?:p|a|list)=
1521 ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
1524 ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
1526 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1528 IE_NAME = u'youtube:playlist'
1531 def suitable(cls, url):
1532 """Receives a URL and returns True if suitable for this IE."""
1533 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1535 def _real_extract(self, url):
1536 # Extract playlist id
1537 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1539 raise ExtractorError(u'Invalid URL: %s' % url)
1541 # Download playlist videos from API
1542 playlist_id = mobj.group(1) or mobj.group(2)
1545 for page_num in itertools.count(1):
1546 start_index = self._MAX_RESULTS * (page_num - 1) + 1
1547 if start_index >= 1000:
1548 self._downloader.report_warning(u'Max number of results reached')
1550 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index)
1551 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1554 response = json.loads(page)
1555 except ValueError as err:
1556 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1558 if 'feed' not in response:
1559 raise ExtractorError(u'Got a malformed response from YouTube API')
1560 playlist_title = response['feed']['title']['$t']
1561 if 'entry' not in response['feed']:
1562 # Number of videos is a multiple of self._MAX_RESULTS
1565 for entry in response['feed']['entry']:
1566 index = entry['yt$position']['$t']
1567 if 'media$group' in entry and 'yt$videoid' in entry['media$group']:
1570 'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t']
1573 videos = [v[1] for v in sorted(videos)]
1575 url_results = [self.url_result(vurl, 'Youtube') for vurl in videos]
1576 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1579 class YoutubeChannelIE(InfoExtractor):
1580 IE_DESC = u'YouTube.com channels'
1581 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1582 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1583 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1584 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1585 IE_NAME = u'youtube:channel'
1587 def extract_videos_from_page(self, page):
1589 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1590 if mobj.group(1) not in ids_in_page:
1591 ids_in_page.append(mobj.group(1))
1594 def _real_extract(self, url):
1595 # Extract channel id
1596 mobj = re.match(self._VALID_URL, url)
1598 raise ExtractorError(u'Invalid URL: %s' % url)
1600 # Download channel page
1601 channel_id = mobj.group(1)
1605 url = self._TEMPLATE_URL % (channel_id, pagenum)
1606 page = self._download_webpage(url, channel_id,
1607 u'Downloading page #%s' % pagenum)
1609 # Extract video identifiers
1610 ids_in_page = self.extract_videos_from_page(page)
1611 video_ids.extend(ids_in_page)
1613 # Download any subsequent channel pages using the json-based channel_ajax query
1614 if self._MORE_PAGES_INDICATOR in page:
1615 for pagenum in itertools.count(1):
1616 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1617 page = self._download_webpage(url, channel_id,
1618 u'Downloading page #%s' % pagenum)
1620 page = json.loads(page)
1622 ids_in_page = self.extract_videos_from_page(page['content_html'])
1623 video_ids.extend(ids_in_page)
1625 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1628 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1630 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1631 url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls]
1632 return [self.playlist_result(url_entries, channel_id)]
1635 class YoutubeUserIE(InfoExtractor):
1636 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
1637 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?)|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1638 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1639 _GDATA_PAGE_SIZE = 50
1640 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1641 IE_NAME = u'youtube:user'
1644 def suitable(cls, url):
1645 # Don't return True if the url can be extracted with other youtube
1646 # extractor, the regex would is too permissive and it would match.
1647 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1648 if any(ie.suitable(url) for ie in other_ies): return False
1649 else: return super(YoutubeUserIE, cls).suitable(url)
1651 def _real_extract(self, url):
1653 mobj = re.match(self._VALID_URL, url)
1655 raise ExtractorError(u'Invalid URL: %s' % url)
1657 username = mobj.group(1)
1659 # Download video ids using YouTube Data API. Result size per
1660 # query is limited (currently to 50 videos) so we need to query
1661 # page by page until there are no video ids - it means we got
1666 for pagenum in itertools.count(0):
1667 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1669 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1670 page = self._download_webpage(gdata_url, username,
1671 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1674 response = json.loads(page)
1675 except ValueError as err:
1676 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1677 if 'entry' not in response['feed']:
1678 # Number of videos is a multiple of self._MAX_RESULTS
1681 # Extract video identifiers
1683 for entry in response['feed']['entry']:
1684 ids_in_page.append(entry['id']['$t'].split('/')[-1])
1685 video_ids.extend(ids_in_page)
1687 # A little optimization - if current page is not
1688 # "full", ie. does not contain PAGE_SIZE video ids then
1689 # we can assume that this page is the last one - there
1690 # are no more ids on further pages - no need to query
1693 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1696 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1697 url_results = [self.url_result(rurl, 'Youtube') for rurl in urls]
1698 return [self.playlist_result(url_results, playlist_title = username)]
1700 class YoutubeSearchIE(SearchInfoExtractor):
1701 IE_DESC = u'YouTube.com searches'
1702 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1704 IE_NAME = u'youtube:search'
1705 _SEARCH_KEY = 'ytsearch'
1707 def report_download_page(self, query, pagenum):
1708 """Report attempt to download search page with given number."""
1709 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1711 def _get_n_results(self, query, n):
1712 """Get a specified number of results for a query"""
1718 while (50 * pagenum) < limit:
1719 self.report_download_page(query, pagenum+1)
1720 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1721 request = compat_urllib_request.Request(result_url)
1723 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1724 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1725 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1726 api_response = json.loads(data)['data']
1728 if not 'items' in api_response:
1729 raise ExtractorError(u'[youtube] No video results')
1731 new_ids = list(video['id'] for video in api_response['items'])
1732 video_ids += new_ids
1734 limit = min(n, api_response['totalItems'])
1737 if len(video_ids) > n:
1738 video_ids = video_ids[:n]
1739 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1740 return self.playlist_result(videos, query)
1743 class YoutubeShowIE(InfoExtractor):
1744 IE_DESC = u'YouTube.com (multi-season) shows'
1745 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1746 IE_NAME = u'youtube:show'
1748 def _real_extract(self, url):
1749 mobj = re.match(self._VALID_URL, url)
1750 show_name = mobj.group(1)
1751 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1752 # There's one playlist for each season of the show
1753 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1754 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1755 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1758 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1760 Base class for extractors that fetch info from
1761 http://www.youtube.com/feed_ajax
1762 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1764 _LOGIN_REQUIRED = True
1766 # use action_load_personal_feed instead of action_load_system_feed
1767 _PERSONAL_FEED = False
1770 def _FEED_TEMPLATE(self):
1771 action = 'action_load_system_feed'
1772 if self._PERSONAL_FEED:
1773 action = 'action_load_personal_feed'
1774 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1778 return u'youtube:%s' % self._FEED_NAME
1780 def _real_initialize(self):
1783 def _real_extract(self, url):
1785 # The step argument is available only in 2.7 or higher
1786 for i in itertools.count(0):
1787 paging = i*self._PAGING_STEP
1788 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1789 u'%s feed' % self._FEED_NAME,
1790 u'Downloading page %s' % i)
1791 info = json.loads(info)
1792 feed_html = info['feed_html']
1793 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1794 ids = orderedSet(m.group(1) for m in m_ids)
1795 feed_entries.extend(self.url_result(id, 'Youtube') for id in ids)
1796 if info['paging'] is None:
1798 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1800 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1801 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1802 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1803 _FEED_NAME = 'subscriptions'
1804 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1806 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1807 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1808 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1809 _FEED_NAME = 'recommended'
1810 _PLAYLIST_TITLE = u'Youtube Recommended videos'
1812 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1813 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1814 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1815 _FEED_NAME = 'watch_later'
1816 _PLAYLIST_TITLE = u'Youtube Watch Later'
1818 _PERSONAL_FEED = True
1820 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1821 IE_NAME = u'youtube:favorites'
1822 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1823 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1824 _LOGIN_REQUIRED = True
1826 def _real_extract(self, url):
1827 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1828 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1829 return self.url_result(playlist_id, 'YoutubePlaylist')