14 import xml.etree.ElementTree
17 from .common import InfoExtractor, SearchInfoExtractor
18 from .subtitles import SubtitlesInfoExtractor
25 compat_urllib_request,
37 class YoutubeBaseInfoExtractor(InfoExtractor):
38 """Provide base functions for Youtube extractors"""
39 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
40 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
41 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
42 _NETRC_MACHINE = 'youtube'
43 # If True it will raise an error if no login info is provided
44 _LOGIN_REQUIRED = False
46 def report_lang(self):
47 """Report attempt to set language."""
48 self.to_screen(u'Setting language')
50 def _set_language(self):
51 request = compat_urllib_request.Request(self._LANG_URL)
54 compat_urllib_request.urlopen(request).read()
55 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
56 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
61 (username, password) = self._get_login_info()
62 # No authentication to be performed
64 if self._LOGIN_REQUIRED:
65 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
68 request = compat_urllib_request.Request(self._LOGIN_URL)
70 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
71 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
72 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
77 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
80 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
86 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
90 u'PersistentCookie': u'yes',
92 u'bgresponse': u'js_disabled',
93 u'checkConnection': u'',
94 u'checkedDomains': u'youtube',
100 u'signIn': u'Sign in',
102 u'service': u'youtube',
106 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
108 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
109 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
110 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
113 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
114 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
115 self._downloader.report_warning(u'unable to log in: bad username or password')
117 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
118 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
122 def _confirm_age(self):
125 'action_confirm': 'Confirm',
127 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
129 self.report_age_confirmation()
130 compat_urllib_request.urlopen(request).read().decode('utf-8')
131 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
132 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
135 def _real_initialize(self):
136 if self._downloader is None:
138 if not self._set_language():
140 if not self._login():
145 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
146 IE_DESC = u'YouTube.com'
149 (?:https?://)? # http(s):// (optional)
150 (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
151 tube\.majestyc\.net/|
152 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
153 (?:.*?\#/)? # handle anchor (#/) redirect urls
154 (?: # the various things that can precede the ID:
155 (?:(?:v|embed|e)/) # v/ or embed/ or e/
156 |(?: # or the v= param in all its forms
157 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
158 (?:\?|\#!?) # the params delimiter ? or # or #!
159 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
163 |youtu\.be/ # just youtu.be/xxxx
165 )? # all until now is optional -> you can pass the naked ID
166 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
167 (?(1).+)? # if we found the ID, everything can follow
169 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
170 # Listed in order of quality
171 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
172 # Apple HTTP Live Streaming
173 '96', '95', '94', '93', '92', '132', '151',
175 '85', '84', '102', '83', '101', '82', '100',
177 '138', '137', '248', '136', '247', '135', '246',
178 '245', '244', '134', '243', '133', '242', '160',
180 '141', '172', '140', '171', '139',
182 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
183 # Apple HTTP Live Streaming
184 '96', '95', '94', '93', '92', '132', '151',
186 '85', '102', '84', '101', '83', '100', '82',
188 '138', '248', '137', '247', '136', '246', '245',
189 '244', '135', '243', '134', '242', '133', '160',
191 '172', '141', '171', '140', '139',
193 _video_formats_map = {
194 'flv': ['35', '34', '6', '5'],
195 '3gp': ['36', '17', '13'],
196 'mp4': ['38', '37', '22', '18'],
197 'webm': ['46', '45', '44', '43'],
199 _video_extensions = {
221 # Apple HTTP Live Streaming
253 _video_dimensions = {
335 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
336 u"file": u"BaW_jenozKc.mp4",
338 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
339 u"uploader": u"Philipp Hagemeister",
340 u"uploader_id": u"phihag",
341 u"upload_date": u"20121002",
342 u"description": u"test chars: \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
346 u"url": u"http://www.youtube.com/watch?v=1ltcDfZMA3U",
347 u"file": u"1ltcDfZMA3U.flv",
348 u"note": u"Test VEVO video (#897)",
350 u"upload_date": u"20070518",
351 u"title": u"Maps - It Will Find You",
352 u"description": u"Music video by Maps performing It Will Find You.",
353 u"uploader": u"MuteUSA",
354 u"uploader_id": u"MuteUSA"
358 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
359 u"file": u"UxxajLWwzqY.mp4",
360 u"note": u"Test generic use_cipher_signature video (#897)",
362 u"upload_date": u"20120506",
363 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
364 u"description": u"md5:3e2666e0a55044490499ea45fe9037b7",
365 u"uploader": u"Icona Pop",
366 u"uploader_id": u"IconaPop"
370 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
371 u"file": u"07FYdnEawAQ.mp4",
372 u"note": u"Test VEVO video with age protection (#956)",
374 u"upload_date": u"20130703",
375 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
376 u"description": u"md5:64249768eec3bc4276236606ea996373",
377 u"uploader": u"justintimberlakeVEVO",
378 u"uploader_id": u"justintimberlakeVEVO"
382 u'url': u'https://www.youtube.com/watch?v=TGi3HqYrWHE',
383 u'file': u'TGi3HqYrWHE.mp4',
384 u'note': u'm3u8 video',
386 u'title': u'Triathlon - Men - London 2012 Olympic Games',
387 u'description': u'- Men - TR02 - Triathlon - 07 August 2012 - London 2012 Olympic Games',
388 u'uploader': u'olympic',
389 u'upload_date': u'20120807',
390 u'uploader_id': u'olympic',
393 u'skip_download': True,
400 def suitable(cls, url):
401 """Receives a URL and returns True if suitable for this IE."""
402 if YoutubePlaylistIE.suitable(url): return False
403 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
405 def __init__(self, *args, **kwargs):
406 super(YoutubeIE, self).__init__(*args, **kwargs)
407 self._player_cache = {}
409 def report_video_webpage_download(self, video_id):
410 """Report attempt to download video webpage."""
411 self.to_screen(u'%s: Downloading video webpage' % video_id)
413 def report_video_info_webpage_download(self, video_id):
414 """Report attempt to download video info webpage."""
415 self.to_screen(u'%s: Downloading video info webpage' % video_id)
417 def report_information_extraction(self, video_id):
418 """Report attempt to extract video information."""
419 self.to_screen(u'%s: Extracting video information' % video_id)
421 def report_unavailable_format(self, video_id, format):
422 """Report extracted video URL."""
423 self.to_screen(u'%s: Format %s not available' % (video_id, format))
425 def report_rtmp_download(self):
426 """Indicate the download will use the RTMP protocol."""
427 self.to_screen(u'RTMP download detected')
429 def _extract_signature_function(self, video_id, player_url, slen):
430 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
432 player_type = id_m.group('ext')
433 player_id = id_m.group('id')
435 # Read from filesystem cache
436 func_id = '%s_%s_%d' % (player_type, player_id, slen)
437 assert os.path.basename(func_id) == func_id
438 cache_dir = self._downloader.params.get('cachedir',
439 u'~/.youtube-dl/cache')
441 cache_enabled = cache_dir != u'NONE'
443 cache_fn = os.path.join(os.path.expanduser(cache_dir),
447 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
448 cache_spec = json.load(cachef)
449 return lambda s: u''.join(s[i] for i in cache_spec)
451 pass # No cache available
453 if player_type == 'js':
454 code = self._download_webpage(
455 player_url, video_id,
456 note=u'Downloading %s player %s' % (player_type, player_id),
457 errnote=u'Download of %s failed' % player_url)
458 res = self._parse_sig_js(code)
459 elif player_type == 'swf':
460 urlh = self._request_webpage(
461 player_url, video_id,
462 note=u'Downloading %s player %s' % (player_type, player_id),
463 errnote=u'Download of %s failed' % player_url)
465 res = self._parse_sig_swf(code)
467 assert False, 'Invalid player type %r' % player_type
471 cache_res = res(map(compat_chr, range(slen)))
472 cache_spec = [ord(c) for c in cache_res]
474 os.makedirs(os.path.dirname(cache_fn))
475 except OSError as ose:
476 if ose.errno != errno.EEXIST:
478 write_json_file(cache_spec, cache_fn)
480 tb = traceback.format_exc()
481 self._downloader.report_warning(
482 u'Writing cache to %r failed: %s' % (cache_fn, tb))
486 def _print_sig_code(self, func, slen):
487 def gen_sig_code(idxs):
488 def _genslice(start, end, step):
489 starts = u'' if start == 0 else str(start)
490 ends = u':%d' % (end+step)
491 steps = u'' if step == 1 else (':%d' % step)
492 return u's[%s%s%s]' % (starts, ends, steps)
495 start = '(Never used)' # Quelch pyflakes warnings - start will be
496 # set as soon as step is set
497 for i, prev in zip(idxs[1:], idxs[:-1]):
501 yield _genslice(start, prev, step)
504 if i - prev in [-1, 1]:
509 yield u's[%d]' % prev
513 yield _genslice(start, i, step)
515 cache_res = func(map(compat_chr, range(slen)))
516 cache_spec = [ord(c) for c in cache_res]
517 expr_code = u' + '.join(gen_sig_code(cache_spec))
518 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
519 self.to_screen(u'Extracted signature function:\n' + code)
521 def _parse_sig_js(self, jscode):
522 funcname = self._search_regex(
523 r'signature=([a-zA-Z]+)', jscode,
524 u'Initial JS player signature function name')
529 return string.lowercase.index(varname)
531 def interpret_statement(stmt, local_vars, allow_recursion=20):
532 if allow_recursion < 0:
533 raise ExtractorError(u'Recursion limit reached')
535 if stmt.startswith(u'var '):
536 stmt = stmt[len(u'var '):]
537 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
538 r'=(?P<expr>.*)$', stmt)
540 if ass_m.groupdict().get('index'):
542 lvar = local_vars[ass_m.group('out')]
543 idx = interpret_expression(ass_m.group('index'),
544 local_vars, allow_recursion)
545 assert isinstance(idx, int)
548 expr = ass_m.group('expr')
551 local_vars[ass_m.group('out')] = val
553 expr = ass_m.group('expr')
554 elif stmt.startswith(u'return '):
556 expr = stmt[len(u'return '):]
558 raise ExtractorError(
559 u'Cannot determine left side of statement in %r' % stmt)
561 v = interpret_expression(expr, local_vars, allow_recursion)
564 def interpret_expression(expr, local_vars, allow_recursion):
569 return local_vars[expr]
571 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
573 member = m.group('member')
574 val = local_vars[m.group('in')]
575 if member == 'split("")':
577 if member == 'join("")':
579 if member == 'length':
581 if member == 'reverse()':
583 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
585 idx = interpret_expression(
586 slice_m.group('idx'), local_vars, allow_recursion-1)
590 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
592 val = local_vars[m.group('in')]
593 idx = interpret_expression(m.group('idx'), local_vars,
597 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
599 a = interpret_expression(m.group('a'),
600 local_vars, allow_recursion)
601 b = interpret_expression(m.group('b'),
602 local_vars, allow_recursion)
606 r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
608 fname = m.group('func')
609 if fname not in functions:
610 functions[fname] = extract_function(fname)
611 argvals = [int(v) if v.isdigit() else local_vars[v]
612 for v in m.group('args').split(',')]
613 return functions[fname](argvals)
614 raise ExtractorError(u'Unsupported JS expression %r' % expr)
616 def extract_function(funcname):
618 r'function ' + re.escape(funcname) +
619 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
621 argnames = func_m.group('args').split(',')
624 local_vars = dict(zip(argnames, args))
625 for stmt in func_m.group('code').split(';'):
626 res = interpret_statement(stmt, local_vars)
630 initial_function = extract_function(funcname)
631 return lambda s: initial_function([s])
633 def _parse_sig_swf(self, file_contents):
634 if file_contents[1:3] != b'WS':
635 raise ExtractorError(
636 u'Not an SWF file; header is %r' % file_contents[:3])
637 if file_contents[:1] == b'C':
638 content = zlib.decompress(file_contents[8:])
640 raise NotImplementedError(u'Unsupported compression format %r' %
643 def extract_tags(content):
645 while pos < len(content):
646 header16 = struct.unpack('<H', content[pos:pos+2])[0]
648 tag_code = header16 >> 6
649 tag_len = header16 & 0x3f
651 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
653 assert pos+tag_len <= len(content)
654 yield (tag_code, content[pos:pos+tag_len])
658 for tag_code, tag in extract_tags(content)
660 p = code_tag.index(b'\0', 4) + 1
661 code_reader = io.BytesIO(code_tag[p:])
663 # Parse ABC (AVM2 ByteCode)
664 def read_int(reader=None):
672 b = struct.unpack('<B', buf)[0]
673 res = res | ((b & 0x7f) << shift)
679 def u30(reader=None):
680 res = read_int(reader)
681 assert res & 0xf0000000 == 0
685 def s32(reader=None):
687 if v & 0x80000000 != 0:
688 v = - ((v ^ 0xffffffff) + 1)
691 def read_string(reader=None):
695 resb = reader.read(slen)
696 assert len(resb) == slen
697 return resb.decode('utf-8')
699 def read_bytes(count, reader=None):
702 resb = reader.read(count)
703 assert len(resb) == count
706 def read_byte(reader=None):
707 resb = read_bytes(1, reader=reader)
708 res = struct.unpack('<B', resb)[0]
711 # minor_version + major_version
716 for _c in range(1, int_count):
719 for _c in range(1, uint_count):
722 read_bytes((double_count-1) * 8)
724 constant_strings = [u'']
725 for _c in range(1, string_count):
727 constant_strings.append(s)
728 namespace_count = u30()
729 for _c in range(1, namespace_count):
733 for _c in range(1, ns_set_count):
735 for _c2 in range(count):
737 multiname_count = u30()
746 0x0e: 2, # MultinameA
747 0x1b: 1, # MultinameL
748 0x1c: 1, # MultinameLA
751 for _c in range(1, multiname_count):
753 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
755 u30() # namespace_idx
757 multinames.append(constant_strings[name_idx])
759 multinames.append('[MULTINAME kind: %d]' % kind)
760 for _c2 in range(MULTINAME_SIZES[kind]):
765 MethodInfo = collections.namedtuple(
767 ['NEED_ARGUMENTS', 'NEED_REST'])
769 for method_id in range(method_count):
772 for _ in range(param_count):
774 u30() # name index (always 0 for youtube)
776 if flags & 0x08 != 0:
779 for c in range(option_count):
782 if flags & 0x80 != 0:
783 # Param names present
784 for _ in range(param_count):
786 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
787 method_infos.append(mi)
790 metadata_count = u30()
791 for _c in range(metadata_count):
794 for _c2 in range(item_count):
798 def parse_traits_info():
799 trait_name_idx = u30()
800 kind_full = read_byte()
801 kind = kind_full & 0x0f
802 attrs = kind_full >> 4
804 if kind in [0x00, 0x06]: # Slot or Const
806 u30() # type_name_idx
810 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
813 methods[multinames[trait_name_idx]] = method_idx
814 elif kind == 0x04: # Class
817 elif kind == 0x05: # Function
820 methods[function_idx] = multinames[trait_name_idx]
822 raise ExtractorError(u'Unsupported trait kind %d' % kind)
824 if attrs & 0x4 != 0: # Metadata present
825 metadata_count = u30()
826 for _c3 in range(metadata_count):
827 u30() # metadata index
832 TARGET_CLASSNAME = u'SignatureDecipher'
833 searched_idx = multinames.index(TARGET_CLASSNAME)
834 searched_class_id = None
836 for class_id in range(class_count):
838 if name_idx == searched_idx:
839 # We found the class we're looking for!
840 searched_class_id = class_id
841 u30() # super_name idx
843 if flags & 0x08 != 0: # Protected namespace is present
844 u30() # protected_ns_idx
846 for _c2 in range(intrf_count):
850 for _c2 in range(trait_count):
853 if searched_class_id is None:
854 raise ExtractorError(u'Target class %r not found' %
859 for class_id in range(class_count):
862 for _c2 in range(trait_count):
863 trait_methods = parse_traits_info()
864 if class_id == searched_class_id:
865 method_names.update(trait_methods.items())
866 method_idxs.update(dict(
868 for name, idx in trait_methods.items()))
872 for _c in range(script_count):
875 for _c2 in range(trait_count):
879 method_body_count = u30()
880 Method = collections.namedtuple('Method', ['code', 'local_count'])
882 for _c in range(method_body_count):
886 u30() # init_scope_depth
887 u30() # max_scope_depth
889 code = read_bytes(code_length)
890 if method_idx in method_idxs:
891 m = Method(code, local_count)
892 methods[method_idxs[method_idx]] = m
893 exception_count = u30()
894 for _c2 in range(exception_count):
901 for _c2 in range(trait_count):
904 assert p + code_reader.tell() == len(code_tag)
905 assert len(methods) == len(method_idxs)
907 method_pyfunctions = {}
909 def extract_function(func_name):
910 if func_name in method_pyfunctions:
911 return method_pyfunctions[func_name]
912 if func_name not in methods:
913 raise ExtractorError(u'Cannot find function %r' % func_name)
914 m = methods[func_name]
917 registers = ['(this)'] + list(args) + [None] * m.local_count
919 coder = io.BytesIO(m.code)
921 opcode = struct.unpack('!B', coder.read(1))[0]
922 if opcode == 36: # pushbyte
923 v = struct.unpack('!B', coder.read(1))[0]
925 elif opcode == 44: # pushstring
927 stack.append(constant_strings[idx])
928 elif opcode == 48: # pushscope
929 # We don't implement the scope register, so we'll just
930 # ignore the popped value
932 elif opcode == 70: # callproperty
934 mname = multinames[index]
935 arg_count = u30(coder)
936 args = list(reversed(
937 [stack.pop() for _ in range(arg_count)]))
939 if mname == u'split':
940 assert len(args) == 1
941 assert isinstance(args[0], compat_str)
942 assert isinstance(obj, compat_str)
946 res = obj.split(args[0])
948 elif mname == u'slice':
949 assert len(args) == 1
950 assert isinstance(args[0], int)
951 assert isinstance(obj, list)
954 elif mname == u'join':
955 assert len(args) == 1
956 assert isinstance(args[0], compat_str)
957 assert isinstance(obj, list)
958 res = args[0].join(obj)
960 elif mname in method_pyfunctions:
961 stack.append(method_pyfunctions[mname](args))
963 raise NotImplementedError(
964 u'Unsupported property %r on %r'
966 elif opcode == 72: # returnvalue
969 elif opcode == 79: # callpropvoid
971 mname = multinames[index]
972 arg_count = u30(coder)
973 args = list(reversed(
974 [stack.pop() for _ in range(arg_count)]))
976 if mname == u'reverse':
977 assert isinstance(obj, list)
980 raise NotImplementedError(
981 u'Unsupported (void) property %r on %r'
983 elif opcode == 93: # findpropstrict
985 mname = multinames[index]
986 res = extract_function(mname)
988 elif opcode == 97: # setproperty
993 assert isinstance(obj, list)
994 assert isinstance(idx, int)
996 elif opcode == 98: # getlocal
998 stack.append(registers[index])
999 elif opcode == 99: # setlocal
1002 registers[index] = value
1003 elif opcode == 102: # getproperty
1005 pname = multinames[index]
1006 if pname == u'length':
1008 assert isinstance(obj, list)
1009 stack.append(len(obj))
1010 else: # Assume attribute access
1012 assert isinstance(idx, int)
1014 assert isinstance(obj, list)
1015 stack.append(obj[idx])
1016 elif opcode == 128: # coerce
1018 elif opcode == 133: # coerce_s
1019 assert isinstance(stack[-1], (type(None), compat_str))
1020 elif opcode == 164: # modulo
1021 value2 = stack.pop()
1022 value1 = stack.pop()
1023 res = value1 % value2
1025 elif opcode == 208: # getlocal_0
1026 stack.append(registers[0])
1027 elif opcode == 209: # getlocal_1
1028 stack.append(registers[1])
1029 elif opcode == 210: # getlocal_2
1030 stack.append(registers[2])
1031 elif opcode == 211: # getlocal_3
1032 stack.append(registers[3])
1033 elif opcode == 214: # setlocal_2
1034 registers[2] = stack.pop()
1035 elif opcode == 215: # setlocal_3
1036 registers[3] = stack.pop()
1038 raise NotImplementedError(
1039 u'Unsupported opcode %d' % opcode)
1041 method_pyfunctions[func_name] = resfunc
1044 initial_function = extract_function(u'decipher')
1045 return lambda s: initial_function([s])
1047 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
1048 """Turn the encrypted s field into a working signature"""
1050 if player_url is not None:
1052 if player_url not in self._player_cache:
1053 func = self._extract_signature_function(
1054 video_id, player_url, len(s)
1056 self._player_cache[player_url] = func
1057 func = self._player_cache[player_url]
1058 if self._downloader.params.get('youtube_print_sig_code'):
1059 self._print_sig_code(func, len(s))
1062 tb = traceback.format_exc()
1063 self._downloader.report_warning(
1064 u'Automatic signature extraction failed: ' + tb)
1066 self._downloader.report_warning(
1067 u'Warning: Falling back to static signature algorithm')
1068 return self._static_decrypt_signature(
1069 s, video_id, player_url, age_gate)
1071 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
1073 # The videos with age protection use another player, so the
1074 # algorithms can be different.
1076 return s[2:63] + s[82] + s[64:82] + s[63]
1079 return s[86:29:-1] + s[88] + s[28:5:-1]
1081 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
1083 return s[84:27:-1] + s[86] + s[26:5:-1]
1085 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
1087 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
1089 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
1091 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
1093 return s[5:34] + s[0] + s[35:38] + s[3] + s[39:45] + s[38] + s[46:53] + s[73] + s[54:73] + s[85] + s[74:85] + s[53]
1095 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
1097 return s[81:36:-1] + s[0] + s[35:2:-1]
1099 return s[81:64:-1] + s[82] + s[63:52:-1] + s[45] + s[51:45:-1] + s[1] + s[44:1:-1] + s[0]
1101 return s[80:73:-1] + s[81] + s[72:54:-1] + s[2] + s[53:43:-1] + s[0] + s[42:2:-1] + s[43] + s[1] + s[54]
1103 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1105 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
1107 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1110 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
1112 def _get_available_subtitles(self, video_id):
1114 sub_list = self._download_webpage(
1115 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1116 video_id, note=False)
1117 except ExtractorError as err:
1118 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
1120 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
1125 params = compat_urllib_parse.urlencode({
1128 'fmt': self._downloader.params.get('subtitlesformat'),
1130 url = u'http://www.youtube.com/api/timedtext?' + params
1131 sub_lang_list[lang] = url
1132 if not sub_lang_list:
1133 self._downloader.report_warning(u'video doesn\'t have subtitles')
1135 return sub_lang_list
1137 def _get_available_automatic_caption(self, video_id, webpage):
1138 """We need the webpage for getting the captions url, pass it as an
1139 argument to speed up the process."""
1140 sub_format = self._downloader.params.get('subtitlesformat')
1141 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1142 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
1143 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
1145 self._downloader.report_warning(err_msg)
1147 player_config = json.loads(mobj.group(1))
1149 args = player_config[u'args']
1150 caption_url = args[u'ttsurl']
1151 timestamp = args[u'timestamp']
1152 # We get the available subtitles
1153 list_params = compat_urllib_parse.urlencode({
1158 list_url = caption_url + '&' + list_params
1159 list_page = self._download_webpage(list_url, video_id)
1160 caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8'))
1161 original_lang_node = caption_list.find('track')
1162 if original_lang_node.attrib.get('kind') != 'asr' :
1163 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1165 original_lang = original_lang_node.attrib['lang_code']
1168 for lang_node in caption_list.findall('target'):
1169 sub_lang = lang_node.attrib['lang_code']
1170 params = compat_urllib_parse.urlencode({
1171 'lang': original_lang,
1177 sub_lang_list[sub_lang] = caption_url + '&' + params
1178 return sub_lang_list
1179 # An extractor error can be raise by the download process if there are
1180 # no automatic captions but there are subtitles
1181 except (KeyError, ExtractorError):
1182 self._downloader.report_warning(err_msg)
1185 def _print_formats(self, formats):
1186 print('Available formats:')
1188 print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'),
1189 self._video_dimensions.get(x, '???'),
1190 ' ('+self._special_itags[x]+')' if x in self._special_itags else ''))
1192 def _extract_id(self, url):
1193 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1195 raise ExtractorError(u'Invalid URL: %s' % url)
1196 video_id = mobj.group(2)
1199 def _get_video_url_list(self, url_map):
1201 Transform a dictionary in the format {itag:url} to a list of (itag, url)
1202 with the requested formats.
1204 req_format = self._downloader.params.get('format', None)
1205 format_limit = self._downloader.params.get('format_limit', None)
1206 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1207 if format_limit is not None and format_limit in available_formats:
1208 format_list = available_formats[available_formats.index(format_limit):]
1210 format_list = available_formats
1211 existing_formats = [x for x in format_list if x in url_map]
1212 if len(existing_formats) == 0:
1213 raise ExtractorError(u'no known formats available for video')
1214 if self._downloader.params.get('listformats', None):
1215 self._print_formats(existing_formats)
1217 if req_format is None or req_format == 'best':
1218 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1219 elif req_format == 'worst':
1220 video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality
1221 elif req_format in ('-1', 'all'):
1222 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1224 # Specific formats. We pick the first in a slash-delimeted sequence.
1225 # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
1226 # available in the specified format. For example,
1227 # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1228 # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
1229 # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
1230 req_formats = req_format.split('/')
1231 video_url_list = None
1232 for rf in req_formats:
1234 video_url_list = [(rf, url_map[rf])]
1236 if rf in self._video_formats_map:
1237 for srf in self._video_formats_map[rf]:
1239 video_url_list = [(srf, url_map[srf])]
1244 if video_url_list is None:
1245 raise ExtractorError(u'requested format not available')
1246 return video_url_list
1248 def _extract_from_m3u8(self, manifest_url, video_id):
1250 def _get_urls(_manifest):
1251 lines = _manifest.split('\n')
1252 urls = filter(lambda l: l and not l.startswith('#'),
1255 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1256 formats_urls = _get_urls(manifest)
1257 for format_url in formats_urls:
1258 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1259 url_map[itag] = format_url
1262 def _real_extract(self, url):
1263 if re.match(r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$', url):
1264 self._downloader.report_warning(u'Did you forget to quote the URL? Remember that & is a meta-character in most shells, so you want to put the URL in quotes, like youtube-dl \'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\' (or simply youtube-dl BaW_jenozKc ).')
1266 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1267 mobj = re.search(self._NEXT_URL_RE, url)
1269 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1270 video_id = self._extract_id(url)
1273 self.report_video_webpage_download(video_id)
1274 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1275 request = compat_urllib_request.Request(url)
1277 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
1278 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1279 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
1281 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
1283 # Attempt to extract SWF player URL
1284 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1285 if mobj is not None:
1286 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1291 self.report_video_info_webpage_download(video_id)
1292 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1293 self.report_age_confirmation()
1295 # We simulate the access to the video from www.youtube.com/v/{video_id}
1296 # this can be viewed without login into Youtube
1297 data = compat_urllib_parse.urlencode({'video_id': video_id,
1301 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1305 video_info_url = 'https://www.youtube.com/get_video_info?' + data
1306 video_info_webpage = self._download_webpage(video_info_url, video_id,
1308 errnote='unable to download video info webpage')
1309 video_info = compat_parse_qs(video_info_webpage)
1312 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1313 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1314 % (video_id, el_type))
1315 video_info_webpage = self._download_webpage(video_info_url, video_id,
1317 errnote='unable to download video info webpage')
1318 video_info = compat_parse_qs(video_info_webpage)
1319 if 'token' in video_info:
1321 if 'token' not in video_info:
1322 if 'reason' in video_info:
1323 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
1325 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1327 # Check for "rental" videos
1328 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1329 raise ExtractorError(u'"rental" videos not supported')
1331 # Start extracting information
1332 self.report_information_extraction(video_id)
1335 if 'author' not in video_info:
1336 raise ExtractorError(u'Unable to extract uploader name')
1337 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1340 video_uploader_id = None
1341 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1342 if mobj is not None:
1343 video_uploader_id = mobj.group(1)
1345 self._downloader.report_warning(u'unable to extract uploader nickname')
1348 if 'title' not in video_info:
1349 raise ExtractorError(u'Unable to extract video title')
1350 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1353 # We try first to get a high quality image:
1354 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1355 video_webpage, re.DOTALL)
1356 if m_thumb is not None:
1357 video_thumbnail = m_thumb.group(1)
1358 elif 'thumbnail_url' not in video_info:
1359 self._downloader.report_warning(u'unable to extract video thumbnail')
1360 video_thumbnail = ''
1361 else: # don't panic if we can't find it
1362 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1366 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1367 if mobj is not None:
1368 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1369 upload_date = unified_strdate(upload_date)
1372 video_description = get_element_by_id("eow-description", video_webpage)
1373 if video_description:
1374 video_description = clean_html(video_description)
1376 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1378 video_description = unescapeHTML(fd_mobj.group(1))
1380 video_description = u''
1383 video_subtitles = self.extract_subtitles(video_id, video_webpage)
1385 if self._downloader.params.get('listsubtitles', False):
1386 self._list_available_subtitles(video_id, video_webpage)
1389 if 'length_seconds' not in video_info:
1390 self._downloader.report_warning(u'unable to extract video duration')
1393 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
1395 # Decide which formats to download
1398 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
1400 raise ValueError('Could not find vevo ID')
1401 info = json.loads(mobj.group(1))
1403 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1404 # this signatures are encrypted
1405 m_s = re.search(r'[&,]s=', args['url_encoded_fmt_stream_map'])
1407 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
1408 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
1409 m_s = re.search(r'[&,]s=', args.get('adaptive_fmts', u''))
1411 if 'url_encoded_fmt_stream_map' in video_info:
1412 video_info['url_encoded_fmt_stream_map'][0] += ',' + args['adaptive_fmts']
1414 video_info['url_encoded_fmt_stream_map'] = [args['adaptive_fmts']]
1415 elif 'adaptive_fmts' in video_info:
1416 if 'url_encoded_fmt_stream_map' in video_info:
1417 video_info['url_encoded_fmt_stream_map'][0] += ',' + video_info['adaptive_fmts'][0]
1419 video_info['url_encoded_fmt_stream_map'] = video_info['adaptive_fmts']
1423 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1424 self.report_rtmp_download()
1425 video_url_list = [(None, video_info['conn'][0])]
1426 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1427 if 'rtmpe%3Dyes' in video_info['url_encoded_fmt_stream_map'][0]:
1428 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1430 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
1431 url_data = compat_parse_qs(url_data_str)
1432 if 'itag' in url_data and 'url' in url_data:
1433 url = url_data['url'][0]
1434 if 'sig' in url_data:
1435 url += '&signature=' + url_data['sig'][0]
1436 elif 's' in url_data:
1437 encrypted_sig = url_data['s'][0]
1438 if self._downloader.params.get('verbose'):
1440 if player_url is None:
1441 player_version = 'unknown'
1443 player_version = self._search_regex(
1444 r'-(.+)\.swf$', player_url,
1445 u'flash player', fatal=False)
1446 player_desc = 'flash player %s' % player_version
1448 player_version = self._search_regex(
1449 r'html5player-(.+?)\.js', video_webpage,
1450 'html5 player', fatal=False)
1451 player_desc = u'html5 player %s' % player_version
1453 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
1454 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
1455 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1458 jsplayer_url_json = self._search_regex(
1459 r'"assets":.+?"js":\s*("[^"]+")',
1460 video_webpage, u'JS player URL')
1461 player_url = json.loads(jsplayer_url_json)
1463 signature = self._decrypt_signature(
1464 encrypted_sig, video_id, player_url, age_gate)
1465 url += '&signature=' + signature
1466 if 'ratebypass' not in url:
1467 url += '&ratebypass=yes'
1468 url_map[url_data['itag'][0]] = url
1469 video_url_list = self._get_video_url_list(url_map)
1470 if not video_url_list:
1472 elif video_info.get('hlsvp'):
1473 manifest_url = video_info['hlsvp'][0]
1474 url_map = self._extract_from_m3u8(manifest_url, video_id)
1475 video_url_list = self._get_video_url_list(url_map)
1476 if not video_url_list:
1480 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
1483 for format_param, video_real_url in video_url_list:
1485 video_extension = self._video_extensions.get(format_param, 'flv')
1487 video_format = '{0} - {1}{2}'.format(format_param if format_param else video_extension,
1488 self._video_dimensions.get(format_param, '???'),
1489 ' ('+self._special_itags[format_param]+')' if format_param in self._special_itags else '')
1493 'url': video_real_url,
1494 'uploader': video_uploader,
1495 'uploader_id': video_uploader_id,
1496 'upload_date': upload_date,
1497 'title': video_title,
1498 'ext': video_extension,
1499 'format': video_format,
1500 'thumbnail': video_thumbnail,
1501 'description': video_description,
1502 'player_url': player_url,
1503 'subtitles': video_subtitles,
1504 'duration': video_duration
1508 class YoutubePlaylistIE(InfoExtractor):
1509 IE_DESC = u'YouTube.com playlists'
1510 _VALID_URL = r"""(?:
1515 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1516 \? (?:.*?&)*? (?:p|a|list)=
1519 ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
1522 ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
1524 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1526 IE_NAME = u'youtube:playlist'
1529 def suitable(cls, url):
1530 """Receives a URL and returns True if suitable for this IE."""
1531 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1533 def _real_extract(self, url):
1534 # Extract playlist id
1535 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1537 raise ExtractorError(u'Invalid URL: %s' % url)
1539 # Download playlist videos from API
1540 playlist_id = mobj.group(1) or mobj.group(2)
1543 for page_num in itertools.count(1):
1544 start_index = self._MAX_RESULTS * (page_num - 1) + 1
1545 if start_index >= 1000:
1546 self._downloader.report_warning(u'Max number of results reached')
1548 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index)
1549 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1552 response = json.loads(page)
1553 except ValueError as err:
1554 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1556 if 'feed' not in response:
1557 raise ExtractorError(u'Got a malformed response from YouTube API')
1558 playlist_title = response['feed']['title']['$t']
1559 if 'entry' not in response['feed']:
1560 # Number of videos is a multiple of self._MAX_RESULTS
1563 for entry in response['feed']['entry']:
1564 index = entry['yt$position']['$t']
1565 if 'media$group' in entry and 'yt$videoid' in entry['media$group']:
1568 'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t']
1571 videos = [v[1] for v in sorted(videos)]
1573 url_results = [self.url_result(vurl, 'Youtube') for vurl in videos]
1574 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1577 class YoutubeChannelIE(InfoExtractor):
1578 IE_DESC = u'YouTube.com channels'
1579 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1580 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1581 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1582 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1583 IE_NAME = u'youtube:channel'
1585 def extract_videos_from_page(self, page):
1587 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1588 if mobj.group(1) not in ids_in_page:
1589 ids_in_page.append(mobj.group(1))
1592 def _real_extract(self, url):
1593 # Extract channel id
1594 mobj = re.match(self._VALID_URL, url)
1596 raise ExtractorError(u'Invalid URL: %s' % url)
1598 # Download channel page
1599 channel_id = mobj.group(1)
1603 url = self._TEMPLATE_URL % (channel_id, pagenum)
1604 page = self._download_webpage(url, channel_id,
1605 u'Downloading page #%s' % pagenum)
1607 # Extract video identifiers
1608 ids_in_page = self.extract_videos_from_page(page)
1609 video_ids.extend(ids_in_page)
1611 # Download any subsequent channel pages using the json-based channel_ajax query
1612 if self._MORE_PAGES_INDICATOR in page:
1613 for pagenum in itertools.count(1):
1614 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1615 page = self._download_webpage(url, channel_id,
1616 u'Downloading page #%s' % pagenum)
1618 page = json.loads(page)
1620 ids_in_page = self.extract_videos_from_page(page['content_html'])
1621 video_ids.extend(ids_in_page)
1623 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1626 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1628 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1629 url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls]
1630 return [self.playlist_result(url_entries, channel_id)]
1633 class YoutubeUserIE(InfoExtractor):
1634 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
1635 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?)|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1636 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1637 _GDATA_PAGE_SIZE = 50
1638 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1639 IE_NAME = u'youtube:user'
1642 def suitable(cls, url):
1643 # Don't return True if the url can be extracted with other youtube
1644 # extractor, the regex would is too permissive and it would match.
1645 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1646 if any(ie.suitable(url) for ie in other_ies): return False
1647 else: return super(YoutubeUserIE, cls).suitable(url)
1649 def _real_extract(self, url):
1651 mobj = re.match(self._VALID_URL, url)
1653 raise ExtractorError(u'Invalid URL: %s' % url)
1655 username = mobj.group(1)
1657 # Download video ids using YouTube Data API. Result size per
1658 # query is limited (currently to 50 videos) so we need to query
1659 # page by page until there are no video ids - it means we got
1664 for pagenum in itertools.count(0):
1665 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1667 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1668 page = self._download_webpage(gdata_url, username,
1669 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1672 response = json.loads(page)
1673 except ValueError as err:
1674 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1675 if 'entry' not in response['feed']:
1676 # Number of videos is a multiple of self._MAX_RESULTS
1679 # Extract video identifiers
1681 for entry in response['feed']['entry']:
1682 ids_in_page.append(entry['id']['$t'].split('/')[-1])
1683 video_ids.extend(ids_in_page)
1685 # A little optimization - if current page is not
1686 # "full", ie. does not contain PAGE_SIZE video ids then
1687 # we can assume that this page is the last one - there
1688 # are no more ids on further pages - no need to query
1691 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1694 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1695 url_results = [self.url_result(rurl, 'Youtube') for rurl in urls]
1696 return [self.playlist_result(url_results, playlist_title = username)]
1698 class YoutubeSearchIE(SearchInfoExtractor):
1699 IE_DESC = u'YouTube.com searches'
1700 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1702 IE_NAME = u'youtube:search'
1703 _SEARCH_KEY = 'ytsearch'
1705 def report_download_page(self, query, pagenum):
1706 """Report attempt to download search page with given number."""
1707 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1709 def _get_n_results(self, query, n):
1710 """Get a specified number of results for a query"""
1716 while (50 * pagenum) < limit:
1717 self.report_download_page(query, pagenum+1)
1718 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1719 request = compat_urllib_request.Request(result_url)
1721 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1722 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1723 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1724 api_response = json.loads(data)['data']
1726 if not 'items' in api_response:
1727 raise ExtractorError(u'[youtube] No video results')
1729 new_ids = list(video['id'] for video in api_response['items'])
1730 video_ids += new_ids
1732 limit = min(n, api_response['totalItems'])
1735 if len(video_ids) > n:
1736 video_ids = video_ids[:n]
1737 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1738 return self.playlist_result(videos, query)
1741 class YoutubeShowIE(InfoExtractor):
1742 IE_DESC = u'YouTube.com (multi-season) shows'
1743 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1744 IE_NAME = u'youtube:show'
1746 def _real_extract(self, url):
1747 mobj = re.match(self._VALID_URL, url)
1748 show_name = mobj.group(1)
1749 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1750 # There's one playlist for each season of the show
1751 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1752 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1753 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1756 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1758 Base class for extractors that fetch info from
1759 http://www.youtube.com/feed_ajax
1760 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1762 _LOGIN_REQUIRED = True
1764 # use action_load_personal_feed instead of action_load_system_feed
1765 _PERSONAL_FEED = False
1768 def _FEED_TEMPLATE(self):
1769 action = 'action_load_system_feed'
1770 if self._PERSONAL_FEED:
1771 action = 'action_load_personal_feed'
1772 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1776 return u'youtube:%s' % self._FEED_NAME
1778 def _real_initialize(self):
1781 def _real_extract(self, url):
1783 # The step argument is available only in 2.7 or higher
1784 for i in itertools.count(0):
1785 paging = i*self._PAGING_STEP
1786 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1787 u'%s feed' % self._FEED_NAME,
1788 u'Downloading page %s' % i)
1789 info = json.loads(info)
1790 feed_html = info['feed_html']
1791 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1792 ids = orderedSet(m.group(1) for m in m_ids)
1793 feed_entries.extend(self.url_result(id, 'Youtube') for id in ids)
1794 if info['paging'] is None:
1796 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1798 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1799 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1800 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1801 _FEED_NAME = 'subscriptions'
1802 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1804 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1805 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1806 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1807 _FEED_NAME = 'recommended'
1808 _PLAYLIST_TITLE = u'Youtube Recommended videos'
1810 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1811 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1812 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1813 _FEED_NAME = 'watch_later'
1814 _PLAYLIST_TITLE = u'Youtube Watch Later'
1816 _PERSONAL_FEED = True
1818 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1819 IE_NAME = u'youtube:favorites'
1820 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1821 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1822 _LOGIN_REQUIRED = True
1824 def _real_extract(self, url):
1825 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1826 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1827 return self.url_result(playlist_id, 'YoutubePlaylist')