Merge pull request #7599 from lalinsky/fix-youtube

author Sergey M <dstftw@gmail.com>

Mon, 23 Nov 2015 14:52:23 +0000 (20:52 +0600)

committer Sergey M <dstftw@gmail.com>

Mon, 23 Nov 2015 14:52:23 +0000 (20:52 +0600)
author Sergey M <dstftw@gmail.com>
Mon, 23 Nov 2015 14:52:23 +0000 (20:52 +0600)
committer Sergey M <dstftw@gmail.com>
Mon, 23 Nov 2015 14:52:23 +0000 (20:52 +0600)
diff --combined youtube_dl/extractor/youtube.py

index 4a0ff6e9c82810143a450394ed0849493d3cd942,247769067c4789bee293aa240de4215359c3a261..1580c54fe779d9d300481fabfcfa4d2b5174ce06
--- 1/youtube_dl/extractor/youtube.py
--- 2/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@@ -178,13 -178,15 +178,13 @@@ class YoutubeBaseInfoExtractor(InfoExtr
               return
   
   
- -class YoutubePlaylistBaseInfoExtractor(InfoExtractor):
- -    # Extract the video ids from the playlist pages
+ +class YoutubeEntryListBaseInfoExtractor(InfoExtractor):
+ +    # Extract entries from page with "Load more" button
       def _entries(self, page, playlist_id):
           more_widget_html = content_html = page
           for page_num in itertools.count(1):
- -            for video_id, video_title in self.extract_videos_from_page(content_html):
- -                yield self.url_result(
- -                    video_id, 'Youtube', video_id=video_id,
- -                    video_title=video_title)
+ +            for entry in self._process_page(content_html):
+ +                yield entry
   
               mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
               if not mobj:
@@@ -201,12 -203,6 +201,12 @@@
                   break
               more_widget_html = more['load_more_widget_html']
   
+ +
+ +class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
+ +    def _process_page(self, content):
+ +        for video_id, video_title in self.extract_videos_from_page(content):
+ +            yield self.url_result(video_id, 'Youtube', video_id, video_title)
+ +
       def extract_videos_from_page(self, page):
           ids_in_page = []
           titles_in_page = []
@@@ -228,19 -224,6 +228,19 @@@
           return zip(ids_in_page, titles_in_page)
   
   
+ +class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
+ +    def _process_page(self, content):
+ +        for playlist_id in re.findall(r'href="/?playlist\?list=(.+?)"', content):
+ +            yield self.url_result(
+ +                'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist')
+ +
+ +    def _real_extract(self, url):
+ +        playlist_id = self._match_id(url)
+ +        webpage = self._download_webpage(url, playlist_id)
+ +        title = self._og_search_title(webpage, fatal=False)
+ +        return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title)
+ +
+ +
   class YoutubeIE(YoutubeBaseInfoExtractor):
       IE_DESC = 'YouTube.com'
       _VALID_URL = r"""(?x)^
@@@ -691,7 -674,23 +691,23 @@@
           {
               'url': 'http://vid.plus/FlRa-iH7PGw',
               'only_matching': True,
-         }
+         },
+         {
+             # Title with JS-like syntax "};"
+             'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',
+             'info_dict': {
+                 'id': 'lsguqyKfVQg',
+                 'ext': 'mp4',
+                 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',
+                 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
+                 'upload_date': '20151119',
+                 'uploader_id': 'IronSoulElf',
+                 'uploader': 'IronSoulElf',
+             },
+             'params': {
+                 'skip_download': True,
+             },
+         },
       ]
   
       def __init__(self, *args, **kwargs):
@@@ -875,16 -874,24 +891,24 @@@
               return {}
           return sub_lang_list
   
+     def _get_ytplayer_config(self, webpage):
+         patterns = [
+             r';ytplayer\.config\s*=\s*({.*?});ytplayer',
+             r';ytplayer\.config\s*=\s*({.*?});',
+         ]
+         config = self._search_regex(patterns, webpage, 'ytconfig.player', default=None)
+         if config is not None:
+             return json.loads(uppercase_escape(config))
+ 
       def _get_automatic_captions(self, video_id, webpage):
           """We need the webpage for getting the captions url, pass it as an
              argument to speed up the process."""
           self.to_screen('%s: Looking for automatic captions' % video_id)
-         mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
+         player_config = self._get_ytplayer_config(webpage)
           err_msg = 'Couldn\'t find automatic captions for %s' % video_id
-         if mobj is None:
+         if player_config is None:
               self._downloader.report_warning(err_msg)
               return {}
-         player_config = json.loads(mobj.group(1))
           try:
               args = player_config['args']
               caption_url = args['ttsurl']
@@@ -1091,10 -1098,8 +1115,8 @@@
               age_gate = False
               video_info = None
               # Try looking directly into the video webpage
-             mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
-             if mobj:
-                 json_code = uppercase_escape(mobj.group(1))
-                 ytplayer_config = json.loads(json_code)
+             ytplayer_config = self._get_ytplayer_config(video_webpage)
+             if ytplayer_config is not None:
                   args = ytplayer_config['args']
                   if args.get('url_encoded_fmt_stream_map'):
                       # Convert to the same format returned by compat_parse_qs
@@@ -1632,7 -1637,7 +1654,7 @@@ class YoutubePlaylistIE(YoutubeBaseInfo
                   self.report_warning('Youtube gives an alert message: ' + match)
   
           playlist_title = self._html_search_regex(
- -            r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
+ +            r'(?s)<h1 class="pl-header-title[^"]*"[^>]*>\s*(.*?)\s*</h1>',
               page, 'title')
   
           return self.playlist_result(self._entries(page, playlist_id), playlist_id, playlist_title)
@@@ -1759,29 -1764,6 +1781,29 @@@ class YoutubeUserIE(YoutubeChannelIE)
               return super(YoutubeUserIE, cls).suitable(url)
   
   
+ +class YoutubeUserPlaylistsIE(YoutubePlaylistsBaseInfoExtractor):
+ +    IE_DESC = 'YouTube.com user playlists'
+ +    _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/user/(?P<id>[^/]+)/playlists'
+ +    IE_NAME = 'youtube:user:playlists'
+ +
+ +    _TESTS = [{
+ +        'url': 'http://www.youtube.com/user/ThirstForScience/playlists',
+ +        'playlist_mincount': 4,
+ +        'info_dict': {
+ +            'id': 'ThirstForScience',
+ +            'title': 'Thirst for Science',
+ +        },
+ +    }, {
+ +        # with "Load more" button
+ +        'url': 'http://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
+ +        'playlist_mincount': 70,
+ +        'info_dict': {
+ +            'id': 'igorkle1',
+ +            'title': 'Игорь Клейнер',
+ +        },
+ +    }]
+ +
+ +
   class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE):
       IE_DESC = 'YouTube.com searches'
       # there doesn't appear to be a real limit, for example if you search for
@@@ -1877,7 -1859,7 +1899,7 @@@ class YoutubeSearchURLIE(InfoExtractor)
           }
   
   
- -class YoutubeShowIE(InfoExtractor):
+ +class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor):
       IE_DESC = 'YouTube.com (multi-season) shows'
       _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
       IE_NAME = 'youtube:show'
@@@ -1891,9 -1873,26 +1913,9 @@@
       }]
   
       def _real_extract(self, url):
- -        mobj = re.match(self._VALID_URL, url)
- -        playlist_id = mobj.group('id')
- -        webpage = self._download_webpage(
- -            'https://www.youtube.com/show/%s/playlists' % playlist_id, playlist_id, 'Downloading show webpage')
- -        # There's one playlist for each season of the show
- -        m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
- -        self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons)))
- -        entries = [
- -            self.url_result(
- -                'https://www.youtube.com' + season.group(1), 'YoutubePlaylist')
- -            for season in m_seasons
- -        ]
- -        title = self._og_search_title(webpage, fatal=False)
- -
- -        return {
- -            '_type': 'playlist',
- -            'id': playlist_id,
- -            'title': title,
- -            'entries': entries,
- -        }
+ +        playlist_id = self._match_id(url)
+ +        return super(YoutubeShowIE, self)._real_extract(
+ +            'https://www.youtube.com/show/%s/playlists' % playlist_id)
   
   
   class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
author	Sergey M <dstftw@gmail.com>
	Mon, 23 Nov 2015 14:52:23 +0000 (20:52 +0600)
committer	Sergey M <dstftw@gmail.com>
	Mon, 23 Nov 2015 14:52:23 +0000 (20:52 +0600)