[youtube:playlist] Fetch all the videos in a mix (fixes #3837)

author Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>

Sun, 17 Apr 2016 15:07:57 +0000 (17:07 +0200)

committer Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>

Sun, 17 Apr 2016 15:07:57 +0000 (17:07 +0200)
author Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>
Sun, 17 Apr 2016 15:07:57 +0000 (17:07 +0200)
committer Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>
Sun, 17 Apr 2016 15:07:57 +0000 (17:07 +0200)
diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py

index 47df0f348d862e4ab455058d00321636123db807..af1c454217d0bec66a27a1bdc89c02195bb6274f 100644 (file)
--- a/test/test_youtube_lists.py
+++ b/test/test_youtube_lists.py
@@ -44,7 +44,7 @@ class TestYoutubeLists(unittest.TestCase):
          ie = YoutubePlaylistIE(dl)
          result = ie.extract('https://www.youtube.com/watch?v=W01L70IGBgE&index=2&list=RDOQpdSVF_k_w')
          entries = result['entries']
-        self.assertTrue(len(entries) >= 20)
+        self.assertTrue(len(entries) >= 50)
          original_video = entries[0]
          self.assertEqual(original_video['id'], 'OQpdSVF_k_w')
  
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py

index 44c1191bd2a2fe5b6dbadf4ee9c0ae1f9b4c869e..a4dd628a1114d5aeb156b025db75ee31544da5f7 100644 (file)
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -1818,20 +1818,32 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
      def _extract_mix(self, playlist_id):
          # The mixes are generated from a single video
          # the id of the playlist is just 'RD' + video_id
-        url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
-        webpage = self._download_webpage(
-            url, playlist_id, 'Downloading Youtube mix')
+        ids = []
+        last_id = playlist_id[-11:]
+        for n in itertools.count(1):
+            url = 'https://youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id)
+            webpage = self._download_webpage(
+                url, playlist_id, 'Downloading page {0} of Youtube mix'.format(n))
+            new_ids = orderedSet(re.findall(
+                r'''(?xs)data-video-username=".*?".*?
+                           href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
+                webpage))
+            # Fetch new pages until all the videos are repeated, it seems that
+            # there are always 51 unique videos.
+            new_ids = [_id for _id in new_ids if _id not in ids]
+            if not new_ids:
+                break
+            ids.extend(new_ids)
+            last_id = ids[-1]
+
+        url_results = self._ids_to_results(ids)
+
          search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
          title_span = (
              search_title('playlist-title') or
              search_title('title long-title') or
              search_title('title'))
          title = clean_html(title_span)
-        ids = orderedSet(re.findall(
-            r'''(?xs)data-video-username=".*?".*?
-                       href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
-            webpage))
-        url_results = self._ids_to_results(ids)
  
          return self.playlist_result(url_results, playlist_id, title)
author	Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>
	Sun, 17 Apr 2016 15:07:57 +0000 (17:07 +0200)
committer	Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>
	Sun, 17 Apr 2016 15:07:57 +0000 (17:07 +0200)
test/test_youtube_lists.py		patch \| blob \| history
youtube_dl/extractor/youtube.py		patch \| blob \| history