release 2014.01.23.3

[youtube-dl.git] / youtube_dl / extractor / youtube.py
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py

index 870b7c4cabb502f870b7f1aeee9b5295156b3e29..f70dca77ce6db3c04aadfb782132c52456eb71d6 100644 (file)
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -28,6 +28,7 @@ from ..utils import (
      get_element_by_attribute,
      ExtractorError,
      int_or_none,
+    PagedList,
      RegexNotFoundError,
      unescapeHTML,
      unified_strdate,
@@ -206,6 +207,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
          '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
  
          # Dash webm
+        '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
+        '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
+        '168': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
+        '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
+        '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
          '242': {'ext': 'webm', 'height': 240, 'resolution': '240p', 'format_note': 'DASH webm', 'preference': -40},
          '243': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': 'DASH webm', 'preference': -40},
          '244': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
@@ -275,16 +281,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
              u"url": u"http://www.youtube.com/watch?v=a9LDPn-MO4I",
              u"file": u"a9LDPn-MO4I.m4a",
              u"note": u"256k DASH audio (format 141) via DASH manifest",
-            u"params": {
-                u"format": "141"
-            },
              u"info_dict": {
                  u"upload_date": "20121002",
                  u"uploader_id": "8KVIDEO",
                  u"description": "No description available.",
                  u"uploader": "8KVIDEO",
                  u"title": "UHDTV TEST 8K VIDEO.mp4"
-            }
+            },
+            u"params": {
+                u"youtube_include_dash_manifest": True,
+                u"format": "141",
+            },
          },
      ]
  
@@ -1288,7 +1295,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
                      'url': video_real_url,
                      'player_url': player_url,
                  }
-                dct.update(self._formats[itag])
+                if itag in self._formats:
+                    dct.update(self._formats[itag])
                  formats.append(dct)
              return formats
  
@@ -1354,7 +1362,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
  
          # Look for the DASH manifest
          dash_manifest_url_lst = video_info.get('dashmpd')
-        if dash_manifest_url_lst and dash_manifest_url_lst[0]:
+        if (dash_manifest_url_lst and dash_manifest_url_lst[0] and
+                self._downloader.params.get('youtube_include_dash_manifest', False)):
              try:
                  dash_doc = self._download_xml(
                      dash_manifest_url_lst[0], video_id,
@@ -1626,44 +1635,35 @@ class YoutubeUserIE(InfoExtractor):
          # page by page until there are no video ids - it means we got
          # all of them.
  
-        url_results = []
-
-        for pagenum in itertools.count(0):
+        def download_page(pagenum):
              start_index = pagenum * self._GDATA_PAGE_SIZE + 1
  
              gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
-            page = self._download_webpage(gdata_url, username,
-                                          u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
+            page = self._download_webpage(
+                gdata_url, username,
+                u'Downloading video ids from %d to %d' % (
+                    start_index, start_index + self._GDATA_PAGE_SIZE))
  
              try:
                  response = json.loads(page)
              except ValueError as err:
                  raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
              if 'entry' not in response['feed']:
-                # Number of videos is a multiple of self._MAX_RESULTS
-                break
+                return
  
              # Extract video identifiers
              entries = response['feed']['entry']
              for entry in entries:
                  title = entry['title']['$t']
                  video_id = entry['id']['$t'].split('/')[-1]
-                url_results.append({
+                yield {
                      '_type': 'url',
                      'url': video_id,
                      'ie_key': 'Youtube',
                      'id': 'video_id',
                      'title': title,
-                })
-
-            # A little optimization - if current page is not
-            # "full", ie. does not contain PAGE_SIZE video ids then
-            # we can assume that this page is the last one - there
-            # are no more ids on further pages - no need to query
-            # again.
-
-            if len(entries) < self._GDATA_PAGE_SIZE:
-                break
+                }
+        url_results = PagedList(download_page, self._GDATA_PAGE_SIZE)
  
          return self.playlist_result(url_results, playlist_title=username)
  
@@ -1811,7 +1811,10 @@ class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
  class YoutubeTruncatedURLIE(InfoExtractor):
      IE_NAME = 'youtube:truncated_url'
      IE_DESC = False  # Do not list
-    _VALID_URL = r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$'
+    _VALID_URL = r'''(?x)
+        (?:https?://)?[^/]+/watch\?feature=[a-z_]+$|
+        (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
+    '''
  
      def _real_extract(self, url):
          raise ExtractorError(