X-Git-Url: http://git.cielonegro.org/gitweb.cgi?a=blobdiff_plain;f=youtube_dl%2FInfoExtractors.py;h=136e4183ec8b89524a37cdeb617cc28a106a6fca;hb=4b879984eacbb5b6d19e2a9e8627953d51caa8b2;hp=b18a0ba6b47e33058d1fa0ce149649345d955171;hpb=4fcca4bb1897ff111a6332480a569d53bc797838;p=youtube-dl.git

diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py
index b18a0ba6b..136e4183e 100755
--- a/youtube_dl/InfoExtractors.py
+++ b/youtube_dl/InfoExtractors.py
@@ -106,18 +106,18 @@ class InfoExtractor(object):
     def IE_NAME(self):
         return type(self).__name__[:-2]
 
-    def _download_webpage(self, url, video_id, note=None, errnote=None):
+    def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
         if note is None:
             note = u'Downloading video webpage'
         self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
         try:
-            urlh = compat_urllib_request.urlopen(url)
+            urlh = compat_urllib_request.urlopen(url_or_request)
             webpage_bytes = urlh.read()
             return webpage_bytes.decode('utf-8', 'replace')
         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
             if errnote is None:
                 errnote = u'Unable to download webpage'
-            raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)))
+            raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 
 
 class YoutubeIE(InfoExtractor):
@@ -412,7 +412,7 @@ class YoutubeIE(InfoExtractor):
 
         # uploader_id
         video_uploader_id = None
-        mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/user/([^"]+)">', video_webpage)
+        mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
         if mobj is not None:
             video_uploader_id = mobj.group(1)
         else:
@@ -675,10 +675,6 @@ class DailymotionIE(InfoExtractor):
     def __init__(self, downloader=None):
         InfoExtractor.__init__(self, downloader)
 
-    def report_download_webpage(self, video_id):
-        """Report webpage download."""
-        self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
-
     def report_extraction(self, video_id):
         """Report information extraction."""
         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
@@ -697,13 +693,7 @@ class DailymotionIE(InfoExtractor):
         # Retrieve video webpage to extract further information
         request = compat_urllib_request.Request(url)
         request.add_header('Cookie', 'family_filter=off')
-        try:
-            self.report_download_webpage(video_id)
-            webpage_bytes = compat_urllib_request.urlopen(request).read()
-            webpage = webpage_bytes.decode('utf-8')
-        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-            self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
-            return
+        webpage = self._download_webpage(request, video_id)
 
         # Extract URL, uploader and title from webpage
         self.report_extraction(video_id)
@@ -2343,7 +2333,6 @@ class ComedyCentralIE(InfoExtractor):
                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
                      $"""
-    IE_NAME = u'comedycentral'
 
     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
 
@@ -2371,16 +2360,12 @@ class ComedyCentralIE(InfoExtractor):
     def report_extraction(self, episode_id):
         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
 
-    def report_config_download(self, episode_id):
-        self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
+    def report_config_download(self, episode_id, media_id):
+        self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
 
     def report_index_download(self, episode_id):
         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
 
-    def report_player_url(self, episode_id):
-        self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
-
-
     def _print_formats(self, formats):
         print('Available formats:')
         for x in formats:
@@ -2419,6 +2404,7 @@ class ComedyCentralIE(InfoExtractor):
         try:
             htmlHandle = compat_urllib_request.urlopen(req)
             html = htmlHandle.read()
+            webpage = html.decode('utf-8')
         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
             return
@@ -2433,29 +2419,20 @@ class ComedyCentralIE(InfoExtractor):
                 return
             epTitle = mobj.group('episode')
 
-        mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', html)
+        mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
 
         if len(mMovieParams) == 0:
             # The Colbert Report embeds the information in a without
             # a URL prefix; so extract the alternate reference
             # and then add the URL prefix manually.
 
-            altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', html)
+            altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
             if len(altMovieParams) == 0:
                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
                 return
             else:
                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
 
-        playerUrl_raw = mMovieParams[0][0]
-        self.report_player_url(epTitle)
-        try:
-            urlHandle = compat_urllib_request.urlopen(playerUrl_raw)
-            playerUrl = urlHandle.geturl()
-        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-            self._downloader.trouble(u'ERROR: unable to find out player URL: ' + compat_str(err))
-            return
-
         uri = mMovieParams[0][1]
         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
         self.report_index_download(epTitle)
@@ -2469,7 +2446,7 @@ class ComedyCentralIE(InfoExtractor):
 
         idoc = xml.etree.ElementTree.fromstring(indexXml)
         itemEls = idoc.findall('.//item')
-        for itemEl in itemEls:
+        for partNum,itemEl in enumerate(itemEls):
             mediaId = itemEl.findall('./guid')[0].text
             shortMediaId = mediaId.split(':')[-1]
             showId = mediaId.split(':')[-2].replace('.com', '')
@@ -2479,7 +2456,7 @@ class ComedyCentralIE(InfoExtractor):
             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
                         compat_urllib_parse.urlencode({'uri': mediaId}))
             configReq = compat_urllib_request.Request(configUrl)
-            self.report_config_download(epTitle)
+            self.report_config_download(epTitle, shortMediaId)
             try:
                 configXml = compat_urllib_request.urlopen(configReq).read()
             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
@@ -2501,7 +2478,7 @@ class ComedyCentralIE(InfoExtractor):
                 return
 
             # For now, just pick the highest bitrate
-            format,video_url = turls[-1]
+            format,rtmp_video_url = turls[-1]
 
             # Get the format arg from the arg stream
             req_format = self._downloader.params.get('format', None)
@@ -2509,18 +2486,16 @@ class ComedyCentralIE(InfoExtractor):
             # Select format if we can find one
             for f,v in turls:
                 if f == req_format:
-                    format, video_url = f, v
+                    format, rtmp_video_url = f, v
                     break
 
-            # Patch to download from alternative CDN, which does not
-            # break on current RTMPDump builds
-            broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
-            better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
-
-            if video_url.startswith(broken_cdn):
-                video_url = video_url.replace(broken_cdn, better_cdn)
+            m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
+            if not m:
+                raise ExtractorError(u'Cannot transform RTMP url')
+            base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
+            video_url = base + m.group('finalid')
 
-            effTitle = showId + u'-' + epTitle
+            effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
             info = {
                 'id': shortMediaId,
                 'url': video_url,
@@ -2531,9 +2506,7 @@ class ComedyCentralIE(InfoExtractor):
                 'format': format,
                 'thumbnail': None,
                 'description': officialTitle,
-                'player_url': None #playerUrl
             }
-
             results.append(info)
 
         return results
@@ -2613,7 +2586,6 @@ class EscapistIE(InfoExtractor):
 
         return [info]
 
-
 class CollegeHumorIE(InfoExtractor):
     """Information extractor for collegehumor.com"""
 
@@ -2821,9 +2793,7 @@ class SoundcloudIE(InfoExtractor):
 
 class InfoQIE(InfoExtractor):
     """Information extractor for infoq.com"""
-
     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
-    IE_NAME = u'infoq'
 
     def report_extraction(self, video_id):
         """Report information extraction."""
@@ -3706,8 +3676,8 @@ class SteamIE(InfoExtractor):
         videourl = 'http://store.steampowered.com/video/%s/' % gameID
         webpage = self._download_webpage(videourl, gameID)
         mweb = re.finditer(urlRE, webpage)
-        namesRE = r'<span class=\"title\">(?P<videoName>[\w:/\.\?=\+\s-]+)</span>'
-        titles = list(re.finditer(namesRE, webpage))
+        namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
+        titles = re.finditer(namesRE, webpage)
         videos = []
         for vid,vtitle in zip(mweb,titles):
             video_id = vid.group('videoID')
@@ -3719,7 +3689,7 @@ class SteamIE(InfoExtractor):
                 'id':video_id,
                 'url':video_url,
                 'ext': 'flv',
-                'title': title
+                'title': unescapeHTML(title)
                   }
             videos.append(info)
         return videos