Improve rtmpdump support

[youtube-dl.git] / youtube-dl
diff --git a/youtube-dl b/youtube-dl

index 7262ce94d1f94d6a4cb8fe822b272f99db06b57d..6b56f3f98f6b0e03c7d3b623ad3ff99591c2004c 100755 (executable)
--- a/youtube-dl
+++ b/youtube-dl
@@ -2,6 +2,7 @@
  # -*- coding: utf-8 -*-
  # Author: Ricardo Garcia Gonzalez
  # Author: Danny Colligan
+# Author: Benjamin Johnson
  # License: Public domain code
  import htmlentitydefs
  import httplib
@@ -13,11 +14,18 @@ import os.path
  import re
  import socket
  import string
+import subprocess
  import sys
  import time
  import urllib
  import urllib2
  
+# parse_qs was moved from the cgi module to the urlparse module recently.
+try:
+       from urlparse import parse_qs
+except ImportError:
+       from cgi import parse_qs
+
  std_headers = {
         'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.1.2) Gecko/20090729 Firefox/3.5.2',
         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
@@ -308,10 +316,12 @@ class FileDownloader(object):
                 """Process a single dictionary returned by an InfoExtractor."""
                 # Do nothing else if in simulate mode
                 if self.params.get('simulate', False):
-                       try:
-                               info_dict['url'] = self.verify_url(info_dict['url'].encode('utf-8')).decode('utf-8')
-                       except (OSError, IOError, urllib2.URLError, httplib.HTTPException, socket.error), err:
-                               raise UnavailableFormatError
+                       # Verify URL if it's an HTTP one
+                       if info_dict['url'].startswith('http'):
+                               try:
+                                       info_dict['url'] = self.verify_url(info_dict['url'].encode('utf-8')).decode('utf-8')
+                               except (OSError, IOError, urllib2.URLError, httplib.HTTPException, socket.error), err:
+                                       raise UnavailableFormatError
  
                         # Forced printings
                         if self.params.get('forcetitle', False):
@@ -390,7 +400,37 @@ class FileDownloader(object):
                         if info is None:
                                 break
         
+       def _download_with_rtmpdump(self, filename, url):
+               self.report_destination(filename)
+
+               # Check for rtmpdump first
+               try:
+                       subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
+               except (OSError, IOError):
+                       self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
+                       return False
+
+               # Download using rtmpdump. rtmpdump returns exit code 2 when
+               # the connection was interrumpted and resuming appears to be
+               # possible. This is part of rtmpdump's normal usage, AFAIK.
+               basic_args = ['rtmpdump', '-q', '-r', url, '-o', filename]
+               retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
+               while retval == 2 or retval == 1:
+                       self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename), skip_eol=True)
+                       time.sleep(2.0) # This seems to be needed
+                       retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
+               if retval == 0:
+                       self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename))
+                       return True
+               else:
+                       self.trouble('ERROR: rtmpdump exited with code %d' % retval)
+                       return False
+
         def _do_download(self, filename, url):
+               # Attempt to download using rtmpdump
+               if url.startswith('rtmp'):
+                       return self._download_with_rtmpdump(filename, url)
+
                 stream = None
                 open_mode = 'wb'
                 basic_request = urllib2.Request(url, None, std_headers)
@@ -596,6 +636,10 @@ class YoutubeIE(InfoExtractor):
                 """Report extracted video URL."""
                 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
         
+       def report_rtmp_download(self):
+               """Indicate the download will use the RTMP protocol."""
+               self._downloader.to_stdout(u'[youtube] RTMP download detected')
+       
         def _real_initialize(self):
                 if self._downloader is None:
                         return
@@ -694,43 +738,45 @@ class YoutubeIE(InfoExtractor):
                         try:
                                 self.report_video_info_webpage_download(video_id)
                                 video_info_webpage = urllib2.urlopen(request).read()
+                               video_info = parse_qs(video_info_webpage)
                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
                                 return
                         self.report_information_extraction(video_id)
  
                         # "t" param
-                       mobj = re.search(r'(?m)&token=([^&]+)(?:&|$)', video_info_webpage)
-                       if mobj is None:
+                       if 'token' not in video_info:
                                 # Attempt to see if YouTube has issued an error message
-                               mobj = re.search(r'(?m)&reason=([^&]+)(?:&|$)', video_info_webpage)
-                               if mobj is None:
+                               if 'reason' not in video_info:
                                         self._downloader.trouble(u'ERROR: unable to extract "t" parameter for unknown reason')
                                         stream = open('reportme-ydl-%s.dat' % time.time(), 'wb')
                                         stream.write(video_info_webpage)
                                         stream.close()
                                 else:
-                                       reason = urllib.unquote_plus(mobj.group(1))
+                                       reason = urllib.unquote_plus(video_info['reason'][0])
                                         self._downloader.trouble(u'ERROR: YouTube said: %s' % reason.decode('utf-8'))
                                 return
-                       token = urllib.unquote(mobj.group(1))
+                       token = urllib.unquote_plus(video_info['token'][0])
                         video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=detailpage&ps=default&gl=US&hl=en' % (video_id, token)
                         if format_param is not None:
                                 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
  
+                       # Check possible RTMP download
+                       if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
+                               self.report_rtmp_download()
+                               video_real_url = video_info['conn'][0]
+
                         # uploader
-                       mobj = re.search(r'(?m)&author=([^&]+)(?:&|$)', video_info_webpage)
-                       if mobj is None:
+                       if 'author' not in video_info:
                                 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
                                 return
-                       video_uploader = urllib.unquote(mobj.group(1))
+                       video_uploader = urllib.unquote_plus(video_info['author'][0])
  
                         # title
-                       mobj = re.search(r'(?m)&title=([^&]*)(?:&|$)', video_info_webpage)
-                       if mobj is None:
+                       if 'title' not in video_info:
                                 self._downloader.trouble(u'ERROR: unable to extract video title')
                                 return
-                       video_title = urllib.unquote_plus(mobj.group(1))
+                       video_title = urllib.unquote_plus(video_info['title'][0])
                         video_title = video_title.decode('utf-8')
                         video_title = re.sub(ur'(?u)&(.+?);', self.htmlentity_transform, video_title)
                         video_title = video_title.replace(os.sep, u'%')
@@ -894,6 +940,159 @@ class MetacafeIE(InfoExtractor):
                         self._downloader.trouble(u'ERROR: format not available for video')
  
  
+class GoogleIE(InfoExtractor):
+       """Information extractor for video.google.com."""
+
+       _VALID_URL = r'(?:http://)?video\.google\.com/videoplay\?docid=([^\&]+).*'
+
+       def __init__(self, downloader=None):
+               InfoExtractor.__init__(self, downloader)
+
+       @staticmethod
+       def suitable(url):
+               return (re.match(GoogleIE._VALID_URL, url) is not None)
+
+       def report_download_webpage(self, video_id):
+               """Report webpage download."""
+               self._downloader.to_stdout(u'[video.google] %s: Downloading webpage' % video_id)
+
+       def report_extraction(self, video_id):
+               """Report information extraction."""
+               self._downloader.to_stdout(u'[video.google] %s: Extracting information' % video_id)
+
+       def _real_initialize(self):
+               return
+
+       def _real_extract(self, url):
+               # Extract id from URL
+               mobj = re.match(self._VALID_URL, url)
+               if mobj is None:
+                       self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
+                       return
+
+               video_id = mobj.group(1)
+
+               video_extension = 'mp4'
+
+               # Retrieve video webpage to extract further information
+               request = urllib2.Request('http://video.google.com/videoplay?docid=%s' % video_id)
+               try:
+                       self.report_download_webpage(video_id)
+                       webpage = urllib2.urlopen(request).read()
+               except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+                       self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
+                       return
+
+               # Extract URL, uploader, and title from webpage
+               self.report_extraction(video_id)
+               mobj = re.search(r"download_url:'(.*)'", webpage)
+               if mobj is None:
+                       self._downloader.trouble(u'ERROR: unable to extract media URL')
+                       return
+               mediaURL = urllib.unquote(mobj.group(1))
+               mediaURL = mediaURL.replace('\\x3d', '\x3d')
+               mediaURL = mediaURL.replace('\\x26', '\x26')
+
+               video_url = mediaURL
+
+               mobj = re.search(r'<title>(.*)</title>', webpage)
+               if mobj is None:
+                       self._downloader.trouble(u'ERROR: unable to extract title')
+                       return
+               video_title = mobj.group(1).decode('utf-8')
+
+               # Google Video doesn't show uploader nicknames?
+               video_uploader = 'uploader'
+
+               try:
+                       # Process video information
+                       self._downloader.process_info({
+                               'id':           video_id.decode('utf-8'),
+                               'url':          video_url.decode('utf-8'),
+                               'uploader':     video_uploader.decode('utf-8'),
+                               'title':        video_title.decode('utf-8'),
+                               'stitle':       video_title.decode('utf-8'),
+                               'ext':          video_extension.decode('utf-8'),
+                       })
+               except UnavailableFormatError:
+                       self._downloader.trouble(u'ERROR: format not available for video')
+
+
+class PhotobucketIE(InfoExtractor):
+       """Information extractor for photobucket.com."""
+
+       _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
+
+       def __init__(self, downloader=None):
+               InfoExtractor.__init__(self, downloader)
+
+       @staticmethod
+       def suitable(url):
+               return (re.match(PhotobucketIE._VALID_URL, url) is not None)
+
+       def report_download_webpage(self, video_id):
+               """Report webpage download."""
+               self._downloader.to_stdout(u'[photobucket] %s: Downloading webpage' % video_id)
+
+       def report_extraction(self, video_id):
+               """Report information extraction."""
+               self._downloader.to_stdout(u'[photobucket] %s: Extracting information' % video_id)
+
+       def _real_initialize(self):
+               return
+
+       def _real_extract(self, url):
+               # Extract id from URL
+               mobj = re.match(self._VALID_URL, url)
+               if mobj is None:
+                       self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
+                       return
+
+               video_id = mobj.group(1)
+
+               video_extension = 'flv'
+
+               # Retrieve video webpage to extract further information
+               request = urllib2.Request(url)
+               try:
+                       self.report_download_webpage(video_id)
+                       webpage = urllib2.urlopen(request).read()
+               except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+                       self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
+                       return
+
+               # Extract URL, uploader, and title from webpage
+               self.report_extraction(video_id)
+               mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
+               if mobj is None:
+                       self._downloader.trouble(u'ERROR: unable to extract media URL')
+                       return
+               mediaURL = urllib.unquote(mobj.group(1))
+
+               video_url = mediaURL
+
+               mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
+               if mobj is None:
+                       self._downloader.trouble(u'ERROR: unable to extract title')
+                       return
+               video_title = mobj.group(1).decode('utf-8')
+
+               video_uploader = mobj.group(2).decode('utf-8')
+
+               try:
+                       # Process video information
+                       self._downloader.process_info({
+                               'id':           video_id.decode('utf-8'),
+                               'url':          video_url.decode('utf-8'),
+                               'uploader':     video_uploader.decode('utf-8'),
+                               'title':        video_title.decode('utf-8'),
+                               'stitle':       video_title.decode('utf-8'),
+                               'ext':          video_extension.decode('utf-8'),
+                       })
+               except UnavailableFormatError:
+                       self._downloader.trouble(u'ERROR: format not available for video')
+
+
  class YoutubeSearchIE(InfoExtractor):
         """Information Extractor for YouTube search queries."""
         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
@@ -1048,7 +1247,7 @@ class YoutubeUserIE(InfoExtractor):
  
         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
-       _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)'
+       _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
         _youtube_ie = None
  
         def __init__(self, youtube_ie, downloader=None):
@@ -1061,7 +1260,7 @@ class YoutubeUserIE(InfoExtractor):
  
         def report_download_page(self, username):
                 """Report attempt to download user page."""
-               self._downloader.to_stdout(u'[youtube] USR %s: Downloading page ' % (username))
+               self._downloader.to_stdout(u'[youtube] user %s: Downloading page ' % (username))
  
         def _real_initialize(self):
                 self._youtube_ie.initialize()
@@ -1090,7 +1289,6 @@ class YoutubeUserIE(InfoExtractor):
                 ids_in_page = []
  
                 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
-                       print mobj.group(1)
                         if mobj.group(1) not in ids_in_page:
                                 ids_in_page.append(mobj.group(1))
                 video_ids.extend(ids_in_page)
@@ -1176,7 +1374,7 @@ if __name__ == '__main__':
                 # Parse command line
                 parser = optparse.OptionParser(
                         usage='Usage: %prog [options] url...',
-                       version='2009.12.26',
+                       version='2010.01.06',
                         conflict_handler='resolve',
                 )
  
@@ -1273,6 +1471,8 @@ if __name__ == '__main__':
                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
                 youtube_user_ie = YoutubeUserIE(youtube_ie)
                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
+               google_ie = GoogleIE()
+               photobucket_ie = PhotobucketIE()
  
                 # File downloader
                 fd = FileDownloader({
@@ -1298,6 +1498,8 @@ if __name__ == '__main__':
                 fd.add_info_extractor(youtube_user_ie)
                 fd.add_info_extractor(metacafe_ie)
                 fd.add_info_extractor(youtube_ie)
+               fd.add_info_extractor(google_ie)
+               fd.add_info_extractor(photobucket_ie)
  
                 # Update version
                 if opts.update_self: