youtube_dl/extractor/googleplus.py

   1 import datetime
   2 import re
   3
   4 from .common import InfoExtractor
   5 from ..utils import (
   6     ExtractorError,
   7 )
   8
   9
  10 class GooglePlusIE(InfoExtractor):
  11     """Information extractor for plus.google.com."""
  12
  13     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
  14     IE_NAME = u'plus.google'
  15
  16     def _real_extract(self, url):
  17         # Extract id from URL
  18         mobj = re.match(self._VALID_URL, url)
  19         if mobj is None:
  20             raise ExtractorError(u'Invalid URL: %s' % url)
  21
  22         post_url = mobj.group(0)
  23         video_id = mobj.group(1)
  24
  25         video_extension = 'flv'
  26
  27         # Step 1, Retrieve post webpage to extract further information
  28         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
  29
  30         self.report_extraction(video_id)
  31
  32         # Extract update date
  33         upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
  34             webpage, u'upload date', fatal=False)
  35         if upload_date:
  36             # Convert timestring to a format suitable for filename
  37             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
  38             upload_date = upload_date.strftime('%Y%m%d')
  39
  40         # Extract uploader
  41         uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
  42             webpage, u'uploader', fatal=False)
  43
  44         # Extract title
  45         # Get the first line for title
  46         video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
  47             webpage, 'title', default=u'NA')
  48
  49         # Step 2, Simulate clicking the image box to launch video
  50         DOMAIN = 'https://plus.google.com'
  51         video_page = self._search_regex(r'<a href="((?:%s)?/photos/.*?)"' % re.escape(DOMAIN),
  52             webpage, u'video page URL')
  53         if not video_page.startswith(DOMAIN):
  54             video_page = DOMAIN + video_page
  55
  56         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
  57
  58         # Extract video links on video page
  59         """Extract video links of all sizes"""
  60         pattern = r'\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
  61         mobj = re.findall(pattern, webpage)
  62         if len(mobj) == 0:
  63             raise ExtractorError(u'Unable to extract video links')
  64
  65         # Sort in resolution
  66         links = sorted(mobj)
  67
  68         # Choose the lowest of the sort, i.e. highest resolution
  69         video_url = links[-1]
  70         # Only get the url. The resolution part in the tuple has no use anymore
  71         video_url = video_url[-1]
  72         # Treat escaped \u0026 style hex
  73         try:
  74             video_url = video_url.decode("unicode_escape")
  75         except AttributeError: # Python 3
  76             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
  77
  78
  79         return [{
  80             'id':       video_id,
  81             'url':      video_url,
  82             'uploader': uploader,
  83             'upload_date':  upload_date,
  84             'title':    video_title,
  85             'ext':      video_extension,
  86         }]