youtube_dl/extractor/googledrive.py

   1 from .common import InfoExtractor
   2 from ..utils import RegexNotFoundError
   3
   4 class GoogleDriveIE(InfoExtractor):
   5     _VALID_URL = r'(?:https?://)?(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/))(?P<id>.+?)(?:&|/|$)'
   6     _TEST = {
   7         'url': 'https://drive.google.com/file/d/0BzpExh0WzJF0NlR5WUlxdEVsY0U/edit?pli=1',
   8         'info_dict': {
   9             'id': '0BzpExh0WzJF0NlR5WUlxdEVsY0U',
  10             'ext': 'mp4',
  11             'title': '[AHSH] Fairy Tail S2 - 01 [720p].mp4',
  12         }
  13     }
  14     _formats = {
  15         '5': {'ext': 'flv'},
  16         '6': {'ext': 'flv'},
  17         '13': {'ext': '3gp'},
  18         '17': {'ext': '3gp'},
  19         '18': {'ext': 'mp4'},
  20         '22': {'ext': 'mp4'},
  21         '34': {'ext': 'flv'},
  22         '35': {'ext': 'flv'},
  23         '36': {'ext': '3gp'},
  24         '37': {'ext': 'mp4'},
  25         '38': {'ext': 'mp4'},
  26         '43': {'ext': 'webm'},
  27         '44': {'ext': 'webm'},
  28         '45': {'ext': 'webm'},
  29         '46': {'ext': 'webm'},
  30         '59': {'ext': 'mp4'}
  31     }
  32
  33     def _real_extract(self, url):
  34         video_id = self._match_id(url)
  35         webpage = self._download_webpage(
  36             'http://docs.google.com/file/d/'+video_id, video_id, encoding='unicode_escape'
  37         )
  38         try:
  39             title = self._html_search_regex(
  40                 r'"title","(?P<title>.*?)"',
  41                 webpage,
  42                 'title',
  43                 group='title'
  44             )
  45             fmt_stream_map = self._html_search_regex(
  46                 r'"fmt_stream_map","(?P<fmt_stream_map>.*?)"',
  47                 webpage,
  48                 'fmt_stream_map',
  49                 group='fmt_stream_map'
  50             )
  51             fmt_list = self._html_search_regex(
  52                 r'"fmt_list","(?P<fmt_list>.*?)"',
  53                 webpage,
  54                 'fmt_list',
  55                 group='fmt_list'
  56             )
  57 #                       timestamp = self._html_search_regex(
  58 #                               r'"timestamp","(?P<timestamp>.*?)"',
  59 #                               webpage,
  60 #                               'timestamp',
  61 #                               group='timestamp'
  62 #                       )
  63             length_seconds = self._html_search_regex(
  64                 r'"length_seconds","(?P<length_seconds>.*?)"',
  65                 webpage,
  66                 'length_seconds',
  67                 group='length_seconds'
  68             )
  69         except RegexNotFoundError:
  70             try:
  71                 reason = self._html_search_regex(
  72                     r'"reason","(?P<reason>.*?)"',
  73                     webpage,
  74                     'reason',
  75                     group='reason'
  76                 )
  77                 self.report_warning(reason)
  78                 return
  79             except RegexNotFoundError:
  80                 self.report_warning('not a video')
  81                 return
  82
  83         fmt_stream_map = fmt_stream_map.split(',')
  84         fmt_list = fmt_list.split(',')
  85         formats = []
  86         for i in range(len(fmt_stream_map)):
  87             fmt_id, fmt_url = fmt_stream_map[i].split('|')
  88             resolution = fmt_list[i].split('/')[1]
  89             width, height = resolution.split('x')
  90             formats.append({
  91                 'url': fmt_url,
  92                 'format_id': fmt_id,
  93                 'resolution': resolution,
  94                 'width': int(width),
  95                 'height': int(height),
  96                 'ext': self._formats[fmt_id]['ext']
  97             })
  98         self._sort_formats(formats)
  99
 100         return {
 101             'id': video_id,
 102             'title': title,
 103 #           'timestamp': int(timestamp),
 104             'duration': int(length_seconds),
 105             'formats': formats
 106         }