extractor/gorillavid.py

   1 # -*- coding: utf-8 -*-
   2 from __future__ import unicode_literals
   3
   4 import re
   5
   6 from .common import InfoExtractor
   7 from ..utils import (
   8     ExtractorError,
   9     determine_ext,
  10     compat_urllib_parse,
  11     compat_urllib_request,
  12     int_or_none,
  13 )
  14
  15
  16 class GorillaVidIE(InfoExtractor):
  17     IE_DESC = 'GorillaVid.in, daclips.in, movpod.in and fastvideo.in'
  18     _VALID_URL = r'''(?x)
  19         https?://(?P<host>(?:www\.)?
  20             (?:daclips\.in|gorillavid\.in|movpod\.in|fastvideo\.in))/
  21         (?:embed-)?(?P<id>[0-9a-zA-Z]+)(?:-[0-9]+x[0-9]+\.html)?
  22     '''
  23
  24     _FILE_NOT_FOUND_REGEX = r'>(?:404 - )?File Not Found<'
  25
  26     _TESTS = [{
  27         'url': 'http://gorillavid.in/06y9juieqpmi',
  28         'md5': '5ae4a3580620380619678ee4875893ba',
  29         'info_dict': {
  30             'id': '06y9juieqpmi',
  31             'ext': 'flv',
  32             'title': 'Rebecca Black My Moment Official Music Video Reaction-6GK87Rc8bzQ',
  33             'thumbnail': 're:http://.*\.jpg',
  34         },
  35     }, {
  36         'url': 'http://gorillavid.in/embed-z08zf8le23c6-960x480.html',
  37         'md5': 'c9e293ca74d46cad638e199c3f3fe604',
  38         'info_dict': {
  39             'id': 'z08zf8le23c6',
  40             'ext': 'mp4',
  41             'title': 'Say something nice',
  42             'thumbnail': 're:http://.*\.jpg',
  43         },
  44     }, {
  45         'url': 'http://daclips.in/3rso4kdn6f9m',
  46         'md5': '1ad8fd39bb976eeb66004d3a4895f106',
  47         'info_dict': {
  48             'id': '3rso4kdn6f9m',
  49             'ext': 'mp4',
  50             'title': 'Micro Pig piglets ready on 16th July 2009-bG0PdrCdxUc',
  51             'thumbnail': 're:http://.*\.jpg',
  52         }
  53     }, {
  54         # video with countdown timeout
  55         'url': 'http://fastvideo.in/1qmdn1lmsmbw',
  56         'md5': '8b87ec3f6564a3108a0e8e66594842ba',
  57         'info_dict': {
  58             'id': '1qmdn1lmsmbw',
  59             'ext': 'mp4',
  60             'title': 'Man of Steel - Trailer',
  61             'thumbnail': 're:http://.*\.jpg',
  62         },
  63     }, {
  64         'url': 'http://movpod.in/0wguyyxi1yca',
  65         'only_matching': True,
  66     }]
  67
  68     def _real_extract(self, url):
  69         mobj = re.match(self._VALID_URL, url)
  70         video_id = mobj.group('id')
  71
  72         webpage = self._download_webpage('http://%s/%s' % (mobj.group('host'), video_id), video_id)
  73
  74         if re.search(self._FILE_NOT_FOUND_REGEX, webpage) is not None:
  75             raise ExtractorError('Video %s does not exist' % video_id, expected=True)
  76
  77         fields = dict(re.findall(r'''(?x)<input\s+
  78             type="hidden"\s+
  79             name="([^"]+)"\s+
  80             (?:id="[^"]+"\s+)?
  81             value="([^"]*)"
  82             ''', webpage))
  83
  84         if fields['op'] == 'download1':
  85             countdown = int_or_none(self._search_regex(
  86                 r'<span id="countdown_str">(?:[Ww]ait)?\s*<span id="cxc">(\d+)</span>\s*(?:seconds?)?</span>',
  87                 webpage, 'countdown', default=None))
  88             if countdown:
  89                 self._sleep(countdown, video_id)
  90
  91             post = compat_urllib_parse.urlencode(fields)
  92
  93             req = compat_urllib_request.Request(url, post)
  94             req.add_header('Content-type', 'application/x-www-form-urlencoded')
  95
  96             webpage = self._download_webpage(req, video_id, 'Downloading video page')
  97
  98         title = self._search_regex(
  99             r'style="z-index: [0-9]+;">([^<]+)</span>',
 100             webpage, 'title', default=None) or self._og_search_title(webpage)
 101         video_url = self._search_regex(
 102             r'file\s*:\s*["\'](http[^"\']+)["\'],', webpage, 'file url')
 103         thumbnail = self._search_regex(
 104             r'image\s*:\s*["\'](http[^"\']+)["\'],', webpage, 'thumbnail', fatal=False)
 105
 106         formats = [{
 107             'format_id': 'sd',
 108             'url': video_url,
 109             'ext': determine_ext(video_url),
 110             'quality': 1,
 111         }]
 112
 113         return {
 114             'id': video_id,
 115             'title': title,
 116             'thumbnail': thumbnail,
 117             'formats': formats,
 118         }