youtube_dl/extractor/photobucket.py

   1 import datetime
   2 import json
   3 import re
   4
   5 from .common import InfoExtractor
   6
   7 from ..utils import (
   8     ExtractorError,
   9 )
  10
  11 class PhotobucketIE(InfoExtractor):
  12     """Information extractor for photobucket.com."""
  13
  14     # TODO: the original _VALID_URL was:
  15     # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
  16     # Check if it's necessary to keep the old extracion process
  17     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
  18     IE_NAME = u'photobucket'
  19
  20     def _real_extract(self, url):
  21         # Extract id from URL
  22         mobj = re.match(self._VALID_URL, url)
  23         if mobj is None:
  24             raise ExtractorError(u'Invalid URL: %s' % url)
  25
  26         video_id = mobj.group('id')
  27
  28         video_extension = mobj.group('ext')
  29
  30         # Retrieve video webpage to extract further information
  31         webpage = self._download_webpage(url, video_id)
  32
  33         # Extract URL, uploader, and title from webpage
  34         self.report_extraction(video_id)
  35         # We try first by looking the javascript code:
  36         mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
  37         if mobj is not None:
  38             info = json.loads(mobj.group('json'))
  39             return [{
  40                 'id':       video_id,
  41                 'url':      info[u'downloadUrl'],
  42                 'uploader': info[u'username'],
  43                 'upload_date':  datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
  44                 'title':    info[u'title'],
  45                 'ext':      video_extension,
  46                 'thumbnail': info[u'thumbUrl'],
  47             }]
  48
  49         # We try looking in other parts of the webpage
  50         video_url = self._search_regex(r'<link rel="video_src" href=".*\?file=([^"]+)" />',
  51             webpage, u'video URL')
  52
  53         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
  54         if mobj is None:
  55             raise ExtractorError(u'Unable to extract title')
  56         video_title = mobj.group(1).decode('utf-8')
  57         video_uploader = mobj.group(2).decode('utf-8')
  58
  59         return [{
  60             'id':       video_id.decode('utf-8'),
  61             'url':      video_url.decode('utf-8'),
  62             'uploader': video_uploader,
  63             'upload_date':  None,
  64             'title':    video_title,
  65             'ext':      video_extension.decode('utf-8'),
  66         }]