youtube_dl/extractor/chirbit.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import re
   5
   6 from .common import InfoExtractor
   7 from ..compat import compat_b64decode
   8 from ..utils import parse_duration
   9
  10
  11 class ChirbitIE(InfoExtractor):
  12     IE_NAME = 'chirbit'
  13     _VALID_URL = r'https?://(?:www\.)?chirb\.it/(?:(?:wp|pl)/|fb_chirbit_player\.swf\?key=)?(?P<id>[\da-zA-Z]+)'
  14     _TESTS = [{
  15         'url': 'http://chirb.it/be2abG',
  16         'info_dict': {
  17             'id': 'be2abG',
  18             'ext': 'mp3',
  19             'title': 'md5:f542ea253f5255240be4da375c6a5d7e',
  20             'description': 'md5:f24a4e22a71763e32da5fed59e47c770',
  21             'duration': 306,
  22             'uploader': 'Gerryaudio',
  23         },
  24         'params': {
  25             'skip_download': True,
  26         }
  27     }, {
  28         'url': 'https://chirb.it/fb_chirbit_player.swf?key=PrIPv5',
  29         'only_matching': True,
  30     }, {
  31         'url': 'https://chirb.it/wp/MN58c2',
  32         'only_matching': True,
  33     }]
  34
  35     def _real_extract(self, url):
  36         audio_id = self._match_id(url)
  37
  38         webpage = self._download_webpage(
  39             'http://chirb.it/%s' % audio_id, audio_id)
  40
  41         data_fd = self._search_regex(
  42             r'data-fd=(["\'])(?P<url>(?:(?!\1).)+)\1',
  43             webpage, 'data fd', group='url')
  44
  45         # Reverse engineered from https://chirb.it/js/chirbit.player.js (look
  46         # for soundURL)
  47         audio_url = compat_b64decode(data_fd[::-1]).decode('utf-8')
  48
  49         title = self._search_regex(
  50             r'class=["\']chirbit-title["\'][^>]*>([^<]+)', webpage, 'title')
  51         description = self._search_regex(
  52             r'<h3>Description</h3>\s*<pre[^>]*>([^<]+)</pre>',
  53             webpage, 'description', default=None)
  54         duration = parse_duration(self._search_regex(
  55             r'class=["\']c-length["\'][^>]*>([^<]+)',
  56             webpage, 'duration', fatal=False))
  57         uploader = self._search_regex(
  58             r'id=["\']chirbit-username["\'][^>]*>([^<]+)',
  59             webpage, 'uploader', fatal=False)
  60
  61         return {
  62             'id': audio_id,
  63             'url': audio_url,
  64             'title': title,
  65             'description': description,
  66             'duration': duration,
  67             'uploader': uploader,
  68         }
  69
  70
  71 class ChirbitProfileIE(InfoExtractor):
  72     IE_NAME = 'chirbit:profile'
  73     _VALID_URL = r'https?://(?:www\.)?chirbit\.com/(?:rss/)?(?P<id>[^/]+)'
  74     _TEST = {
  75         'url': 'http://chirbit.com/ScarletBeauty',
  76         'info_dict': {
  77             'id': 'ScarletBeauty',
  78         },
  79         'playlist_mincount': 3,
  80     }
  81
  82     def _real_extract(self, url):
  83         profile_id = self._match_id(url)
  84
  85         webpage = self._download_webpage(url, profile_id)
  86
  87         entries = [
  88             self.url_result(self._proto_relative_url('//chirb.it/' + video_id))
  89             for _, video_id in re.findall(r'<input[^>]+id=([\'"])copy-btn-(?P<id>[0-9a-zA-Z]+)\1', webpage)]
  90
  91         return self.playlist_result(entries, profile_id)