youtube_dl/extractor/chirbit.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import re
   5
   6 from .common import InfoExtractor
   7 from ..utils import clean_html
   8
   9
  10 class ChirbitIE(InfoExtractor):
  11     _VALID_URL = r'https?://(?:www\.)?chirb\.it/(?P<id>[^/]+)'
  12     _TEST = {
  13         'url': 'http://chirb.it/PrIPv5',
  14         'md5': '9847b0dad6ac3e074568bf2cfb197de8',
  15         'info_dict': {
  16             'id': 'PrIPv5',
  17             'display_id': 'kukushtv_1423231243',
  18             'ext': 'mp3',
  19             'title': 'Фасадстрой',
  20             'url': 'http://audio.chirbit.com/kukushtv_1423231243.mp3'
  21         }
  22     }
  23
  24     def _real_extract(self, url):
  25         audio_linkid = self._match_id(url)
  26         webpage = self._download_webpage(url, audio_linkid)
  27
  28         audio_title = self._html_search_regex(r'<h2\s+itemprop="name">(.*?)</h2>', webpage, 'title')
  29         audio_id = self._html_search_regex(r'\("setFile",\s+"http://audio.chirbit.com/(.*?).mp3"\)', webpage, 'audio ID')
  30         audio_url = 'http://audio.chirbit.com/' + audio_id + '.mp3';
  31
  32         return {
  33             'id': audio_linkid,
  34             'display_id': audio_id,
  35             'title': audio_title,
  36             'url': audio_url
  37         }
  38
  39 class ChirbitProfileIE(InfoExtractor):
  40     _VALID_URL = r'https?://(?:www\.)?chirbit.com/(?P<id>[^/]+)'
  41     _TEST = {
  42         'url': 'http://chirbit.com/ScarletBeauty',
  43         'playlist_count': 3,
  44         'info_dict': {
  45             '_type': 'playlist',
  46             'title': 'ScarletBeauty',
  47             'id': 'ScarletBeauty'
  48         }
  49     }
  50
  51     def _real_extract(self, url):
  52         profile_id = self._match_id(url)
  53
  54         # Chirbit has a pretty weird "Last Page" navigation behavior.
  55         # We grab the profile's oldest entry to determine when to
  56         # stop fetching entries.
  57         oldestpage = self._download_webpage(url + '/24599', profile_id)
  58         oldest_page_entries = re.findall(
  59             r'''soundFile:\s*"http://audio.chirbit.com/(.*?).mp3"''',
  60             oldestpage);
  61         oldestentry = clean_html(oldest_page_entries[-1]);
  62
  63         ids = []
  64         titles = []
  65         n = 0
  66         while True:
  67             page = self._download_webpage(url + '/' + str(n), profile_id)
  68             page_ids = re.findall(
  69                 r'''soundFile:\s*"http://audio.chirbit.com/(.*?).mp3"''',
  70                 page);
  71             page_titles = re.findall(
  72                 r'''<div\s+class="chirbit_title"\s*>(.*?)</div>''',
  73                 page);
  74             ids += page_ids
  75             titles += page_titles
  76             if oldestentry in page_ids:
  77                 break
  78             n += 1
  79
  80         entries = []
  81         i = 0
  82         for id in ids:
  83             entries.append({
  84                 'id': id,
  85                 'title': titles[i],
  86                 'url': 'http://audio.chirbit.com/' + id + '.mp3'
  87             });
  88             i += 1
  89
  90         info_dict = {
  91             '_type': 'playlist',
  92             'id': profile_id,
  93             'title': profile_id,
  94             'entries': entries
  95         }
  96
  97         return info_dict;