youtube_dl/extractor/bandcamp.py

   1 from __future__ import unicode_literals
   2
   3 import json
   4 import random
   5 import re
   6 import time
   7
   8 from .common import InfoExtractor
   9 from ..compat import (
  10     compat_str,
  11     compat_urlparse,
  12 )
  13 from ..utils import (
  14     ExtractorError,
  15     float_or_none,
  16     int_or_none,
  17     KNOWN_EXTENSIONS,
  18     parse_filesize,
  19     unescapeHTML,
  20     update_url_query,
  21     unified_strdate,
  22 )
  23
  24
  25 class BandcampIE(InfoExtractor):
  26     _VALID_URL = r'https?://.*?\.bandcamp\.com/track/(?P<title>[^/?#&]+)'
  27     _TESTS = [{
  28         'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song',
  29         'md5': 'c557841d5e50261777a6585648adf439',
  30         'info_dict': {
  31             'id': '1812978515',
  32             'ext': 'mp3',
  33             'title': "youtube-dl  \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad",
  34             'duration': 9.8485,
  35         },
  36         '_skip': 'There is a limit of 200 free downloads / month for the test song'
  37     }, {
  38         'url': 'http://benprunty.bandcamp.com/track/lanius-battle',
  39         'md5': '0369ace6b939f0927e62c67a1a8d9fa7',
  40         'info_dict': {
  41             'id': '2650410135',
  42             'ext': 'aiff',
  43             'title': 'Ben Prunty - Lanius (Battle)',
  44             'uploader': 'Ben Prunty',
  45         },
  46     }]
  47
  48     def _real_extract(self, url):
  49         mobj = re.match(self._VALID_URL, url)
  50         title = mobj.group('title')
  51         webpage = self._download_webpage(url, title)
  52         thumbnail = self._html_search_meta('og:image', webpage, default=None)
  53         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
  54         if not m_download:
  55             m_trackinfo = re.search(r'trackinfo: (.+),\s*?\n', webpage)
  56             if m_trackinfo:
  57                 json_code = m_trackinfo.group(1)
  58                 data = json.loads(json_code)[0]
  59                 track_id = compat_str(data['id'])
  60
  61                 if not data.get('file'):
  62                     raise ExtractorError('Not streamable', video_id=track_id, expected=True)
  63
  64                 formats = []
  65                 for format_id, format_url in data['file'].items():
  66                     ext, abr_str = format_id.split('-', 1)
  67                     formats.append({
  68                         'format_id': format_id,
  69                         'url': self._proto_relative_url(format_url, 'http:'),
  70                         'ext': ext,
  71                         'vcodec': 'none',
  72                         'acodec': ext,
  73                         'abr': int_or_none(abr_str),
  74                     })
  75
  76                 self._sort_formats(formats)
  77
  78                 return {
  79                     'id': track_id,
  80                     'title': data['title'],
  81                     'thumbnail': thumbnail,
  82                     'formats': formats,
  83                     'duration': float_or_none(data.get('duration')),
  84                 }
  85             else:
  86                 raise ExtractorError('No free songs found')
  87
  88         download_link = m_download.group(1)
  89         video_id = self._search_regex(
  90             r'(?ms)var TralbumData = .*?[{,]\s*id: (?P<id>\d+),?$',
  91             webpage, 'video id')
  92
  93         download_webpage = self._download_webpage(
  94             download_link, video_id, 'Downloading free downloads page')
  95
  96         blob = self._parse_json(
  97             self._search_regex(
  98                 r'data-blob=(["\'])(?P<blob>{.+?})\1', download_webpage,
  99                 'blob', group='blob'),
 100             video_id, transform_source=unescapeHTML)
 101
 102         info = blob['digital_items'][0]
 103
 104         downloads = info['downloads']
 105         track = info['title']
 106
 107         artist = info.get('artist')
 108         title = '%s - %s' % (artist, track) if artist else track
 109
 110         download_formats = {}
 111         for f in blob['download_formats']:
 112             name, ext = f.get('name'), f.get('file_extension')
 113             if all(isinstance(x, compat_str) for x in (name, ext)):
 114                 download_formats[name] = ext.strip('.')
 115
 116         formats = []
 117         for format_id, f in downloads.items():
 118             format_url = f.get('url')
 119             if not format_url:
 120                 continue
 121             # Stat URL generation algorithm is reverse engineered from
 122             # download_*_bundle_*.js
 123             stat_url = update_url_query(
 124                 format_url.replace('/download/', '/statdownload/'), {
 125                     '.rand': int(time.time() * 1000 * random.random()),
 126                 })
 127             format_id = f.get('encoding_name') or format_id
 128             stat = self._download_json(
 129                 stat_url, video_id, 'Downloading %s JSON' % format_id,
 130                 transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1],
 131                 fatal=False)
 132             if not stat:
 133                 continue
 134             retry_url = stat.get('retry_url')
 135             if not isinstance(retry_url, compat_str):
 136                 continue
 137             formats.append({
 138                 'url': self._proto_relative_url(retry_url, 'http:'),
 139                 'ext': download_formats.get(format_id),
 140                 'format_id': format_id,
 141                 'format_note': f.get('description'),
 142                 'filesize': parse_filesize(f.get('size_mb')),
 143                 'vcodec': 'none',
 144             })
 145         self._sort_formats(formats)
 146
 147         return {
 148             'id': video_id,
 149             'title': title,
 150             'thumbnail': info.get('thumb_url') or thumbnail,
 151             'uploader': info.get('artist'),
 152             'artist': artist,
 153             'track': track,
 154             'formats': formats,
 155         }
 156
 157
 158 class BandcampAlbumIE(InfoExtractor):
 159     IE_NAME = 'Bandcamp:album'
 160     _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<album_id>[^/?#&]+))?'
 161
 162     _TESTS = [{
 163         'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1',
 164         'playlist': [
 165             {
 166                 'md5': '39bc1eded3476e927c724321ddf116cf',
 167                 'info_dict': {
 168                     'id': '1353101989',
 169                     'ext': 'mp3',
 170                     'title': 'Intro',
 171                 }
 172             },
 173             {
 174                 'md5': '1a2c32e2691474643e912cc6cd4bffaa',
 175                 'info_dict': {
 176                     'id': '38097443',
 177                     'ext': 'mp3',
 178                     'title': 'Kero One - Keep It Alive (Blazo remix)',
 179                 }
 180             },
 181         ],
 182         'info_dict': {
 183             'title': 'Jazz Format Mixtape vol.1',
 184             'id': 'jazz-format-mixtape-vol-1',
 185             'uploader_id': 'blazo',
 186         },
 187         'params': {
 188             'playlistend': 2
 189         },
 190         'skip': 'Bandcamp imposes download limits.'
 191     }, {
 192         'url': 'http://nightbringer.bandcamp.com/album/hierophany-of-the-open-grave',
 193         'info_dict': {
 194             'title': 'Hierophany of the Open Grave',
 195             'uploader_id': 'nightbringer',
 196             'id': 'hierophany-of-the-open-grave',
 197         },
 198         'playlist_mincount': 9,
 199     }, {
 200         'url': 'http://dotscale.bandcamp.com',
 201         'info_dict': {
 202             'title': 'Loom',
 203             'id': 'dotscale',
 204             'uploader_id': 'dotscale',
 205         },
 206         'playlist_mincount': 7,
 207     }, {
 208         # with escaped quote in title
 209         'url': 'https://jstrecords.bandcamp.com/album/entropy-ep',
 210         'info_dict': {
 211             'title': '"Entropy" EP',
 212             'uploader_id': 'jstrecords',
 213             'id': 'entropy-ep',
 214         },
 215         'playlist_mincount': 3,
 216     }, {
 217         # not all tracks have songs
 218         'url': 'https://insulters.bandcamp.com/album/we-are-the-plague',
 219         'info_dict': {
 220             'id': 'we-are-the-plague',
 221             'title': 'WE ARE THE PLAGUE',
 222             'uploader_id': 'insulters',
 223         },
 224         'playlist_count': 2,
 225     }]
 226
 227     @classmethod
 228     def suitable(cls, url):
 229         return (False
 230                 if BandcampWeeklyIE.suitable(url) or BandcampIE.suitable(url)
 231                 else super(BandcampAlbumIE, cls).suitable(url))
 232
 233     def _real_extract(self, url):
 234         mobj = re.match(self._VALID_URL, url)
 235         uploader_id = mobj.group('subdomain')
 236         album_id = mobj.group('album_id')
 237         playlist_id = album_id or uploader_id
 238         webpage = self._download_webpage(url, playlist_id)
 239         track_elements = re.findall(
 240             r'(?s)<div[^>]*>(.*?<a[^>]+href="([^"]+?)"[^>]+itemprop="url"[^>]*>.*?)</div>', webpage)
 241         if not track_elements:
 242             raise ExtractorError('The page doesn\'t contain any tracks')
 243         # Only tracks with duration info have songs
 244         entries = [
 245             self.url_result(
 246                 compat_urlparse.urljoin(url, t_path),
 247                 ie=BandcampIE.ie_key(),
 248                 video_title=self._search_regex(
 249                     r'<span\b[^>]+\bitemprop=["\']name["\'][^>]*>([^<]+)',
 250                     elem_content, 'track title', fatal=False))
 251             for elem_content, t_path in track_elements
 252             if self._html_search_meta('duration', elem_content, default=None)]
 253
 254         title = self._html_search_regex(
 255             r'album_title\s*:\s*"((?:\\.|[^"\\])+?)"',
 256             webpage, 'title', fatal=False)
 257         if title:
 258             title = title.replace(r'\"', '"')
 259         return {
 260             '_type': 'playlist',
 261             'uploader_id': uploader_id,
 262             'id': playlist_id,
 263             'title': title,
 264             'entries': entries,
 265         }
 266
 267
 268 class BandcampWeeklyIE(InfoExtractor):
 269     IE_NAME = 'Bandcamp:weekly'
 270     _VALID_URL = r'https?://(?:www\.)?bandcamp\.com/?\?(?:.*?&)?show=(?P<id>\d+)'
 271     _TESTS = [{
 272         'url': 'https://bandcamp.com/?show=224',
 273         'md5': 'b00df799c733cf7e0c567ed187dea0fd',
 274         'info_dict': {
 275             'id': '224',
 276             'ext': 'opus',
 277             'title': 'BC Weekly April 4th 2017 - Magic Moments',
 278             'description': 'md5:5d48150916e8e02d030623a48512c874',
 279             'duration': 5829.77,
 280             'release_date': '20170404',
 281             'series': 'Bandcamp Weekly',
 282             'episode': 'Magic Moments',
 283             'episode_number': 208,
 284             'episode_id': '224',
 285         }
 286     }, {
 287         'url': 'https://bandcamp.com/?blah/blah@&show=228',
 288         'only_matching': True
 289     }]
 290
 291     def _real_extract(self, url):
 292         video_id = self._match_id(url)
 293         webpage = self._download_webpage(url, video_id)
 294
 295         blob = self._parse_json(
 296             self._search_regex(
 297                 r'data-blob=(["\'])(?P<blob>{.+?})\1', webpage,
 298                 'blob', group='blob'),
 299             video_id, transform_source=unescapeHTML)
 300
 301         show = blob['bcw_show']
 302
 303         # This is desired because any invalid show id redirects to `bandcamp.com`
 304         # which happens to expose the latest Bandcamp Weekly episode.
 305         show_id = int_or_none(show.get('show_id')) or int_or_none(video_id)
 306
 307         formats = []
 308         for format_id, format_url in show['audio_stream'].items():
 309             if not isinstance(format_url, compat_str):
 310                 continue
 311             for known_ext in KNOWN_EXTENSIONS:
 312                 if known_ext in format_id:
 313                     ext = known_ext
 314                     break
 315             else:
 316                 ext = None
 317             formats.append({
 318                 'format_id': format_id,
 319                 'url': format_url,
 320                 'ext': ext,
 321                 'vcodec': 'none',
 322             })
 323         self._sort_formats(formats)
 324
 325         title = show.get('audio_title') or 'Bandcamp Weekly'
 326         subtitle = show.get('subtitle')
 327         if subtitle:
 328             title += ' - %s' % subtitle
 329
 330         episode_number = None
 331         seq = blob.get('bcw_seq')
 332
 333         if seq and isinstance(seq, list):
 334             try:
 335                 episode_number = next(
 336                     int_or_none(e.get('episode_number'))
 337                     for e in seq
 338                     if isinstance(e, dict) and int_or_none(e.get('id')) == show_id)
 339             except StopIteration:
 340                 pass
 341
 342         return {
 343             'id': video_id,
 344             'title': title,
 345             'description': show.get('desc') or show.get('short_desc'),
 346             'duration': float_or_none(show.get('audio_duration')),
 347             'is_live': False,
 348             'release_date': unified_strdate(show.get('published_date')),
 349             'series': 'Bandcamp Weekly',
 350             'episode': show.get('subtitle'),
 351             'episode_number': episode_number,
 352             'episode_id': compat_str(video_id),
 353             'formats': formats
 354         }