youtube_dl/extractor/bandcamp.py

   1 import json
   2 import re
   3
   4 from .common import InfoExtractor
   5 from ..utils import (
   6     compat_str,
   7     compat_urlparse,
   8     ExtractorError,
   9 )
  10
  11
  12 class BandcampIE(InfoExtractor):
  13     IE_NAME = u'Bandcamp'
  14     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
  15     _TESTS = [{
  16         u'url': u'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song',
  17         u'file': u'1812978515.mp3',
  18         u'md5': u'cdeb30cdae1921719a3cbcab696ef53c',
  19         u'info_dict': {
  20             u"title": u"youtube-dl test song \"'/\\\u00e4\u21ad"
  21         },
  22         u'skip': u'There is a limit of 200 free downloads / month for the test song'
  23     }]
  24
  25     def _real_extract(self, url):
  26         mobj = re.match(self._VALID_URL, url)
  27         title = mobj.group('title')
  28         webpage = self._download_webpage(url, title)
  29         # We get the link to the free download page
  30         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
  31         if m_download is None:
  32             m_trackinfo = re.search(r'trackinfo: (.+),\s*?\n', webpage)
  33         if m_trackinfo:
  34             json_code = m_trackinfo.group(1)
  35             data = json.loads(json_code)
  36
  37             for d in data:
  38                 formats = [{
  39                     'format_id': 'format_id',
  40                     'url': format_url,
  41                     'ext': format_id.partition('-')[0]
  42                 } for format_id, format_url in sorted(d['file'].items())]
  43                 return {
  44                     'id': compat_str(d['id']),
  45                     'title': d['title'],
  46                     'formats': formats,
  47                 }
  48         else:
  49             raise ExtractorError(u'No free songs found')
  50
  51         download_link = m_download.group(1)
  52         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
  53                        webpage, re.MULTILINE|re.DOTALL).group('id')
  54
  55         download_webpage = self._download_webpage(download_link, id,
  56                                                   'Downloading free downloads page')
  57         # We get the dictionary of the track from some javascrip code
  58         info = re.search(r'items: (.*?),$',
  59                          download_webpage, re.MULTILINE).group(1)
  60         info = json.loads(info)[0]
  61         # We pick mp3-320 for now, until format selection can be easily implemented.
  62         mp3_info = info[u'downloads'][u'mp3-320']
  63         # If we try to use this url it says the link has expired
  64         initial_url = mp3_info[u'url']
  65         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
  66         m_url = re.match(re_url, initial_url)
  67         #We build the url we will use to get the final track url
  68         # This url is build in Bandcamp in the script download_bunde_*.js
  69         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
  70         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
  71         # If we could correctly generate the .rand field the url would be
  72         #in the "download_url" key
  73         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
  74
  75         track_info = {'id':id,
  76                       'title' : info[u'title'],
  77                       'ext' :   'mp3',
  78                       'url' :   final_url,
  79                       'thumbnail' : info[u'thumb_url'],
  80                       'uploader' :  info[u'artist']
  81                       }
  82
  83         return [track_info]
  84
  85
  86 class BandcampAlbumIE(InfoExtractor):
  87     IE_NAME = u'Bandcamp:album'
  88     _VALID_URL = r'http://.*?\.bandcamp\.com/album/(?P<title>.*)'
  89
  90     _TEST = {
  91         u'url': u'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1',
  92         u'playlist': [
  93             {
  94                 u'file': u'1353101989.mp3',
  95                 u'md5': u'39bc1eded3476e927c724321ddf116cf',
  96                 u'info_dict': {
  97                     u'title': u'Intro',
  98                 }
  99             },
 100             {
 101                 u'file': u'38097443.mp3',
 102                 u'md5': u'1a2c32e2691474643e912cc6cd4bffaa',
 103                 u'info_dict': {
 104                     u'title': u'Kero One - Keep It Alive (Blazo remix)',
 105                 }
 106             },
 107         ],
 108         u'params': {
 109             u'playlistend': 2
 110         },
 111         u'skip': u'Bancamp imposes download limits. See test_playlists:test_bandcamp_album for the playlist test'
 112     }
 113
 114     def _real_extract(self, url):
 115         mobj = re.match(self._VALID_URL, url)
 116         title = mobj.group('title')
 117         webpage = self._download_webpage(url, title)
 118         tracks_paths = re.findall(r'<a href="(.*?)" itemprop="url">', webpage)
 119         if not tracks_paths:
 120             raise ExtractorError(u'The page doesn\'t contain any track')
 121         entries = [
 122             self.url_result(compat_urlparse.urljoin(url, t_path), ie=BandcampIE.ie_key())
 123             for t_path in tracks_paths]
 124         title = self._search_regex(r'album_title : "(.*?)"', webpage, u'title')
 125         return {
 126             '_type': 'playlist',
 127             'title': title,
 128             'entries': entries,
 129         }