youtube_dl/extractor/iconosquare.py

   1 from __future__ import unicode_literals
   2
   3 from .common import InfoExtractor
   4 from ..utils import (
   5     int_or_none,
   6     get_element_by_id,
   7     remove_end,
   8 )
   9
  10
  11 class IconosquareIE(InfoExtractor):
  12     _VALID_URL = r'https?://(?:www\.)?(?:iconosquare\.com|statigr\.am)/p/(?P<id>[^/]+)'
  13     _TEST = {
  14         'url': 'http://statigr.am/p/522207370455279102_24101272',
  15         'md5': '6eb93b882a3ded7c378ee1d6884b1814',
  16         'info_dict': {
  17             'id': '522207370455279102_24101272',
  18             'ext': 'mp4',
  19             'title': 'Instagram photo by @aguynamedpatrick (Patrick Janelle)',
  20             'description': 'md5:644406a9ec27457ed7aa7a9ebcd4ce3d',
  21             'timestamp': 1376471991,
  22             'upload_date': '20130814',
  23             'uploader': 'aguynamedpatrick',
  24             'uploader_id': '24101272',
  25             'comment_count': int,
  26             'like_count': int,
  27         },
  28     }
  29
  30     def _real_extract(self, url):
  31         video_id = self._match_id(url)
  32
  33         webpage = self._download_webpage(url, video_id)
  34
  35         media = self._parse_json(
  36             get_element_by_id('mediaJson', webpage),
  37             video_id)
  38
  39         formats = [{
  40             'url': f['url'],
  41             'format_id': format_id,
  42             'width': int_or_none(f.get('width')),
  43             'height': int_or_none(f.get('height'))
  44         } for format_id, f in media['videos'].items()]
  45         self._sort_formats(formats)
  46
  47         title = remove_end(self._og_search_title(webpage), ' - via Iconosquare')
  48
  49         timestamp = int_or_none(media.get('created_time') or media.get('caption', {}).get('created_time'))
  50         description = media.get('caption', {}).get('text')
  51
  52         uploader = media.get('user', {}).get('username')
  53         uploader_id = media.get('user', {}).get('id')
  54
  55         comment_count = int_or_none(media.get('comments', {}).get('count'))
  56         like_count = int_or_none(media.get('likes', {}).get('count'))
  57
  58         thumbnails = [{
  59             'url': t['url'],
  60             'id': thumbnail_id,
  61             'width': int_or_none(t.get('width')),
  62             'height': int_or_none(t.get('height'))
  63         } for thumbnail_id, t in media.get('images', {}).items()]
  64
  65         comments = [{
  66             'id': comment.get('id'),
  67             'text': comment['text'],
  68             'timestamp': int_or_none(comment.get('created_time')),
  69             'author': comment.get('from', {}).get('full_name'),
  70             'author_id': comment.get('from', {}).get('username'),
  71         } for comment in media.get('comments', {}).get('data', []) if 'text' in comment]
  72
  73         return {
  74             'id': video_id,
  75             'title': title,
  76             'description': description,
  77             'thumbnails': thumbnails,
  78             'timestamp': timestamp,
  79             'uploader': uploader,
  80             'uploader_id': uploader_id,
  81             'comment_count': comment_count,
  82             'like_count': like_count,
  83             'formats': formats,
  84             'comments': comments,
  85         }