youtube_dl/extractor/esri.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import re
   5
   6 from .common import InfoExtractor
   7 from ..compat import compat_urlparse
   8 from ..utils import (
   9     int_or_none,
  10     parse_filesize,
  11     unified_strdate,
  12 )
  13
  14
  15 class EsriVideoIE(InfoExtractor):
  16     _VALID_URL = r'https?://video\.esri\.com/watch/(?P<id>[0-9]+)'
  17     _TEST = {
  18         'url': 'https://video.esri.com/watch/1124/arcgis-online-_dash_-developing-applications',
  19         'md5': 'd4aaf1408b221f1b38227a9bbaeb95bc',
  20         'info_dict': {
  21             'id': '1124',
  22             'ext': 'mp4',
  23             'title': 'ArcGIS Online - Developing Applications',
  24             'description': 'Jeremy Bartley demonstrates how to develop applications with ArcGIS Online.',
  25             'thumbnail': r're:^https?://.*\.jpg$',
  26             'duration': 185,
  27             'upload_date': '20120419',
  28         }
  29     }
  30
  31     def _real_extract(self, url):
  32         video_id = self._match_id(url)
  33
  34         webpage = self._download_webpage(url, video_id)
  35
  36         formats = []
  37         for width, height, content in re.findall(
  38                 r'(?s)<li><strong>(\d+)x(\d+):</strong>(.+?)</li>', webpage):
  39             for video_url, ext, filesize in re.findall(
  40                     r'<a[^>]+href="([^"]+)">([^<]+)&nbsp;\(([^<]+)\)</a>', content):
  41                 formats.append({
  42                     'url': compat_urlparse.urljoin(url, video_url),
  43                     'ext': ext.lower(),
  44                     'format_id': '%s-%s' % (ext.lower(), height),
  45                     'width': int(width),
  46                     'height': int(height),
  47                     'filesize_approx': parse_filesize(filesize),
  48                 })
  49         self._sort_formats(formats)
  50
  51         title = self._html_search_meta('title', webpage, 'title')
  52         description = self._html_search_meta(
  53             'description', webpage, 'description', fatal=False)
  54
  55         thumbnail = self._html_search_meta('thumbnail', webpage, 'thumbnail', fatal=False)
  56         if thumbnail:
  57             thumbnail = re.sub(r'_[st]\.jpg$', '_x.jpg', thumbnail)
  58
  59         duration = int_or_none(self._search_regex(
  60             [r'var\s+videoSeconds\s*=\s*(\d+)', r"'duration'\s*:\s*(\d+)"],
  61             webpage, 'duration', fatal=False))
  62
  63         upload_date = unified_strdate(self._html_search_meta(
  64             'last-modified', webpage, 'upload date', fatal=False))
  65
  66         return {
  67             'id': video_id,
  68             'title': title,
  69             'description': description,
  70             'thumbnail': thumbnail,
  71             'duration': duration,
  72             'upload_date': upload_date,
  73             'formats': formats
  74         }