]> gitweb @ CieloNegro.org - youtube-dl.git/blob - youtube_dl/extractor/cbslocal.py
[condenast] Fix extraction and style (closes #12526)
[youtube-dl.git] / youtube_dl / extractor / cbslocal.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 from .anvato import AnvatoIE
5 from .sendtonews import SendtoNewsIE
6 from ..compat import compat_urlparse
7 from ..utils import (
8     parse_iso8601,
9     unified_timestamp,
10 )
11
12
13 class CBSLocalIE(AnvatoIE):
14     _VALID_URL = r'https?://[a-z]+\.cbslocal\.com/(?:\d+/\d+/\d+|video)/(?P<id>[0-9a-z-]+)'
15
16     _TESTS = [{
17         # Anvato backend
18         'url': 'http://losangeles.cbslocal.com/2016/05/16/safety-advocates-say-fatal-car-seat-failures-are-public-health-crisis',
19         'md5': 'f0ee3081e3843f575fccef901199b212',
20         'info_dict': {
21             'id': '3401037',
22             'ext': 'mp4',
23             'title': 'Safety Advocates Say Fatal Car Seat Failures Are \'Public Health Crisis\'',
24             'description': 'Collapsing seats have been the focus of scrutiny for decades, though experts say remarkably little has been done to address the issue. Randy Paige reports.',
25             'thumbnail': 're:^https?://.*',
26             'timestamp': 1463440500,
27             'upload_date': '20160516',
28             'uploader': 'CBS',
29             'subtitles': {
30                 'en': 'mincount:5',
31             },
32             'categories': [
33                 'Stations\\Spoken Word\\KCBSTV',
34                 'Syndication\\MSN',
35                 'Syndication\\NDN',
36                 'Syndication\\AOL',
37                 'Syndication\\Yahoo',
38                 'Syndication\\Tribune',
39                 'Syndication\\Curb.tv',
40                 'Content\\News'
41             ],
42             'tags': ['CBS 2 News Evening'],
43         },
44     }, {
45         # SendtoNews embed
46         'url': 'http://cleveland.cbslocal.com/2016/05/16/indians-score-season-high-15-runs-in-blowout-win-over-reds-rapid-reaction/',
47         'info_dict': {
48             'id': 'GxfCe0Zo7D-175909-5588',
49         },
50         'playlist_count': 9,
51         'params': {
52             # m3u8 download
53             'skip_download': True,
54         },
55     }, {
56         'url': 'http://newyork.cbslocal.com/video/3580809-a-very-blue-anniversary/',
57         'info_dict': {
58             'id': '3580809',
59             'ext': 'mp4',
60             'title': 'A Very Blue Anniversary',
61             'description': 'CBS2’s Cindy Hsu has more.',
62             'thumbnail': 're:^https?://.*',
63             'timestamp': 1479962220,
64             'upload_date': '20161124',
65             'uploader': 'CBS',
66             'subtitles': {
67                 'en': 'mincount:5',
68             },
69             'categories': [
70                 'Stations\\Spoken Word\\WCBSTV',
71                 'Syndication\\AOL',
72                 'Syndication\\MSN',
73                 'Syndication\\NDN',
74                 'Syndication\\Yahoo',
75                 'Content\\News',
76                 'Content\\News\\Local News',
77             ],
78             'tags': ['CBS 2 News Weekends', 'Cindy Hsu', 'Blue Man Group'],
79         },
80     }]
81
82     def _real_extract(self, url):
83         display_id = self._match_id(url)
84         webpage = self._download_webpage(url, display_id)
85
86         sendtonews_url = SendtoNewsIE._extract_url(webpage)
87         if sendtonews_url:
88             return self.url_result(
89                 compat_urlparse.urljoin(url, sendtonews_url),
90                 ie=SendtoNewsIE.ie_key())
91
92         info_dict = self._extract_anvato_videos(webpage, display_id)
93
94         time_str = self._html_search_regex(
95             r'class="entry-date">([^<]+)<', webpage, 'released date', default=None)
96         if time_str:
97             timestamp = unified_timestamp(time_str)
98         else:
99             timestamp = parse_iso8601(self._html_search_meta('uploadDate', webpage))
100
101         info_dict.update({
102             'display_id': display_id,
103             'timestamp': timestamp,
104         })
105
106         return info_dict