]> gitweb @ CieloNegro.org - youtube-dl.git/commitdiff
Merge branch 'soompi' of https://github.com/ping/youtube-dl into ping-soompi
authorSergey M․ <dstftw@gmail.com>
Thu, 28 May 2015 16:22:29 +0000 (22:22 +0600)
committerSergey M․ <dstftw@gmail.com>
Thu, 28 May 2015 16:22:29 +0000 (22:22 +0600)
45 files changed:
AUTHORS
README.md
docs/supportedsites.md
test/test_subtitles.py
youtube_dl/YoutubeDL.py
youtube_dl/aes.py
youtube_dl/extractor/__init__.py
youtube_dl/extractor/arte.py
youtube_dl/extractor/bilibili.py
youtube_dl/extractor/chilloutzone.py
youtube_dl/extractor/cinemassacre.py
youtube_dl/extractor/cnn.py
youtube_dl/extractor/dailymotion.py
youtube_dl/extractor/drtv.py
youtube_dl/extractor/empflix.py
youtube_dl/extractor/facebook.py
youtube_dl/extractor/firedrive.py [deleted file]
youtube_dl/extractor/karrierevideos.py [new file with mode: 0644]
youtube_dl/extractor/mitele.py
youtube_dl/extractor/naver.py
youtube_dl/extractor/nba.py
youtube_dl/extractor/nextmedia.py
youtube_dl/extractor/nowtv.py [new file with mode: 0644]
youtube_dl/extractor/odnoklassniki.py
youtube_dl/extractor/pornhub.py
youtube_dl/extractor/prosiebensat1.py
youtube_dl/extractor/rtbf.py
youtube_dl/extractor/rtlnow.py [deleted file]
youtube_dl/extractor/rtve.py
youtube_dl/extractor/rutv.py
youtube_dl/extractor/shared.py
youtube_dl/extractor/sockshare.py [deleted file]
youtube_dl/extractor/spankwire.py
youtube_dl/extractor/sportbox.py
youtube_dl/extractor/teamcoco.py
youtube_dl/extractor/telecinco.py
youtube_dl/extractor/tenplay.py
youtube_dl/extractor/tf1.py
youtube_dl/extractor/tnaflix.py
youtube_dl/extractor/tutv.py
youtube_dl/extractor/videott.py
youtube_dl/extractor/viki.py
youtube_dl/extractor/youtube.py
youtube_dl/options.py
youtube_dl/version.py

diff --git a/AUTHORS b/AUTHORS
index 267b8da1e6ffbda7853d53996de4020111074834..3410e1fb9cbec88822c9681d80826985cdc62efb 100644 (file)
--- a/AUTHORS
+++ b/AUTHORS
@@ -124,3 +124,5 @@ Mohammad Teimori Pabandi
 Roman Le Négrate
 Matthias Küch
 Julian Richen
+Ping O.
+Mister Hat
index 3d9436456c4bd29d11bf160efa27769b05edde2d..e51bb534341e389a26a466f1fb4c3ef721731016 100644 (file)
--- a/README.md
+++ b/README.md
@@ -17,12 +17,12 @@ youtube-dl - download videos from youtube.com or other video platforms
 To install it right away for all UNIX users (Linux, OS X, etc.), type:
 
     sudo curl https://yt-dl.org/latest/youtube-dl -o /usr/local/bin/youtube-dl
-    sudo chmod a+x /usr/local/bin/youtube-dl
+    sudo chmod a+rx /usr/local/bin/youtube-dl
 
 If you do not have curl, you can alternatively use a recent wget:
 
     sudo wget https://yt-dl.org/downloads/latest/youtube-dl -O /usr/local/bin/youtube-dl
-    sudo chmod a+x /usr/local/bin/youtube-dl
+    sudo chmod a+rx /usr/local/bin/youtube-dl
 
 Windows users can [download a .exe file](https://yt-dl.org/latest/youtube-dl.exe) and place it in their home directory or any other location on their [PATH](http://en.wikipedia.org/wiki/PATH_%28variable%29).
 
index 43fbe8b1d75bf260dd98d29ee7cb029b4c373365..a4879bd9a1a4b5221e824927e0ef0ec4d6c9f734 100644 (file)
  - **Eporner**
  - **EroProfile**
  - **Escapist**
+ - **ESPN** (Currently broken)
  - **EveryonesMixtape**
  - **exfm**: ex.fm
  - **ExpoTV**
  - **OktoberfestTV**
  - **on.aol.com**
  - **Ooyala**
+ - **OoyalaExternal**
  - **OpenFilm**
  - **orf:fm4**: radio FM4
  - **orf:iptv**: iptv.ORF.at
  - **Spike**
  - **Sport5**
  - **SportBox**
+ - **SportBoxEmbed**
  - **SportDeutschland**
  - **Srf**
  - **SRMediathek**: Saarländischer Rundfunk
  - **Turbo**
  - **Tutv**
  - **tv.dfb.de**
+ - **TV2**
+ - **TV2Article**
  - **TV4**: tv4.se and tv4play.se
  - **tvigle**: Интернет-телевидение Tvigle.ru
  - **tvp.pl**
index 891ee620b1f2627dd6991e0cccfbc58b59fb6a95..c4e3adb67b7d1034b36cdd3c45969fe321351c64 100644 (file)
@@ -266,7 +266,7 @@ class TestNRKSubtitles(BaseTestSubtitles):
         self.DL.params['allsubtitles'] = True
         subtitles = self.getSubtitles()
         self.assertEqual(set(subtitles.keys()), set(['no']))
-        self.assertEqual(md5(subtitles['no']), '1d221e6458c95c5494dcd38e6a1f129a')
+        self.assertEqual(md5(subtitles['no']), '544fa917d3197fcbee64634559221cc2')
 
 
 class TestRaiSubtitles(BaseTestSubtitles):
index 58b34e087b421474112719ffb3389b252e560313..d1953c18f39b438740aec88a1aadf4d529a8e0b4 100755 (executable)
@@ -1527,6 +1527,7 @@ class YoutubeDL(object):
             pps_chain.extend(ie_info['__postprocessors'])
         pps_chain.extend(self._pps)
         for pp in pps_chain:
+            files_to_delete = []
             try:
                 files_to_delete, info = pp.run(info)
             except PostProcessingError as e:
index 07224d5084158184ac30c927b1b79802d398c336..7817adcfdd546f70cfb76e0634b8df8ddbcaf8e0 100644 (file)
@@ -152,7 +152,7 @@ def aes_decrypt_text(data, password, key_size_bytes):
     """
     NONCE_LENGTH_BYTES = 8
 
-    data = bytes_to_intlist(base64.b64decode(data))
+    data = bytes_to_intlist(base64.b64decode(data.encode('utf-8')))
     password = bytes_to_intlist(password.encode('utf-8'))
 
     key = password[:key_size_bytes] + [0] * (key_size_bytes - len(password))
index 2a5cf95470c99886eadbc5306fb31f516d7d8b80..f73bf646b510b8e9c93bbfc1fc5c5508da97f782 100644 (file)
@@ -149,7 +149,6 @@ from .extremetube import ExtremeTubeIE
 from .facebook import FacebookIE
 from .faz import FazIE
 from .fc2 import FC2IE
-from .firedrive import FiredriveIE
 from .firstpost import FirstpostIE
 from .firsttv import FirstTVIE
 from .fivemin import FiveMinIE
@@ -244,6 +243,7 @@ from .kaltura import KalturaIE
 from .kanalplay import KanalPlayIE
 from .kankan import KankanIE
 from .karaoketv import KaraoketvIE
+from .karrierevideos import KarriereVideosIE
 from .keezmovies import KeezMoviesIE
 from .khanacademy import KhanAcademyIE
 from .kickstarter import KickStarterIE
@@ -338,8 +338,7 @@ from .newstube import NewstubeIE
 from .nextmedia import (
     NextMediaIE,
     NextMediaActionNewsIE,
-    AppleDailyRealtimeNewsIE,
-    AppleDailyAnimationNewsIE
+    AppleDailyIE,
 )
 from .nfb import NFBIE
 from .nfl import NFLIE
@@ -355,6 +354,7 @@ from .normalboots import NormalbootsIE
 from .nosvideo import NosVideoIE
 from .novamov import NovaMovIE
 from .nowness import NownessIE
+from .nowtv import NowTVIE
 from .nowvideo import NowVideoIE
 from .npo import (
     NPOIE,
@@ -438,7 +438,6 @@ from .roxwel import RoxwelIE
 from .rtbf import RTBFIE
 from .rte import RteIE
 from .rtlnl import RtlNlIE
-from .rtlnow import RTLnowIE
 from .rtl2 import RTL2IE
 from .rtp import RTPIE
 from .rts import RTSIE
@@ -480,7 +479,6 @@ from .smotri import (
     SmotriBroadcastIE,
 )
 from .snotr import SnotrIE
-from .sockshare import SockshareIE
 from .sohu import SohuIE
 from .soompi import (
     SoompiIE,
@@ -651,7 +649,10 @@ from .vine import (
     VineIE,
     VineUserIE,
 )
-from .viki import VikiIE
+from .viki import (
+    VikiIE,
+    VikiChannelIE,
+)
 from .vk import (
     VKIE,
     VKUserVideosIE,
index 8273bd6c9ae3cdff82052c8f63efc68be97561b3..76de244774369dd53c510961ecf6b6a7641c7027 100644 (file)
@@ -7,7 +7,6 @@ from .common import InfoExtractor
 from ..utils import (
     find_xpath_attr,
     unified_strdate,
-    get_element_by_id,
     get_element_by_attribute,
     int_or_none,
     qualities,
@@ -195,7 +194,9 @@ class ArteTVFutureIE(ArteTVPlus7IE):
     def _real_extract(self, url):
         anchor_id, lang = self._extract_url_info(url)
         webpage = self._download_webpage(url, anchor_id)
-        row = get_element_by_id(anchor_id, webpage)
+        row = self._search_regex(
+            r'(?s)id="%s"[^>]*>.+?(<div[^>]*arte_vp_url[^>]*>)' % anchor_id,
+            webpage, 'row')
         return self._extract_from_webpage(row, anchor_id, lang)
 
 
index 7ca835e31f3477f8e46c74804aea36ecfe789686..2103ed73aad860738bba5108ee27a86fd921d29c 100644 (file)
@@ -3,6 +3,8 @@ from __future__ import unicode_literals
 
 import re
 import itertools
+import json
+import xml.etree.ElementTree as ET
 
 from .common import InfoExtractor
 from ..utils import (
@@ -67,11 +69,19 @@ class BiliBiliIE(InfoExtractor):
 
         entries = []
 
-        lq_doc = self._download_xml(
+        lq_page = self._download_webpage(
             'http://interface.bilibili.com/v_cdn_play?appkey=1&cid=%s' % cid,
             video_id,
             note='Downloading LQ video info'
         )
+        try:
+            err_info = json.loads(lq_page)
+            raise ExtractorError(
+                'BiliBili said: ' + err_info['error_text'], expected=True)
+        except ValueError:
+            pass
+
+        lq_doc = ET.fromstring(lq_page)
         lq_durls = lq_doc.findall('./durl')
 
         hq_doc = self._download_xml(
@@ -80,9 +90,11 @@ class BiliBiliIE(InfoExtractor):
             note='Downloading HQ video info',
             fatal=False,
         )
-        hq_durls = hq_doc.findall('./durl') if hq_doc is not False else itertools.repeat(None)
-
-        assert len(lq_durls) == len(hq_durls)
+        if hq_doc is not False:
+            hq_durls = hq_doc.findall('./durl')
+            assert len(lq_durls) == len(hq_durls)
+        else:
+            hq_durls = itertools.repeat(None)
 
         i = 1
         for lq_durl, hq_durl in zip(lq_durls, hq_durls):
index c922f695905d70e4052ddfa5c8f336c01221413b..0206d96db4670fb29a40353839dae15911b9c6d3 100644 (file)
@@ -57,7 +57,7 @@ class ChilloutzoneIE(InfoExtractor):
 
         base64_video_info = self._html_search_regex(
             r'var cozVidData = "(.+?)";', webpage, 'video data')
-        decoded_video_info = base64.b64decode(base64_video_info).decode("utf-8")
+        decoded_video_info = base64.b64decode(base64_video_info.encode('utf-8')).decode('utf-8')
         video_info_dict = json.loads(decoded_video_info)
 
         # get video information from dict
index cf0a7551b7f3df672d20aad7a00d7ced43bf2945..c949a481477c187d9433f39e6e851b87c97edf79 100644 (file)
@@ -60,6 +60,17 @@ class CinemassacreIE(InfoExtractor):
                 'uploader_id': 'Cinemassacre',
                 'title': 'AVGN: McKids',
             }
+        },
+        {
+            'url': 'http://cinemassacre.com/2015/05/25/mario-kart-64-nintendo-64-james-mike-mondays/',
+            'md5': '1376908e49572389e7b06251a53cdd08',
+            'info_dict': {
+                'id': 'Cinemassacre-555779690c440',
+                'ext': 'mp4',
+                'description': 'Let’s Play Mario Kart 64 !! Mario Kart 64 is a classic go-kart racing game released for the Nintendo 64 (N64). Today James & Mike do 4 player Battle Mode with Kyle and Bootsy!',
+                'title': 'Mario Kart 64 (Nintendo 64) James & Mike Mondays',
+                'upload_date': '20150525',
+            }
         }
     ]
 
@@ -72,7 +83,7 @@ class CinemassacreIE(InfoExtractor):
 
         playerdata_url = self._search_regex(
             [
-                r'src="(http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?[^"]*\bid=.+?)"',
+                r'src="(http://(?:player2\.screenwavemedia\.com|player\.screenwavemedia\.com/play)/[a-zA-Z]+\.php\?[^"]*\bid=.+?)"',
                 r'<iframe[^>]+src="((?:https?:)?//(?:[^.]+\.)?youtube\.com/.+?)"',
             ],
             webpage, 'player data URL', default=None)
index 5efc5f4fe556a4424542b441a83f2d6dbd5bc8e7..3b1bd4033fd1c01986c83ab44cc1cebaa1b19e5b 100644 (file)
@@ -12,7 +12,7 @@ from ..utils import (
 
 class CNNIE(InfoExtractor):
     _VALID_URL = r'''(?x)https?://(?:(?:edition|www)\.)?cnn\.com/video/(?:data/.+?|\?)/
-        (?P<path>.+?/(?P<title>[^/]+?)(?:\.(?:[a-z]{3,5})(?:-ap)?|(?=&)))'''
+        (?P<path>.+?/(?P<title>[^/]+?)(?:\.(?:[a-z\-]+)|(?=&)))'''
 
     _TESTS = [{
         'url': 'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn',
index db10b8d00b7482b157bd9dd0ecc9ef9b8191ce88..70aa4333c773c39c4e3be451f8b632045ac85b51 100644 (file)
@@ -225,7 +225,7 @@ class DailymotionPlaylistIE(DailymotionBaseInfoExtractor):
 
 class DailymotionUserIE(DailymotionPlaylistIE):
     IE_NAME = 'dailymotion:user'
-    _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?:old/)?user/(?P<user>[^/]+)'
+    _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?:(?:old/)?user/)?(?P<user>[^/]+)$'
     _PAGE_TEMPLATE = 'http://www.dailymotion.com/user/%s/%s'
     _TESTS = [{
         'url': 'https://www.dailymotion.com/user/nqtv',
@@ -239,7 +239,8 @@ class DailymotionUserIE(DailymotionPlaylistIE):
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         user = mobj.group('user')
-        webpage = self._download_webpage(url, user)
+        webpage = self._download_webpage(
+            'https://www.dailymotion.com/user/%s' % user, user)
         full_user = unescapeHTML(self._html_search_regex(
             r'<a class="nav-image" title="([^"]+)" href="/%s">' % re.escape(user),
             webpage, 'user'))
index f25ab319e66d4d5b151cd9a9d4509807b6a88617..baa24c6d13abe016cceb83bb927db15d7d300509 100644 (file)
@@ -1,8 +1,11 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
-from .common import InfoExtractor, ExtractorError
-from ..utils import parse_iso8601
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    parse_iso8601,
+)
 
 
 class DRTVIE(InfoExtractor):
@@ -60,19 +63,31 @@ class DRTVIE(InfoExtractor):
                 restricted_to_denmark = asset['RestrictedToDenmark']
                 spoken_subtitles = asset['Target'] == 'SpokenSubtitles'
                 for link in asset['Links']:
-                    target = link['Target']
                     uri = link['Uri']
+                    target = link['Target']
                     format_id = target
-                    preference = -1 if target == 'HDS' else -2
+                    preference = None
                     if spoken_subtitles:
-                        preference -= 2
+                        preference = -1
                         format_id += '-spoken-subtitles'
-                    formats.append({
-                        'url': uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43' if target == 'HDS' else uri,
-                        'format_id': format_id,
-                        'ext': link['FileFormat'],
-                        'preference': preference,
-                    })
+                    if target == 'HDS':
+                        formats.extend(self._extract_f4m_formats(
+                            uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43',
+                            video_id, preference, f4m_id=format_id))
+                    elif target == 'HLS':
+                        formats.extend(self._extract_m3u8_formats(
+                            uri, video_id, 'mp4', preference=preference,
+                            m3u8_id=format_id))
+                    else:
+                        bitrate = link.get('Bitrate')
+                        if bitrate:
+                            format_id += '-%s' % bitrate
+                        formats.append({
+                            'url': uri,
+                            'format_id': format_id,
+                            'tbr': bitrate,
+                            'ext': link.get('FileFormat'),
+                        })
                 subtitles_list = asset.get('SubtitlesList')
                 if isinstance(subtitles_list, list):
                     LANGS = {
index 70f8efe27578c4d43b27378a4a2c80d495a7488c..9a5a8f4bb44039e6c52968801033a3d12a73d835 100644 (file)
@@ -4,22 +4,28 @@ from .tnaflix import TNAFlixIE
 
 
 class EMPFlixIE(TNAFlixIE):
-    _VALID_URL = r'^https?://www\.empflix\.com/videos/(?P<display_id>[0-9a-zA-Z-]+)-(?P<id>[0-9]+)\.html'
+    _VALID_URL = r'https?://(?:www\.)?empflix\.com/videos/(?P<display_id>.+?)-(?P<id>[0-9]+)\.html'
 
     _TITLE_REGEX = r'name="title" value="(?P<title>[^"]*)"'
     _DESCRIPTION_REGEX = r'name="description" value="([^"]*)"'
     _CONFIG_REGEX = r'flashvars\.config\s*=\s*escape\("([^"]+)"'
 
-    _TEST = {
-        'url': 'http://www.empflix.com/videos/Amateur-Finger-Fuck-33051.html',
-        'md5': 'b1bc15b6412d33902d6e5952035fcabc',
-        'info_dict': {
-            'id': '33051',
-            'display_id': 'Amateur-Finger-Fuck',
-            'ext': 'mp4',
-            'title': 'Amateur Finger Fuck',
-            'description': 'Amateur solo finger fucking.',
-            'thumbnail': 're:https?://.*\.jpg$',
-            'age_limit': 18,
+    _TESTS = [
+        {
+            'url': 'http://www.empflix.com/videos/Amateur-Finger-Fuck-33051.html',
+            'md5': 'b1bc15b6412d33902d6e5952035fcabc',
+            'info_dict': {
+                'id': '33051',
+                'display_id': 'Amateur-Finger-Fuck',
+                'ext': 'mp4',
+                'title': 'Amateur Finger Fuck',
+                'description': 'Amateur solo finger fucking.',
+                'thumbnail': 're:https?://.*\.jpg$',
+                'age_limit': 18,
+            }
+        },
+        {
+            'url': 'http://www.empflix.com/videos/[AROMA][ARMD-718]-Aoi-Yoshino-Sawa-25826.html',
+            'matching_only': True,
         }
-    }
+    ]
index 937b28fcccf3bd58929adcca1bda9d05966460e5..82dc27bc6ff3ed2edd3b318f3ed3d14e360ef22d 100644 (file)
@@ -50,7 +50,10 @@ class FacebookIE(InfoExtractor):
             'id': '274175099429670',
             'ext': 'mp4',
             'title': 'Facebook video #274175099429670',
-        }
+        },
+        'expected_warnings': [
+            'title'
+        ]
     }, {
         'url': 'https://www.facebook.com/video.php?v=10204634152394104',
         'only_matching': True,
@@ -149,12 +152,12 @@ class FacebookIE(InfoExtractor):
             raise ExtractorError('Cannot find video formats')
 
         video_title = self._html_search_regex(
-            r'<h2 class="uiHeaderTitle">([^<]*)</h2>', webpage, 'title',
-            fatal=False)
+            r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>([^<]*)</h2>', webpage, 'title',
+            default=None)
         if not video_title:
             video_title = self._html_search_regex(
                 r'(?s)<span class="fbPhotosPhotoCaption".*?id="fbPhotoPageCaption"><span class="hasCaption">(.*?)</span>',
-                webpage, 'alternative title', default=None)
+                webpage, 'alternative title', fatal=False)
             video_title = limit_length(video_title, 80)
         if not video_title:
             video_title = 'Facebook video #%s' % video_id
diff --git a/youtube_dl/extractor/firedrive.py b/youtube_dl/extractor/firedrive.py
deleted file mode 100644 (file)
index 3191116..0000000
+++ /dev/null
@@ -1,80 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import re
-
-from .common import InfoExtractor
-from ..compat import (
-    compat_urllib_parse,
-    compat_urllib_request,
-)
-from ..utils import (
-    ExtractorError,
-)
-
-
-class FiredriveIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?firedrive\.com/' + \
-                 '(?:file|embed)/(?P<id>[0-9a-zA-Z]+)'
-    _FILE_DELETED_REGEX = r'<div class="removed_file_image">'
-
-    _TESTS = [{
-        'url': 'https://www.firedrive.com/file/FEB892FA160EBD01',
-        'md5': 'd5d4252f80ebeab4dc2d5ceaed1b7970',
-        'info_dict': {
-            'id': 'FEB892FA160EBD01',
-            'ext': 'flv',
-            'title': 'bbb_theora_486kbit.flv',
-            'thumbnail': 're:^http://.*\.jpg$',
-        },
-    }]
-
-    def _real_extract(self, url):
-        video_id = self._match_id(url)
-        url = 'http://firedrive.com/file/%s' % video_id
-        webpage = self._download_webpage(url, video_id)
-
-        if re.search(self._FILE_DELETED_REGEX, webpage) is not None:
-            raise ExtractorError('Video %s does not exist' % video_id,
-                                 expected=True)
-
-        fields = dict(re.findall(r'''(?x)<input\s+
-            type="hidden"\s+
-            name="([^"]+)"\s+
-            value="([^"]*)"
-            ''', webpage))
-
-        post = compat_urllib_parse.urlencode(fields)
-        req = compat_urllib_request.Request(url, post)
-        req.add_header('Content-type', 'application/x-www-form-urlencoded')
-
-        # Apparently, this header is required for confirmation to work.
-        req.add_header('Host', 'www.firedrive.com')
-
-        webpage = self._download_webpage(req, video_id,
-                                         'Downloading video page')
-
-        title = self._search_regex(r'class="external_title_left">(.+)</div>',
-                                   webpage, 'title')
-        thumbnail = self._search_regex(r'image:\s?"(//[^\"]+)', webpage,
-                                       'thumbnail', fatal=False)
-        if thumbnail is not None:
-            thumbnail = 'http:' + thumbnail
-
-        ext = self._search_regex(r'type:\s?\'([^\']+)\',',
-                                 webpage, 'extension', fatal=False)
-        video_url = self._search_regex(
-            r'file:\s?loadURL\(\'(http[^\']+)\'\),', webpage, 'file url')
-
-        formats = [{
-            'format_id': 'sd',
-            'url': video_url,
-            'ext': ext,
-        }]
-
-        return {
-            'id': video_id,
-            'title': title,
-            'thumbnail': thumbnail,
-            'formats': formats,
-        }
diff --git a/youtube_dl/extractor/karrierevideos.py b/youtube_dl/extractor/karrierevideos.py
new file mode 100644 (file)
index 0000000..bed94bc
--- /dev/null
@@ -0,0 +1,96 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_urlparse
+from ..utils import (
+    fix_xml_ampersands,
+    float_or_none,
+    xpath_with_ns,
+    xpath_text,
+)
+
+
+class KarriereVideosIE(InfoExtractor):
+    _VALID_URL = r'http://(?:www\.)?karrierevideos\.at(?:/[^/]+)+/(?P<id>[^/]+)'
+    _TESTS = [{
+        'url': 'http://www.karrierevideos.at/berufsvideos/mittlere-hoehere-schulen/altenpflegerin',
+        'info_dict': {
+            'id': '32c91',
+            'ext': 'flv',
+            'title': 'AltenpflegerIn',
+            'description': 'md5:dbadd1259fde2159a9b28667cb664ae2',
+            'thumbnail': 're:^http://.*\.png',
+        },
+        'params': {
+            # rtmp download
+            'skip_download': True,
+        }
+    }, {
+        # broken ampersands
+        'url': 'http://www.karrierevideos.at/orientierung/vaeterkarenz-und-neue-chancen-fuer-muetter-baby-was-nun',
+        'info_dict': {
+            'id': '5sniu',
+            'ext': 'flv',
+            'title': 'Väterkarenz und neue Chancen für Mütter - "Baby - was nun?"',
+            'description': 'md5:97092c6ad1fd7d38e9d6a5fdeb2bcc33',
+            'thumbnail': 're:^http://.*\.png',
+        },
+        'params': {
+            # rtmp download
+            'skip_download': True,
+        }
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        title = (self._html_search_meta('title', webpage, default=None) or
+                 self._search_regex(r'<h1 class="title">([^<]+)</h1>'))
+
+        video_id = self._search_regex(
+            r'/config/video/(.+?)\.xml', webpage, 'video id')
+        playlist = self._download_xml(
+            'http://www.karrierevideos.at/player-playlist.xml.php?p=%s' % video_id,
+            video_id, transform_source=fix_xml_ampersands)
+
+        NS_MAP = {
+            'jwplayer': 'http://developer.longtailvideo.com/trac/wiki/FlashFormats'
+        }
+
+        def ns(path):
+            return xpath_with_ns(path, NS_MAP)
+
+        item = playlist.find('./tracklist/item')
+        video_file = xpath_text(
+            item, ns('./jwplayer:file'), 'video url', fatal=True)
+        streamer = xpath_text(
+            item, ns('./jwplayer:streamer'), 'streamer', fatal=True)
+
+        uploader = xpath_text(
+            item, ns('./jwplayer:author'), 'uploader')
+        duration = float_or_none(
+            xpath_text(item, ns('./jwplayer:duration'), 'duration'))
+
+        description = self._html_search_regex(
+            r'(?s)<div class="leadtext">(.+?)</div>',
+            webpage, 'description')
+
+        thumbnail = self._html_search_meta(
+            'thumbnail', webpage, 'thumbnail')
+        if thumbnail:
+            thumbnail = compat_urlparse.urljoin(url, thumbnail)
+
+        return {
+            'id': video_id,
+            'url': streamer.replace('rtmpt', 'rtmp'),
+            'play_path': 'mp4:%s' % video_file,
+            'ext': 'flv',
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'uploader': uploader,
+            'duration': duration,
+        }
index d8897eb90d526b7b7d2e5a5ace5bec84ebb40031..7091f3335e8223ea0a089ba7cff127a983b12d7d 100644 (file)
@@ -20,7 +20,6 @@ class MiTeleIE(InfoExtractor):
 
     _TESTS = [{
         'url': 'http://www.mitele.es/programas-tv/diario-de/la-redaccion/programa-144/',
-        'md5': '6a75fe9d0d3275bead0cb683c616fddb',
         'info_dict': {
             'id': '0fce117d',
             'ext': 'mp4',
@@ -29,6 +28,10 @@ class MiTeleIE(InfoExtractor):
             'display_id': 'programa-144',
             'duration': 2913,
         },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
     }]
 
     def _real_extract(self, url):
@@ -56,12 +59,14 @@ class MiTeleIE(InfoExtractor):
             episode,
             transform_source=strip_jsonp
         )
+        formats = self._extract_m3u8_formats(
+            token_info['tokenizedUrl'], episode, ext='mp4')
 
         return {
             'id': embed_data['videoId'],
             'display_id': episode,
             'title': info_el.find('title').text,
-            'url': token_info['tokenizedUrl'],
+            'formats': formats,
             'description': get_element_by_attribute('class', 'text', webpage),
             'thumbnail': info_el.find('thumb').text,
             'duration': parse_duration(info_el.find('duration').text),
index c10405f04d3cc1b3e89004029b7502112e9baa29..925967753bd12816005b5ed8929f0438e9ec0214 100644 (file)
@@ -6,6 +6,7 @@ import re
 from .common import InfoExtractor
 from ..compat import (
     compat_urllib_parse,
+    compat_urlparse,
 )
 from ..utils import (
     ExtractorError,
@@ -16,7 +17,7 @@ from ..utils import (
 class NaverIE(InfoExtractor):
     _VALID_URL = r'https?://(?:m\.)?tvcast\.naver\.com/v/(?P<id>\d+)'
 
-    _TEST = {
+    _TESTS = [{
         'url': 'http://tvcast.naver.com/v/81652',
         'info_dict': {
             'id': '81652',
@@ -25,7 +26,18 @@ class NaverIE(InfoExtractor):
             'description': '합격불변의 법칙 메가스터디 | 메가스터디 수학 김상희 선생님이 9월 모의고사 수학A형 16번에서 20번까지 해설강의를 공개합니다.',
             'upload_date': '20130903',
         },
-    }
+    }, {
+        'url': 'http://tvcast.naver.com/v/395837',
+        'md5': '638ed4c12012c458fefcddfd01f173cd',
+        'info_dict': {
+            'id': '395837',
+            'ext': 'mp4',
+            'title': '9년이 지나도 아픈 기억, 전효성의 아버지',
+            'description': 'md5:5bf200dcbf4b66eb1b350d1eb9c753f7',
+            'upload_date': '20150519',
+        },
+        'skip': 'Georestricted',
+    }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
@@ -35,7 +47,7 @@ class NaverIE(InfoExtractor):
                          webpage)
         if m_id is None:
             m_error = re.search(
-                r'(?s)<div class="nation_error">\s*(?:<!--.*?-->)?\s*<p class="[^"]+">(?P<msg>.+?)</p>\s*</div>',
+                r'(?s)<div class="(?:nation_error|nation_box)">\s*(?:<!--.*?-->)?\s*<p class="[^"]+">(?P<msg>.+?)</p>\s*</div>',
                 webpage)
             if m_error:
                 raise ExtractorError(clean_html(m_error.group('msg')), expected=True)
@@ -58,14 +70,18 @@ class NaverIE(InfoExtractor):
         formats = []
         for format_el in urls.findall('EncodingOptions/EncodingOption'):
             domain = format_el.find('Domain').text
+            uri = format_el.find('uri').text
             f = {
-                'url': domain + format_el.find('uri').text,
+                'url': compat_urlparse.urljoin(domain, uri),
                 'ext': 'mp4',
                 'width': int(format_el.find('width').text),
                 'height': int(format_el.find('height').text),
             }
             if domain.startswith('rtmp'):
+                # urlparse does not support custom schemes
+                # https://bugs.python.org/issue18828
                 f.update({
+                    'url': domain + uri,
                     'ext': 'flv',
                     'rtmp_protocol': '1',  # rtmpt
                 })
index 862b706bf96719aa071f1f89c73f2a4ef45a20b1..944096e1ca15de964fcdf896adf988c9aa2264bd 100644 (file)
@@ -22,6 +22,18 @@ class NBAIE(InfoExtractor):
     }, {
         'url': 'http://www.nba.com/video/games/hornets/2014/12/05/0021400276-nyk-cha-play5.nba/',
         'only_matching': True,
+    }, {
+        'url': 'http://watch.nba.com/nba/video/channels/playoffs/2015/05/20/0041400301-cle-atl-recap.nba',
+        'info_dict': {
+            'id': '0041400301-cle-atl-recap.nba',
+            'ext': 'mp4',
+            'title': 'NBA GAME TIME | Video: Hawks vs. Cavaliers Game 1',
+            'description': 'md5:8094c3498d35a9bd6b1a8c396a071b4d',
+            'duration': 228,
+        },
+        'params': {
+            'skip_download': True,
+        }
     }]
 
     def _real_extract(self, url):
@@ -35,8 +47,12 @@ class NBAIE(InfoExtractor):
             self._og_search_title(webpage, default=shortened_video_id), ' : NBA.com')
 
         description = self._og_search_description(webpage)
-        duration = parse_duration(
-            self._html_search_meta('duration', webpage, 'duration'))
+        duration_str = self._html_search_meta(
+            'duration', webpage, 'duration', default=None)
+        if not duration_str:
+            duration_str = self._html_search_regex(
+                r'Duration:</b>\s*(\d+:\d+)', webpage, 'duration', fatal=False)
+        duration = parse_duration(duration_str)
 
         return {
             'id': shortened_video_id,
index 02dba4ef639e64deff790f94bd5cd0cd6da2a4cb..d1b7cff4cfbf30c76c52ae98ad247e8907be6abd 100644 (file)
@@ -89,8 +89,8 @@ class NextMediaActionNewsIE(NextMediaIE):
         return self._extract_from_nextmedia_page(news_id, url, article_page)
 
 
-class AppleDailyRealtimeNewsIE(NextMediaIE):
-    _VALID_URL = r'http://(www|ent).appledaily.com.tw/(realtimenews|enews)/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?'
+class AppleDailyIE(NextMediaIE):
+    _VALID_URL = r'http://(www|ent).appledaily.com.tw/(?:animation|appledaily|enews|realtimenews)/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?'
     _TESTS = [{
         'url': 'http://ent.appledaily.com.tw/enews/article/entertainment/20150128/36354694',
         'md5': 'a843ab23d150977cc55ef94f1e2c1e4d',
@@ -99,7 +99,7 @@ class AppleDailyRealtimeNewsIE(NextMediaIE):
             'ext': 'mp4',
             'title': '周亭羽走過摩鐵陰霾2男陪吃 九把刀孤寒看醫生',
             'thumbnail': 're:^https?://.*\.jpg$',
-            'description': 'md5:b23787119933404ce515c6356a8c355c',
+            'description': 'md5:2acd430e59956dc47cd7f67cb3c003f4',
             'upload_date': '20150128',
         }
     }, {
@@ -110,26 +110,10 @@ class AppleDailyRealtimeNewsIE(NextMediaIE):
             'ext': 'mp4',
             'title': '不滿被踩腳 山東兩大媽一路打下車',
             'thumbnail': 're:^https?://.*\.jpg$',
-            'description': 'md5:2648aaf6fc4f401f6de35a91d111aa1d',
+            'description': 'md5:175b4260c1d7c085993474217e4ab1b4',
             'upload_date': '20150128',
         }
-    }]
-
-    _URL_PATTERN = r'\{url: \'(.+)\'\}'
-
-    def _fetch_title(self, page):
-        return self._html_search_regex(r'<h1 id="h1">([^<>]+)</h1>', page, 'news title')
-
-    def _fetch_thumbnail(self, page):
-        return self._html_search_regex(r"setInitialImage\(\'([^']+)'\)", page, 'video thumbnail', fatal=False)
-
-    def _fetch_timestamp(self, page):
-        return None
-
-
-class AppleDailyAnimationNewsIE(AppleDailyRealtimeNewsIE):
-    _VALID_URL = 'http://www.appledaily.com.tw/animation/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?'
-    _TESTS = [{
+    }, {
         'url': 'http://www.appledaily.com.tw/animation/realtimenews/new/20150128/5003671',
         'md5': '03df296d95dedc2d5886debbb80cb43f',
         'info_dict': {
@@ -154,10 +138,22 @@ class AppleDailyAnimationNewsIE(AppleDailyRealtimeNewsIE):
         'expected_warnings': [
             'video thumbnail',
         ]
+    }, {
+        'url': 'http://www.appledaily.com.tw/appledaily/article/supplement/20140417/35770334/',
+        'only_matching': True,
     }]
 
+    _URL_PATTERN = r'\{url: \'(.+)\'\}'
+
     def _fetch_title(self, page):
-        return self._html_search_meta('description', page, 'news title')
+        return (self._html_search_regex(r'<h1 id="h1">([^<>]+)</h1>', page, 'news title', default=None) or
+                self._html_search_meta('description', page, 'news title'))
+
+    def _fetch_thumbnail(self, page):
+        return self._html_search_regex(r"setInitialImage\(\'([^']+)'\)", page, 'video thumbnail', fatal=False)
+
+    def _fetch_timestamp(self, page):
+        return None
 
     def _fetch_description(self, page):
         return self._html_search_meta('description', page, 'news description')
diff --git a/youtube_dl/extractor/nowtv.py b/youtube_dl/extractor/nowtv.py
new file mode 100644 (file)
index 0000000..173e46c
--- /dev/null
@@ -0,0 +1,192 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    parse_iso8601,
+    parse_duration,
+    remove_start,
+)
+
+
+class NowTVIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?nowtv\.de/(?P<station>rtl|rtl2|rtlnitro|superrtl|ntv|vox)/(?P<id>.+?)/player'
+
+    _TESTS = [{
+        # rtl
+        'url': 'http://www.nowtv.de/rtl/bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit/player',
+        'info_dict': {
+            'id': '203519',
+            'display_id': 'bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit',
+            'ext': 'mp4',
+            'title': 'Die neuen Bauern und eine Hochzeit',
+            'description': 'md5:e234e1ed6d63cf06be5c070442612e7e',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'timestamp': 1432580700,
+            'upload_date': '20150525',
+            'duration': 2786,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        # rtl2
+        'url': 'http://www.nowtv.de/rtl2/berlin-tag-nacht/berlin-tag-nacht-folge-934/player',
+        'info_dict': {
+            'id': '203481',
+            'display_id': 'berlin-tag-nacht/berlin-tag-nacht-folge-934',
+            'ext': 'mp4',
+            'title': 'Berlin - Tag & Nacht (Folge 934)',
+            'description': 'md5:c85e88c2e36c552dfe63433bc9506dd0',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'timestamp': 1432666800,
+            'upload_date': '20150526',
+            'duration': 2641,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        # rtlnitro
+        'url': 'http://www.nowtv.de/rtlnitro/alarm-fuer-cobra-11-die-autobahnpolizei/hals-und-beinbruch-2014-08-23-21-10-00/player',
+        'info_dict': {
+            'id': '165780',
+            'display_id': 'alarm-fuer-cobra-11-die-autobahnpolizei/hals-und-beinbruch-2014-08-23-21-10-00',
+            'ext': 'mp4',
+            'title': 'Hals- und Beinbruch',
+            'description': 'md5:b50d248efffe244e6f56737f0911ca57',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'timestamp': 1432415400,
+            'upload_date': '20150523',
+            'duration': 2742,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        # superrtl
+        'url': 'http://www.nowtv.de/superrtl/medicopter-117/angst/player',
+        'info_dict': {
+            'id': '99205',
+            'display_id': 'medicopter-117/angst',
+            'ext': 'mp4',
+            'title': 'Angst!',
+            'description': 'md5:30cbc4c0b73ec98bcd73c9f2a8c17c4e',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'timestamp': 1222632900,
+            'upload_date': '20080928',
+            'duration': 3025,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        # ntv
+        'url': 'http://www.nowtv.de/ntv/ratgeber-geld/thema-ua-der-erste-blick-die-apple-watch/player',
+        'info_dict': {
+            'id': '203521',
+            'display_id': 'ratgeber-geld/thema-ua-der-erste-blick-die-apple-watch',
+            'ext': 'mp4',
+            'title': 'Thema u.a.: Der erste Blick: Die Apple Watch',
+            'description': 'md5:4312b6c9d839ffe7d8caf03865a531af',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'timestamp': 1432751700,
+            'upload_date': '20150527',
+            'duration': 1083,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        # vox
+        'url': 'http://www.nowtv.de/vox/der-hundeprofi/buero-fall-chihuahua-joel/player',
+        'info_dict': {
+            'id': '128953',
+            'display_id': 'der-hundeprofi/buero-fall-chihuahua-joel',
+            'ext': 'mp4',
+            'title': "Büro-Fall / Chihuahua 'Joel'",
+            'description': 'md5:e62cb6bf7c3cc669179d4f1eb279ad8d',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'timestamp': 1432408200,
+            'upload_date': '20150523',
+            'duration': 3092,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        display_id = mobj.group('id')
+        station = mobj.group('station')
+
+        info = self._download_json(
+            'https://api.nowtv.de/v3/movies/%s?fields=*,format,files' % display_id,
+            display_id)
+
+        video_id = compat_str(info['id'])
+
+        files = info['files']
+        if not files:
+            if info.get('geoblocked', False):
+                raise ExtractorError(
+                    'Video %s is not available from your location due to geo restriction' % video_id,
+                    expected=True)
+            if not info.get('free', True):
+                raise ExtractorError(
+                    'Video %s is not available for free' % video_id, expected=True)
+
+        f = info.get('format', {})
+        station = f.get('station') or station
+
+        STATIONS = {
+            'rtl': 'rtlnow',
+            'rtl2': 'rtl2now',
+            'vox': 'voxnow',
+            'nitro': 'rtlnitronow',
+            'ntv': 'n-tvnow',
+            'superrtl': 'superrtlnow'
+        }
+
+        formats = []
+        for item in files['items']:
+            item_path = remove_start(item['path'], '/')
+            tbr = int_or_none(item['bitrate'])
+            m3u8_url = 'http://hls.fra.%s.de/hls-vod-enc/%s.m3u8' % (STATIONS[station], item_path)
+            m3u8_url = m3u8_url.replace('now/', 'now/videos/')
+            formats.append({
+                'url': m3u8_url,
+                'format_id': '%s-%sk' % (item['id'], tbr),
+                'ext': 'mp4',
+                'tbr': tbr,
+            })
+        self._sort_formats(formats)
+
+        title = info['title']
+        description = info.get('articleLong') or info.get('articleShort')
+        timestamp = parse_iso8601(info.get('broadcastStartDate'), ' ')
+        duration = parse_duration(info.get('duration'))
+        thumbnail = f.get('defaultImage169Format') or f.get('defaultImage169Logo')
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'timestamp': timestamp,
+            'duration': duration,
+            'formats': formats,
+        }
index fbc521d1aae02077ae62c5cd0a6c5f9cdcff014a..6c7149fe3859732978d34356f8c18aca14e74f63 100644 (file)
@@ -2,6 +2,7 @@
 from __future__ import unicode_literals
 
 from .common import InfoExtractor
+from ..compat import compat_urllib_parse
 from ..utils import (
     unified_strdate,
     int_or_none,
@@ -11,8 +12,9 @@ from ..utils import (
 
 
 class OdnoklassnikiIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:odnoklassniki|ok)\.ru/(?:video|web-api/video/moviePlayer)/(?P<id>\d+)'
+    _VALID_URL = r'https?://(?:odnoklassniki|ok)\.ru/(?:video|web-api/video/moviePlayer)/(?P<id>[\d-]+)'
     _TESTS = [{
+        # metadata in JSON
         'url': 'http://ok.ru/video/20079905452',
         'md5': '8e24ad2da6f387948e7a7d44eb8668fe',
         'info_dict': {
@@ -20,11 +22,22 @@ class OdnoklassnikiIE(InfoExtractor):
             'ext': 'mp4',
             'title': 'Культура меняет нас (прекрасный ролик!))',
             'duration': 100,
-            'upload_date': '20141207',
             'uploader_id': '330537914540',
             'uploader': 'Виталий Добровольский',
             'like_count': int,
-            'age_limit': 0,
+        },
+    }, {
+        # metadataUrl
+        'url': 'http://ok.ru/video/63567059965189-0',
+        'md5': '9676cf86eff5391d35dea675d224e131',
+        'info_dict': {
+            'id': '63567059965189-0',
+            'ext': 'mp4',
+            'title': 'Девушка без комплексов ...',
+            'duration': 191,
+            'uploader_id': '534380003155',
+            'uploader': 'Андрей Мещанинов',
+            'like_count': int,
         },
     }, {
         'url': 'http://ok.ru/web-api/video/moviePlayer/20079905452',
@@ -34,14 +47,23 @@ class OdnoklassnikiIE(InfoExtractor):
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
-        webpage = self._download_webpage(url, video_id)
+        webpage = self._download_webpage(
+            'http://ok.ru/video/%s' % video_id, video_id)
 
         player = self._parse_json(
             unescapeHTML(self._search_regex(
                 r'data-attributes="([^"]+)"', webpage, 'player')),
             video_id)
 
-        metadata = self._parse_json(player['flashvars']['metadata'], video_id)
+        flashvars = player['flashvars']
+
+        metadata = flashvars.get('metadata')
+        if metadata:
+            metadata = self._parse_json(metadata, video_id)
+        else:
+            metadata = self._download_json(
+                compat_urllib_parse.unquote(flashvars['metadataUrl']),
+                video_id, 'Downloading metadata JSON')
 
         movie = metadata['movie']
         title = movie['title']
@@ -53,11 +75,11 @@ class OdnoklassnikiIE(InfoExtractor):
         uploader = author.get('name')
 
         upload_date = unified_strdate(self._html_search_meta(
-            'ya:ovs:upload_date', webpage, 'upload date'))
+            'ya:ovs:upload_date', webpage, 'upload date', default=None))
 
         age_limit = None
         adult = self._html_search_meta(
-            'ya:ovs:adult', webpage, 'age limit')
+            'ya:ovs:adult', webpage, 'age limit', default=None)
         if adult:
             age_limit = 18 if adult == 'true' else 0
 
index 0c8b731cf47267568e43ccd09ff21f1683b4d992..daa284ea28be63f9677471f16a0b8102f32b560e 100644 (file)
@@ -71,7 +71,8 @@ class PornHubIE(InfoExtractor):
 
         video_urls = list(map(compat_urllib_parse.unquote, re.findall(r'"quality_[0-9]{3}p":"([^"]+)', webpage)))
         if webpage.find('"encrypted":true') != -1:
-            password = compat_urllib_parse.unquote_plus(self._html_search_regex(r'"video_title":"([^"]+)', webpage, 'password'))
+            password = compat_urllib_parse.unquote_plus(
+                self._search_regex(r'"video_title":"([^"]+)', webpage, 'password'))
             video_urls = list(map(lambda s: aes_decrypt_text(s, password, 32).decode('utf-8'), video_urls))
 
         formats = []
index 7cc7996642cae1de1ca2a585391d167025b92162..255d4abc131519ec470ccdc2b1a64b7d38d9f44b 100644 (file)
@@ -17,7 +17,7 @@ from ..utils import (
 class ProSiebenSat1IE(InfoExtractor):
     IE_NAME = 'prosiebensat1'
     IE_DESC = 'ProSiebenSat.1 Digital'
-    _VALID_URL = r'https?://(?:www\.)?(?:(?:prosieben|prosiebenmaxx|sixx|sat1|kabeleins|ran|the-voice-of-germany)\.de|fem\.com)/(?P<id>.+)'
+    _VALID_URL = r'https?://(?:www\.)?(?:(?:prosieben|prosiebenmaxx|sixx|sat1|kabeleins|the-voice-of-germany)\.(?:de|at)|ran\.de|fem\.com)/(?P<id>.+)'
 
     _TESTS = [
         {
index dce64e1517003015722db1097ac83b106cc91136..5a381d9ced41516db44d7e17120b29948a1957cb 100644 (file)
@@ -1,10 +1,11 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
-import re
-import json
-
 from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    unescapeHTML,
+)
 
 
 class RTBFIE(InfoExtractor):
@@ -16,25 +17,24 @@ class RTBFIE(InfoExtractor):
             'id': '1921274',
             'ext': 'mp4',
             'title': 'Les Diables au coeur (épisode 2)',
-            'description': 'Football - Diables Rouges',
             'duration': 3099,
-            'timestamp': 1398456336,
-            'upload_date': '20140425',
         }
     }
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
+        video_id = self._match_id(url)
 
-        page = self._download_webpage('https://www.rtbf.be/video/embed?id=%s' % video_id, video_id)
+        webpage = self._download_webpage(
+            'http://www.rtbf.be/video/embed?id=%s' % video_id, video_id)
 
-        data = json.loads(self._html_search_regex(
-            r'<div class="js-player-embed(?: player-embed)?" data-video="([^"]+)"', page, 'data video'))['data']
+        data = self._parse_json(
+            unescapeHTML(self._search_regex(
+                r'data-video="([^"]+)"', webpage, 'data video')),
+            video_id)
 
         video_url = data.get('downloadUrl') or data.get('url')
 
-        if data['provider'].lower() == 'youtube':
+        if data.get('provider').lower() == 'youtube':
             return self.url_result(video_url, 'Youtube')
 
         return {
@@ -42,8 +42,8 @@ class RTBFIE(InfoExtractor):
             'url': video_url,
             'title': data['title'],
             'description': data.get('description') or data.get('subtitle'),
-            'thumbnail': data['thumbnail']['large'],
+            'thumbnail': data.get('thumbnail'),
             'duration': data.get('duration') or data.get('realDuration'),
-            'timestamp': data['created'],
-            'view_count': data['viewCount'],
+            'timestamp': int_or_none(data.get('created')),
+            'view_count': int_or_none(data.get('viewCount')),
         }
diff --git a/youtube_dl/extractor/rtlnow.py b/youtube_dl/extractor/rtlnow.py
deleted file mode 100644 (file)
index 785a804..0000000
+++ /dev/null
@@ -1,174 +0,0 @@
-# encoding: utf-8
-from __future__ import unicode_literals
-
-import re
-
-from .common import InfoExtractor
-from ..utils import (
-    ExtractorError,
-    clean_html,
-    unified_strdate,
-    int_or_none,
-)
-
-
-class RTLnowIE(InfoExtractor):
-    """Information Extractor for RTL NOW, RTL2 NOW, RTL NITRO, SUPER RTL NOW, VOX NOW and n-tv NOW"""
-    _VALID_URL = r'''(?x)
-                        (?:https?://)?
-                        (?P<url>
-                            (?P<domain>
-                                rtl-now\.rtl\.de|
-                                rtl2now\.rtl2\.de|
-                                (?:www\.)?voxnow\.de|
-                                (?:www\.)?rtlnitronow\.de|
-                                (?:www\.)?superrtlnow\.de|
-                                (?:www\.)?n-tvnow\.de)
-                            /+[a-zA-Z0-9-]+/[a-zA-Z0-9-]+\.php\?
-                            (?:container_id|film_id)=(?P<video_id>[0-9]+)&
-                            player=1(?:&season=[0-9]+)?(?:&.*)?
-                        )'''
-
-    _TESTS = [
-        {
-            'url': 'http://rtl-now.rtl.de/ahornallee/folge-1.php?film_id=90419&player=1&season=1',
-            'info_dict': {
-                'id': '90419',
-                'ext': 'flv',
-                'title': 'Ahornallee - Folge 1 - Der Einzug',
-                'description': 'md5:ce843b6b5901d9a7f7d04d1bbcdb12de',
-                'upload_date': '20070416',
-                'duration': 1685,
-            },
-            'params': {
-                'skip_download': True,
-            },
-            'skip': 'Only works from Germany',
-        },
-        {
-            'url': 'http://rtl2now.rtl2.de/aerger-im-revier/episode-15-teil-1.php?film_id=69756&player=1&season=2&index=5',
-            'info_dict': {
-                'id': '69756',
-                'ext': 'flv',
-                'title': 'Ärger im Revier - Ein junger Ladendieb, ein handfester Streit u.a.',
-                'description': 'md5:3fb247005ed21a935ffc82b7dfa70cf0',
-                'thumbnail': 'http://autoimg.static-fra.de/rtl2now/219850/1500x1500/image2.jpg',
-                'upload_date': '20120519',
-                'duration': 1245,
-            },
-            'params': {
-                'skip_download': True,
-            },
-            'skip': 'Only works from Germany',
-        },
-        {
-            'url': 'http://www.voxnow.de/voxtours/suedafrika-reporter-ii.php?film_id=13883&player=1&season=17',
-            'info_dict': {
-                'id': '13883',
-                'ext': 'flv',
-                'title': 'Voxtours - Südafrika-Reporter II',
-                'description': 'md5:de7f8d56be6fd4fed10f10f57786db00',
-                'upload_date': '20090627',
-                'duration': 1800,
-            },
-            'params': {
-                'skip_download': True,
-            },
-        },
-        {
-            'url': 'http://superrtlnow.de/medicopter-117/angst.php?film_id=99205&player=1',
-            'info_dict': {
-                'id': '99205',
-                'ext': 'flv',
-                'title': 'Medicopter 117 - Angst!',
-                'description': 're:^Im Therapiezentrum \'Sonnalm\' kommen durch eine Unachtsamkeit die für die B.handlung mit Phobikern gehaltenen Voglespinnen frei\. Eine Ausreißerin',
-                'thumbnail': 'http://autoimg.static-fra.de/superrtlnow/287529/1500x1500/image2.jpg',
-                'upload_date': '20080928',
-                'duration': 2691,
-            },
-            'params': {
-                'skip_download': True,
-            },
-        },
-        {
-            'url': 'http://rtl-now.rtl.de/der-bachelor/folge-4.php?film_id=188729&player=1&season=5',
-            'info_dict': {
-                'id': '188729',
-                'ext': 'flv',
-                'upload_date': '20150204',
-                'description': 'md5:5e1ce23095e61a79c166d134b683cecc',
-                'title': 'Der Bachelor - Folge 4',
-            }
-        }, {
-            'url': 'http://www.n-tvnow.de/deluxe-alles-was-spass-macht/thema-ua-luxushotel-fuer-vierbeiner.php?container_id=153819&player=1&season=0',
-            'only_matching': True,
-        },
-    ]
-
-    def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_page_url = 'http://%s/' % mobj.group('domain')
-        video_id = mobj.group('video_id')
-
-        webpage = self._download_webpage('http://' + mobj.group('url'), video_id)
-
-        mobj = re.search(r'(?s)<div style="margin-left: 20px; font-size: 13px;">(.*?)<div id="playerteaser">', webpage)
-        if mobj:
-            raise ExtractorError(clean_html(mobj.group(1)), expected=True)
-
-        title = self._og_search_title(webpage)
-        description = self._og_search_description(webpage)
-        thumbnail = self._og_search_thumbnail(webpage, default=None)
-
-        upload_date = unified_strdate(self._html_search_meta('uploadDate', webpage, 'upload date'))
-
-        mobj = re.search(r'<meta itemprop="duration" content="PT(?P<seconds>\d+)S" />', webpage)
-        duration = int(mobj.group('seconds')) if mobj else None
-
-        playerdata_url = self._html_search_regex(
-            r"'playerdata': '(?P<playerdata_url>[^']+)'", webpage, 'playerdata_url')
-
-        playerdata = self._download_xml(playerdata_url, video_id, 'Downloading player data XML')
-
-        videoinfo = playerdata.find('./playlist/videoinfo')
-
-        formats = []
-        for filename in videoinfo.findall('filename'):
-            mobj = re.search(r'(?P<url>rtmpe://(?:[^/]+/){2})(?P<play_path>.+)', filename.text)
-            if mobj:
-                fmt = {
-                    'url': mobj.group('url'),
-                    'play_path': 'mp4:' + mobj.group('play_path'),
-                    'page_url': video_page_url,
-                    'player_url': video_page_url + 'includes/vodplayer.swf',
-                }
-            else:
-                mobj = re.search(r'.*/(?P<hoster>[^/]+)/videos/(?P<play_path>.+)\.f4m', filename.text)
-                if mobj:
-                    fmt = {
-                        'url': 'rtmpe://fms.rtl.de/' + mobj.group('hoster'),
-                        'play_path': 'mp4:' + mobj.group('play_path'),
-                        'page_url': url,
-                        'player_url': video_page_url + 'includes/vodplayer.swf',
-                    }
-                else:
-                    fmt = {
-                        'url': filename.text,
-                    }
-            fmt.update({
-                'width': int_or_none(filename.get('width')),
-                'height': int_or_none(filename.get('height')),
-                'vbr': int_or_none(filename.get('bitrate')),
-                'ext': 'flv',
-            })
-            formats.append(fmt)
-
-        return {
-            'id': video_id,
-            'title': title,
-            'description': description,
-            'thumbnail': thumbnail,
-            'upload_date': upload_date,
-            'duration': duration,
-            'formats': formats,
-        }
index 849300140ecbf598874d22b090262eabec1e7ea5..82cd98ac742bf436b24fbbc77cac9a6fb8a44ff6 100644 (file)
@@ -17,7 +17,7 @@ from ..utils import (
 
 
 def _decrypt_url(png):
-    encrypted_data = base64.b64decode(png)
+    encrypted_data = base64.b64decode(png.encode('utf-8'))
     text_index = encrypted_data.find(b'tEXt')
     text_chunk = encrypted_data[text_index - 4:]
     length = struct_unpack('!I', text_chunk[:4])[0]
index 55604637dca22533cd765529dcb2abfb759fd9c1..d9df0686133a6772deb1e58260069857620afc58 100644 (file)
@@ -104,7 +104,7 @@ class RUTVIE(InfoExtractor):
     @classmethod
     def _extract_url(cls, webpage):
         mobj = re.search(
-            r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.rutv\.ru/(?:iframe/(?:swf|video|live)/id|index/iframe/cast_id)/.+?)\1', webpage)
+            r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.(?:rutv\.ru|vgtrk\.com)/(?:iframe/(?:swf|video|live)/id|index/iframe/cast_id)/.+?)\1', webpage)
         if mobj:
             return mobj.group('url')
 
index 26ced716e8a875f1c4c5c9527b856475dce83f9e..9f3e944e73532a92c5213ce95e7633fe6bc4c212 100644 (file)
@@ -47,7 +47,7 @@ class SharedIE(InfoExtractor):
         video_url = self._html_search_regex(
             r'data-url="([^"]+)"', video_page, 'video URL')
         title = base64.b64decode(self._html_search_meta(
-            'full:title', webpage, 'title')).decode('utf-8')
+            'full:title', webpage, 'title').encode('utf-8')).decode('utf-8')
         filesize = int_or_none(self._html_search_meta(
             'full:size', webpage, 'file size', fatal=False))
         thumbnail = self._html_search_regex(
diff --git a/youtube_dl/extractor/sockshare.py b/youtube_dl/extractor/sockshare.py
deleted file mode 100644 (file)
index b5fa6f1..0000000
+++ /dev/null
@@ -1,83 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import re
-
-from ..compat import (
-    compat_urllib_parse,
-    compat_urllib_request,
-)
-from ..utils import (
-    determine_ext,
-    ExtractorError,
-)
-
-from .common import InfoExtractor
-
-
-class SockshareIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?sockshare\.com/file/(?P<id>[0-9A-Za-z]+)'
-    _FILE_DELETED_REGEX = r'This file doesn\'t exist, or has been removed\.</div>'
-    _TEST = {
-        'url': 'http://www.sockshare.com/file/437BE28B89D799D7',
-        'md5': '9d0bf1cfb6dbeaa8d562f6c97506c5bd',
-        'info_dict': {
-            'id': '437BE28B89D799D7',
-            'title': 'big_buck_bunny_720p_surround.avi',
-            'ext': 'avi',
-        }
-    }
-
-    def _real_extract(self, url):
-        video_id = self._match_id(url)
-        url = 'http://sockshare.com/file/%s' % video_id
-        webpage = self._download_webpage(url, video_id)
-
-        if re.search(self._FILE_DELETED_REGEX, webpage) is not None:
-            raise ExtractorError('Video %s does not exist' % video_id,
-                                 expected=True)
-
-        confirm_hash = self._html_search_regex(r'''(?x)<input\s+
-            type="hidden"\s+
-            value="([^"]*)"\s+
-            name="hash"
-            ''', webpage, 'hash')
-
-        fields = {
-            "hash": confirm_hash.encode('utf-8'),
-            "confirm": "Continue as Free User"
-        }
-
-        post = compat_urllib_parse.urlencode(fields)
-        req = compat_urllib_request.Request(url, post)
-        # Apparently, this header is required for confirmation to work.
-        req.add_header('Host', 'www.sockshare.com')
-        req.add_header('Content-type', 'application/x-www-form-urlencoded')
-
-        webpage = self._download_webpage(
-            req, video_id, 'Downloading video page')
-
-        video_url = self._html_search_regex(
-            r'<a href="([^"]*)".+class="download_file_link"',
-            webpage, 'file url')
-        video_url = "http://www.sockshare.com" + video_url
-        title = self._html_search_regex((
-            r'<h1>(.+)<strong>',
-            r'var name = "([^"]+)";'),
-            webpage, 'title', default=None)
-        thumbnail = self._html_search_regex(
-            r'<img\s+src="([^"]*)".+?name="bg"',
-            webpage, 'thumbnail', default=None)
-
-        formats = [{
-            'format_id': 'sd',
-            'url': video_url,
-            'ext': determine_ext(title),
-        }]
-
-        return {
-            'id': video_id,
-            'title': title,
-            'thumbnail': thumbnail,
-            'formats': formats,
-        }
index b936202f6f3005fe9ae085724566d709c6a484cc..06d6e6640637f2ff3507c3fa1c68339fbb8d98b4 100644 (file)
@@ -71,7 +71,7 @@ class SpankwireIE(InfoExtractor):
             compat_urllib_parse.unquote,
             re.findall(r'playerData\.cdnPath[0-9]{3,}\s*=\s*["\']([^"\']+)["\']', webpage)))
         if webpage.find('flashvars\.encrypted = "true"') != -1:
-            password = self._html_search_regex(
+            password = self._search_regex(
                 r'flashvars\.video_title = "([^"]+)',
                 webpage, 'password').replace('+', ' ')
             video_urls = list(map(
index 8686f9d11fa3178eefe3eb71fe5a1413e729beeb..86d509ae5351a3cc15be66dda9f485d63ec166ba 100644 (file)
@@ -6,8 +6,7 @@ import re
 from .common import InfoExtractor
 from ..compat import compat_urlparse
 from ..utils import (
-    parse_duration,
-    parse_iso8601,
+    unified_strdate,
 )
 
 
@@ -20,11 +19,9 @@ class SportBoxIE(InfoExtractor):
             'id': '80822',
             'ext': 'mp4',
             'title': 'Гонка 2  заезд ««Объединенный 2000»: классы Туринг и Супер-продакшн',
-            'description': 'md5:81715fa9c4ea3d9e7915dc8180c778ed',
+            'description': 'md5:3d72dc4a006ab6805d82f037fdc637ad',
             'thumbnail': 're:^https?://.*\.jpg$',
-            'timestamp': 1411896237,
             'upload_date': '20140928',
-            'duration': 4846,
         },
         'params': {
             # m3u8 download
@@ -48,17 +45,13 @@ class SportBoxIE(InfoExtractor):
             r'src="/?(vdl/player/[^"]+)"', webpage, 'player')
 
         title = self._html_search_regex(
-            r'<h1 itemprop="name">([^<]+)</h1>', webpage, 'title')
-        description = self._html_search_regex(
-            r'(?s)<div itemprop="description">(.+?)</div>',
-            webpage, 'description', fatal=False)
+            [r'"nodetitle"\s*:\s*"([^"]+)"', r'class="node-header_{1,2}title">([^<]+)'],
+            webpage, 'title')
+        description = self._og_search_description(webpage) or self._html_search_meta(
+            'description', webpage, 'description')
         thumbnail = self._og_search_thumbnail(webpage)
-        timestamp = parse_iso8601(self._search_regex(
-            r'<span itemprop="uploadDate">([^<]+)</span>',
-            webpage, 'timestamp', fatal=False))
-        duration = parse_duration(self._html_search_regex(
-            r'<meta itemprop="duration" content="PT([^"]+)">',
-            webpage, 'duration', fatal=False))
+        upload_date = unified_strdate(self._html_search_meta(
+            'dateCreated', webpage, 'upload date'))
 
         return {
             '_type': 'url_transparent',
@@ -67,8 +60,7 @@ class SportBoxIE(InfoExtractor):
             'title': title,
             'description': description,
             'thumbnail': thumbnail,
-            'timestamp': timestamp,
-            'duration': duration,
+            'upload_date': upload_date,
         }
 
 
index 56be526383b590d9b00759400a269d5164beef2f..b2a4b1fc05430558ad9b33a9aa3ce834107dc6e3 100644 (file)
@@ -10,6 +10,7 @@ from .common import InfoExtractor
 from ..utils import (
     ExtractorError,
     qualities,
+    determine_ext,
 )
 from ..compat import compat_ord
 
@@ -108,7 +109,7 @@ class TeamcocoIE(InfoExtractor):
         formats = []
         get_quality = qualities(['500k', '480p', '1000k', '720p', '1080p'])
         for filed in data['files']:
-            if filed['type'] == 'hls':
+            if determine_ext(filed['url']) == 'm3u8':
                 formats.extend(self._extract_m3u8_formats(
                     filed['url'], video_id, ext='mp4'))
             else:
index 251a686804b6f26915c3fa25d9f6b2cc1f98ed4b..a0c744fd16b633b08e7bbb632b77cf40e8410710 100644 (file)
@@ -16,6 +16,10 @@ class TelecincoIE(MiTeleIE):
             'title': 'Con Martín Berasategui, hacer un bacalao al ...',
             'duration': 662,
         },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
     }, {
         'url': 'http://www.telecinco.es/informativos/nacional/Pablo_Iglesias-Informativos_Telecinco-entrevista-Pedro_Piqueras_2_1945155182.html',
         'only_matching': True,
index 466155ef800fbc540292eb343bc9092ab9da6416..f6694149b8e3509b4446458300824a3f3d5fc5de 100644 (file)
@@ -2,6 +2,10 @@
 from __future__ import unicode_literals
 
 from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    float_or_none,
+)
 
 
 class TenPlayIE(InfoExtractor):
@@ -49,18 +53,23 @@ class TenPlayIE(InfoExtractor):
             if protocol == 'rtmp':
                 url = url.replace('&mp4:', '')
 
+                tbr = int_or_none(rendition.get('encodingRate'), 1000)
+
             formats.append({
-                'format_id': '_'.join(['rtmp', rendition['videoContainer'].lower(), rendition['videoCodec'].lower()]),
-                'width': rendition['frameWidth'],
-                'height': rendition['frameHeight'],
-                'tbr': rendition['encodingRate'] / 1024,
-                'filesize': rendition['size'],
+                'format_id': '_'.join(
+                    ['rtmp', rendition['videoContainer'].lower(),
+                     rendition['videoCodec'].lower(), '%sk' % tbr]),
+                'width': int_or_none(rendition['frameWidth']),
+                'height': int_or_none(rendition['frameHeight']),
+                'tbr': tbr,
+                'filesize': int_or_none(rendition['size']),
                 'protocol': protocol,
                 'ext': ext,
                 'vcodec': rendition['videoCodec'].lower(),
                 'container': rendition['videoContainer'].lower(),
                 'url': url,
             })
+        self._sort_formats(formats)
 
         return {
             'id': video_id,
@@ -74,8 +83,8 @@ class TenPlayIE(InfoExtractor):
                 'url': json['thumbnailURL']
             }],
             'thumbnail': json['videoStillURL'],
-            'duration': json['length'] / 1000,
-            'timestamp': float(json['creationDate']) / 1000,
-            'uploader': json['customFields']['production_company_distributor'] if 'production_company_distributor' in json['customFields'] else 'TENplay',
-            'view_count': json['playsTotal']
+            'duration': float_or_none(json.get('length'), 1000),
+            'timestamp': float_or_none(json.get('creationDate'), 1000),
+            'uploader': json.get('customFields', {}).get('production_company_distributor') or 'TENplay',
+            'view_count': int_or_none(json.get('playsTotal')),
         }
index 025d0877cb928bb433aff9f6eff19a29d253e006..656410528172fed2a171bd2935afa247fb2e6841 100644 (file)
@@ -6,8 +6,8 @@ from .common import InfoExtractor
 
 class TF1IE(InfoExtractor):
     """TF1 uses the wat.tv player."""
-    _VALID_URL = r'http://(?:videos\.tf1|www\.tfou)\.fr/.*?-(?P<id>\d+)(?:-\d+)?\.html'
-    _TESTS = {
+    _VALID_URL = r'http://(?:videos\.tf1|www\.tfou|www\.tf1)\.fr/.*?-(?P<id>\d+)(?:-\d+)?\.html'
+    _TESTS = [{
         'url': 'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html',
         'info_dict': {
             'id': '10635995',
@@ -32,7 +32,10 @@ class TF1IE(InfoExtractor):
             # Sometimes wat serves the whole file with the --test option
             'skip_download': True,
         },
-    }
+    }, {
+        'url': 'http://www.tf1.fr/tf1/koh-lanta/videos/replay-koh-lanta-22-mai-2015.html',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
index d48cbbf140054e639f7191acfa0909972ef3ab76..59af9aba06399cefcc6c2049c958dfb3819bb20a 100644 (file)
@@ -10,26 +10,32 @@ from ..utils import (
 
 
 class TNAFlixIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?tnaflix\.com/(?P<cat_id>[\w-]+)/(?P<display_id>[\w-]+)/video(?P<id>\d+)'
+    _VALID_URL = r'https?://(?:www\.)?tnaflix\.com/[^/]+/(?P<display_id>[^/]+)/video(?P<id>\d+)'
 
     _TITLE_REGEX = r'<title>(.+?) - TNAFlix Porn Videos</title>'
     _DESCRIPTION_REGEX = r'<h3 itemprop="description">([^<]+)</h3>'
     _CONFIG_REGEX = r'flashvars\.config\s*=\s*escape\("([^"]+)"'
 
-    _TEST = {
-        'url': 'http://www.tnaflix.com/porn-stars/Carmella-Decesare-striptease/video553878',
-        'md5': 'ecf3498417d09216374fc5907f9c6ec0',
-        'info_dict': {
-            'id': '553878',
-            'display_id': 'Carmella-Decesare-striptease',
-            'ext': 'mp4',
-            'title': 'Carmella Decesare - striptease',
-            'description': '',
-            'thumbnail': 're:https?://.*\.jpg$',
-            'duration': 91,
-            'age_limit': 18,
+    _TESTS = [
+        {
+            'url': 'http://www.tnaflix.com/porn-stars/Carmella-Decesare-striptease/video553878',
+            'md5': 'ecf3498417d09216374fc5907f9c6ec0',
+            'info_dict': {
+                'id': '553878',
+                'display_id': 'Carmella-Decesare-striptease',
+                'ext': 'mp4',
+                'title': 'Carmella Decesare - striptease',
+                'description': '',
+                'thumbnail': 're:https?://.*\.jpg$',
+                'duration': 91,
+                'age_limit': 18,
+            }
+        },
+        {
+            'url': 'https://www.tnaflix.com/amateur-porn/bunzHD-Ms.Donk/video358632',
+            'matching_only': True,
         }
-    }
+    ]
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
index 4de0aac523313eced334aab38a9a20c7bf08dfc7..fad720b681e125ac495b26ba3870dbd65340e3ce 100644 (file)
@@ -26,7 +26,7 @@ class TutvIE(InfoExtractor):
 
         data_content = self._download_webpage(
             'http://tu.tv/flvurl.php?codVideo=%s' % internal_id, video_id, 'Downloading video info')
-        video_url = base64.b64decode(compat_parse_qs(data_content)['kpt'][0]).decode('utf-8')
+        video_url = base64.b64decode(compat_parse_qs(data_content)['kpt'][0].encode('utf-8')).decode('utf-8')
 
         return {
             'id': internal_id,
index ececc7ee0118932716ca9bdb06779cf94e6dc0ec..591024eaded0cdddbb0779bd942c0fa8f63d86a6 100644 (file)
@@ -43,7 +43,7 @@ class VideoTtIE(InfoExtractor):
 
         formats = [
             {
-                'url': base64.b64decode(res['u']).decode('utf-8'),
+                'url': base64.b64decode(res['u'].encode('utf-8')).decode('utf-8'),
                 'ext': 'flv',
                 'format_id': res['l'],
             } for res in settings['res'] if res['u']
index cf6af1e5cdb6315d325d2bd355d384cc283a3e0c..7f2fb1ca8896e29e48a41a9efddaded987ba1e96 100644 (file)
@@ -1,29 +1,65 @@
 from __future__ import unicode_literals
 
-import re
+import time
+import hmac
+import hashlib
+import itertools
 
-from ..compat import (
-    compat_urlparse,
-    compat_urllib_request,
-)
 from ..utils import (
     ExtractorError,
-    unescapeHTML,
-    unified_strdate,
-    US_RATINGS,
-    determine_ext,
-    mimetype2ext,
+    int_or_none,
+    parse_age_limit,
+    parse_iso8601,
 )
 from .common import InfoExtractor
 
 
-class VikiIE(InfoExtractor):
-    IE_NAME = 'viki'
+class VikiBaseIE(InfoExtractor):
+    _VALID_URL_BASE = r'https?://(?:www\.)?viki\.(?:com|net|mx|jp|fr)/'
+    _API_QUERY_TEMPLATE = '/v4/%sapp=%s&t=%s&site=www.viki.com'
+    _API_URL_TEMPLATE = 'http://api.viki.io%s&sig=%s'
+
+    _APP = '65535a'
+    _APP_VERSION = '2.2.5.1428709186'
+    _APP_SECRET = '-$iJ}@p7!G@SyU/je1bEyWg}upLu-6V6-Lg9VD(]siH,r.,m-r|ulZ,U4LC/SeR)'
+
+    def _prepare_call(self, path, timestamp=None):
+        path += '?' if '?' not in path else '&'
+        if not timestamp:
+            timestamp = int(time.time())
+        query = self._API_QUERY_TEMPLATE % (path, self._APP, timestamp)
+        sig = hmac.new(
+            self._APP_SECRET.encode('ascii'),
+            query.encode('ascii'),
+            hashlib.sha1
+        ).hexdigest()
+        return self._API_URL_TEMPLATE % (query, sig)
+
+    def _call_api(self, path, video_id, note, timestamp=None):
+        resp = self._download_json(
+            self._prepare_call(path, timestamp), video_id, note)
+
+        error = resp.get('error')
+        if error:
+            if error == 'invalid timestamp':
+                resp = self._download_json(
+                    self._prepare_call(path, int(resp['current_timestamp'])),
+                    video_id, '%s (retry)' % note)
+                error = resp.get('error')
+            if error:
+                self._raise_error(resp['error'])
+
+        return resp
 
-    # iPad2
-    _USER_AGENT = 'Mozilla/5.0(iPad; U; CPU OS 4_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8F191 Safari/6533.18.5'
+    def _raise_error(self, error):
+        raise ExtractorError(
+            '%s returned error: %s' % (self.IE_NAME, error),
+            expected=True)
 
-    _VALID_URL = r'^https?://(?:www\.)?viki\.com/videos/(?P<id>[0-9]+v)'
+
+class VikiIE(VikiBaseIE):
+    IE_NAME = 'viki'
+    _VALID_URL = r'%s(?:videos|player)/(?P<id>[0-9]+v)' % VikiBaseIE._VALID_URL_BASE
     _TESTS = [{
         'url': 'http://www.viki.com/videos/1023585v-heirs-episode-14',
         'info_dict': {
@@ -37,111 +73,218 @@ class VikiIE(InfoExtractor):
         },
         'skip': 'Blocked in the US',
     }, {
+        # clip
         'url': 'http://www.viki.com/videos/1067139v-the-avengers-age-of-ultron-press-conference',
-        'md5': 'ca6493e6f0a6ec07da9aa8d6304b4b2c',
+        'md5': '86c0b5dbd4d83a6611a79987cc7a1989',
         'info_dict': {
             'id': '1067139v',
             'ext': 'mp4',
+            'title': "'The Avengers: Age of Ultron' Press Conference",
             'description': 'md5:d70b2f9428f5488321bfe1db10d612ea',
+            'duration': 352,
+            'timestamp': 1430380829,
             'upload_date': '20150430',
-            'title': '\'The Avengers: Age of Ultron\' Press Conference',
+            'uploader': 'Arirang TV',
+            'like_count': int,
+            'age_limit': 0,
         }
     }, {
         'url': 'http://www.viki.com/videos/1048879v-ankhon-dekhi',
         'info_dict': {
             'id': '1048879v',
             'ext': 'mp4',
-            'upload_date': '20140820',
-            'description': 'md5:54ff56d51bdfc7a30441ec967394e91c',
             'title': 'Ankhon Dekhi',
+            'duration': 6512,
+            'timestamp': 1408532356,
+            'upload_date': '20140820',
+            'uploader': 'Spuul',
+            'like_count': int,
+            'age_limit': 13,
         },
         'params': {
-            # requires ffmpeg
+            # m3u8 download
             'skip_download': True,
         }
+    }, {
+        # episode
+        'url': 'http://www.viki.com/videos/44699v-boys-over-flowers-episode-1',
+        'md5': '190f3ef426005ba3a080a63325955bc3',
+        'info_dict': {
+            'id': '44699v',
+            'ext': 'mp4',
+            'title': 'Boys Over Flowers - Episode 1',
+            'description': 'md5:52617e4f729c7d03bfd4bcbbb6e946f2',
+            'duration': 4155,
+            'timestamp': 1270496524,
+            'upload_date': '20100405',
+            'uploader': 'group8',
+            'like_count': int,
+            'age_limit': 13,
+        }
+    }, {
+        # youtube external
+        'url': 'http://www.viki.com/videos/50562v-poor-nastya-complete-episode-1',
+        'md5': '216d1afdc0c64d1febc1e9f2bd4b864b',
+        'info_dict': {
+            'id': '50562v',
+            'ext': 'mp4',
+            'title': 'Poor Nastya [COMPLETE] - Episode 1',
+            'description': '',
+            'duration': 607,
+            'timestamp': 1274949505,
+            'upload_date': '20101213',
+            'uploader': 'ad14065n',
+            'uploader_id': 'ad14065n',
+            'like_count': int,
+            'age_limit': 13,
+        }
+    }, {
+        'url': 'http://www.viki.com/player/44699v',
+        'only_matching': True,
     }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
-        webpage = self._download_webpage(url, video_id)
-        title = self._og_search_title(webpage)
-        description = self._og_search_description(webpage)
-        thumbnail = self._og_search_thumbnail(webpage)
-
-        uploader_m = re.search(
-            r'<strong>Broadcast Network: </strong>\s*([^<]*)<', webpage)
-        if uploader_m is None:
-            uploader = None
-        else:
-            uploader = uploader_m.group(1).strip()
-
-        rating_str = self._html_search_regex(
-            r'<strong>Rating: </strong>\s*([^<]*)<', webpage,
-            'rating information', default='').strip()
-        age_limit = US_RATINGS.get(rating_str)
-
-        req = compat_urllib_request.Request(
-            'http://www.viki.com/player5_fragment/%s?action=show&controller=videos' % video_id)
-        req.add_header('User-Agent', self._USER_AGENT)
-        info_webpage = self._download_webpage(
-            req, video_id, note='Downloading info page')
-        err_msg = self._html_search_regex(r'<div[^>]+class="video-error[^>]+>(.+)</div>', info_webpage, 'error message', default=None)
-        if err_msg:
-            if 'not available in your region' in err_msg:
-                raise ExtractorError(
-                    'Video %s is blocked from your location.' % video_id,
-                    expected=True)
-            else:
-                raise ExtractorError('Viki said: ' + err_msg)
-        mobj = re.search(
-            r'<source[^>]+type="(?P<mime_type>[^"]+)"[^>]+src="(?P<url>[^"]+)"', info_webpage)
-        if not mobj:
-            raise ExtractorError('Unable to find video URL')
-        video_url = unescapeHTML(mobj.group('url'))
-        video_ext = mimetype2ext(mobj.group('mime_type'))
-
-        if determine_ext(video_url) == 'm3u8':
-            formats = self._extract_m3u8_formats(
-                video_url, video_id, ext=video_ext)
-        else:
-            formats = [{
-                'url': video_url,
-                'ext': video_ext,
-            }]
-
-        upload_date_str = self._html_search_regex(
-            r'"created_at":"([^"]+)"', info_webpage, 'upload date')
-        upload_date = (
-            unified_strdate(upload_date_str)
-            if upload_date_str is not None
-            else None
-        )
-
-        # subtitles
-        video_subtitles = self.extract_subtitles(video_id, info_webpage)
-
-        return {
+        video = self._call_api(
+            'videos/%s.json' % video_id, video_id, 'Downloading video JSON')
+
+        title = None
+        titles = video.get('titles')
+        if titles:
+            title = titles.get('en') or titles[titles.keys()[0]]
+        if not title:
+            title = 'Episode %d' % video.get('number') if video.get('type') == 'episode' else video.get('id') or video_id
+            container_titles = video.get('container', {}).get('titles')
+            if container_titles:
+                container_title = container_titles.get('en') or container_titles[container_titles.keys()[0]]
+                title = '%s - %s' % (container_title, title)
+
+        descriptions = video.get('descriptions')
+        description = descriptions.get('en') or descriptions[titles.keys()[0]] if descriptions else None
+
+        duration = int_or_none(video.get('duration'))
+        timestamp = parse_iso8601(video.get('created_at'))
+        uploader = video.get('author')
+        like_count = int_or_none(video.get('likes', {}).get('count'))
+        age_limit = parse_age_limit(video.get('rating'))
+
+        thumbnails = []
+        for thumbnail_id, thumbnail in video.get('images', {}).items():
+            thumbnails.append({
+                'id': thumbnail_id,
+                'url': thumbnail.get('url'),
+            })
+
+        subtitles = {}
+        for subtitle_lang, _ in video.get('subtitle_completions', {}).items():
+            subtitles[subtitle_lang] = [{
+                'ext': subtitles_format,
+                'url': self._prepare_call(
+                    'videos/%s/subtitles/%s.%s' % (video_id, subtitle_lang, subtitles_format)),
+            } for subtitles_format in ('srt', 'vtt')]
+
+        result = {
             'id': video_id,
             'title': title,
-            'formats': formats,
             'description': description,
-            'thumbnail': thumbnail,
-            'age_limit': age_limit,
+            'duration': duration,
+            'timestamp': timestamp,
             'uploader': uploader,
-            'subtitles': video_subtitles,
-            'upload_date': upload_date,
+            'like_count': like_count,
+            'age_limit': age_limit,
+            'thumbnails': thumbnails,
+            'subtitles': subtitles,
         }
 
-    def _get_subtitles(self, video_id, info_webpage):
-        res = {}
-        for sturl_html in re.findall(r'<track src="([^"]+)"', info_webpage):
-            sturl = unescapeHTML(sturl_html)
-            m = re.search(r'/(?P<lang>[a-z]+)\.vtt', sturl)
-            if not m:
-                continue
-            res[m.group('lang')] = [{
-                'url': compat_urlparse.urljoin('http://www.viki.com', sturl),
-                'ext': 'vtt',
-            }]
-        return res
+        streams = self._call_api(
+            'videos/%s/streams.json' % video_id, video_id,
+            'Downloading video streams JSON')
+
+        if 'external' in streams:
+            result.update({
+                '_type': 'url_transparent',
+                'url': streams['external']['url'],
+            })
+            return result
+
+        formats = []
+        for format_id, stream_dict in streams.items():
+            height = self._search_regex(
+                r'^(\d+)[pP]$', format_id, 'height', default=None)
+            for protocol, format_dict in stream_dict.items():
+                if format_id == 'm3u8':
+                    formats = self._extract_m3u8_formats(
+                        format_dict['url'], video_id, 'mp4', m3u8_id='m3u8-%s' % protocol)
+                else:
+                    formats.append({
+                        'url': format_dict['url'],
+                        'format_id': '%s-%s' % (format_id, protocol),
+                        'height': height,
+                    })
+        self._sort_formats(formats)
+
+        result['formats'] = formats
+        return result
+
+
+class VikiChannelIE(VikiBaseIE):
+    IE_NAME = 'viki:channel'
+    _VALID_URL = r'%s(?:tv|news|movies|artists)/(?P<id>[0-9]+c)' % VikiBaseIE._VALID_URL_BASE
+    _TESTS = [{
+        'url': 'http://www.viki.com/tv/50c-boys-over-flowers',
+        'info_dict': {
+            'id': '50c',
+            'title': 'Boys Over Flowers',
+            'description': 'md5:ecd3cff47967fe193cff37c0bec52790',
+        },
+        'playlist_count': 70,
+    }, {
+        'url': 'http://www.viki.com/tv/1354c-poor-nastya-complete',
+        'info_dict': {
+            'id': '1354c',
+            'title': 'Poor Nastya [COMPLETE]',
+            'description': 'md5:05bf5471385aa8b21c18ad450e350525',
+        },
+        'playlist_count': 127,
+    }, {
+        'url': 'http://www.viki.com/news/24569c-showbiz-korea',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.viki.com/movies/22047c-pride-and-prejudice-2005',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.viki.com/artists/2141c-shinee',
+        'only_matching': True,
+    }]
+
+    _PER_PAGE = 25
+
+    def _real_extract(self, url):
+        channel_id = self._match_id(url)
+
+        channel = self._call_api(
+            'containers/%s.json' % channel_id, channel_id,
+            'Downloading channel JSON')
+
+        titles = channel['titles']
+        title = titles.get('en') or titles[titles.keys()[0]]
+
+        descriptions = channel['descriptions']
+        description = descriptions.get('en') or descriptions[descriptions.keys()[0]]
+
+        entries = []
+        for video_type in ('episodes', 'clips', 'movies'):
+            for page_num in itertools.count(1):
+                page = self._call_api(
+                    'containers/%s/%s.json?per_page=%d&sort=number&direction=asc&with_paging=true&page=%d'
+                    % (channel_id, video_type, self._PER_PAGE, page_num), channel_id,
+                    'Downloading %s JSON page #%d' % (video_type, page_num))
+                for video in page['response']:
+                    video_id = video['id']
+                    entries.append(self.url_result(
+                        'http://www.viki.com/videos/%s' % video_id, 'Viki'))
+                if not page['pagination']['next']:
+                    break
+
+        return self.playlist_result(entries, channel_id, title, description)
index 1f9940cf5c1e4c8698a8a0bed9a874d5a79f18b0..0301682b8dd228cab336bc1e68eaf868660fd5c7 100644 (file)
@@ -1126,12 +1126,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                     self.report_warning(
                         'Skipping DASH manifest: %r' % e, video_id)
                 else:
-                    # Hide the formats we found through non-DASH
+                    # Remove the formats we found through non-DASH, they
+                    # contain less info and it can be wrong, because we use
+                    # fixed values (for example the resolution). See
+                    # https://github.com/rg3/youtube-dl/issues/5774 for an
+                    # example.
                     dash_keys = set(df['format_id'] for df in dash_formats)
-                    for f in formats:
-                        if f['format_id'] in dash_keys:
-                            f['format_id'] = 'nondash-%s' % f['format_id']
-                            f['preference'] = f.get('preference', 0) - 10000
+                    formats = [f for f in formats if f['format_id'] not in dash_keys]
                     formats.extend(dash_formats)
 
         # Check for malformed aspect ratio
index 22dbc3aec7866ad3f5d048c35737486a8fdac8fc..5a2315bd96ce0c6abfdf4a8bea65aa68e6fa370b 100644 (file)
@@ -537,7 +537,7 @@ def parseOpts(overrideArguments=None):
     verbosity.add_option(
         '--dump-pages', '--dump-intermediate-pages',
         action='store_true', dest='dump_intermediate_pages', default=False,
-        help='Print downloaded pages to debug problems (very verbose)')
+        help='Print downloaded pages encoded using base64 to debug problems (very verbose)')
     verbosity.add_option(
         '--write-pages',
         action='store_true', dest='write_pages', default=False,
@@ -713,7 +713,7 @@ def parseOpts(overrideArguments=None):
         help='Parse additional metadata like song title / artist from the video title. '
              'The format syntax is the same as --output, '
              'the parsed parameters replace existing values. '
-             'Additional templates: %(album), %(artist). '
+             'Additional templates: %(album)s, %(artist)s. '
              'Example: --metadata-from-title "%(artist)s - %(title)s" matches a title like '
              '"Coldplay - Paradise"')
     postproc.add_option(
index 38f00bc9bc2ef476ddb813d7bcd26e5d13f4947d..b333851534e9edd9c75ff70ee4350874530ea8f7 100644 (file)
@@ -1,3 +1,3 @@
 from __future__ import unicode_literals
 
-__version__ = '2015.05.15'
+__version__ = '2015.05.20'