From: Philipp Hagemeister Date: Wed, 7 Aug 2013 18:11:04 +0000 (+0200) Subject: Merge commit '7a4c6cc92f9ffec9135652a49153caffa5520c29' X-Git-Url: http://git.cielonegro.org/gitweb.cgi?a=commitdiff_plain;h=b513a251f8ba1123617ece96bff30e3f2863c6c2;hp=7a4c6cc92f9ffec9135652a49153caffa5520c29;p=youtube-dl.git Merge commit '7a4c6cc92f9ffec9135652a49153caffa5520c29' --- diff --git a/.travis.yml b/.travis.yml index 7f1fa8a3c..45b71f11b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,6 +9,7 @@ notifications: - filippo.valsorda@gmail.com - phihag@phihag.de - jaime.marquinez.ferrandiz+travis@gmail.com + - yasoob.khld@gmail.com # irc: # channels: # - "irc.freenode.org#youtube-dl" diff --git a/README.md b/README.md index b246d3c53..560bcdca1 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,9 @@ which means you can modify it, redistribute it or use it however you like. # OPTIONS -h, --help print this help text and exit --version print program version and exit - -U, --update update this program to latest version + -U, --update update this program to latest version. Make sure + that you have sufficient permissions (run with + sudo if needed) -i, --ignore-errors continue on download errors --dump-user-agent display the current browser identification --user-agent UA specify a custom user agent diff --git a/devscripts/youtube_genalgo.py b/devscripts/youtube_genalgo.py index 1fce316dd..31d6ec952 100644 --- a/devscripts/youtube_genalgo.py +++ b/devscripts/youtube_genalgo.py @@ -5,30 +5,45 @@ import sys tests = [ + # 92 - vflQw-fB4 2013/07/17 + ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]}|:;?/>.<'`~\"", + "mrtyuioplkjhgfdsazxcvbnq1234567890QWERTY}IOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]\"|:;"), + # 90 + ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]}|:;?/>.<'`", + "mrtyuioplkjhgfdsazxcvbne1234567890QWER[YUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={`]}|"), # 88 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]}|:;?/>.<", "J:|}][{=+-_)(*&;%$#@>MNBVCXZASDFGH^KLPOIUYTREWQ0987654321mnbvcxzasdfghrklpoiuytej"), - # 87 + # 87 - vflART1Nf 2013/07/24 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$^&*()_-+={[]}|:;?/>.<", - "!?;:|}][{=+-_)(*&^$#@/MNBVCXZASqFGHJKLPOIUYTREWQ0987654321mnbvcxzasdfghjklpoiuytr"), - # 86 - vfl_ymO4Z 2013/06/27 + "tyuioplkjhgfdsazxcv"), + # 86 - vflm_D8eE 2013/07/31 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[|};?/>.<", - "ertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!/#$%^&*()_-+={[|};?@"), - # 85 + ">.1}|[{=+-_)(*&^%$#@!MNBVCXZASDFGHJK.<", - "{>/?;}[.=+-_)(*&^%$#@!MqBVCXZASDFwHJKLPOIUYTREWQ0987654321mnbvcxzasdfghjklpoiuytr"), + "ertyuiqplkjhgfdsazx$vbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#<%^&*()_-+={[};?/c"), # 84 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[};?>.<", "<.>?;}[{=+-_)(*&^%$#@!MNBVCXZASDFGHJKLPOIUYTREWe098765432rmnbvcxzasdfghjklpoiuyt1"), - # 83 - vflcaqGO8 2013/07/11 + # 83 - vflTWC9KW 2013/08/01 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!#$%^&*()_+={[};?/>.<", - "urty8ioplkjhgfdsazxcvbqm1234567S90QWERTYUIOPLKJHGFDnAZXCVBNM!#$%^&*()_+={[};?/>.<"), + "qwertyuioplkjhg>dsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!#$%^&*()_+={[};?/f"), # 82 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/>.<", "Q>/?;}[{=+-(*<^%$#@!MNBVCXZASDFGHKLPOIUY8REWT0q&7654321mnbvcxzasdfghjklpoiuytrew9"), - # 81 + # 81 - vflLC8JvQ 2013/07/25 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/>.", - "urty8ioplkjhgfdsazxcvbqm1234567e90QWERTYUIOPLKHGFDSnZXCVBNM!@#$%^&*(-+={[};?/>."), + "C>/?;}[{=+-(*&^%$#@!MNBVYXZASDFGHKLPOIU.TREWQ0q87659321mnbvcxzasdfghjkl4oiuytrewp"), + # 79 - vflLC8JvQ 2013/07/25 (sporadic) + ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/", + "Z?;}[{=+-(*&^%$#@!MNBVCXRASDFGHKLPOIUYT/EWQ0q87659321mnbvcxzasdfghjkl4oiuytrewp"), +] + +tests_age_gate = [ + # 86 - vflqinMWD + ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[|};?/>.<", + "ertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!/#$%^&*()_-+={[|};?@"), ] def find_matching(wrong, right): @@ -81,6 +96,8 @@ def genall(tests): def main(): print(genall(tests)) + print(u' Age gate:') + print(genall(tests_age_gate)) if __name__ == '__main__': main() diff --git a/test/test_playlists.py b/test/test_playlists.py new file mode 100644 index 000000000..65de3a55c --- /dev/null +++ b/test/test_playlists.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python + +import sys +import unittest +import json + +# Allow direct execution +import os +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from youtube_dl.extractor import DailymotionPlaylistIE, VimeoChannelIE +from youtube_dl.utils import * + +from helper import FakeYDL + +class TestPlaylists(unittest.TestCase): + def assertIsPlaylist(self, info): + """Make sure the info has '_type' set to 'playlist'""" + self.assertEqual(info['_type'], 'playlist') + + def test_dailymotion_playlist(self): + dl = FakeYDL() + ie = DailymotionPlaylistIE(dl) + result = ie.extract('http://www.dailymotion.com/playlist/xv4bw_nqtv_sport/1#video=xl8v3q') + self.assertIsPlaylist(result) + self.assertEqual(result['title'], u'SPORT') + self.assertTrue(len(result['entries']) > 20) + + def test_vimeo_channel(self): + dl = FakeYDL() + ie = VimeoChannelIE(dl) + result = ie.extract('http://vimeo.com/channels/tributes') + self.assertIsPlaylist(result) + self.assertEqual(result['title'], u'Vimeo Tributes') + self.assertTrue(len(result['entries']) > 24) + +if __name__ == '__main__': + unittest.main() diff --git a/test/test_youtube_sig.py b/test/test_youtube_sig.py index b8e51f479..d645c08fb 100644 --- a/test/test_youtube_sig.py +++ b/test/test_youtube_sig.py @@ -10,12 +10,19 @@ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from youtube_dl.extractor.youtube import YoutubeIE from helper import FakeYDL -sig = YoutubeIE(FakeYDL())._decrypt_signature +ie = YoutubeIE(FakeYDL()) +sig = ie._decrypt_signature +sig_age_gate = ie._decrypt_signature_age_gate class TestYoutubeSig(unittest.TestCase): - def test_43_43(self): - wrong = '5AEEAE0EC39677BC65FD9021CCD115F1F2DBD5A59E4.C0B243A3E2DED6769199AF3461781E75122AE135135' - right = '931EA22157E1871643FA9519676DED253A342B0C.4E95A5DBD2F1F511DCC1209DF56CB77693CE0EAE' + def test_92(self): + wrong = "F9F9B6E6FD47029957AB911A964CC20D95A181A5D37A2DBEFD67D403DB0E8BE4F4910053E4E8A79.0B70B.0B80B8" + right = "69B6E6FD47029957AB911A9F4CC20D95A181A5D3.A2DBEFD67D403DB0E8BE4F4910053E4E8A7980B7" + self.assertEqual(sig(wrong), right) + + def test_90(self): + wrong = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]}|:;?/>.<'`" + right = "mrtyuioplkjhgfdsazxcvbne1234567890QWER[YUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={`]}|" self.assertEqual(sig(wrong), right) def test_88(self): @@ -25,17 +32,17 @@ class TestYoutubeSig(unittest.TestCase): def test_87(self): wrong = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$^&*()_-+={[]}|:;?/>.<" - right = "!?;:|}][{=+-_)(*&^$#@/MNBVCXZASqFGHJKLPOIUYTREWQ0987654321mnbvcxzasdfghjklpoiuytr" + right = "tyuioplkjhgfdsazxcv" self.assertEqual(sig(wrong), right) def test_86(self): wrong = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[|};?/>.<" - right = "ertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!/#$%^&*()_-+={[|};?@" + right = ">.1}|[{=+-_)(*&^%$#@!MNBVCXZASDFGHJK 0 and not re.search(r'^[#/;]', x)] + if opts.verbose: + sys.stderr.write(u'[debug] Batch file urls: ' + repr(batchurls) + u'\n') except IOError: sys.exit(u'ERROR: batch file could not be read') all_urls = batchurls + args diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 79868aa06..84c02c2ed 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -12,13 +12,14 @@ from .comedycentral import ComedyCentralIE from .condenast import CondeNastIE from .criterion import CriterionIE from .cspan import CSpanIE -from .dailymotion import DailymotionIE +from .dailymotion import DailymotionIE, DailymotionPlaylistIE from .depositfiles import DepositFilesIE from .dotsub import DotsubIE from .dreisat import DreiSatIE from .ehow import EHowIE from .eighttracks import EightTracksIE from .escapist import EscapistIE +from .exfm import ExfmIE from .facebook import FacebookIE from .flickr import FlickrIE from .freesound import FreesoundIE @@ -37,20 +38,25 @@ from .infoq import InfoQIE from .instagram import InstagramIE from .jukebox import JukeboxIE from .justintv import JustinTVIE +from .kankan import KankanIE from .keek import KeekIE from .liveleak import LiveLeakIE from .livestream import LivestreamIE from .metacafe import MetacafeIE from .mixcloud import MixcloudIE from .mtv import MTVIE +from .muzu import MuzuTVIE from .myspass import MySpassIE from .myvideo import MyVideoIE from .nba import NBAIE +from .ooyala import OoyalaIE from .photobucket import PhotobucketIE from .pornotube import PornotubeIE from .rbmaradio import RBMARadioIE from .redtube import RedTubeIE from .ringtv import RingTVIE +from .roxwel import RoxwelIE +from .sina import SinaIE from .soundcloud import SoundcloudIE, SoundcloudSetIE from .spiegel import SpiegelIE from .stanfordoc import StanfordOpenClassroomIE @@ -68,9 +74,12 @@ from .ustream import UstreamIE from .vbox7 import Vbox7IE from .veoh import VeohIE from .vevo import VevoIE -from .vimeo import VimeoIE +from .videofyme import VideofyMeIE +from .vimeo import VimeoIE, VimeoChannelIE from .vine import VineIE +from .c56 import C56IE from .wat import WatIE +from .weibo import WeiboIE from .wimp import WimpIE from .worldstarhiphop import WorldStarHipHopIE from .xhamster import XHamsterIE @@ -88,6 +97,9 @@ from .youtube import ( YoutubeChannelIE, YoutubeShowIE, YoutubeSubscriptionsIE, + YoutubeRecommendedIE, + YoutubeWatchLaterIE, + YoutubeFavouritesIE, ) from .zdf import ZDFIE diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 993e30f7a..69b3b0ad7 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -17,13 +17,14 @@ class ArteTvIE(InfoExtractor): """ _EMISSION_URL = r'(?:http://)?www\.arte.tv/guide/(?Pfr|de)/(?:(?:sendungen|emissions)/)?(?P.*?)/(?P.*?)(\?.*)?' _VIDEOS_URL = r'(?:http://)?videos.arte.tv/(?Pfr|de)/.*-(?P.*?).html' + _LIVEWEB_URL = r'(?:http://)?liveweb.arte.tv/(?Pfr|de)/(?P.+?)/(?P.+)' _LIVE_URL = r'index-[0-9]+\.html$' IE_NAME = u'arte.tv' @classmethod def suitable(cls, url): - return any(re.match(regex, url) for regex in (cls._EMISSION_URL, cls._VIDEOS_URL)) + return any(re.match(regex, url) for regex in (cls._EMISSION_URL, cls._VIDEOS_URL, cls._LIVEWEB_URL)) # TODO implement Live Stream # from ..utils import compat_urllib_parse @@ -68,6 +69,12 @@ class ArteTvIE(InfoExtractor): lang = mobj.group('lang') return self._extract_video(url, id, lang) + mobj = re.match(self._LIVEWEB_URL, url) + if mobj is not None: + name = mobj.group('name') + lang = mobj.group('lang') + return self._extract_liveweb(url, name, lang) + if re.search(self._LIVE_URL, video_id) is not None: raise ExtractorError(u'Arte live streams are not yet supported, sorry') # self.extractLiveStream(url) @@ -85,7 +92,7 @@ class ArteTvIE(InfoExtractor): info_dict = {'id': player_info['VID'], 'title': player_info['VTI'], - 'description': player_info['VDE'], + 'description': player_info.get('VDE'), 'upload_date': unified_strdate(player_info['VDA'].split(' ')[0]), 'thumbnail': player_info['programImage'], 'ext': 'flv', @@ -98,12 +105,14 @@ class ArteTvIE(InfoExtractor): l = 'F' elif lang == 'de': l = 'A' - regexes = [r'VO?%s' % l, r'V%s-ST.' % l] + regexes = [r'VO?%s' % l, r'VO?.-ST%s' % l] return any(re.match(r, f['versionCode']) for r in regexes) # Some formats may not be in the same language as the url formats = filter(_match_lang, formats) # We order the formats by quality formats = sorted(formats, key=lambda f: int(f['height'])) + # Prefer videos without subtitles in the same language + formats = sorted(formats, key=lambda f: re.match(r'VO(F|A)-STM\1', f['versionCode']) is None) # Pick the best quality format_info = formats[-1] if format_info['mediaType'] == u'rtmp': @@ -144,3 +153,22 @@ class ArteTvIE(InfoExtractor): 'url': video_url, 'ext': 'flv', } + + def _extract_liveweb(self, url, name, lang): + """Extract form http://liveweb.arte.tv/""" + webpage = self._download_webpage(url, name) + video_id = self._search_regex(r'eventId=(\d+?)("|&)', webpage, u'event id') + config_xml = self._download_webpage('http://download.liveweb.arte.tv/o21/liveweb/events/event-%s.xml' % video_id, + video_id, u'Downloading information') + config_doc = xml.etree.ElementTree.fromstring(config_xml.encode('utf-8')) + event_doc = config_doc.find('event') + url_node = event_doc.find('video').find('urlHd') + if url_node is None: + url_node = video_doc.find('urlSd') + + return {'id': video_id, + 'title': event_doc.find('name%s' % lang.capitalize()).text, + 'url': url_node.text.replace('MP4', 'mp4'), + 'ext': 'flv', + 'thumbnail': self._og_search_thumbnail(webpage), + } diff --git a/youtube_dl/extractor/breakcom.py b/youtube_dl/extractor/breakcom.py index 34f555e89..53a898de3 100644 --- a/youtube_dl/extractor/breakcom.py +++ b/youtube_dl/extractor/breakcom.py @@ -1,6 +1,8 @@ import re +import json from .common import InfoExtractor +from ..utils import determine_ext class BreakIE(InfoExtractor): @@ -17,17 +19,20 @@ class BreakIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group(1).split("-")[-1] - webpage = self._download_webpage(url, video_id) - video_url = re.search(r"videoPath: '(.+?)',",webpage).group(1) - key = re.search(r"icon: '(.+?)',",webpage).group(1) - final_url = str(video_url)+"?"+str(key) - thumbnail_url = re.search(r"thumbnailURL: '(.+?)'",webpage).group(1) - title = re.search(r"sVidTitle: '(.+)',",webpage).group(1) - ext = video_url.split('.')[-1] + embed_url = 'http://www.break.com/embed/%s' % video_id + webpage = self._download_webpage(embed_url, video_id) + info_json = self._search_regex(r'var embedVars = ({.*?});', webpage, + u'info json', flags=re.DOTALL) + info = json.loads(info_json) + video_url = info['videoUri'] + m_youtube = re.search(r'(https?://www\.youtube\.com/watch\?v=.*)', video_url) + if m_youtube is not None: + return self.url_result(m_youtube.group(1), 'Youtube') + final_url = video_url + '?' + info['AuthToken'] return [{ 'id': video_id, 'url': final_url, - 'ext': ext, - 'title': title, - 'thumbnail': thumbnail_url, + 'ext': determine_ext(final_url), + 'title': info['contentName'], + 'thumbnail': info['thumbUri'], }] diff --git a/youtube_dl/extractor/c56.py b/youtube_dl/extractor/c56.py new file mode 100644 index 000000000..4c8a8af09 --- /dev/null +++ b/youtube_dl/extractor/c56.py @@ -0,0 +1,36 @@ +# coding: utf-8 + +import re +import json + +from .common import InfoExtractor +from ..utils import determine_ext + +class C56IE(InfoExtractor): + _VALID_URL = r'https?://((www|player)\.)?56\.com/(.+?/)?(v_|(play_album.+-))(?P.+?)\.(html|swf)' + IE_NAME = u'56.com' + + _TEST ={ + u'url': u'http://www.56.com/u39/v_OTM0NDA3MTY.html', + u'file': u'93440716.mp4', + u'md5': u'9dc07b5c8e978112a6441f9e75d2b59e', + u'info_dict': { + u'title': u'网事知多少 第32期:车怒', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE) + text_id = mobj.group('textid') + info_page = self._download_webpage('http://vxml.56.com/json/%s/' % text_id, + text_id, u'Downloading video info') + info = json.loads(info_page)['info'] + best_format = sorted(info['rfiles'], key=lambda f: int(f['filesize']))[-1] + video_url = best_format['url'] + + return {'id': info['vid'], + 'title': info['Subject'], + 'url': video_url, + 'ext': determine_ext(video_url), + 'thumbnail': info.get('bimg') or info.get('img'), + } diff --git a/youtube_dl/extractor/collegehumor.py b/youtube_dl/extractor/collegehumor.py index 7ae0972e5..30b9c7549 100644 --- a/youtube_dl/extractor/collegehumor.py +++ b/youtube_dl/extractor/collegehumor.py @@ -1,26 +1,26 @@ import re -import socket import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( - compat_http_client, - compat_str, - compat_urllib_error, compat_urllib_parse_urlparse, - compat_urllib_request, ExtractorError, ) class CollegeHumorIE(InfoExtractor): - _WORKING = False - _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P[0-9]+)/(?P.*)$' + _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/(video|embed|e)/(?P[0-9]+)/?(?P.*)$' - def report_manifest(self, video_id): - """Report information extraction.""" - self.to_screen(u'%s: Downloading XML manifest' % video_id) + _TEST = { + u'url': u'http://www.collegehumor.com/video/6902724/comic-con-cosplay-catastrophe', + u'file': u'6902724.mp4', + u'md5': u'1264c12ad95dca142a9f0bf7968105a0', + u'info_dict': { + u'title': u'Comic-Con Cosplay Catastrophe', + u'description': u'Fans get creative this year at San Diego. Too creative. And yes, that\'s really Joss Whedon.', + }, + } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -36,14 +36,16 @@ class CollegeHumorIE(InfoExtractor): self.report_extraction(video_id) xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id - try: - metaXml = compat_urllib_request.urlopen(xmlUrl).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err)) + metaXml = self._download_webpage(xmlUrl, video_id, + u'Downloading info XML', + u'Unable to download video info XML') mdoc = xml.etree.ElementTree.fromstring(metaXml) try: videoNode = mdoc.findall('./video')[0] + youtubeIdNode = videoNode.find('./youtubeID') + if youtubeIdNode is not None: + return self.url_result(youtubeIdNode.text, 'Youtube') info['description'] = videoNode.findall('./description')[0].text info['title'] = videoNode.findall('./caption')[0].text info['thumbnail'] = videoNode.findall('./thumbnail')[0].text @@ -52,11 +54,9 @@ class CollegeHumorIE(InfoExtractor): raise ExtractorError(u'Invalid metadata XML file') manifest_url += '?hdcore=2.10.3' - self.report_manifest(video_id) - try: - manifestXml = compat_urllib_request.urlopen(manifest_url).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err)) + manifestXml = self._download_webpage(manifest_url, video_id, + u'Downloading XML manifest', + u'Unable to download video info XML') adoc = xml.etree.ElementTree.fromstring(manifestXml) try: @@ -66,9 +66,8 @@ class CollegeHumorIE(InfoExtractor): except IndexError as err: raise ExtractorError(u'Invalid manifest file') - url_pr = compat_urllib_parse_urlparse(manifest_url) - url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1' + url_pr = compat_urllib_parse_urlparse(info['thumbnail']) - info['url'] = url - info['ext'] = 'f4f' + info['url'] = url_pr.scheme + '://' + url_pr.netloc + video_id[:-2].replace('.csmil','').replace(',','') + info['ext'] = 'mp4' return [info] diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index 93d9e3d5e..bf8d711ee 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -24,7 +24,9 @@ class ComedyCentralIE(InfoExtractor): (full-episodes/(?P.*)| (?P (the-colbert-report-(videos|collections)/(?P[0-9]+)/[^/]*/(?P.*?)) - |(watch/(?P[^/]*)/(?P.*))))) + |(watch/(?P[^/]*)/(?P.*)))| + (?P + extended-interviews/(?P[0-9]+)/playlist_tds_extended_(?P.*?)/.*?))) $""" _TEST = { u'url': u'http://www.thedailyshow.com/watch/thu-december-13-2012/kristen-stewart', @@ -87,6 +89,9 @@ class ComedyCentralIE(InfoExtractor): else: epTitle = mobj.group('cntitle') dlNewest = False + elif mobj.group('interview'): + epTitle = mobj.group('interview_title') + dlNewest = False else: dlNewest = not mobj.group('episode') if dlNewest: diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 9bf7a28ca..fa8c630d0 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -1,9 +1,12 @@ import re import json +import itertools from .common import InfoExtractor from ..utils import ( compat_urllib_request, + get_element_by_attribute, + get_element_by_id, ExtractorError, ) @@ -77,3 +80,31 @@ class DailymotionIE(InfoExtractor): 'ext': video_extension, 'thumbnail': info['thumbnail_url'] }] + + +class DailymotionPlaylistIE(InfoExtractor): + _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P.+?)/' + _MORE_PAGES_INDICATOR = r'' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + playlist_id = mobj.group('id') + video_ids = [] + + for pagenum in itertools.count(1): + webpage = self._download_webpage('https://www.dailymotion.com/playlist/%s/%s' % (playlist_id, pagenum), + playlist_id, u'Downloading page %s' % pagenum) + + playlist_el = get_element_by_attribute(u'class', u'video_list', webpage) + video_ids.extend(re.findall(r'data-id="(.+?)" data-ext-id', playlist_el)) + + if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None: + break + + entries = [self.url_result('http://www.dailymotion.com/video/%s' % video_id, 'Dailymotion') + for video_id in video_ids] + return {'_type': 'playlist', + 'id': playlist_id, + 'title': get_element_by_id(u'playlist_name', webpage), + 'entries': entries, + } diff --git a/youtube_dl/extractor/exfm.py b/youtube_dl/extractor/exfm.py new file mode 100644 index 000000000..3443f19c5 --- /dev/null +++ b/youtube_dl/extractor/exfm.py @@ -0,0 +1,54 @@ +import re +import json + +from .common import InfoExtractor + + +class ExfmIE(InfoExtractor): + IE_NAME = u'exfm' + IE_DESC = u'ex.fm' + _VALID_URL = r'(?:http://)?(?:www\.)?ex\.fm/song/([^/]+)' + _SOUNDCLOUD_URL = r'(?:http://)?(?:www\.)?api\.soundcloud.com/tracks/([^/]+)/stream' + _TESTS = [ + { + u'url': u'http://ex.fm/song/1bgtzg', + u'file': u'95223130.mp3', + u'md5': u'8a7967a3fef10e59a1d6f86240fd41cf', + u'info_dict': { + u"title": u"We Can't Stop - Miley Cyrus", + u"uploader": u"Miley Cyrus", + u'upload_date': u'20130603', + u'description': u'Download "We Can\'t Stop" \r\niTunes: http://smarturl.it/WeCantStop?IQid=SC\r\nAmazon: http://smarturl.it/WeCantStopAMZ?IQid=SC', + }, + u'note': u'Soundcloud song', + }, + { + u'url': u'http://ex.fm/song/wddt8', + u'file': u'wddt8.mp3', + u'md5': u'966bd70741ac5b8570d8e45bfaed3643', + u'info_dict': { + u'title': u'Safe and Sound', + u'uploader': u'Capital Cities', + }, + }, + ] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + song_id = mobj.group(1) + info_url = "http://ex.fm/api/v3/song/%s" %(song_id) + webpage = self._download_webpage(info_url, song_id) + info = json.loads(webpage) + song_url = info['song']['url'] + if re.match(self._SOUNDCLOUD_URL, song_url) is not None: + self.to_screen('Soundcloud song detected') + return self.url_result(song_url.replace('/stream',''), 'Soundcloud') + return [{ + 'id': song_id, + 'url': song_url, + 'ext': 'mp3', + 'title': info['song']['title'], + 'thumbnail': info['song']['image']['large'], + 'uploader': info['song']['artist'], + 'view_count': info['song']['loved_count'], + }] diff --git a/youtube_dl/extractor/ina.py b/youtube_dl/extractor/ina.py index 962c59214..652f19b7b 100644 --- a/youtube_dl/extractor/ina.py +++ b/youtube_dl/extractor/ina.py @@ -5,7 +5,7 @@ from .common import InfoExtractor class InaIE(InfoExtractor): """Information Extractor for Ina.fr""" - _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?PI[0-9]+)/.*' + _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?PI?[A-F0-9]+)/.*' _TEST = { u'url': u'www.ina.fr/video/I12055569/francois-hollande-je-crois-que-c-est-clair-video.html', u'file': u'I12055569.mp4', diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index f9ac8d5b4..ddc42882a 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -10,7 +10,8 @@ class InstagramIE(InfoExtractor): u'md5': u'0d2da106a9d2631273e192b372806516', u'info_dict': { u"uploader_id": u"naomipq", - u"title": u"Video by naomipq" + u"title": u"Video by naomipq", + u'description': u'md5:1f17f0ab29bd6fe2bfad705f58de3cb8', } } @@ -18,20 +19,17 @@ class InstagramIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group(1) webpage = self._download_webpage(url, video_id) - html_title = self._html_search_regex( - r'(.+?)', - webpage, u'title', flags=re.DOTALL) - title = re.sub(u'(?: *\(Videos?\))? \u2022 Instagram$', '', html_title).strip() - uploader_id = self._html_search_regex( - r'
.*?

([^<]*)

', - webpage, u'uploader id', fatal=False, flags=re.DOTALL) - ext = 'mp4' + uploader_id = self._search_regex(r'"owner":{"username":"(.+?)"', + webpage, u'uploader id', fatal=False) + desc = self._search_regex(r'"caption":"(.*?)"', webpage, u'description', + fatal=False) return [{ 'id': video_id, 'url': self._og_search_video_url(webpage), - 'ext': ext, - 'title': title, + 'ext': 'mp4', + 'title': u'Video by %s' % uploader_id, 'thumbnail': self._og_search_thumbnail(webpage), - 'uploader_id' : uploader_id + 'uploader_id' : uploader_id, + 'description': desc, }] diff --git a/youtube_dl/extractor/kankan.py b/youtube_dl/extractor/kankan.py new file mode 100644 index 000000000..8537ba584 --- /dev/null +++ b/youtube_dl/extractor/kankan.py @@ -0,0 +1,37 @@ +import re + +from .common import InfoExtractor +from ..utils import determine_ext + + +class KankanIE(InfoExtractor): + _VALID_URL = r'https?://(?:.*?\.)?kankan\.com/.+?/(?P\d+)\.shtml' + + _TEST = { + u'url': u'http://yinyue.kankan.com/vod/48/48863.shtml', + u'file': u'48863.flv', + u'md5': u'29aca1e47ae68fc28804aca89f29507e', + u'info_dict': { + u'title': u'Ready To Go', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + webpage = self._download_webpage(url, video_id) + + title = self._search_regex(r'G_TITLE=[\'"](.+?)[\'"]', webpage, u'video title') + gcid = self._search_regex(r'lurl:[\'"]http://.+?/.+?/(.+?)/', webpage, u'gcid') + + video_info_page = self._download_webpage('http://p2s.cl.kankan.com/getCdnresource_flv?gcid=%s' % gcid, + video_id, u'Downloading video url info') + ip = self._search_regex(r'ip:"(.+?)"', video_info_page, u'video url ip') + path = self._search_regex(r'path:"(.+?)"', video_info_page, u'video url path') + video_url = 'http://%s%s' % (ip, path) + + return {'id': video_id, + 'title': title, + 'url': video_url, + 'ext': determine_ext(video_url), + } diff --git a/youtube_dl/extractor/keek.py b/youtube_dl/extractor/keek.py index dda78743d..a7b88d2d9 100644 --- a/youtube_dl/extractor/keek.py +++ b/youtube_dl/extractor/keek.py @@ -4,10 +4,10 @@ from .common import InfoExtractor class KeekIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P\w+)' + _VALID_URL = r'https?://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P\w+)' IE_NAME = u'keek' _TEST = { - u'url': u'http://www.keek.com/ytdl/keeks/NODfbab', + u'url': u'https://www.keek.com/ytdl/keeks/NODfbab', u'file': u'NODfbab.mp4', u'md5': u'9b0636f8c0f7614afa4ea5e4c6e57e83', u'info_dict': { diff --git a/youtube_dl/extractor/muzu.py b/youtube_dl/extractor/muzu.py new file mode 100644 index 000000000..03e31ea1c --- /dev/null +++ b/youtube_dl/extractor/muzu.py @@ -0,0 +1,64 @@ +import re +import json + +from .common import InfoExtractor +from ..utils import ( + compat_urllib_parse, + determine_ext, +) + + +class MuzuTVIE(InfoExtractor): + _VALID_URL = r'https?://www.muzu.tv/(.+?)/(.+?)/(?P\d+)' + IE_NAME = u'muzu.tv' + + _TEST = { + u'url': u'http://www.muzu.tv/defected/marcashken-featuring-sos-cat-walk-original-mix-music-video/1981454/', + u'file': u'1981454.mp4', + u'md5': u'98f8b2c7bc50578d6a0364fff2bfb000', + u'info_dict': { + u'title': u'Cat Walk (Original Mix)', + u'description': u'md5:90e868994de201b2570e4e5854e19420', + u'uploader': u'MarcAshken featuring SOS', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + info_data = compat_urllib_parse.urlencode({'format': 'json', + 'url': url, + }) + video_info_page = self._download_webpage('http://www.muzu.tv/api/oembed/?%s' % info_data, + video_id, u'Downloading video info') + info = json.loads(video_info_page) + + player_info_page = self._download_webpage('http://player.muzu.tv/player/playerInit?ai=%s' % video_id, + video_id, u'Downloading player info') + video_info = json.loads(player_info_page)['videos'][0] + for quality in ['1080' , '720', '480', '360']: + if video_info.get('v%s' % quality): + break + + data = compat_urllib_parse.urlencode({'ai': video_id, + # Even if each time you watch a video the hash changes, + # it seems to work for different videos, and it will work + # even if you use any non empty string as a hash + 'viewhash': 'VBNff6djeV4HV5TRPW5kOHub2k', + 'device': 'web', + 'qv': quality, + }) + video_url_page = self._download_webpage('http://player.muzu.tv/player/requestVideo?%s' % data, + video_id, u'Downloading video url') + video_url_info = json.loads(video_url_page) + video_url = video_url_info['url'] + + return {'id': video_id, + 'title': info['title'], + 'url': video_url, + 'ext': determine_ext(video_url), + 'thumbnail': info['thumbnail_url'], + 'description': info['description'], + 'uploader': info['author_name'], + } diff --git a/youtube_dl/extractor/myvideo.py b/youtube_dl/extractor/myvideo.py index b2a7b1df0..0404e6e43 100644 --- a/youtube_dl/extractor/myvideo.py +++ b/youtube_dl/extractor/myvideo.py @@ -2,11 +2,13 @@ import binascii import base64 import hashlib import re +import json from .common import InfoExtractor from ..utils import ( compat_ord, compat_urllib_parse, + compat_urllib_request, ExtractorError, ) @@ -16,7 +18,7 @@ from ..utils import ( class MyVideoIE(InfoExtractor): """Information Extractor for myvideo.de.""" - _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*' + _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/(?:[^/]+/)?watch/([0-9]+)/([^?/]+).*' IE_NAME = u'myvideo' _TEST = { u'url': u'http://www.myvideo.de/watch/8229274/bowling_fail_or_win', @@ -85,6 +87,20 @@ class MyVideoIE(InfoExtractor): 'ext': video_ext, }] + mobj = re.search(r'data-video-service="/service/data/video/%s/config' % video_id, webpage) + if mobj is not None: + request = compat_urllib_request.Request('http://www.myvideo.de/service/data/video/%s/config' % video_id, '') + response = self._download_webpage(request, video_id, + u'Downloading video info') + info = json.loads(base64.b64decode(response).decode('utf-8')) + return {'id': video_id, + 'title': info['title'], + 'url': info['streaming_url'].replace('rtmpe', 'rtmpt'), + 'play_path': info['filename'], + 'ext': 'flv', + 'thumbnail': info['thumbnail'][0]['url'], + } + # try encxml mobj = re.search('var flashvars={(.+?)}', webpage) if mobj is None: diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py new file mode 100644 index 000000000..b734722d0 --- /dev/null +++ b/youtube_dl/extractor/ooyala.py @@ -0,0 +1,52 @@ +import re +import json + +from .common import InfoExtractor +from ..utils import unescapeHTML + +class OoyalaIE(InfoExtractor): + _VALID_URL = r'https?://.+?\.ooyala\.com/.*?embedCode=(?P.+?)(&|$)' + + _TEST = { + # From http://it.slashdot.org/story/13/04/25/178216/recovering-data-from-broken-hard-drives-and-ssds-video + u'url': u'http://player.ooyala.com/player.js?embedCode=pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8', + u'file': u'pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8.mp4', + u'md5': u'3f5cceb3a7bf461d6c29dc466cf8033c', + u'info_dict': { + u'title': u'Explaining Data Recovery from Hard Drives and SSDs', + u'description': u'How badly damaged does a drive have to be to defeat Russell and his crew? Apparently, smashed to bits.', + }, + } + + def _extract_result(self, info, more_info): + return {'id': info['embedCode'], + 'ext': 'mp4', + 'title': unescapeHTML(info['title']), + 'url': info['url'], + 'description': unescapeHTML(more_info['description']), + 'thumbnail': more_info['promo'], + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + embedCode = mobj.group('id') + player_url = 'http://player.ooyala.com/player.js?embedCode=%s' % embedCode + player = self._download_webpage(player_url, embedCode) + mobile_url = self._search_regex(r'mobile_player_url="(.+?)&device="', + player, u'mobile player url') + mobile_player = self._download_webpage(mobile_url, embedCode) + videos_info = self._search_regex(r'eval\("\((\[{.*?stream_redirect.*?}\])\)"\);', mobile_player, u'info').replace('\\"','"') + videos_more_info = self._search_regex(r'eval\("\(({.*?\\"promo\\".*?})\)"', mobile_player, u'more info').replace('\\"','"') + videos_info = json.loads(videos_info) + videos_more_info =json.loads(videos_more_info) + + if videos_more_info.get('lineup'): + videos = [self._extract_result(info, more_info) for (info, more_info) in zip(videos_info, videos_more_info['lineup'])] + return {'_type': 'playlist', + 'id': embedCode, + 'title': unescapeHTML(videos_more_info['title']), + 'entries': videos, + } + else: + return self._extract_result(videos_info[0], videos_more_info) + diff --git a/youtube_dl/extractor/roxwel.py b/youtube_dl/extractor/roxwel.py new file mode 100644 index 000000000..d339e6cb5 --- /dev/null +++ b/youtube_dl/extractor/roxwel.py @@ -0,0 +1,49 @@ +import re +import json + +from .common import InfoExtractor +from ..utils import unified_strdate, determine_ext + + +class RoxwelIE(InfoExtractor): + _VALID_URL = r'https?://www\.roxwel\.com/player/(?P.+?)(\.|\?|$)' + + _TEST = { + u'url': u'http://www.roxwel.com/player/passionpittakeawalklive.html', + u'file': u'passionpittakeawalklive.flv', + u'md5': u'd9dea8360a1e7d485d2206db7fe13035', + u'info_dict': { + u'title': u'Take A Walk (live)', + u'uploader': u'Passion Pit', + u'description': u'Passion Pit performs "Take A Walk\" live at The Backyard in Austin, Texas. ', + }, + u'skip': u'Requires rtmpdump', + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + filename = mobj.group('filename') + info_url = 'http://www.roxwel.com/api/videos/%s' % filename + info_page = self._download_webpage(info_url, filename, + u'Downloading video info') + + self.report_extraction(filename) + info = json.loads(info_page) + rtmp_rates = sorted([int(r.replace('flv_', '')) for r in info['media_rates'] if r.startswith('flv_')]) + best_rate = rtmp_rates[-1] + url_page_url = 'http://roxwel.com/pl_one_time.php?filename=%s&quality=%s' % (filename, best_rate) + rtmp_url = self._download_webpage(url_page_url, filename, u'Downloading video url') + ext = determine_ext(rtmp_url) + if ext == 'f4v': + rtmp_url = rtmp_url.replace(filename, 'mp4:%s' % filename) + + return {'id': filename, + 'title': info['title'], + 'url': rtmp_url, + 'ext': 'flv', + 'description': info['description'], + 'thumbnail': info.get('player_image_url') or info.get('image_url_large'), + 'uploader': info['artist'], + 'uploader_id': info['artistname'], + 'upload_date': unified_strdate(info['dbdate']), + } diff --git a/youtube_dl/extractor/sina.py b/youtube_dl/extractor/sina.py new file mode 100644 index 000000000..14b1c656c --- /dev/null +++ b/youtube_dl/extractor/sina.py @@ -0,0 +1,67 @@ +# coding: utf-8 + +import re +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import ( + compat_urllib_request, + compat_urllib_parse, +) + + +class SinaIE(InfoExtractor): + _VALID_URL = r'''https?://(.*?\.)?video\.sina\.com\.cn/ + ( + (.+?/(((?P\d+).html)|(.*?(\#|(vid=))(?P\d+?)($|&)))) + | + # This is used by external sites like Weibo + (api/sinawebApi/outplay.php/(?P.+?)\.swf) + ) + ''' + + _TEST = { + u'url': u'http://video.sina.com.cn/news/vlist/zt/chczlj2013/?opsubject_id=top12#110028898', + u'file': u'110028898.flv', + u'md5': u'd65dd22ddcf44e38ce2bf58a10c3e71f', + u'info_dict': { + u'title': u'《中国新闻》 朝鲜要求巴拿马立即释放被扣船员', + } + } + + @classmethod + def suitable(cls, url): + return re.match(cls._VALID_URL, url, flags=re.VERBOSE) is not None + + def _extract_video(self, video_id): + data = compat_urllib_parse.urlencode({'vid': video_id}) + url_page = self._download_webpage('http://v.iask.com/v_play.php?%s' % data, + video_id, u'Downloading video url') + image_page = self._download_webpage( + 'http://interface.video.sina.com.cn/interface/common/getVideoImage.php?%s' % data, + video_id, u'Downloading thumbnail info') + url_doc = xml.etree.ElementTree.fromstring(url_page.encode('utf-8')) + + return {'id': video_id, + 'url': url_doc.find('./durl/url').text, + 'ext': 'flv', + 'title': url_doc.find('./vname').text, + 'thumbnail': image_page.split('=')[1], + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE) + video_id = mobj.group('id') + if mobj.group('token') is not None: + # The video id is in the redirected url + self.to_screen(u'Getting video id') + request = compat_urllib_request.Request(url) + request.get_method = lambda: 'HEAD' + (_, urlh) = self._download_webpage_handle(request, 'NA', False) + return self._real_extract(urlh.geturl()) + elif video_id is None: + pseudo_id = mobj.group('pseudo_id') + webpage = self._download_webpage(url, pseudo_id) + video_id = self._search_regex(r'vid:\'(\d+?)\'', webpage, u'video id') + + return self._extract_video(video_id) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index d47c49c03..7c9f1c6b6 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -19,7 +19,11 @@ class SoundcloudIE(InfoExtractor): of the stream token and uid """ - _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)(?:[?].*)?$' + _VALID_URL = r'''^(?:https?://)? + (?:(?:(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)/?(?:[?].*)?$) + |(?:api\.soundcloud\.com/tracks/(?P\d+)) + ) + ''' IE_NAME = u'soundcloud' _TEST = { u'url': u'http://soundcloud.com/ethmusic/lostin-powers-she-so-heavy', @@ -33,59 +37,65 @@ class SoundcloudIE(InfoExtractor): } } + _CLIENT_ID = 'b45b1aa10f1ac2941910a7f0d10f8e28' + + @classmethod + def suitable(cls, url): + return re.match(cls._VALID_URL, url, flags=re.VERBOSE) is not None + def report_resolve(self, video_id): """Report information extraction.""" self.to_screen(u'%s: Resolving id' % video_id) - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) - - # extract uploader (which is in the url) - uploader = mobj.group(1) - # extract simple title (uploader + slug of song title) - slug_title = mobj.group(2) - full_title = '%s/%s' % (uploader, slug_title) - - self.report_resolve(full_title) - - url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title) - resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28' - info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON') + @classmethod + def _resolv_url(cls, url): + return 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=' + cls._CLIENT_ID - info = json.loads(info_json) + def _extract_info_dict(self, info, full_title=None): video_id = info['id'] - self.report_extraction(full_title) - - streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28' - stream_json = self._download_webpage(streams_url, full_title, - u'Downloading stream definitions', - u'unable to download stream definitions') - - streams = json.loads(stream_json) - mediaURL = streams['http_mp3_128_url'] - upload_date = unified_strdate(info['created_at']) + name = full_title or video_id + self.report_extraction(name) - return [{ + thumbnail = info['artwork_url'] + if thumbnail is not None: + thumbnail = thumbnail.replace('-large', '-t500x500') + return { 'id': info['id'], - 'url': mediaURL, + 'url': info['stream_url'] + '?client_id=' + self._CLIENT_ID, 'uploader': info['user']['username'], - 'upload_date': upload_date, + 'upload_date': unified_strdate(info['created_at']), 'title': info['title'], 'ext': u'mp3', 'description': info['description'], - }] + 'thumbnail': thumbnail, + } -class SoundcloudSetIE(InfoExtractor): - """Information extractor for soundcloud.com sets - To access the media, the uid of the song and a stream token - must be extracted from the page source and the script must make - a request to media.soundcloud.com/crossdomain.xml. Then - the media can be grabbed by requesting from an url composed - of the stream token and uid - """ + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE) + if mobj is None: + raise ExtractorError(u'Invalid URL: %s' % url) + + track_id = mobj.group('track_id') + if track_id is not None: + info_json_url = 'http://api.soundcloud.com/tracks/' + track_id + '.json?client_id=' + self._CLIENT_ID + full_title = track_id + else: + # extract uploader (which is in the url) + uploader = mobj.group(1) + # extract simple title (uploader + slug of song title) + slug_title = mobj.group(2) + full_title = '%s/%s' % (uploader, slug_title) + + self.report_resolve(full_title) + + url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title) + info_json_url = self._resolv_url(url) + info_json = self._download_webpage(info_json_url, full_title, u'Downloading info JSON') + info = json.loads(info_json) + return self._extract_info_dict(info, full_title) + +class SoundcloudSetIE(SoundcloudIE): _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)(?:[?].*)?$' IE_NAME = u'soundcloud:set' _TEST = { @@ -153,10 +163,6 @@ class SoundcloudSetIE(InfoExtractor): ] } - def report_resolve(self, video_id): - """Report information extraction.""" - self.to_screen(u'%s: Resolving id' % video_id) - def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: @@ -171,7 +177,7 @@ class SoundcloudSetIE(InfoExtractor): self.report_resolve(full_title) url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title) - resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28' + resolv_url = self._resolv_url(url) info_json = self._download_webpage(resolv_url, full_title) videos = [] @@ -182,23 +188,8 @@ class SoundcloudSetIE(InfoExtractor): return self.report_extraction(full_title) - for track in info['tracks']: - video_id = track['id'] - - streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28' - stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON') - - self.report_extraction(video_id) - streams = json.loads(stream_json) - mediaURL = streams['http_mp3_128_url'] - - videos.append({ - 'id': video_id, - 'url': mediaURL, - 'uploader': track['user']['username'], - 'upload_date': unified_strdate(track['created_at']), - 'title': track['title'], - 'ext': u'mp3', - 'description': track['description'], - }) - return videos + return {'_type': 'playlist', + 'entries': [self._extract_info_dict(track) for track in info['tracks']], + 'id': info['id'], + 'title': info['title'], + } diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index ec92e589a..c910110ca 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -33,7 +33,7 @@ class TeamcocoIE(InfoExtractor): data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id data = self._download_webpage(data_url, video_id, 'Downloading data webpage') - video_url = self._html_search_regex(r'(.*?)', + video_url = self._html_search_regex(r']*type="high".*?>(.*?)', data, u'video URL') return [{ diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index 8b73b8340..4c11f7a03 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -67,7 +67,7 @@ class TEDIE(InfoExtractor): webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name) self.report_extraction(video_name) # If the url includes the language we get the title translated - title = self._html_search_regex(r'(?P.*)</span>', + title = self._html_search_regex(r'<span .*?id="altHeadline".+?>(?P<title>.*)</span>', webpage, 'title') json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>', webpage, 'json data') diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py index e0ffeced5..772134a12 100644 --- a/youtube_dl/extractor/tf1.py +++ b/youtube_dl/extractor/tf1.py @@ -6,19 +6,17 @@ import re from .common import InfoExtractor class TF1IE(InfoExtractor): - """ - TF1 uses the wat.tv player, currently it can only download videos with the - html5 player enabled, it cannot download HD videos. - """ + """TF1 uses the wat.tv player.""" _VALID_URL = r'http://videos.tf1.fr/.*-(.*?).html' _TEST = { u'url': u'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html', u'file': u'10635995.mp4', - u'md5': u'66789d3e91278d332f75e1feb7aea327', + u'md5': u'2e378cc28b9957607d5e88f274e637d8', u'info_dict': { u'title': u'Citroën Grand C4 Picasso 2013 : présentation officielle', u'description': u'Vidéo officielle du nouveau Citroën Grand C4 Picasso, lancé à l\'automne 2013.', - } + }, + u'skip': u'Sometimes wat serves the whole file with the --test option', } def _real_extract(self, url): diff --git a/youtube_dl/extractor/traileraddict.py b/youtube_dl/extractor/traileraddict.py index 324bb6231..35f89e9ee 100644 --- a/youtube_dl/extractor/traileraddict.py +++ b/youtube_dl/extractor/traileraddict.py @@ -4,11 +4,11 @@ from .common import InfoExtractor class TrailerAddictIE(InfoExtractor): - _VALID_URL = r'(?:http://)?(?:www\.)?traileraddict\.com/trailer/([^/]+)/(?:trailer|feature-trailer)' + _VALID_URL = r'(?:http://)?(?:www\.)?traileraddict\.com/(?:trailer|clip)/(?P<movie>.+?)/(?P<trailer_name>.+)' _TEST = { u'url': u'http://www.traileraddict.com/trailer/prince-avalanche/trailer', u'file': u'76184.mp4', - u'md5': u'41365557f3c8c397d091da510e73ceb4', + u'md5': u'57e39dbcf4142ceb8e1f242ff423fd71', u'info_dict': { u"title": u"Prince Avalanche Trailer", u"description": u"Trailer for Prince Avalanche.Two highway road workers spend the summer of 1988 away from their city lives. The isolated landscape becomes a place of misadventure as the men find themselves at odds with each other and the women they left behind." @@ -17,24 +17,30 @@ class TrailerAddictIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - video_id = mobj.group(1) - webpage = self._download_webpage(url, video_id) - + name = mobj.group('movie') + '/' + mobj.group('trailer_name') + webpage = self._download_webpage(url, name) + title = self._search_regex(r'<title>(.+?)', webpage, 'video title').replace(' - Trailer Addict','') view_count = self._search_regex(r'Views: (.+?)
', webpage, 'Views Count') video_id = self._og_search_property('video', webpage, 'Video id').split('=')[1] - info_url = "http://www.traileraddict.com/fvar.php?tid=%s" %(str(video_id)) + # Presence of (no)watchplus function indicates HD quality is available + if re.search(r'function (no)?watchplus()', webpage): + fvar = "fvarhd" + else: + fvar = "fvar" + + info_url = "http://www.traileraddict.com/%s.php?tid=%s" % (fvar, str(video_id)) info_webpage = self._download_webpage(info_url, video_id , "Downloading the info webpage") - + final_url = self._search_regex(r'&fileurl=(.+)', info_webpage, 'Download url').replace('%3F','?') thumbnail_url = self._search_regex(r'&image=(.+?)&', info_webpage, 'thumbnail url') ext = final_url.split('.')[-1].split('?')[0] - + return [{ 'id' : video_id, 'url' : final_url, diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index 3b16dcfbc..67537eae5 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -35,12 +35,12 @@ class VevoIE(InfoExtractor): self.report_extraction(video_id) video_info = json.loads(info_json) - m_urls = list(re.finditer(r'