From: Yen Chi Hsuan Date: Fri, 25 Sep 2015 09:02:10 +0000 (+0800) Subject: Merge branch 'fktv' of https://github.com/remitamine/youtube-dl into remitamine-fktv X-Git-Url: http://git.cielonegro.org/gitweb.cgi?a=commitdiff_plain;h=dade7245af16c4167575bb3759e7e64011602043;hp=7b4137c351222a94f46f854bf490a299e4124acc;p=youtube-dl.git Merge branch 'fktv' of https://github.com/remitamine/youtube-dl into remitamine-fktv --- diff --git a/AUTHORS b/AUTHORS index d1693224e..901c1b263 100644 --- a/AUTHORS +++ b/AUTHORS @@ -140,3 +140,6 @@ Behrouz Abbasi ngld nyuszika7h Shaun Walbridge +Lee Jenkins +Anssi Hannula +Lukáš Lalinský diff --git a/README.md b/README.md index 24bfe38a2..2ed751791 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,7 @@ youtube-dl - download videos from youtube.com or other video platforms - [VIDEO SELECTION](#video-selection) - [FAQ](#faq) - [DEVELOPER INSTRUCTIONS](#developer-instructions) +- [EMBEDDING YOUTUBE-DL](#embedding-youtube-dl) - [BUGS](#bugs) - [COPYRIGHT](#copyright) @@ -261,7 +262,7 @@ For example: machine youtube login myaccount@gmail.com password my_youtube_password machine twitch login my_twitch_account_name password my_twitch_password ``` -To activate authentication with `.netrc` file you should pass `--netrc` to youtube-dl or to place it in [configuration file](#configuration). +To activate authentication with `.netrc` file you should pass `--netrc` to youtube-dl or place it in [configuration file](#configuration). On Windows you may also need to setup `%HOME%` environment variable manually. @@ -277,8 +278,8 @@ The `-o` option allows users to indicate a template for the output file names. T - `ext`: The sequence will be replaced by the appropriate extension (like flv or mp4). - `epoch`: The sequence will be replaced by the Unix epoch when creating the file. - `autonumber`: The sequence will be replaced by a five-digit number that will be increased with each download, starting at zero. - - `playlist`: The name or the id of the playlist that contains the video. - - `playlist_index`: The index of the video in the playlist, a five-digit number. + - `playlist`: The sequence will be replaced by the name or the id of the playlist that contains the video. + - `playlist_index`: The sequence will be replaced by the index of the video in the playlist padded with leading zeros according to the total length of the playlist. - `format_id`: The sequence will be replaced by the format code specified by `--format`. The current default template is `%(title)s-%(id)s.%(ext)s`. diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 04b9959ac..ab153af6b 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -122,7 +122,6 @@ - **defense.gouv.fr** - **DHM**: Filmarchiv - Deutsches Historisches Museum - **Discovery** - - **divxstage**: DivxStage - **Dotsub** - **DouyuTV**: 斗鱼 - **dramafever** @@ -195,7 +194,7 @@ - **GodTube** - **GoldenMoustache** - **Golem** - - **GorillaVid**: GorillaVid.in, daclips.in, movpod.in, fastvideo.in and realvid.net + - **GorillaVid**: GorillaVid.in, daclips.in, movpod.in, fastvideo.in, realvid.net and filehoot.com - **Goshgay** - **Groupon** - **Hark** @@ -286,7 +285,7 @@ - **Minhateca** - **MinistryGrid** - **miomio.tv** - - **mitele.es** + - **MiTele**: mitele.es - **mixcloud** - **MLB** - **MoeVideo**: LetitBit video services: moevideo.net, playreplay.net and videochart.net @@ -309,7 +308,6 @@ - **mtvservices:embedded** - **MuenchenTV**: münchen.tv - **MusicPlayOn** - - **MusicVault** - **muzu.tv** - **Mwave** - **MySpace** @@ -318,7 +316,6 @@ - **Myvi** - **myvideo** - **MyVidster** - - **N-JOY** - **n-tv.de** - **NationalGeographic** - **Naver** @@ -327,7 +324,9 @@ - **NBCNews** - **NBCSports** - **NBCSportsVPlayer** - - **ndr**: NDR.de - Mediathek + - **ndr**: NDR.de - Norddeutscher Rundfunk + - **ndr:embed** + - **ndr:embed:base** - **NDTV** - **NerdCubedFeed** - **Nerdist** @@ -350,12 +349,16 @@ - **nhl.com:videocenter**: NHL videocenter category - **niconico**: ニコニコ動画 - **NiconicoPlaylist** + - **njoy**: N-JOY + - **njoy:embed** - **Noco** - **Normalboots** - **NosVideo** - **Nova**: TN.cz, Prásk.tv, Nova.cz, Novaplus.cz, FANDA.tv, Krásná.cz and Doma.cz - **novamov**: NovaMov - - **Nowness** + - **nowness** + - **nowness:playlist** + - **nowness:series** - **NowTV** - **nowvideo**: NowVideo - **npo**: npo.nl and ntr.nl @@ -376,7 +379,6 @@ - **OnionStudios** - **Ooyala** - **OoyalaExternal** - - **OpenFilm** - **orf:fm4**: radio FM4 - **orf:iptv**: iptv.ORF.at - **orf:oe1**: Radio Österreich 1 @@ -531,7 +533,7 @@ - **techtv.mit.edu** - **ted** - **TeleBruxelles** - - **telecinco.es** + - **Telecinco**: telecinco.es, cuatro.com and mediaset.es - **Telegraaf** - **TeleMB** - **TeleTask** @@ -633,6 +635,7 @@ - **vine:user** - **vk**: VK - **vk:uservideos**: VK - User's Videos + - **vlive** - **Vodlocker** - **VoiceRepublic** - **Vporn** diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index e32bef279..1ff42d94b 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -80,6 +80,11 @@ try: except ImportError: import BaseHTTPServer as compat_http_server +try: + compat_str = unicode # Python 2 +except NameError: + compat_str = str + try: from urllib.parse import unquote_to_bytes as compat_urllib_parse_unquote_to_bytes from urllib.parse import unquote as compat_urllib_parse_unquote @@ -100,7 +105,7 @@ except ImportError: # Python 2 # Is it a string-like object? string.split return b'' - if isinstance(string, unicode): + if isinstance(string, compat_str): string = string.encode('utf-8') bits = string.split(b'%') if len(bits) == 1: @@ -150,11 +155,6 @@ except ImportError: # Python 2 string = string.replace('+', ' ') return compat_urllib_parse_unquote(string, encoding, errors) -try: - compat_str = unicode # Python 2 -except NameError: - compat_str = str - try: compat_basestring = basestring # Python 2 except NameError: @@ -234,7 +234,7 @@ else: # Working around shlex issue with unicode strings on some python 2 # versions (see http://bugs.python.org/issue1548891) def compat_shlex_split(s, comments=False, posix=True): - if isinstance(s, unicode): + if isinstance(s, compat_str): s = s.encode('utf-8') return shlex.split(s, comments, posix) diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index 71aafdc73..a62d2047b 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -28,10 +28,19 @@ class HlsFD(FileDownloader): return False ffpp.check_version() - args = [ - encodeArgument(opt) - for opt in (ffpp.executable, '-y', '-i', url, '-f', 'mp4', '-c', 'copy', '-bsf:a', 'aac_adtstoasc')] - args.append(encodeFilename(tmpfilename, True)) + args = [ffpp.executable, '-y'] + + if info_dict['http_headers']: + # Trailing \r\n after each HTTP header is important to prevent warning from ffmpeg/avconv: + # [http @ 00000000003d2fa0] No trailing CRLF found in HTTP header. + args += [ + '-headers', + ''.join('%s: %s\r\n' % (key, val) for key, val in info_dict['http_headers'].items())] + + args += ['-i', url, '-f', 'mp4', '-c', 'copy', '-bsf:a', 'aac_adtstoasc'] + + args = [encodeArgument(opt) for opt in args] + args.append(encodeFilename(ffpp._ffmpeg_filename_argument(tmpfilename), True)) self._debug_cmd(args) @@ -92,6 +101,7 @@ class NativeHlsFD(FragmentFD): return False down, frag_sanitized = sanitize_open(frag_filename, 'rb') ctx['dest_stream'].write(down.read()) + down.close() frags_filenames.append(frag_sanitized) self._finish_frag_download(ctx) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index f8d4c8462..a73a1317e 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -138,7 +138,6 @@ from .dump import DumpIE from .dumpert import DumpertIE from .defense import DefenseGouvFrIE from .discovery import DiscoveryIE -from .divxstage import DivxStageIE from .dropbox import DropboxIE from .eagleplatform import EaglePlatformIE from .ebaumsworld import EbaumsWorldIE @@ -226,7 +225,6 @@ from .historicfilms import HistoricFilmsIE from .history import HistoryIE from .hitbox import HitboxIE, HitboxLiveIE from .hornbunny import HornBunnyIE -from .hostingbulk import HostingBulkIE from .hotnewhiphop import HotNewHipHopIE from .howcast import HowcastIE from .howstuffworks import HowStuffWorksIE @@ -364,6 +362,9 @@ from .nbc import ( from .ndr import ( NDRIE, NJoyIE, + NDREmbedBaseIE, + NDREmbedIE, + NJoyEmbedIE, ) from .ndtv import NDTVIE from .netzkino import NetzkinoIE @@ -399,7 +400,11 @@ from .normalboots import NormalbootsIE from .nosvideo import NosVideoIE from .nova import NovaIE from .novamov import NovaMovIE -from .nowness import NownessIE +from .nowness import ( + NownessIE, + NownessPlaylistIE, + NownessSeriesIE, +) from .nowtv import NowTVIE from .nowvideo import NowVideoIE from .npo import ( @@ -429,7 +434,6 @@ from .ooyala import ( OoyalaIE, OoyalaExternalIE, ) -from .openfilm import OpenFilmIE from .orf import ( ORFTVthekIE, ORFOE1IE, diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 76de24477..2a00da3ee 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -4,6 +4,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urllib_parse_urlparse, +) from ..utils import ( find_xpath_attr, unified_strdate, @@ -77,7 +81,13 @@ class ArteTVPlus7IE(InfoExtractor): def _extract_from_webpage(self, webpage, video_id, lang): json_url = self._html_search_regex( [r'arte_vp_url=["\'](.*?)["\']', r'data-url=["\']([^"]+)["\']'], - webpage, 'json vp url') + webpage, 'json vp url', default=None) + if not json_url: + iframe_url = self._html_search_regex( + r']+src=(["\'])(?P.+\bjson_url=.+?)\1', + webpage, 'iframe url', group='url') + json_url = compat_parse_qs( + compat_urllib_parse_urlparse(iframe_url).query)['json_url'][0] return self._extract_from_json_url(json_url, video_id, lang) def _extract_from_json_url(self, json_url, video_id, lang): diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index abc5a44a1..42526357a 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -21,6 +21,7 @@ class BBCCoUkIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:(?:programmes|iplayer(?:/[^/]+)?/(?:episode|playlist))/)|music/clips[/#])(?P[\da-z]{8})' _MEDIASELECTOR_URLS = [ + 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s', 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s', ] @@ -189,6 +190,12 @@ class BBCCoUkIE(InfoExtractor): # Skip DASH until supported elif transfer_format == 'dash': pass + elif transfer_format == 'hls': + m3u8_formats = self._extract_m3u8_formats( + href, programme_id, ext='mp4', entry_protocol='m3u8_native', + m3u8_id=supplier, fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) # Direct link else: formats.append({ diff --git a/youtube_dl/extractor/clubic.py b/youtube_dl/extractor/clubic.py index 14f215c5c..1dfa7c12e 100644 --- a/youtube_dl/extractor/clubic.py +++ b/youtube_dl/extractor/clubic.py @@ -12,9 +12,9 @@ from ..utils import ( class ClubicIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?clubic\.com/video/[^/]+/video.*-(?P[0-9]+)\.html' + _VALID_URL = r'http://(?:www\.)?clubic\.com/video/(?:[^/]+/)*video.*-(?P[0-9]+)\.html' - _TEST = { + _TESTS = [{ 'url': 'http://www.clubic.com/video/clubic-week/video-clubic-week-2-0-le-fbi-se-lance-dans-la-photo-d-identite-448474.html', 'md5': '1592b694ba586036efac1776b0b43cd3', 'info_dict': { @@ -24,7 +24,10 @@ class ClubicIE(InfoExtractor): 'description': 're:Gueule de bois chez Nokia. Le constructeur a indiqué cette.*', 'thumbnail': 're:^http://img\.clubic\.com/.*\.jpg$', } - } + }, { + 'url': 'http://www.clubic.com/video/video-clubic-week-2-0-apple-iphone-6s-et-plus-mais-surtout-le-pencil-469792.html', + 'only_matching': True, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 5eeeda08d..1e7db8a9b 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -516,6 +516,12 @@ class InfoExtractor(object): '%s. Use --username and --password or --netrc to provide account credentials.' % msg, expected=True) + @staticmethod + def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'): + raise ExtractorError( + '%s. You might want to use --proxy to workaround.' % msg, + expected=True) + # Methods for following #608 @staticmethod def url_result(url, ie=None, video_id=None, video_title=None): @@ -731,8 +737,9 @@ class InfoExtractor(object): @staticmethod def _hidden_inputs(html): + html = re.sub(r'', '', html) hidden_inputs = {} - for input in re.findall(r']+)>', html): + for input in re.findall(r'(?i)]+)>', html): if not re.search(r'type=(["\'])(?:hidden|submit)\1', input): continue name = re.search(r'name=(["\'])(?P.+?)\1', input) @@ -746,7 +753,7 @@ class InfoExtractor(object): def _form_hidden_inputs(self, form_id, html): form = self._search_regex( - r'(?s)]+?id=(["\'])%s\1[^>]*>(?P
.+?)
' % form_id, + r'(?is)]+?id=(["\'])%s\1[^>]*>(?P
.+?)
' % form_id, html, '%s form' % form_id, group='form') return self._hidden_inputs(form) diff --git a/youtube_dl/extractor/condenast.py b/youtube_dl/extractor/condenast.py index 3db4db4e4..d6949ca28 100644 --- a/youtube_dl/extractor/condenast.py +++ b/youtube_dl/extractor/condenast.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals import re -import json from .common import InfoExtractor from ..compat import ( @@ -24,16 +23,28 @@ class CondeNastIE(InfoExtractor): # The keys are the supported sites and the values are the name to be shown # to the user and in the extractor description. _SITES = { - 'wired': 'WIRED', + 'allure': 'Allure', + 'architecturaldigest': 'Architectural Digest', + 'arstechnica': 'Ars Technica', + 'bonappetit': 'Bon Appétit', + 'brides': 'Brides', + 'cnevids': 'Condé Nast', + 'cntraveler': 'Condé Nast Traveler', + 'details': 'Details', + 'epicurious': 'Epicurious', + 'glamour': 'Glamour', + 'golfdigest': 'Golf Digest', 'gq': 'GQ', + 'newyorker': 'The New Yorker', + 'self': 'SELF', + 'teenvogue': 'Teen Vogue', + 'vanityfair': 'Vanity Fair', 'vogue': 'Vogue', - 'glamour': 'Glamour', + 'wired': 'WIRED', 'wmagazine': 'W Magazine', - 'vanityfair': 'Vanity Fair', - 'cnevids': 'Condé Nast', } - _VALID_URL = r'http://(video|www|player)\.(?P%s)\.com/(?Pwatch|series|video|embed)/(?P[^/?#]+)' % '|'.join(_SITES.keys()) + _VALID_URL = r'http://(?:video|www|player)\.(?P%s)\.com/(?Pwatch|series|video|embed)/(?P[^/?#]+)' % '|'.join(_SITES.keys()) IE_DESC = 'Condé Nast media group: %s' % ', '.join(sorted(_SITES.values())) EMBED_URL = r'(?:https?:)?//player\.(?P%s)\.com/(?Pembed)/.+?' % '|'.join(_SITES.keys()) @@ -86,8 +97,8 @@ class CondeNastIE(InfoExtractor): info_url = base_info_url + data info_page = self._download_webpage(info_url, video_id, 'Downloading video info') - video_info = self._search_regex(r'var video = ({.+?});', info_page, 'video info') - video_info = json.loads(video_info) + video_info = self._search_regex(r'var\s+video\s*=\s*({.+?});', info_page, 'video info') + video_info = self._parse_json(video_info, video_id) formats = [{ 'format_id': '%s-%s' % (fdata['type'].split('/')[-1], fdata['quality']), diff --git a/youtube_dl/extractor/divxstage.py b/youtube_dl/extractor/divxstage.py deleted file mode 100644 index b88379e06..000000000 --- a/youtube_dl/extractor/divxstage.py +++ /dev/null @@ -1,27 +0,0 @@ -from __future__ import unicode_literals - -from .novamov import NovaMovIE - - -class DivxStageIE(NovaMovIE): - IE_NAME = 'divxstage' - IE_DESC = 'DivxStage' - - _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': 'divxstage\.(?:eu|net|ch|co|at|ag|to)'} - - _HOST = 'www.divxstage.eu' - - _FILE_DELETED_REGEX = r'>This file no longer exists on our servers.<' - _TITLE_REGEX = r'
\s*([^<]+)' - _DESCRIPTION_REGEX = r'
\s*[^<]+\s*

([^<]+)

' - - _TEST = { - 'url': 'http://www.divxstage.eu/video/57f238e2e5e01', - 'md5': '63969f6eb26533a1968c4d325be63e72', - 'info_dict': { - 'id': '57f238e2e5e01', - 'ext': 'flv', - 'title': 'youtubedl test video', - 'description': 'This is a test video for youtubedl.', - } - } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index ec748ed9f..8881a8a23 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import os import re +import sys from .common import InfoExtractor from .youtube import YoutubeIE @@ -230,6 +231,22 @@ class GenericIE(InfoExtractor): 'skip_download': False, } }, + { + # redirect in Refresh HTTP header + 'url': 'https://www.facebook.com/l.php?u=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DpO8h3EaFRdo&h=TAQHsoToz&enc=AZN16h-b6o4Zq9pZkCCdOLNKMN96BbGMNtcFwHSaazus4JHT_MFYkAA-WARTX2kvsCIdlAIyHZjl6d33ILIJU7Jzwk_K3mcenAXoAzBNoZDI_Q7EXGDJnIhrGkLXo_LJ_pAa2Jzbx17UHMd3jAs--6j2zaeto5w9RTn8T_1kKg3fdC5WPX9Dbb18vzH7YFX0eSJmoa6SP114rvlkw6pkS1-T&s=1', + 'info_dict': { + 'id': 'pO8h3EaFRdo', + 'ext': 'mp4', + 'title': 'Tripeo Boiler Room x Dekmantel Festival DJ Set', + 'description': 'md5:6294cc1af09c4049e0652b51a2df10d5', + 'upload_date': '20150917', + 'uploader_id': 'brtvofficial', + 'uploader': 'Boiler Room', + }, + 'params': { + 'skip_download': False, + }, + }, { 'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html', 'md5': '85b90ccc9d73b4acd9138d3af4c27f89', @@ -1808,6 +1825,9 @@ class GenericIE(InfoExtractor): # Look also in Refresh HTTP header refresh_header = head_response.headers.get('Refresh') if refresh_header: + # In python 2 response HTTP headers are bytestrings + if sys.version_info < (3, 0) and isinstance(refresh_header, str): + refresh_header = refresh_header.decode('iso-8859-1') found = re.search(REDIRECT_REGEX, refresh_header) if found: new_url = compat_urlparse.urljoin(url, unescapeHTML(found.group(1))) diff --git a/youtube_dl/extractor/hostingbulk.py b/youtube_dl/extractor/hostingbulk.py deleted file mode 100644 index a3154cfde..000000000 --- a/youtube_dl/extractor/hostingbulk.py +++ /dev/null @@ -1,80 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import ( - compat_urllib_request, -) -from ..utils import ( - ExtractorError, - int_or_none, - urlencode_postdata, -) - - -class HostingBulkIE(InfoExtractor): - _VALID_URL = r'''(?x) - https?://(?:www\.)?hostingbulk\.com/ - (?:embed-)?(?P[A-Za-z0-9]{12})(?:-\d+x\d+)?\.html''' - _FILE_DELETED_REGEX = r'File Not Found' - _TEST = { - 'url': 'http://hostingbulk.com/n0ulw1hv20fm.html', - 'md5': '6c8653c8ecf7ebfa83b76e24b7b2fe3f', - 'info_dict': { - 'id': 'n0ulw1hv20fm', - 'ext': 'mp4', - 'title': 'md5:5afeba33f48ec87219c269e054afd622', - 'filesize': 6816081, - 'thumbnail': 're:^http://.*\.jpg$', - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - url = 'http://hostingbulk.com/{0:}.html'.format(video_id) - - # Custom request with cookie to set language to English, so our file - # deleted regex would work. - request = compat_urllib_request.Request( - url, headers={'Cookie': 'lang=english'}) - webpage = self._download_webpage(request, video_id) - - if re.search(self._FILE_DELETED_REGEX, webpage) is not None: - raise ExtractorError('Video %s does not exist' % video_id, - expected=True) - - title = self._html_search_regex(r'

(.*?)

', webpage, 'title') - filesize = int_or_none( - self._search_regex( - r'\((\d+)\sbytes?\)', - webpage, - 'filesize', - fatal=False - ) - ) - thumbnail = self._search_regex( - r'(.+?)(?: *\(Videos?\))? \| (?:Iconosquare|Statigram)', - webpage, 'title') + title = remove_end(self._og_search_title(webpage), ' - via Iconosquare') timestamp = int_or_none(media.get('created_time') or media.get('caption', {}).get('created_time')) description = media.get('caption', {}).get('text') @@ -61,6 +62,14 @@ class IconosquareIE(InfoExtractor): 'height': int_or_none(t.get('height')) } for thumbnail_id, t in media.get('images', {}).items()] + comments = [{ + 'id': comment.get('id'), + 'text': comment['text'], + 'timestamp': int_or_none(comment.get('created_time')), + 'author': comment.get('from', {}).get('full_name'), + 'author_id': comment.get('from', {}).get('username'), + } for comment in media.get('comments', {}).get('data', []) if 'text' in comment] + return { 'id': video_id, 'title': title, @@ -72,4 +81,5 @@ class IconosquareIE(InfoExtractor): 'comment_count': comment_count, 'like_count': like_count, 'formats': formats, + 'comments': comments, } diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index 393e67e35..ce1ab3820 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -95,6 +95,10 @@ class IqiyiIE(InfoExtractor): ('10', 'h1'), ] + @staticmethod + def md5_text(text): + return hashlib.md5(text.encode('utf-8')).hexdigest() + def construct_video_urls(self, data, video_id, _uuid): def do_xor(x, y): a = y % 3 @@ -121,7 +125,7 @@ class IqiyiIE(InfoExtractor): note='Download path key of segment %d for format %s' % (segment_index + 1, format_id) )['t'] t = str(int(math.floor(int(tm) / (600.0)))) - return hashlib.md5((t + mg + x).encode('utf8')).hexdigest() + return self.md5_text(t + mg + x) video_urls_dict = {} for format_item in data['vp']['tkl'][0]['vs']: @@ -179,20 +183,19 @@ class IqiyiIE(InfoExtractor): def get_raw_data(self, tvid, video_id, enc_key, _uuid): tm = str(int(time.time())) + tail = tm + tvid param = { 'key': 'fvip', - 'src': hashlib.md5(b'youtube-dl').hexdigest(), + 'src': self.md5_text('youtube-dl'), 'tvId': tvid, 'vid': video_id, 'vinfo': 1, 'tm': tm, - 'enc': hashlib.md5( - (enc_key + tm + tvid).encode('utf8')).hexdigest(), + 'enc': self.md5_text((enc_key + tail)[1:64:2] + tail), 'qyid': _uuid, 'tn': random.random(), 'um': 0, - 'authkey': hashlib.md5( - (tm + tvid).encode('utf8')).hexdigest() + 'authkey': self.md5_text(self.md5_text('') + tail), } api_url = 'http://cache.video.qiyi.com/vms' + '?' + \ @@ -201,7 +204,8 @@ class IqiyiIE(InfoExtractor): return raw_data def get_enc_key(self, swf_url, video_id): - enc_key = '3601ba290e4f4662848c710e2122007e' # last update at 2015-08-10 for Zombie + # TODO: automatic key extraction + enc_key = 'eac64f22daf001da6ba9aa8da4d501508bbe90a4d4091fea3b0582a85b38c2cc' # last update at 2015-09-23-23 for Zombie::bite return enc_key def _real_extract(self, url): diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py index 852d72266..54993e2c9 100644 --- a/youtube_dl/extractor/mitele.py +++ b/youtube_dl/extractor/mitele.py @@ -1,74 +1,85 @@ from __future__ import unicode_literals -import json - from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse, - compat_urllib_parse_unquote, - compat_urlparse, -) +from ..compat import compat_urllib_parse from ..utils import ( + encode_dict, get_element_by_attribute, - parse_duration, - strip_jsonp, + int_or_none, ) class MiTeleIE(InfoExtractor): - IE_NAME = 'mitele.es' + IE_DESC = 'mitele.es' _VALID_URL = r'http://www\.mitele\.es/[^/]+/[^/]+/[^/]+/(?P[^/]+)/' _TESTS = [{ 'url': 'http://www.mitele.es/programas-tv/diario-de/la-redaccion/programa-144/', + 'md5': 'ace7635b2a0b286aaa37d3ff192d2a8a', 'info_dict': { - 'id': '0fce117d', - 'ext': 'mp4', - 'title': 'Programa 144 - Tor, la web invisible', - 'description': 'md5:3b6fce7eaa41b2d97358726378d9369f', + 'id': '0NF1jJnxS1Wu3pHrmvFyw2', 'display_id': 'programa-144', + 'ext': 'flv', + 'title': 'Tor, la web invisible', + 'description': 'md5:3b6fce7eaa41b2d97358726378d9369f', + 'thumbnail': 're:(?i)^https?://.*\.jpg$', 'duration': 2913, }, - 'params': { - # m3u8 download - 'skip_download': True, - }, }] def _real_extract(self, url): - episode = self._match_id(url) - webpage = self._download_webpage(url, episode) - embed_data_json = self._search_regex( - r'(?s)MSV\.embedData\[.*?\]\s*=\s*({.*?});', webpage, 'embed data', - ).replace('\'', '"') - embed_data = json.loads(embed_data_json) + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) - domain = embed_data['mediaUrl'] - if not domain.startswith('http'): - # only happens in telecinco.es videos - domain = 'http://' + domain - info_url = compat_urlparse.urljoin( - domain, - compat_urllib_parse_unquote(embed_data['flashvars']['host']) - ) - info_el = self._download_xml(info_url, episode).find('./video/info') + config_url = self._search_regex( + r'data-config\s*=\s*"([^"]+)"', webpage, 'data config url') - video_link = info_el.find('videoUrl/link').text - token_query = compat_urllib_parse.urlencode({'id': video_link}) - token_info = self._download_json( - embed_data['flashvars']['ov_tk'] + '?' + token_query, - episode, - transform_source=strip_jsonp - ) - formats = self._extract_m3u8_formats( - token_info['tokenizedUrl'], episode, ext='mp4') + config = self._download_json( + config_url, display_id, 'Downloading config JSON') + + mmc = self._download_json( + config['services']['mmc'], display_id, 'Downloading mmc JSON') + + formats = [] + for location in mmc['locations']: + gat = self._proto_relative_url(location.get('gat'), 'http:') + bas = location.get('bas') + loc = location.get('loc') + ogn = location.get('ogn') + if None in (gat, bas, loc, ogn): + continue + token_data = { + 'bas': bas, + 'icd': loc, + 'ogn': ogn, + 'sta': '0', + } + media = self._download_json( + '%s/?%s' % (gat, compat_urllib_parse.urlencode(encode_dict(token_data)).encode('utf-8')), + display_id, 'Downloading %s JSON' % location['loc']) + file_ = media.get('file') + if not file_: + continue + formats.extend(self._extract_f4m_formats( + file_ + '&hdcore=3.2.0&plugin=aasp-3.2.0.77.18', + display_id, f4m_id=loc)) + + title = self._search_regex( + r'class="Destacado-text"[^>]*>\s*([^<]+)', webpage, 'title') + + video_id = self._search_regex( + r'data-media-id\s*=\s*"([^"]+)"', webpage, + 'data media id', default=None) or display_id + thumbnail = config.get('poster', {}).get('imageUrl') + duration = int_or_none(mmc.get('duration')) return { - 'id': embed_data['videoId'], - 'display_id': episode, - 'title': info_el.find('title').text, - 'formats': formats, + 'id': video_id, + 'display_id': display_id, + 'title': title, 'description': get_element_by_attribute('class', 'text', webpage), - 'thumbnail': info_el.find('thumb').text, - 'duration': parse_duration(info_el.find('duration').text), + 'thumbnail': thumbnail, + 'duration': duration, + 'formats': formats, } diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index 79a13958b..e3cc6fde8 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -1,130 +1,380 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( - ExtractorError, + determine_ext, int_or_none, + parse_iso8601, qualities, - parse_duration, ) class NDRBaseIE(InfoExtractor): def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + return self._extract_embed(webpage, display_id) + - page = self._download_webpage(url, video_id, 'Downloading page') +class NDRIE(NDRBaseIE): + IE_NAME = 'ndr' + IE_DESC = 'NDR.de - Norddeutscher Rundfunk' + _VALID_URL = r'https?://www\.ndr\.de/(?:[^/]+/)+(?P[^/?#]+),[\da-z]+\.html' + _TESTS = [{ + # httpVideo, same content id + 'url': 'http://www.ndr.de/fernsehen/Party-Poette-und-Parade,hafengeburtstag988.html', + 'md5': '6515bc255dc5c5f8c85bbc38e035a659', + 'info_dict': { + 'id': 'hafengeburtstag988', + 'display_id': 'Party-Poette-und-Parade', + 'ext': 'mp4', + 'title': 'Party, Pötte und Parade', + 'description': 'md5:ad14f9d2f91d3040b6930c697e5f6b4c', + 'uploader': 'ndrtv', + 'timestamp': 1431108900, + 'upload_date': '20150510', + 'duration': 3498, + }, + 'params': { + 'skip_download': True, + }, + }, { + # httpVideo, different content id + 'url': 'http://www.ndr.de/sport/fussball/40-Osnabrueck-spielt-sich-in-einen-Rausch,osna270.html', + 'md5': '1043ff203eab307f0c51702ec49e9a71', + 'info_dict': { + 'id': 'osna272', + 'display_id': '40-Osnabrueck-spielt-sich-in-einen-Rausch', + 'ext': 'mp4', + 'title': 'Osnabrück - Wehen Wiesbaden: Die Highlights', + 'description': 'md5:32e9b800b3d2d4008103752682d5dc01', + 'uploader': 'ndrtv', + 'timestamp': 1442059200, + 'upload_date': '20150912', + 'duration': 510, + }, + 'params': { + 'skip_download': True, + }, + }, { + # httpAudio, same content id + 'url': 'http://www.ndr.de/info/La-Valette-entgeht-der-Hinrichtung,audio51535.html', + 'md5': 'bb3cd38e24fbcc866d13b50ca59307b8', + 'info_dict': { + 'id': 'audio51535', + 'display_id': 'La-Valette-entgeht-der-Hinrichtung', + 'ext': 'mp3', + 'title': 'La Valette entgeht der Hinrichtung', + 'description': 'md5:22f9541913a40fe50091d5cdd7c9f536', + 'uploader': 'ndrinfo', + 'timestamp': 1290626100, + 'upload_date': '20140729', + 'duration': 884, + }, + 'params': { + 'skip_download': True, + }, + }] - title = self._og_search_title(page).strip() - description = self._og_search_description(page) - if description: - description = description.strip() + def _extract_embed(self, webpage, display_id): + embed_url = self._html_search_meta( + 'embedURL', webpage, 'embed URL', fatal=True) + description = self._search_regex( + r']+itemprop="description">([^<]+)

', + webpage, 'description', fatal=False) + timestamp = parse_iso8601( + self._search_regex( + r'
\s*

([^<]+)

', + webpage, 'description', fatal=False) + return { + '_type': 'url_transparent', + 'ie_key': 'NDREmbedBase', + 'url': 'ndr:%s' % video_id, + 'display_id': display_id, + 'description': description, + } - mp3_url = re.search(r'''\{src:'(?P