X-Git-Url: http://git.cielonegro.org/gitweb.cgi?a=blobdiff_plain;f=youtube_dl%2FInfoExtractors.py;h=a220de80a4b62610efc483f50e45ffed5eb47279;hb=ae608b8076497d70e2a95e5e939c1fb31e2dde53;hp=83be8313f1d3591dd936491ac0e1087791f92ab5;hpb=18be482a6f0b0d48c4fd8101ba0f0e30ac782d79;p=youtube-dl.git
diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py
index 83be8313f..a220de80a 100755
--- a/youtube_dl/InfoExtractors.py
+++ b/youtube_dl/InfoExtractors.py
@@ -5,6 +5,7 @@ from __future__ import absolute_import
import base64
import datetime
+import itertools
import netrc
import os
import re
@@ -35,15 +36,16 @@ class InfoExtractor(object):
url: Final video URL.
title: Video title, unescaped.
ext: Video filename extension.
- uploader: Full name of the video uploader.
- upload_date: Video upload date (YYYYMMDD).
The following fields are optional:
format: The video format, defaults to ext (used for --get-format)
thumbnail: Full URL to a video thumbnail image.
description: One-line video description.
+ uploader: Full name of the video uploader.
+ upload_date: Video upload date (YYYYMMDD).
uploader_id: Nickname or id of the video uploader.
+ location: Physical location of the video.
player_url: SWF Player URL (used for rtmpdump).
subtitles: The .srt file contents.
urlhandle: [internal] The urlHandle to be used to download the file,
@@ -106,19 +108,24 @@ class InfoExtractor(object):
def IE_NAME(self):
return type(self).__name__[:-2]
- def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
+ def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
+ """ Returns the response handle """
if note is None:
note = u'Downloading video webpage'
self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
try:
- urlh = compat_urllib_request.urlopen(url_or_request)
- webpage_bytes = urlh.read()
- return webpage_bytes.decode('utf-8', 'replace')
+ return compat_urllib_request.urlopen(url_or_request)
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
if errnote is None:
errnote = u'Unable to download webpage'
raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
+ def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
+ """ Returns the data of the page as a string """
+ urlh = self._request_webpage(url_or_request, video_id, note, errnote)
+ webpage_bytes = urlh.read()
+ return webpage_bytes.decode('utf-8', 'replace')
+
class YoutubeIE(InfoExtractor):
"""Information extractor for youtube.com."""
@@ -144,7 +151,7 @@ class YoutubeIE(InfoExtractor):
(?(1).+)? # if we found the ID, everything can follow
$"""
_LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
- _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
+ _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
_AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
_NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
_NETRC_MACHINE = 'youtube'
@@ -209,6 +216,10 @@ class YoutubeIE(InfoExtractor):
"""Report attempt to download video info webpage."""
self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
+ def report_video_subtitles_request(self, video_id, lang):
+ """Report attempt to download video info webpage."""
+ self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles for lang: %s' % (video_id,lang))
+
def report_information_extraction(self, video_id):
"""Report attempt to extract video information."""
self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
@@ -221,25 +232,7 @@ class YoutubeIE(InfoExtractor):
"""Indicate the download will use the RTMP protocol."""
self._downloader.to_screen(u'[youtube] RTMP download detected')
- def _closed_captions_xml_to_srt(self, xml_string):
- srt = ''
- texts = re.findall(r'([^<]+)', xml_string, re.MULTILINE)
- # TODO parse xml instead of regex
- for n, (start, dur_tag, dur, caption) in enumerate(texts):
- if not dur: dur = '4'
- start = float(start)
- end = start + float(dur)
- start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
- end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
- caption = unescapeHTML(caption)
- caption = unescapeHTML(caption) # double cycle, intentional
- srt += str(n+1) + '\n'
- srt += start + ' --> ' + end + '\n'
- srt += caption + '\n\n'
- return srt
-
- def _extract_subtitles(self, video_id):
- self.report_video_subtitles_download(video_id)
+ def _get_available_subtitles(self, video_id):
request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
try:
srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
@@ -249,6 +242,29 @@ class YoutubeIE(InfoExtractor):
srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
if not srt_lang_list:
return (u'WARNING: video has no closed captions', None)
+ return srt_lang_list
+
+ def _request_subtitle(self, str_lang, str_name, video_id, format = 'srt'):
+ self.report_video_subtitles_request(video_id, str_lang)
+ params = compat_urllib_parse.urlencode({
+ 'lang': str_lang,
+ 'name': str_name,
+ 'v': video_id,
+ 'fmt': format,
+ })
+ url = 'http://www.youtube.com/api/timedtext?' + params
+ try:
+ srt = compat_urllib_request.urlopen(url).read().decode('utf-8')
+ except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+ return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
+ if not srt:
+ return (u'WARNING: Did not fetch video subtitles', None)
+ return (None, str_lang, srt)
+
+ def _extract_subtitle(self, video_id):
+ self.report_video_subtitles_download(video_id)
+ srt_lang_list = self._get_available_subtitles(video_id)
+
if self._downloader.params.get('subtitleslang', False):
srt_lang = self._downloader.params.get('subtitleslang')
elif 'en' in srt_lang_list:
@@ -256,15 +272,19 @@ class YoutubeIE(InfoExtractor):
else:
srt_lang = list(srt_lang_list.keys())[0]
if not srt_lang in srt_lang_list:
- return (u'WARNING: no closed captions found in the specified language', None)
- request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
- try:
- srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8')
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
- if not srt_xml:
- return (u'WARNING: unable to download video subtitles', None)
- return (None, self._closed_captions_xml_to_srt(srt_xml))
+ return (u'WARNING: no closed captions found in the specified language "%s"' % srt_lang, None)
+
+ sub = self._request_subtitle(srt_lang, srt_lang_list[srt_lang].encode('utf-8'), video_id)
+ return [sub]
+
+ def _extract_all_subtitles(self, video_id):
+ self.report_video_subtitles_download(video_id)
+ srt_lang_list = self._get_available_subtitles(video_id)
+ subs = []
+ for srt_lang in srt_lang_list:
+ sub = self._request_subtitle(srt_lang, srt_lang_list[srt_lang].encode('utf-8'), video_id)
+ subs.append(sub)
+ return subs
def _print_formats(self, formats):
print('Available formats:')
@@ -308,19 +328,54 @@ class YoutubeIE(InfoExtractor):
if username is None:
return
+ request = compat_urllib_request.Request(self._LOGIN_URL)
+ try:
+ login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
+ except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+ self._downloader.to_stderr(u'WARNING: unable to fetch login page: %s' % compat_str(err))
+ return
+
+ galx = None
+ dsh = None
+ match = re.search(re.compile(r']* name="loginForm"', login_results) is not None:
+ if re.search(r'(?i)