X-Git-Url: http://git.cielonegro.org/gitweb.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fudemy.py;h=195f5ce78d308126a1077cda11a4c00b437343fe;hb=760f81212f6cd5608b8c0d42181f6ec6e6c0f24b;hp=160be1b1b913f3b1013ab4d1296239fd110f976a;hpb=b7f8749304aca8e8f338f40a1b2877e4ac732c73;p=youtube-dl.git diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index 160be1b1b..195f5ce78 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -15,6 +15,7 @@ from ..utils import ( ExtractorError, float_or_none, int_or_none, + js_to_json, sanitized_Request, unescapeHTML, urlencode_postdata, @@ -61,11 +62,11 @@ class UdemyIE(InfoExtractor): def _extract_course_info(self, webpage, video_id): course = self._parse_json( unescapeHTML(self._search_regex( - r'ng-init=["\'].*\bcourse=({.+?});', webpage, 'course', default='{}')), + r'ng-init=["\'].*\bcourse=({.+?})[;"\']', + webpage, 'course', default='{}')), video_id, fatal=False) or {} course_id = course.get('id') or self._search_regex( - (r'"id"\s*:\s*(\d+)', r'data-course-id=["\'](\d+)'), - webpage, 'course id') + r'data-course-id=["\'](\d+)', webpage, 'course id') return course_id, course.get('title') def _enroll_course(self, base_url, webpage, course_id): @@ -73,7 +74,7 @@ class UdemyIE(InfoExtractor): return compat_urlparse.urljoin(base_url, url) if not url.startswith('http') else url checkout_url = unescapeHTML(self._search_regex( - r'href=(["\'])(?P(?:https?://(?:www\.)?udemy\.com)?/payment/checkout/.+?)\1', + r'href=(["\'])(?P(?:https?://(?:www\.)?udemy\.com)?/(?:payment|cart)/checkout/.+?)\1', webpage, 'checkout url', group='url', default=None)) if checkout_url: raise ExtractorError( @@ -163,7 +164,7 @@ class UdemyIE(InfoExtractor): }) response = self._download_webpage( - self._LOGIN_URL, None, 'Logging in as %s' % username, + self._LOGIN_URL, None, 'Logging in', data=urlencode_postdata(login_form), headers={ 'Referer': self._ORIGIN_URL, @@ -256,6 +257,11 @@ class UdemyIE(InfoExtractor): video_url = source.get('file') or source.get('src') if not video_url or not isinstance(video_url, compat_str): continue + if source.get('type') == 'application/x-mpegURL' or determine_ext(video_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + continue format_id = source.get('label') f = { 'url': video_url, @@ -268,6 +274,25 @@ class UdemyIE(InfoExtractor): f = add_output_format_meta(f, format_id) formats.append(f) + def extract_subtitles(track_list): + if not isinstance(track_list, list): + return + for track in track_list: + if not isinstance(track, dict): + continue + if track.get('kind') != 'captions': + continue + src = track.get('src') + if not src or not isinstance(src, compat_str): + continue + lang = track.get('language') or track.get( + 'srclang') or track.get('label') + sub_dict = automatic_captions if track.get( + 'autogenerated') is True else subtitles + sub_dict.setdefault(lang, []).append({ + 'url': src, + }) + download_urls = asset.get('download_urls') if isinstance(download_urls, dict): extract_formats(download_urls.get('Video')) @@ -315,23 +340,16 @@ class UdemyIE(InfoExtractor): extract_formats(data.get('sources')) if not duration: duration = int_or_none(data.get('duration')) - tracks = data.get('tracks') - if isinstance(tracks, list): - for track in tracks: - if not isinstance(track, dict): - continue - if track.get('kind') != 'captions': - continue - src = track.get('src') - if not src or not isinstance(src, compat_str): - continue - lang = track.get('language') or track.get( - 'srclang') or track.get('label') - sub_dict = automatic_captions if track.get( - 'autogenerated') is True else subtitles - sub_dict.setdefault(lang, []).append({ - 'url': src, - }) + extract_subtitles(data.get('tracks')) + + if not subtitles and not automatic_captions: + text_tracks = self._parse_json( + self._search_regex( + r'text-tracks=(["\'])(?P\[.+?\])\1', view_html, + 'text tracks', default='{}', group='data'), video_id, + transform_source=lambda s: js_to_json(unescapeHTML(s)), + fatal=False) + extract_subtitles(text_tracks) self._sort_formats(formats, field_preference=('height', 'width', 'tbr', 'format_id'))