from __future__ import unicode_literals import re import json import xml.etree.ElementTree from .common import InfoExtractor from ..utils import unified_strdate class GDCVaultIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?gdcvault\.com/play/(?P\d+)/(?P(\w|-)+)' _TESTS = [ { u'url': u'http://www.gdcvault.com/play/1015683/Embracing-the-Dark-Art-of', u'md5': u'05763e5edd1a74776999a12b02ee1c4e', u'info_dict': { u"id": u"1015683", u"ext": u"flv", u"title": u"Embracing the Dark Art of Mathematical Modeling in AI" } }, { u'url': u'http://www.gdcvault.com/play/1019721/Doki-Doki-Universe-Sweet-Simple', u'md5': u'7ce8388f544c88b7ac11c7ab1b593704', u'info_dict': { u"id": u"1019721", u"ext": u"mp4", u"title": u"Doki-Doki Universe: Sweet, Simple and Genuine (GDC Next 10)" } }, ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') webpage_url = 'http://www.gdcvault.com/play/' + video_id start_page = self._download_webpage(webpage_url, video_id) self.report_extraction(video_id) xml_root = self._html_search_regex(r'', start_page, 'xml filename', None, False) if xml_name is None: # Fallback to the older format xml_name = self._html_search_regex(r'