Extractor for non-password protected GDC Vault videos

author David Wagner <mnem@noiseandheat.com>

Wed, 26 Feb 2014 22:25:04 +0000 (22:25 +0000)

committer David Wagner <mnem@noiseandheat.com>

Wed, 26 Feb 2014 22:33:33 +0000 (22:33 +0000)
author David Wagner <mnem@noiseandheat.com>
Wed, 26 Feb 2014 22:25:04 +0000 (22:25 +0000)
committer David Wagner <mnem@noiseandheat.com>
Wed, 26 Feb 2014 22:33:33 +0000 (22:33 +0000)
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py

index e16bbd969cc0164ac60616b9904b9daf7ee6404f..60aa97636d0390724c3fa89903849349ff114aff 100644 (file)
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -89,6 +89,7 @@ from .funnyordie import FunnyOrDieIE
  from .gamekings import GamekingsIE
  from .gamespot import GameSpotIE
  from .gametrailers import GametrailersIE
+from .gdcvault import GDCVaultIE
  from .generic import GenericIE
  from .googleplus import GooglePlusIE
  from .googlesearch import GoogleSearchIE
diff --git a/youtube_dl/extractor/gdcvault.py b/youtube_dl/extractor/gdcvault.py

new file mode 100644 (file)

index 0000000..a1e4c32
--- /dev/null
+++ b/youtube_dl/extractor/gdcvault.py
@@ -0,0 +1,61 @@
+from __future__ import unicode_literals
+
+import re
+import json
+import xml.etree.ElementTree
+
+from .common import InfoExtractor
+from ..utils import unified_strdate
+
+
+class GDCVaultIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?gdcvault\.com/play/(?P<id>\d+)/(?P<name>(\w|-)+)'
+
+    _TEST = {
+        u'url': u'http://www.gdcvault.com/play/1019721/Doki-Doki-Universe-Sweet-Simple',
+        u'md5': u'7ce8388f544c88b7ac11c7ab1b593704',
+        u'info_dict': {
+            u"id": u"1019721",
+            u"ext": u"mp4",
+            u"title": u"Doki-Doki Universe: Sweet, Simple and Genuine (GDC Next 10)"
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+
+        video_id = mobj.group('id')
+        webpage_url = 'http://www.gdcvault.com/play/' + video_id
+
+        start_page = self._download_webpage(webpage_url, video_id)
+
+        self.report_extraction(video_id)
+
+        xml_root = self._html_search_regex(r'<iframe src="(?P<xml_root>.*?)player.html.*?".*?</iframe>', start_page, 'xml root')
+        xml_name = self._html_search_regex(r'<iframe src=".*?\?xml=(?P<xml_file>.+?\.xml).*?".*?</iframe>', start_page, 'xml filename')
+        xml_decription_url = xml_root + 'xml/' + xml_name
+
+        xml_description = self._download_xml(xml_decription_url, video_id)
+
+        video_title = xml_description.find('./metadata/title').text
+
+        mp4_video = xml_description.find('./metadata/mp4video').text
+        mobj = re.match(r'(?P<root>https?://.*?/).*', mp4_video)
+        video_root = mobj.group('root')
+
+        formats = xml_description.findall('./metadata/MBRVideos/MBRVideo')
+        video_formats = []
+        for format in formats:
+            mobj = re.match(r'mp4\:(?P<path>.*)', format.find('streamName').text)
+            url = video_root + mobj.group('path')
+            vbr = format.find('bitrate').text
+            video_formats.append({
+                'url': url,
+                'vbr': int(vbr),
+            })
+
+        return [{
+            'id': video_id,
+            'formats': video_formats,
+            'title': video_title,
+        }]
author	David Wagner <mnem@noiseandheat.com>
	Wed, 26 Feb 2014 22:25:04 +0000 (22:25 +0000)
committer	David Wagner <mnem@noiseandheat.com>
	Wed, 26 Feb 2014 22:33:33 +0000 (22:33 +0000)
youtube_dl/extractor/__init__.py		patch \| blob \| history
youtube_dl/extractor/gdcvault.py	[new file with mode: 0644]	patch \| blob