3 import xml.etree.ElementTree
5 from .common import InfoExtractor
10 compat_urllib_parse_urlparse,
11 compat_urllib_request,
17 class CollegeHumorIE(InfoExtractor):
19 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
21 def report_manifest(self, video_id):
22 """Report information extraction."""
23 self.to_screen(u'%s: Downloading XML manifest' % video_id)
25 def _real_extract(self, url):
26 mobj = re.match(self._VALID_URL, url)
28 raise ExtractorError(u'Invalid URL: %s' % url)
29 video_id = mobj.group('videoid')
37 self.report_extraction(video_id)
38 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
40 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
41 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
42 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
44 mdoc = xml.etree.ElementTree.fromstring(metaXml)
46 videoNode = mdoc.findall('./video')[0]
47 info['description'] = videoNode.findall('./description')[0].text
48 info['title'] = videoNode.findall('./caption')[0].text
49 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
50 manifest_url = videoNode.findall('./file')[0].text
52 raise ExtractorError(u'Invalid metadata XML file')
54 manifest_url += '?hdcore=2.10.3'
55 self.report_manifest(video_id)
57 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
58 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
59 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
61 adoc = xml.etree.ElementTree.fromstring(manifestXml)
63 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
64 node_id = media_node.attrib['url']
65 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
66 except IndexError as err:
67 raise ExtractorError(u'Invalid manifest file')
69 url_pr = compat_urllib_parse_urlparse(manifest_url)
70 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'