2 # -*- coding: utf-8 -*-
17 import cStringIO as StringIO
21 # parse_qs was moved from the cgi module to the urlparse module recently.
23 from urlparse import parse_qs
25 from cgi import parse_qs
33 import xml.etree.ElementTree
34 except ImportError: # Python<2.5: Not officially supported, but let it slip
35 warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
40 class InfoExtractor(object):
41 """Information Extractor class.
43 Information extractors are the classes that, given a URL, extract
44 information from the video (or videos) the URL refers to. This
45 information includes the real video URL, the video title and simplified
46 title, author and others. The information is stored in a dictionary
47 which is then passed to the FileDownloader. The FileDownloader
48 processes this information possibly downloading the video to the file
49 system, among other possible outcomes. The dictionaries must include
54 uploader: Nickname of the video uploader.
56 stitle: Simplified title.
57 ext: Video filename extension.
59 player_url: SWF Player URL (may be None).
61 The following fields are optional. Their primary purpose is to allow
62 youtube-dl to serve as the backend for a video search function, such
63 as the one in youtube2mp3. They are only used when their respective
64 forced printing functions are called:
66 thumbnail: Full URL to a video thumbnail image.
67 description: One-line video description.
69 Subclasses of this one should re-define the _real_initialize() and
70 _real_extract() methods and define a _VALID_URL regexp.
71 Probably, they should also be added to the list of extractors.
77 def __init__(self, downloader=None):
78 """Constructor. Receives an optional downloader."""
80 self.set_downloader(downloader)
82 def suitable(self, url):
83 """Receives a URL and returns True if suitable for this IE."""
84 return re.match(self._VALID_URL, url) is not None
87 """Initializes an instance (authentication, etc)."""
89 self._real_initialize()
92 def extract(self, url):
93 """Extracts URL information and returns it in list of dicts."""
95 return self._real_extract(url)
97 def set_downloader(self, downloader):
98 """Sets the downloader for this IE."""
99 self._downloader = downloader
101 def _real_initialize(self):
102 """Real initialization process. Redefine in subclasses."""
105 def _real_extract(self, url):
106 """Real extraction process. Redefine in subclasses."""
110 class YoutubeIE(InfoExtractor):
111 """Information extractor for youtube.com."""
113 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
114 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
115 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
116 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
117 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
118 _NETRC_MACHINE = 'youtube'
119 # Listed in order of quality
120 _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
121 _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
122 _video_extensions = {
128 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
133 _video_dimensions = {
150 def report_lang(self):
151 """Report attempt to set language."""
152 self._downloader.to_screen(u'[youtube] Setting language')
154 def report_login(self):
155 """Report attempt to log in."""
156 self._downloader.to_screen(u'[youtube] Logging in')
158 def report_age_confirmation(self):
159 """Report attempt to confirm age."""
160 self._downloader.to_screen(u'[youtube] Confirming age')
162 def report_video_webpage_download(self, video_id):
163 """Report attempt to download video webpage."""
164 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
166 def report_video_info_webpage_download(self, video_id):
167 """Report attempt to download video info webpage."""
168 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
170 def report_video_subtitles_download(self, video_id):
171 """Report attempt to download video info webpage."""
172 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
174 def report_information_extraction(self, video_id):
175 """Report attempt to extract video information."""
176 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
178 def report_unavailable_format(self, video_id, format):
179 """Report extracted video URL."""
180 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
182 def report_rtmp_download(self):
183 """Indicate the download will use the RTMP protocol."""
184 self._downloader.to_screen(u'[youtube] RTMP download detected')
186 def _closed_captions_xml_to_srt(self, xml_string):
188 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
189 # TODO parse xml instead of regex
190 for n, (start, dur_tag, dur, caption) in enumerate(texts):
191 if not dur: dur = '4'
193 end = start + float(dur)
194 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
195 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
196 caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption)
197 caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption) # double cycle, inentional
199 srt += start + ' --> ' + end + '\n'
200 srt += caption + '\n\n'
203 def _print_formats(self, formats):
204 print 'Available formats:'
206 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
208 def _real_initialize(self):
209 if self._downloader is None:
214 downloader_params = self._downloader.params
216 # Attempt to use provided username and password or .netrc data
217 if downloader_params.get('username', None) is not None:
218 username = downloader_params['username']
219 password = downloader_params['password']
220 elif downloader_params.get('usenetrc', False):
222 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
227 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
228 except (IOError, netrc.NetrcParseError), err:
229 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
233 request = urllib2.Request(self._LANG_URL)
236 urllib2.urlopen(request).read()
237 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
238 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
241 # No authentication to be performed
247 'current_form': 'loginForm',
249 'action_login': 'Log In',
250 'username': username,
251 'password': password,
253 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
256 login_results = urllib2.urlopen(request).read()
257 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
258 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
260 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
261 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
267 'action_confirm': 'Confirm',
269 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
271 self.report_age_confirmation()
272 age_results = urllib2.urlopen(request).read()
273 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
274 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
277 def _real_extract(self, url):
278 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
279 mobj = re.search(self._NEXT_URL_RE, url)
281 url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
283 # Extract video id from URL
284 mobj = re.match(self._VALID_URL, url)
286 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
288 video_id = mobj.group(2)
291 self.report_video_webpage_download(video_id)
292 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
294 video_webpage = urllib2.urlopen(request).read()
295 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
296 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
299 # Attempt to extract SWF player URL
300 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
302 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
307 self.report_video_info_webpage_download(video_id)
308 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
309 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
310 % (video_id, el_type))
311 request = urllib2.Request(video_info_url)
313 video_info_webpage = urllib2.urlopen(request).read()
314 video_info = parse_qs(video_info_webpage)
315 if 'token' in video_info:
317 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
318 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
320 if 'token' not in video_info:
321 if 'reason' in video_info:
322 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
324 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
327 # Start extracting information
328 self.report_information_extraction(video_id)
331 if 'author' not in video_info:
332 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
334 video_uploader = urllib.unquote_plus(video_info['author'][0])
337 if 'title' not in video_info:
338 self._downloader.trouble(u'ERROR: unable to extract video title')
340 video_title = urllib.unquote_plus(video_info['title'][0])
341 video_title = video_title.decode('utf-8')
342 video_title = sanitize_title(video_title)
345 simple_title = simplify_title(video_title)
348 if 'thumbnail_url' not in video_info:
349 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
351 else: # don't panic if we can't find it
352 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
356 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
358 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
359 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
360 for expression in format_expressions:
362 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
370 video_description = u'No description available.'
371 mobj = re.search(r'<meta name="description" content="(.*?)">', video_webpage)
373 video_description = mobj.group(1).decode('utf-8')
375 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
376 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
377 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
378 # TODO use another parser
381 video_subtitles = None
382 if self._downloader.params.get('writesubtitles', False):
383 self.report_video_subtitles_download(video_id)
384 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
386 srt_list = urllib2.urlopen(request).read()
387 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
388 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
390 srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
392 if self._downloader.params.get('subtitleslang', False):
393 srt_lang = self._downloader.params.get('subtitleslang')
394 elif 'en' in srt_lang_list:
397 srt_lang = srt_lang_list[0]
398 if not srt_lang in srt_lang_list:
399 self._downloader.trouble(u'WARNING: no closed captions found in the specified language')
401 request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
403 srt_xml = urllib2.urlopen(request).read()
404 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
405 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
407 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
409 self._downloader.trouble(u'WARNING: video has no closed captions')
412 video_token = urllib.unquote_plus(video_info['token'][0])
414 # Decide which formats to download
415 req_format = self._downloader.params.get('format', None)
417 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
418 self.report_rtmp_download()
419 video_url_list = [(None, video_info['conn'][0])]
420 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
421 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
422 url_data = [parse_qs(uds) for uds in url_data_strs]
423 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
424 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
426 format_limit = self._downloader.params.get('format_limit', None)
427 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
428 if format_limit is not None and format_limit in available_formats:
429 format_list = available_formats[available_formats.index(format_limit):]
431 format_list = available_formats
432 existing_formats = [x for x in format_list if x in url_map]
433 if len(existing_formats) == 0:
434 self._downloader.trouble(u'ERROR: no known formats available for video')
436 if self._downloader.params.get('listformats', None):
437 self._print_formats(existing_formats)
439 if req_format is None or req_format == 'best':
440 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
441 elif req_format == 'worst':
442 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
443 elif req_format in ('-1', 'all'):
444 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
446 # Specific formats. We pick the first in a slash-delimeted sequence.
447 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
448 req_formats = req_format.split('/')
449 video_url_list = None
450 for rf in req_formats:
452 video_url_list = [(rf, url_map[rf])]
454 if video_url_list is None:
455 self._downloader.trouble(u'ERROR: requested format not available')
458 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
462 for format_param, video_real_url in video_url_list:
464 video_extension = self._video_extensions.get(format_param, 'flv')
467 'id': video_id.decode('utf-8'),
468 'url': video_real_url.decode('utf-8'),
469 'uploader': video_uploader.decode('utf-8'),
470 'upload_date': upload_date,
471 'title': video_title,
472 'stitle': simple_title,
473 'ext': video_extension.decode('utf-8'),
474 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
475 'thumbnail': video_thumbnail.decode('utf-8'),
476 'description': video_description,
477 'player_url': player_url,
478 'subtitles': video_subtitles
483 class MetacafeIE(InfoExtractor):
484 """Information Extractor for metacafe.com."""
486 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
487 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
488 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
489 IE_NAME = u'metacafe'
491 def __init__(self, downloader=None):
492 InfoExtractor.__init__(self, downloader)
494 def report_disclaimer(self):
495 """Report disclaimer retrieval."""
496 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
498 def report_age_confirmation(self):
499 """Report attempt to confirm age."""
500 self._downloader.to_screen(u'[metacafe] Confirming age')
502 def report_download_webpage(self, video_id):
503 """Report webpage download."""
504 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
506 def report_extraction(self, video_id):
507 """Report information extraction."""
508 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
510 def _real_initialize(self):
511 # Retrieve disclaimer
512 request = urllib2.Request(self._DISCLAIMER)
514 self.report_disclaimer()
515 disclaimer = urllib2.urlopen(request).read()
516 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
517 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
523 'submit': "Continue - I'm over 18",
525 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
527 self.report_age_confirmation()
528 disclaimer = urllib2.urlopen(request).read()
529 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
530 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
533 def _real_extract(self, url):
534 # Extract id and simplified title from URL
535 mobj = re.match(self._VALID_URL, url)
537 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
540 video_id = mobj.group(1)
542 # Check if video comes from YouTube
543 mobj2 = re.match(r'^yt-(.*)$', video_id)
544 if mobj2 is not None:
545 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
548 simple_title = mobj.group(2).decode('utf-8')
550 # Retrieve video webpage to extract further information
551 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
553 self.report_download_webpage(video_id)
554 webpage = urllib2.urlopen(request).read()
555 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
556 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
559 # Extract URL, uploader and title from webpage
560 self.report_extraction(video_id)
561 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
563 mediaURL = urllib.unquote(mobj.group(1))
564 video_extension = mediaURL[-3:]
566 # Extract gdaKey if available
567 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
571 gdaKey = mobj.group(1)
572 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
574 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
576 self._downloader.trouble(u'ERROR: unable to extract media URL')
578 vardict = parse_qs(mobj.group(1))
579 if 'mediaData' not in vardict:
580 self._downloader.trouble(u'ERROR: unable to extract media URL')
582 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
584 self._downloader.trouble(u'ERROR: unable to extract media URL')
586 mediaURL = mobj.group(1).replace('\\/', '/')
587 video_extension = mediaURL[-3:]
588 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
590 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
592 self._downloader.trouble(u'ERROR: unable to extract title')
594 video_title = mobj.group(1).decode('utf-8')
595 video_title = sanitize_title(video_title)
597 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
599 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
601 video_uploader = mobj.group(1)
604 'id': video_id.decode('utf-8'),
605 'url': video_url.decode('utf-8'),
606 'uploader': video_uploader.decode('utf-8'),
607 'upload_date': u'NA',
608 'title': video_title,
609 'stitle': simple_title,
610 'ext': video_extension.decode('utf-8'),
616 class DailymotionIE(InfoExtractor):
617 """Information Extractor for Dailymotion"""
619 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
620 IE_NAME = u'dailymotion'
622 def __init__(self, downloader=None):
623 InfoExtractor.__init__(self, downloader)
625 def report_download_webpage(self, video_id):
626 """Report webpage download."""
627 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
629 def report_extraction(self, video_id):
630 """Report information extraction."""
631 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
633 def _real_extract(self, url):
634 # Extract id and simplified title from URL
635 mobj = re.match(self._VALID_URL, url)
637 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
640 video_id = mobj.group(1)
642 video_extension = 'flv'
644 # Retrieve video webpage to extract further information
645 request = urllib2.Request(url)
646 request.add_header('Cookie', 'family_filter=off')
648 self.report_download_webpage(video_id)
649 webpage = urllib2.urlopen(request).read()
650 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
651 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
654 # Extract URL, uploader and title from webpage
655 self.report_extraction(video_id)
656 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
658 self._downloader.trouble(u'ERROR: unable to extract media URL')
660 sequence = urllib.unquote(mobj.group(1))
661 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
663 self._downloader.trouble(u'ERROR: unable to extract media URL')
665 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
667 # if needed add http://www.dailymotion.com/ if relative URL
671 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
673 self._downloader.trouble(u'ERROR: unable to extract title')
675 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
676 video_title = sanitize_title(video_title)
677 simple_title = simplify_title(video_title)
679 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
681 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
683 video_uploader = mobj.group(1)
686 'id': video_id.decode('utf-8'),
687 'url': video_url.decode('utf-8'),
688 'uploader': video_uploader.decode('utf-8'),
689 'upload_date': u'NA',
690 'title': video_title,
691 'stitle': simple_title,
692 'ext': video_extension.decode('utf-8'),
698 class GoogleIE(InfoExtractor):
699 """Information extractor for video.google.com."""
701 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
702 IE_NAME = u'video.google'
704 def __init__(self, downloader=None):
705 InfoExtractor.__init__(self, downloader)
707 def report_download_webpage(self, video_id):
708 """Report webpage download."""
709 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
711 def report_extraction(self, video_id):
712 """Report information extraction."""
713 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
715 def _real_extract(self, url):
716 # Extract id from URL
717 mobj = re.match(self._VALID_URL, url)
719 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
722 video_id = mobj.group(1)
724 video_extension = 'mp4'
726 # Retrieve video webpage to extract further information
727 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
729 self.report_download_webpage(video_id)
730 webpage = urllib2.urlopen(request).read()
731 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
732 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
735 # Extract URL, uploader, and title from webpage
736 self.report_extraction(video_id)
737 mobj = re.search(r"download_url:'([^']+)'", webpage)
739 video_extension = 'flv'
740 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
742 self._downloader.trouble(u'ERROR: unable to extract media URL')
744 mediaURL = urllib.unquote(mobj.group(1))
745 mediaURL = mediaURL.replace('\\x3d', '\x3d')
746 mediaURL = mediaURL.replace('\\x26', '\x26')
750 mobj = re.search(r'<title>(.*)</title>', webpage)
752 self._downloader.trouble(u'ERROR: unable to extract title')
754 video_title = mobj.group(1).decode('utf-8')
755 video_title = sanitize_title(video_title)
756 simple_title = simplify_title(video_title)
758 # Extract video description
759 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
761 self._downloader.trouble(u'ERROR: unable to extract video description')
763 video_description = mobj.group(1).decode('utf-8')
764 if not video_description:
765 video_description = 'No description available.'
767 # Extract video thumbnail
768 if self._downloader.params.get('forcethumbnail', False):
769 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
771 webpage = urllib2.urlopen(request).read()
772 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
773 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
775 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
777 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
779 video_thumbnail = mobj.group(1)
780 else: # we need something to pass to process_info
784 'id': video_id.decode('utf-8'),
785 'url': video_url.decode('utf-8'),
787 'upload_date': u'NA',
788 'title': video_title,
789 'stitle': simple_title,
790 'ext': video_extension.decode('utf-8'),
796 class PhotobucketIE(InfoExtractor):
797 """Information extractor for photobucket.com."""
799 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
800 IE_NAME = u'photobucket'
802 def __init__(self, downloader=None):
803 InfoExtractor.__init__(self, downloader)
805 def report_download_webpage(self, video_id):
806 """Report webpage download."""
807 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
809 def report_extraction(self, video_id):
810 """Report information extraction."""
811 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
813 def _real_extract(self, url):
814 # Extract id from URL
815 mobj = re.match(self._VALID_URL, url)
817 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
820 video_id = mobj.group(1)
822 video_extension = 'flv'
824 # Retrieve video webpage to extract further information
825 request = urllib2.Request(url)
827 self.report_download_webpage(video_id)
828 webpage = urllib2.urlopen(request).read()
829 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
830 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
833 # Extract URL, uploader, and title from webpage
834 self.report_extraction(video_id)
835 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
837 self._downloader.trouble(u'ERROR: unable to extract media URL')
839 mediaURL = urllib.unquote(mobj.group(1))
843 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
845 self._downloader.trouble(u'ERROR: unable to extract title')
847 video_title = mobj.group(1).decode('utf-8')
848 video_title = sanitize_title(video_title)
849 simple_title = simplify_title(video_title)
851 video_uploader = mobj.group(2).decode('utf-8')
854 'id': video_id.decode('utf-8'),
855 'url': video_url.decode('utf-8'),
856 'uploader': video_uploader,
857 'upload_date': u'NA',
858 'title': video_title,
859 'stitle': simple_title,
860 'ext': video_extension.decode('utf-8'),
866 class YahooIE(InfoExtractor):
867 """Information extractor for video.yahoo.com."""
869 # _VALID_URL matches all Yahoo! Video URLs
870 # _VPAGE_URL matches only the extractable '/watch/' URLs
871 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
872 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
873 IE_NAME = u'video.yahoo'
875 def __init__(self, downloader=None):
876 InfoExtractor.__init__(self, downloader)
878 def report_download_webpage(self, video_id):
879 """Report webpage download."""
880 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
882 def report_extraction(self, video_id):
883 """Report information extraction."""
884 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
886 def _real_extract(self, url, new_video=True):
887 # Extract ID from URL
888 mobj = re.match(self._VALID_URL, url)
890 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
893 video_id = mobj.group(2)
894 video_extension = 'flv'
896 # Rewrite valid but non-extractable URLs as
897 # extractable English language /watch/ URLs
898 if re.match(self._VPAGE_URL, url) is None:
899 request = urllib2.Request(url)
901 webpage = urllib2.urlopen(request).read()
902 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
903 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
906 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
908 self._downloader.trouble(u'ERROR: Unable to extract id field')
910 yahoo_id = mobj.group(1)
912 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
914 self._downloader.trouble(u'ERROR: Unable to extract vid field')
916 yahoo_vid = mobj.group(1)
918 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
919 return self._real_extract(url, new_video=False)
921 # Retrieve video webpage to extract further information
922 request = urllib2.Request(url)
924 self.report_download_webpage(video_id)
925 webpage = urllib2.urlopen(request).read()
926 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
927 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
930 # Extract uploader and title from webpage
931 self.report_extraction(video_id)
932 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
934 self._downloader.trouble(u'ERROR: unable to extract video title')
936 video_title = mobj.group(1).decode('utf-8')
937 simple_title = simplify_title(video_title)
939 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
941 self._downloader.trouble(u'ERROR: unable to extract video uploader')
943 video_uploader = mobj.group(1).decode('utf-8')
945 # Extract video thumbnail
946 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
948 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
950 video_thumbnail = mobj.group(1).decode('utf-8')
952 # Extract video description
953 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
955 self._downloader.trouble(u'ERROR: unable to extract video description')
957 video_description = mobj.group(1).decode('utf-8')
958 if not video_description:
959 video_description = 'No description available.'
961 # Extract video height and width
962 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
964 self._downloader.trouble(u'ERROR: unable to extract video height')
966 yv_video_height = mobj.group(1)
968 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
970 self._downloader.trouble(u'ERROR: unable to extract video width')
972 yv_video_width = mobj.group(1)
974 # Retrieve video playlist to extract media URL
975 # I'm not completely sure what all these options are, but we
976 # seem to need most of them, otherwise the server sends a 401.
977 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
978 yv_bitrate = '700' # according to Wikipedia this is hard-coded
979 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
980 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
981 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
983 self.report_download_webpage(video_id)
984 webpage = urllib2.urlopen(request).read()
985 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
986 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
989 # Extract media URL from playlist XML
990 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
992 self._downloader.trouble(u'ERROR: Unable to extract media URL')
994 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
995 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
998 'id': video_id.decode('utf-8'),
1000 'uploader': video_uploader,
1001 'upload_date': u'NA',
1002 'title': video_title,
1003 'stitle': simple_title,
1004 'ext': video_extension.decode('utf-8'),
1005 'thumbnail': video_thumbnail.decode('utf-8'),
1006 'description': video_description,
1007 'thumbnail': video_thumbnail,
1012 class VimeoIE(InfoExtractor):
1013 """Information extractor for vimeo.com."""
1015 # _VALID_URL matches Vimeo URLs
1016 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1019 def __init__(self, downloader=None):
1020 InfoExtractor.__init__(self, downloader)
1022 def report_download_webpage(self, video_id):
1023 """Report webpage download."""
1024 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1026 def report_extraction(self, video_id):
1027 """Report information extraction."""
1028 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1030 def _real_extract(self, url, new_video=True):
1031 # Extract ID from URL
1032 mobj = re.match(self._VALID_URL, url)
1034 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1037 video_id = mobj.group(1)
1039 # Retrieve video webpage to extract further information
1040 request = urllib2.Request(url, None, std_headers)
1042 self.report_download_webpage(video_id)
1043 webpage = urllib2.urlopen(request).read()
1044 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1045 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1048 # Now we begin extracting as much information as we can from what we
1049 # retrieved. First we extract the information common to all extractors,
1050 # and latter we extract those that are Vimeo specific.
1051 self.report_extraction(video_id)
1053 # Extract the config JSON
1054 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1056 config = json.loads(config)
1058 self._downloader.trouble(u'ERROR: unable to extract info section')
1062 video_title = config["video"]["title"]
1063 simple_title = simplify_title(video_title)
1066 video_uploader = config["video"]["owner"]["name"]
1068 # Extract video thumbnail
1069 video_thumbnail = config["video"]["thumbnail"]
1071 # Extract video description
1075 video_description = u'No description available.'
1076 mobj = re.search(r'<meta name="description" content="(.*?)" />', webpage, re.MULTILINE)
1077 if mobj is not None:
1078 video_description = mobj.group(1)
1080 html_parser = lxml.etree.HTMLParser()
1081 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser)
1082 video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip()
1083 # TODO use another parser
1085 # Extract upload date
1086 video_upload_date = u'NA'
1087 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1088 if mobj is not None:
1089 video_upload_date = mobj.group(1)
1091 # Vimeo specific: extract request signature and timestamp
1092 sig = config['request']['signature']
1093 timestamp = config['request']['timestamp']
1095 # Vimeo specific: extract video codec and quality information
1096 # TODO bind to format param
1097 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1098 for codec in codecs:
1099 if codec[0] in config["video"]["files"]:
1100 video_codec = codec[0]
1101 video_extension = codec[1]
1102 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
1103 else: quality = 'sd'
1106 self._downloader.trouble(u'ERROR: no known codec found')
1109 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1110 %(video_id, sig, timestamp, quality, video_codec.upper())
1115 'uploader': video_uploader,
1116 'upload_date': video_upload_date,
1117 'title': video_title,
1118 'stitle': simple_title,
1119 'ext': video_extension,
1120 'thumbnail': video_thumbnail,
1121 'description': video_description,
1126 class GenericIE(InfoExtractor):
1127 """Generic last-resort information extractor."""
1130 IE_NAME = u'generic'
1132 def __init__(self, downloader=None):
1133 InfoExtractor.__init__(self, downloader)
1135 def report_download_webpage(self, video_id):
1136 """Report webpage download."""
1137 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1138 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1140 def report_extraction(self, video_id):
1141 """Report information extraction."""
1142 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1144 def report_following_redirect(self, new_url):
1145 """Report information extraction."""
1146 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1148 def _test_redirect(self, url):
1149 """Check if it is a redirect, like url shorteners, in case restart chain."""
1150 class HeadRequest(urllib2.Request):
1151 def get_method(self):
1154 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1156 Subclass the HTTPRedirectHandler to make it use our
1157 HeadRequest also on the redirected URL
1159 def redirect_request(self, req, fp, code, msg, headers, newurl):
1160 if code in (301, 302, 303, 307):
1161 newurl = newurl.replace(' ', '%20')
1162 newheaders = dict((k,v) for k,v in req.headers.items()
1163 if k.lower() not in ("content-length", "content-type"))
1164 return HeadRequest(newurl,
1166 origin_req_host=req.get_origin_req_host(),
1169 raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1171 class HTTPMethodFallback(urllib2.BaseHandler):
1173 Fallback to GET if HEAD is not allowed (405 HTTP error)
1175 def http_error_405(self, req, fp, code, msg, headers):
1179 newheaders = dict((k,v) for k,v in req.headers.items()
1180 if k.lower() not in ("content-length", "content-type"))
1181 return self.parent.open(urllib2.Request(req.get_full_url(),
1183 origin_req_host=req.get_origin_req_host(),
1187 opener = urllib2.OpenerDirector()
1188 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1189 HTTPMethodFallback, HEADRedirectHandler,
1190 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1191 opener.add_handler(handler())
1193 response = opener.open(HeadRequest(url))
1194 new_url = response.geturl()
1196 if url == new_url: return False
1198 self.report_following_redirect(new_url)
1199 self._downloader.download([new_url])
1202 def _real_extract(self, url):
1203 if self._test_redirect(url): return
1205 video_id = url.split('/')[-1]
1206 request = urllib2.Request(url)
1208 self.report_download_webpage(video_id)
1209 webpage = urllib2.urlopen(request).read()
1210 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1211 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1213 except ValueError, err:
1214 # since this is the last-resort InfoExtractor, if
1215 # this error is thrown, it'll be thrown here
1216 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1219 self.report_extraction(video_id)
1220 # Start with something easy: JW Player in SWFObject
1221 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1223 # Broaden the search a little bit
1224 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1226 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1229 # It's possible that one of the regexes
1230 # matched, but returned an empty group:
1231 if mobj.group(1) is None:
1232 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1235 video_url = urllib.unquote(mobj.group(1))
1236 video_id = os.path.basename(video_url)
1238 # here's a fun little line of code for you:
1239 video_extension = os.path.splitext(video_id)[1][1:]
1240 video_id = os.path.splitext(video_id)[0]
1242 # it's tempting to parse this further, but you would
1243 # have to take into account all the variations like
1244 # Video Title - Site Name
1245 # Site Name | Video Title
1246 # Video Title - Tagline | Site Name
1247 # and so on and so forth; it's just not practical
1248 mobj = re.search(r'<title>(.*)</title>', webpage)
1250 self._downloader.trouble(u'ERROR: unable to extract title')
1252 video_title = mobj.group(1).decode('utf-8')
1253 video_title = sanitize_title(video_title)
1254 simple_title = simplify_title(video_title)
1256 # video uploader is domain name
1257 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1259 self._downloader.trouble(u'ERROR: unable to extract title')
1261 video_uploader = mobj.group(1).decode('utf-8')
1264 'id': video_id.decode('utf-8'),
1265 'url': video_url.decode('utf-8'),
1266 'uploader': video_uploader,
1267 'upload_date': u'NA',
1268 'title': video_title,
1269 'stitle': simple_title,
1270 'ext': video_extension.decode('utf-8'),
1276 class YoutubeSearchIE(InfoExtractor):
1277 """Information Extractor for YouTube search queries."""
1278 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1279 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1280 _max_youtube_results = 1000
1281 IE_NAME = u'youtube:search'
1283 def __init__(self, downloader=None):
1284 InfoExtractor.__init__(self, downloader)
1286 def report_download_page(self, query, pagenum):
1287 """Report attempt to download playlist page with given number."""
1288 query = query.decode(preferredencoding())
1289 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1291 def _real_extract(self, query):
1292 mobj = re.match(self._VALID_URL, query)
1294 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1297 prefix, query = query.split(':')
1299 query = query.encode('utf-8')
1301 self._download_n_results(query, 1)
1303 elif prefix == 'all':
1304 self._download_n_results(query, self._max_youtube_results)
1310 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1312 elif n > self._max_youtube_results:
1313 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1314 n = self._max_youtube_results
1315 self._download_n_results(query, n)
1317 except ValueError: # parsing prefix as integer fails
1318 self._download_n_results(query, 1)
1321 def _download_n_results(self, query, n):
1322 """Downloads a specified number of results for a query"""
1328 while (50 * pagenum) < limit:
1329 self.report_download_page(query, pagenum+1)
1330 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1331 request = urllib2.Request(result_url)
1333 data = urllib2.urlopen(request).read()
1334 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1335 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
1337 api_response = json.loads(data)['data']
1339 new_ids = list(video['id'] for video in api_response['items'])
1340 video_ids += new_ids
1342 limit = min(n, api_response['totalItems'])
1345 if len(video_ids) > n:
1346 video_ids = video_ids[:n]
1347 for id in video_ids:
1348 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1352 class GoogleSearchIE(InfoExtractor):
1353 """Information Extractor for Google Video search queries."""
1354 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1355 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1356 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1357 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1358 _max_google_results = 1000
1359 IE_NAME = u'video.google:search'
1361 def __init__(self, downloader=None):
1362 InfoExtractor.__init__(self, downloader)
1364 def report_download_page(self, query, pagenum):
1365 """Report attempt to download playlist page with given number."""
1366 query = query.decode(preferredencoding())
1367 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1369 def _real_extract(self, query):
1370 mobj = re.match(self._VALID_URL, query)
1372 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1375 prefix, query = query.split(':')
1377 query = query.encode('utf-8')
1379 self._download_n_results(query, 1)
1381 elif prefix == 'all':
1382 self._download_n_results(query, self._max_google_results)
1388 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1390 elif n > self._max_google_results:
1391 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1392 n = self._max_google_results
1393 self._download_n_results(query, n)
1395 except ValueError: # parsing prefix as integer fails
1396 self._download_n_results(query, 1)
1399 def _download_n_results(self, query, n):
1400 """Downloads a specified number of results for a query"""
1406 self.report_download_page(query, pagenum)
1407 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1408 request = urllib2.Request(result_url)
1410 page = urllib2.urlopen(request).read()
1411 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1412 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1415 # Extract video identifiers
1416 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1417 video_id = mobj.group(1)
1418 if video_id not in video_ids:
1419 video_ids.append(video_id)
1420 if len(video_ids) == n:
1421 # Specified n videos reached
1422 for id in video_ids:
1423 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1426 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1427 for id in video_ids:
1428 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1431 pagenum = pagenum + 1
1434 class YahooSearchIE(InfoExtractor):
1435 """Information Extractor for Yahoo! Video search queries."""
1436 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1437 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1438 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1439 _MORE_PAGES_INDICATOR = r'\s*Next'
1440 _max_yahoo_results = 1000
1441 IE_NAME = u'video.yahoo:search'
1443 def __init__(self, downloader=None):
1444 InfoExtractor.__init__(self, downloader)
1446 def report_download_page(self, query, pagenum):
1447 """Report attempt to download playlist page with given number."""
1448 query = query.decode(preferredencoding())
1449 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1451 def _real_extract(self, query):
1452 mobj = re.match(self._VALID_URL, query)
1454 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1457 prefix, query = query.split(':')
1459 query = query.encode('utf-8')
1461 self._download_n_results(query, 1)
1463 elif prefix == 'all':
1464 self._download_n_results(query, self._max_yahoo_results)
1470 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1472 elif n > self._max_yahoo_results:
1473 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1474 n = self._max_yahoo_results
1475 self._download_n_results(query, n)
1477 except ValueError: # parsing prefix as integer fails
1478 self._download_n_results(query, 1)
1481 def _download_n_results(self, query, n):
1482 """Downloads a specified number of results for a query"""
1485 already_seen = set()
1489 self.report_download_page(query, pagenum)
1490 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1491 request = urllib2.Request(result_url)
1493 page = urllib2.urlopen(request).read()
1494 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1495 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1498 # Extract video identifiers
1499 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1500 video_id = mobj.group(1)
1501 if video_id not in already_seen:
1502 video_ids.append(video_id)
1503 already_seen.add(video_id)
1504 if len(video_ids) == n:
1505 # Specified n videos reached
1506 for id in video_ids:
1507 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1510 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1511 for id in video_ids:
1512 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1515 pagenum = pagenum + 1
1518 class YoutubePlaylistIE(InfoExtractor):
1519 """Information Extractor for YouTube playlists."""
1521 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1522 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1523 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&list=PL%s&'
1524 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1525 IE_NAME = u'youtube:playlist'
1527 def __init__(self, downloader=None):
1528 InfoExtractor.__init__(self, downloader)
1530 def report_download_page(self, playlist_id, pagenum):
1531 """Report attempt to download playlist page with given number."""
1532 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1534 def _real_extract(self, url):
1535 # Extract playlist id
1536 mobj = re.match(self._VALID_URL, url)
1538 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1542 if mobj.group(3) is not None:
1543 self._downloader.download([mobj.group(3)])
1546 # Download playlist pages
1547 # prefix is 'p' as default for playlists but there are other types that need extra care
1548 playlist_prefix = mobj.group(1)
1549 if playlist_prefix == 'a':
1550 playlist_access = 'artist'
1552 playlist_prefix = 'p'
1553 playlist_access = 'view_play_list'
1554 playlist_id = mobj.group(2)
1559 self.report_download_page(playlist_id, pagenum)
1560 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1561 request = urllib2.Request(url)
1563 page = urllib2.urlopen(request).read()
1564 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1565 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1568 # Extract video identifiers
1570 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1571 if mobj.group(1) not in ids_in_page:
1572 ids_in_page.append(mobj.group(1))
1573 video_ids.extend(ids_in_page)
1575 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1577 pagenum = pagenum + 1
1579 playliststart = self._downloader.params.get('playliststart', 1) - 1
1580 playlistend = self._downloader.params.get('playlistend', -1)
1581 if playlistend == -1:
1582 video_ids = video_ids[playliststart:]
1584 video_ids = video_ids[playliststart:playlistend]
1586 for id in video_ids:
1587 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1591 class YoutubeUserIE(InfoExtractor):
1592 """Information Extractor for YouTube users."""
1594 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1595 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1596 _GDATA_PAGE_SIZE = 50
1597 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1598 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1599 IE_NAME = u'youtube:user'
1601 def __init__(self, downloader=None):
1602 InfoExtractor.__init__(self, downloader)
1604 def report_download_page(self, username, start_index):
1605 """Report attempt to download user page."""
1606 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1607 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1609 def _real_extract(self, url):
1611 mobj = re.match(self._VALID_URL, url)
1613 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1616 username = mobj.group(1)
1618 # Download video ids using YouTube Data API. Result size per
1619 # query is limited (currently to 50 videos) so we need to query
1620 # page by page until there are no video ids - it means we got
1627 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1628 self.report_download_page(username, start_index)
1630 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1633 page = urllib2.urlopen(request).read()
1634 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1635 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1638 # Extract video identifiers
1641 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1642 if mobj.group(1) not in ids_in_page:
1643 ids_in_page.append(mobj.group(1))
1645 video_ids.extend(ids_in_page)
1647 # A little optimization - if current page is not
1648 # "full", ie. does not contain PAGE_SIZE video ids then
1649 # we can assume that this page is the last one - there
1650 # are no more ids on further pages - no need to query
1653 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1658 all_ids_count = len(video_ids)
1659 playliststart = self._downloader.params.get('playliststart', 1) - 1
1660 playlistend = self._downloader.params.get('playlistend', -1)
1662 if playlistend == -1:
1663 video_ids = video_ids[playliststart:]
1665 video_ids = video_ids[playliststart:playlistend]
1667 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1668 (username, all_ids_count, len(video_ids)))
1670 for video_id in video_ids:
1671 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1674 class DepositFilesIE(InfoExtractor):
1675 """Information extractor for depositfiles.com"""
1677 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1678 IE_NAME = u'DepositFiles'
1680 def __init__(self, downloader=None):
1681 InfoExtractor.__init__(self, downloader)
1683 def report_download_webpage(self, file_id):
1684 """Report webpage download."""
1685 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1687 def report_extraction(self, file_id):
1688 """Report information extraction."""
1689 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1691 def _real_extract(self, url):
1692 file_id = url.split('/')[-1]
1693 # Rebuild url in english locale
1694 url = 'http://depositfiles.com/en/files/' + file_id
1696 # Retrieve file webpage with 'Free download' button pressed
1697 free_download_indication = { 'gateway_result' : '1' }
1698 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1700 self.report_download_webpage(file_id)
1701 webpage = urllib2.urlopen(request).read()
1702 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1703 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
1706 # Search for the real file URL
1707 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1708 if (mobj is None) or (mobj.group(1) is None):
1709 # Try to figure out reason of the error.
1710 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1711 if (mobj is not None) and (mobj.group(1) is not None):
1712 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1713 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1715 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1718 file_url = mobj.group(1)
1719 file_extension = os.path.splitext(file_url)[1][1:]
1721 # Search for file title
1722 mobj = re.search(r'<b title="(.*?)">', webpage)
1724 self._downloader.trouble(u'ERROR: unable to extract title')
1726 file_title = mobj.group(1).decode('utf-8')
1729 'id': file_id.decode('utf-8'),
1730 'url': file_url.decode('utf-8'),
1732 'upload_date': u'NA',
1733 'title': file_title,
1734 'stitle': file_title,
1735 'ext': file_extension.decode('utf-8'),
1741 class FacebookIE(InfoExtractor):
1742 """Information Extractor for Facebook"""
1744 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1745 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1746 _NETRC_MACHINE = 'facebook'
1747 _available_formats = ['video', 'highqual', 'lowqual']
1748 _video_extensions = {
1753 IE_NAME = u'facebook'
1755 def __init__(self, downloader=None):
1756 InfoExtractor.__init__(self, downloader)
1758 def _reporter(self, message):
1759 """Add header and report message."""
1760 self._downloader.to_screen(u'[facebook] %s' % message)
1762 def report_login(self):
1763 """Report attempt to log in."""
1764 self._reporter(u'Logging in')
1766 def report_video_webpage_download(self, video_id):
1767 """Report attempt to download video webpage."""
1768 self._reporter(u'%s: Downloading video webpage' % video_id)
1770 def report_information_extraction(self, video_id):
1771 """Report attempt to extract video information."""
1772 self._reporter(u'%s: Extracting video information' % video_id)
1774 def _parse_page(self, video_webpage):
1775 """Extract video information from page"""
1777 data = {'title': r'\("video_title", "(.*?)"\)',
1778 'description': r'<div class="datawrap">(.*?)</div>',
1779 'owner': r'\("video_owner_name", "(.*?)"\)',
1780 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1783 for piece in data.keys():
1784 mobj = re.search(data[piece], video_webpage)
1785 if mobj is not None:
1786 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1790 for fmt in self._available_formats:
1791 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1792 if mobj is not None:
1793 # URL is in a Javascript segment inside an escaped Unicode format within
1794 # the generally utf-8 page
1795 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1796 video_info['video_urls'] = video_urls
1800 def _real_initialize(self):
1801 if self._downloader is None:
1806 downloader_params = self._downloader.params
1808 # Attempt to use provided username and password or .netrc data
1809 if downloader_params.get('username', None) is not None:
1810 useremail = downloader_params['username']
1811 password = downloader_params['password']
1812 elif downloader_params.get('usenetrc', False):
1814 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1815 if info is not None:
1819 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1820 except (IOError, netrc.NetrcParseError), err:
1821 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1824 if useremail is None:
1833 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1836 login_results = urllib2.urlopen(request).read()
1837 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1838 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1840 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1841 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1844 def _real_extract(self, url):
1845 mobj = re.match(self._VALID_URL, url)
1847 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1849 video_id = mobj.group('ID')
1852 self.report_video_webpage_download(video_id)
1853 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
1855 page = urllib2.urlopen(request)
1856 video_webpage = page.read()
1857 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1858 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1861 # Start extracting information
1862 self.report_information_extraction(video_id)
1864 # Extract information
1865 video_info = self._parse_page(video_webpage)
1868 if 'owner' not in video_info:
1869 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1871 video_uploader = video_info['owner']
1874 if 'title' not in video_info:
1875 self._downloader.trouble(u'ERROR: unable to extract video title')
1877 video_title = video_info['title']
1878 video_title = video_title.decode('utf-8')
1879 video_title = sanitize_title(video_title)
1881 simple_title = simplify_title(video_title)
1884 if 'thumbnail' not in video_info:
1885 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1886 video_thumbnail = ''
1888 video_thumbnail = video_info['thumbnail']
1892 if 'upload_date' in video_info:
1893 upload_time = video_info['upload_date']
1894 timetuple = email.utils.parsedate_tz(upload_time)
1895 if timetuple is not None:
1897 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
1902 video_description = video_info.get('description', 'No description available.')
1904 url_map = video_info['video_urls']
1905 if len(url_map.keys()) > 0:
1906 # Decide which formats to download
1907 req_format = self._downloader.params.get('format', None)
1908 format_limit = self._downloader.params.get('format_limit', None)
1910 if format_limit is not None and format_limit in self._available_formats:
1911 format_list = self._available_formats[self._available_formats.index(format_limit):]
1913 format_list = self._available_formats
1914 existing_formats = [x for x in format_list if x in url_map]
1915 if len(existing_formats) == 0:
1916 self._downloader.trouble(u'ERROR: no known formats available for video')
1918 if req_format is None:
1919 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1920 elif req_format == 'worst':
1921 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1922 elif req_format == '-1':
1923 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1926 if req_format not in url_map:
1927 self._downloader.trouble(u'ERROR: requested format not available')
1929 video_url_list = [(req_format, url_map[req_format])] # Specific format
1932 for format_param, video_real_url in video_url_list:
1934 video_extension = self._video_extensions.get(format_param, 'mp4')
1937 'id': video_id.decode('utf-8'),
1938 'url': video_real_url.decode('utf-8'),
1939 'uploader': video_uploader.decode('utf-8'),
1940 'upload_date': upload_date,
1941 'title': video_title,
1942 'stitle': simple_title,
1943 'ext': video_extension.decode('utf-8'),
1944 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1945 'thumbnail': video_thumbnail.decode('utf-8'),
1946 'description': video_description.decode('utf-8'),
1951 class BlipTVIE(InfoExtractor):
1952 """Information extractor for blip.tv"""
1954 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
1955 _URL_EXT = r'^.*\.([a-z0-9]+)$'
1956 IE_NAME = u'blip.tv'
1958 def report_extraction(self, file_id):
1959 """Report information extraction."""
1960 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
1962 def report_direct_download(self, title):
1963 """Report information extraction."""
1964 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
1966 def _real_extract(self, url):
1967 mobj = re.match(self._VALID_URL, url)
1969 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1976 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1977 request = urllib2.Request(json_url)
1978 self.report_extraction(mobj.group(1))
1981 urlh = urllib2.urlopen(request)
1982 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1983 basename = url.split('/')[-1]
1984 title,ext = os.path.splitext(basename)
1985 title = title.decode('UTF-8')
1986 ext = ext.replace('.', '')
1987 self.report_direct_download(title)
1992 'stitle': simplify_title(title),
1996 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1997 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1999 if info is None: # Regular URL
2001 json_code = urlh.read()
2002 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2003 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2007 json_data = json.loads(json_code)
2008 if 'Post' in json_data:
2009 data = json_data['Post']
2013 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2014 video_url = data['media']['url']
2015 umobj = re.match(self._URL_EXT, video_url)
2017 raise ValueError('Can not determine filename extension')
2018 ext = umobj.group(1)
2021 'id': data['item_id'],
2023 'uploader': data['display_name'],
2024 'upload_date': upload_date,
2025 'title': data['title'],
2026 'stitle': simplify_title(data['title']),
2028 'format': data['media']['mimeType'],
2029 'thumbnail': data['thumbnailUrl'],
2030 'description': data['description'],
2031 'player_url': data['embedUrl']
2033 except (ValueError,KeyError), err:
2034 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2040 class MyVideoIE(InfoExtractor):
2041 """Information Extractor for myvideo.de."""
2043 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2044 IE_NAME = u'myvideo'
2046 def __init__(self, downloader=None):
2047 InfoExtractor.__init__(self, downloader)
2049 def report_download_webpage(self, video_id):
2050 """Report webpage download."""
2051 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2053 def report_extraction(self, video_id):
2054 """Report information extraction."""
2055 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2057 def _real_extract(self,url):
2058 mobj = re.match(self._VALID_URL, url)
2060 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2063 video_id = mobj.group(1)
2066 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2068 self.report_download_webpage(video_id)
2069 webpage = urllib2.urlopen(request).read()
2070 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2071 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2074 self.report_extraction(video_id)
2075 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2078 self._downloader.trouble(u'ERROR: unable to extract media URL')
2080 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2082 mobj = re.search('<title>([^<]+)</title>', webpage)
2084 self._downloader.trouble(u'ERROR: unable to extract title')
2087 video_title = mobj.group(1)
2088 video_title = sanitize_title(video_title)
2090 simple_title = simplify_title(video_title)
2096 'upload_date': u'NA',
2097 'title': video_title,
2098 'stitle': simple_title,
2104 class ComedyCentralIE(InfoExtractor):
2105 """Information extractor for The Daily Show and Colbert Report """
2107 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2108 IE_NAME = u'comedycentral'
2110 def report_extraction(self, episode_id):
2111 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2113 def report_config_download(self, episode_id):
2114 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2116 def report_index_download(self, episode_id):
2117 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2119 def report_player_url(self, episode_id):
2120 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2122 def _real_extract(self, url):
2123 mobj = re.match(self._VALID_URL, url)
2125 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2128 if mobj.group('shortname'):
2129 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2130 url = u'http://www.thedailyshow.com/full-episodes/'
2132 url = u'http://www.colbertnation.com/full-episodes/'
2133 mobj = re.match(self._VALID_URL, url)
2134 assert mobj is not None
2136 dlNewest = not mobj.group('episode')
2138 epTitle = mobj.group('showname')
2140 epTitle = mobj.group('episode')
2142 req = urllib2.Request(url)
2143 self.report_extraction(epTitle)
2145 htmlHandle = urllib2.urlopen(req)
2146 html = htmlHandle.read()
2147 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2148 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2151 url = htmlHandle.geturl()
2152 mobj = re.match(self._VALID_URL, url)
2154 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2156 if mobj.group('episode') == '':
2157 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2159 epTitle = mobj.group('episode')
2161 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2162 if len(mMovieParams) == 0:
2163 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2166 playerUrl_raw = mMovieParams[0][0]
2167 self.report_player_url(epTitle)
2169 urlHandle = urllib2.urlopen(playerUrl_raw)
2170 playerUrl = urlHandle.geturl()
2171 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2172 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2175 uri = mMovieParams[0][1]
2176 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2177 self.report_index_download(epTitle)
2179 indexXml = urllib2.urlopen(indexUrl).read()
2180 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2181 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2186 idoc = xml.etree.ElementTree.fromstring(indexXml)
2187 itemEls = idoc.findall('.//item')
2188 for itemEl in itemEls:
2189 mediaId = itemEl.findall('./guid')[0].text
2190 shortMediaId = mediaId.split(':')[-1]
2191 showId = mediaId.split(':')[-2].replace('.com', '')
2192 officialTitle = itemEl.findall('./title')[0].text
2193 officialDate = itemEl.findall('./pubDate')[0].text
2195 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2196 urllib.urlencode({'uri': mediaId}))
2197 configReq = urllib2.Request(configUrl)
2198 self.report_config_download(epTitle)
2200 configXml = urllib2.urlopen(configReq).read()
2201 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2202 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2205 cdoc = xml.etree.ElementTree.fromstring(configXml)
2207 for rendition in cdoc.findall('.//rendition'):
2208 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2212 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2215 # For now, just pick the highest bitrate
2216 format,video_url = turls[-1]
2218 effTitle = showId + u'-' + epTitle
2223 'upload_date': officialDate,
2225 'stitle': simplify_title(effTitle),
2229 'description': officialTitle,
2230 'player_url': playerUrl
2233 results.append(info)
2238 class EscapistIE(InfoExtractor):
2239 """Information extractor for The Escapist """
2241 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2242 IE_NAME = u'escapist'
2244 def report_extraction(self, showName):
2245 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2247 def report_config_download(self, showName):
2248 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2250 def _real_extract(self, url):
2251 htmlParser = HTMLParser.HTMLParser()
2253 mobj = re.match(self._VALID_URL, url)
2255 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2257 showName = mobj.group('showname')
2258 videoId = mobj.group('episode')
2260 self.report_extraction(showName)
2262 webPage = urllib2.urlopen(url).read()
2263 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2264 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2267 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2268 description = htmlParser.unescape(descMatch.group(1))
2269 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2270 imgUrl = htmlParser.unescape(imgMatch.group(1))
2271 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2272 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
2273 configUrlMatch = re.search('config=(.*)$', playerUrl)
2274 configUrl = urllib2.unquote(configUrlMatch.group(1))
2276 self.report_config_download(showName)
2278 configJSON = urllib2.urlopen(configUrl).read()
2279 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2280 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2283 # Technically, it's JavaScript, not JSON
2284 configJSON = configJSON.replace("'", '"')
2287 config = json.loads(configJSON)
2288 except (ValueError,), err:
2289 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2292 playlist = config['playlist']
2293 videoUrl = playlist[1]['url']
2298 'uploader': showName,
2299 'upload_date': None,
2301 'stitle': simplify_title(showName),
2304 'thumbnail': imgUrl,
2305 'description': description,
2306 'player_url': playerUrl,
2312 class CollegeHumorIE(InfoExtractor):
2313 """Information extractor for collegehumor.com"""
2315 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2316 IE_NAME = u'collegehumor'
2318 def report_webpage(self, video_id):
2319 """Report information extraction."""
2320 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2322 def report_extraction(self, video_id):
2323 """Report information extraction."""
2324 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2326 def _real_extract(self, url):
2327 htmlParser = HTMLParser.HTMLParser()
2329 mobj = re.match(self._VALID_URL, url)
2331 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2333 video_id = mobj.group('videoid')
2335 self.report_webpage(video_id)
2336 request = urllib2.Request(url)
2338 webpage = urllib2.urlopen(request).read()
2339 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2340 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2343 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2345 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2347 internal_video_id = m.group('internalvideoid')
2351 'internal_id': internal_video_id,
2354 self.report_extraction(video_id)
2355 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2357 metaXml = urllib2.urlopen(xmlUrl).read()
2358 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2359 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
2362 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2364 videoNode = mdoc.findall('./video')[0]
2365 info['description'] = videoNode.findall('./description')[0].text
2366 info['title'] = videoNode.findall('./caption')[0].text
2367 info['stitle'] = simplify_title(info['title'])
2368 info['url'] = videoNode.findall('./file')[0].text
2369 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2370 info['ext'] = info['url'].rpartition('.')[2]
2371 info['format'] = info['ext']
2373 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2379 class XVideosIE(InfoExtractor):
2380 """Information extractor for xvideos.com"""
2382 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2383 IE_NAME = u'xvideos'
2385 def report_webpage(self, video_id):
2386 """Report information extraction."""
2387 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2389 def report_extraction(self, video_id):
2390 """Report information extraction."""
2391 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2393 def _real_extract(self, url):
2394 htmlParser = HTMLParser.HTMLParser()
2396 mobj = re.match(self._VALID_URL, url)
2398 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2400 video_id = mobj.group(1).decode('utf-8')
2402 self.report_webpage(video_id)
2404 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2406 webpage = urllib2.urlopen(request).read()
2407 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2408 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2411 self.report_extraction(video_id)
2415 mobj = re.search(r'flv_url=(.+?)&', webpage)
2417 self._downloader.trouble(u'ERROR: unable to extract video url')
2419 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2423 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2425 self._downloader.trouble(u'ERROR: unable to extract video title')
2427 video_title = mobj.group(1).decode('utf-8')
2430 # Extract video thumbnail
2431 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
2433 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2435 video_thumbnail = mobj.group(1).decode('utf-8')
2441 'upload_date': None,
2442 'title': video_title,
2443 'stitle': simplify_title(video_title),
2446 'thumbnail': video_thumbnail,
2447 'description': None,
2454 class SoundcloudIE(InfoExtractor):
2455 """Information extractor for soundcloud.com
2456 To access the media, the uid of the song and a stream token
2457 must be extracted from the page source and the script must make
2458 a request to media.soundcloud.com/crossdomain.xml. Then
2459 the media can be grabbed by requesting from an url composed
2460 of the stream token and uid
2463 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2464 IE_NAME = u'soundcloud'
2466 def __init__(self, downloader=None):
2467 InfoExtractor.__init__(self, downloader)
2469 def report_webpage(self, video_id):
2470 """Report information extraction."""
2471 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2473 def report_extraction(self, video_id):
2474 """Report information extraction."""
2475 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2477 def _real_extract(self, url):
2478 htmlParser = HTMLParser.HTMLParser()
2480 mobj = re.match(self._VALID_URL, url)
2482 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2485 # extract uploader (which is in the url)
2486 uploader = mobj.group(1).decode('utf-8')
2487 # extract simple title (uploader + slug of song title)
2488 slug_title = mobj.group(2).decode('utf-8')
2489 simple_title = uploader + '-' + slug_title
2491 self.report_webpage('%s/%s' % (uploader, slug_title))
2493 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2495 webpage = urllib2.urlopen(request).read()
2496 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2497 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2500 self.report_extraction('%s/%s' % (uploader, slug_title))
2502 # extract uid and stream token that soundcloud hands out for access
2503 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2505 video_id = mobj.group(1)
2506 stream_token = mobj.group(2)
2508 # extract unsimplified title
2509 mobj = re.search('"title":"(.*?)",', webpage)
2511 title = mobj.group(1)
2513 # construct media url (with uid/token)
2514 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2515 mediaURL = mediaURL % (video_id, stream_token)
2518 description = u'No description available'
2519 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2521 description = mobj.group(1)
2525 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2528 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2529 except Exception, e:
2532 # for soundcloud, a request to a cross domain is required for cookies
2533 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2536 'id': video_id.decode('utf-8'),
2538 'uploader': uploader.decode('utf-8'),
2539 'upload_date': upload_date,
2540 'title': simple_title.decode('utf-8'),
2541 'stitle': simple_title.decode('utf-8'),
2545 'description': description.decode('utf-8')
2549 class InfoQIE(InfoExtractor):
2550 """Information extractor for infoq.com"""
2552 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2555 def report_webpage(self, video_id):
2556 """Report information extraction."""
2557 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2559 def report_extraction(self, video_id):
2560 """Report information extraction."""
2561 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2563 def _real_extract(self, url):
2564 htmlParser = HTMLParser.HTMLParser()
2566 mobj = re.match(self._VALID_URL, url)
2568 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2571 self.report_webpage(url)
2573 request = urllib2.Request(url)
2575 webpage = urllib2.urlopen(request).read()
2576 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2577 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2580 self.report_extraction(url)
2584 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2586 self._downloader.trouble(u'ERROR: unable to extract video url')
2588 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2592 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2594 self._downloader.trouble(u'ERROR: unable to extract video title')
2596 video_title = mobj.group(1).decode('utf-8')
2598 # Extract description
2599 video_description = u'No description available.'
2600 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2601 if mobj is not None:
2602 video_description = mobj.group(1).decode('utf-8')
2604 video_filename = video_url.split('/')[-1]
2605 video_id, extension = video_filename.split('.')
2611 'upload_date': None,
2612 'title': video_title,
2613 'stitle': simplify_title(video_title),
2615 'format': extension, # Extension is always(?) mp4, but seems to be flv
2617 'description': video_description,
2623 class MixcloudIE(InfoExtractor):
2624 """Information extractor for www.mixcloud.com"""
2625 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2626 IE_NAME = u'mixcloud'
2628 def __init__(self, downloader=None):
2629 InfoExtractor.__init__(self, downloader)
2631 def report_download_json(self, file_id):
2632 """Report JSON download."""
2633 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2635 def report_extraction(self, file_id):
2636 """Report information extraction."""
2637 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2639 def get_urls(self, jsonData, fmt, bitrate='best'):
2640 """Get urls from 'audio_formats' section in json"""
2643 bitrate_list = jsonData[fmt]
2644 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2645 bitrate = max(bitrate_list) # select highest
2647 url_list = jsonData[fmt][bitrate]
2648 except TypeError: # we have no bitrate info.
2649 url_list = jsonData[fmt]
2653 def check_urls(self, url_list):
2654 """Returns 1st active url from list"""
2655 for url in url_list:
2657 urllib2.urlopen(url)
2659 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2664 def _print_formats(self, formats):
2665 print 'Available formats:'
2666 for fmt in formats.keys():
2667 for b in formats[fmt]:
2669 ext = formats[fmt][b][0]
2670 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
2671 except TypeError: # we have no bitrate info
2672 ext = formats[fmt][0]
2673 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
2676 def _real_extract(self, url):
2677 mobj = re.match(self._VALID_URL, url)
2679 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2681 # extract uploader & filename from url
2682 uploader = mobj.group(1).decode('utf-8')
2683 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2685 # construct API request
2686 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2687 # retrieve .json file with links to files
2688 request = urllib2.Request(file_url)
2690 self.report_download_json(file_url)
2691 jsonData = urllib2.urlopen(request).read()
2692 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2693 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
2697 json_data = json.loads(jsonData)
2698 player_url = json_data['player_swf_url']
2699 formats = dict(json_data['audio_formats'])
2701 req_format = self._downloader.params.get('format', None)
2704 if self._downloader.params.get('listformats', None):
2705 self._print_formats(formats)
2708 if req_format is None or req_format == 'best':
2709 for format_param in formats.keys():
2710 url_list = self.get_urls(formats, format_param)
2712 file_url = self.check_urls(url_list)
2713 if file_url is not None:
2716 if req_format not in formats.keys():
2717 self._downloader.trouble(u'ERROR: format is not available')
2720 url_list = self.get_urls(formats, req_format)
2721 file_url = self.check_urls(url_list)
2722 format_param = req_format
2725 'id': file_id.decode('utf-8'),
2726 'url': file_url.decode('utf-8'),
2727 'uploader': uploader.decode('utf-8'),
2728 'upload_date': u'NA',
2729 'title': json_data['name'],
2730 'stitle': simplify_title(json_data['name']),
2731 'ext': file_url.split('.')[-1].decode('utf-8'),
2732 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2733 'thumbnail': json_data['thumbnail_url'],
2734 'description': json_data['description'],
2735 'player_url': player_url.decode('utf-8'),
2738 class StanfordOpenClassroomIE(InfoExtractor):
2739 """Information extractor for Stanford's Open ClassRoom"""
2741 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2742 IE_NAME = u'stanfordoc'
2744 def report_download_webpage(self, objid):
2745 """Report information extraction."""
2746 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2748 def report_extraction(self, video_id):
2749 """Report information extraction."""
2750 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2752 def _real_extract(self, url):
2753 mobj = re.match(self._VALID_URL, url)
2755 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2758 if mobj.group('course') and mobj.group('video'): # A specific video
2759 course = mobj.group('course')
2760 video = mobj.group('video')
2762 'id': simplify_title(course + '_' + video),
2765 self.report_extraction(info['id'])
2766 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2767 xmlUrl = baseUrl + video + '.xml'
2769 metaXml = urllib2.urlopen(xmlUrl).read()
2770 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2771 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2773 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2775 info['title'] = mdoc.findall('./title')[0].text
2776 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2778 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2780 info['stitle'] = simplify_title(info['title'])
2781 info['ext'] = info['url'].rpartition('.')[2]
2782 info['format'] = info['ext']
2784 elif mobj.group('course'): # A course page
2785 unescapeHTML = HTMLParser.HTMLParser().unescape
2787 course = mobj.group('course')
2789 'id': simplify_title(course),
2793 self.report_download_webpage(info['id'])
2795 coursepage = urllib2.urlopen(url).read()
2796 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2797 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2800 m = re.search('<h1>([^<]+)</h1>', coursepage)
2802 info['title'] = unescapeHTML(m.group(1))
2804 info['title'] = info['id']
2805 info['stitle'] = simplify_title(info['title'])
2807 m = re.search('<description>([^<]+)</description>', coursepage)
2809 info['description'] = unescapeHTML(m.group(1))
2811 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2814 'type': 'reference',
2815 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2819 for entry in info['list']:
2820 assert entry['type'] == 'reference'
2821 results += self.extract(entry['url'])
2825 unescapeHTML = HTMLParser.HTMLParser().unescape
2828 'id': 'Stanford OpenClassroom',
2832 self.report_download_webpage(info['id'])
2833 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2835 rootpage = urllib2.urlopen(rootURL).read()
2836 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2837 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2840 info['title'] = info['id']
2841 info['stitle'] = simplify_title(info['title'])
2843 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2846 'type': 'reference',
2847 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2852 for entry in info['list']:
2853 assert entry['type'] == 'reference'
2854 results += self.extract(entry['url'])
2857 class MTVIE(InfoExtractor):
2858 """Information extractor for MTV.com"""
2860 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2863 def report_webpage(self, video_id):
2864 """Report information extraction."""
2865 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2867 def report_extraction(self, video_id):
2868 """Report information extraction."""
2869 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2871 def _real_extract(self, url):
2872 mobj = re.match(self._VALID_URL, url)
2874 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2876 if not mobj.group('proto'):
2877 url = 'http://' + url
2878 video_id = mobj.group('videoid')
2879 self.report_webpage(video_id)
2881 request = urllib2.Request(url)
2883 webpage = urllib2.urlopen(request).read()
2884 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2885 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2888 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2890 self._downloader.trouble(u'ERROR: unable to extract song name')
2892 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2893 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2895 self._downloader.trouble(u'ERROR: unable to extract performer')
2897 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2898 video_title = performer + ' - ' + song_name
2900 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2902 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
2904 mtvn_uri = mobj.group(1)
2906 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2908 self._downloader.trouble(u'ERROR: unable to extract content id')
2910 content_id = mobj.group(1)
2912 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2913 self.report_extraction(video_id)
2914 request = urllib2.Request(videogen_url)
2916 metadataXml = urllib2.urlopen(request).read()
2917 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2918 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
2921 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2922 renditions = mdoc.findall('.//rendition')
2924 # For now, always pick the highest quality.
2925 rendition = renditions[-1]
2928 _,_,ext = rendition.attrib['type'].partition('/')
2929 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2930 video_url = rendition.find('./src').text
2932 self._downloader.trouble('Invalid rendition field.')
2938 'uploader': performer,
2939 'title': video_title,
2940 'stitle': simplify_title(video_title),