2 # -*- coding: utf-8 -*-
17 import cStringIO as StringIO
21 # parse_qs was moved from the cgi module to the urlparse module recently.
23 from urlparse import parse_qs
25 from cgi import parse_qs
33 import xml.etree.ElementTree
34 except ImportError: # Python<2.5: Not officially supported, but let it slip
35 warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
40 class InfoExtractor(object):
41 """Information Extractor class.
43 Information extractors are the classes that, given a URL, extract
44 information from the video (or videos) the URL refers to. This
45 information includes the real video URL, the video title and simplified
46 title, author and others. The information is stored in a dictionary
47 which is then passed to the FileDownloader. The FileDownloader
48 processes this information possibly downloading the video to the file
49 system, among other possible outcomes. The dictionaries must include
54 uploader: Nickname of the video uploader.
56 stitle: Simplified title.
57 ext: Video filename extension.
59 player_url: SWF Player URL (may be None).
61 The following fields are optional. Their primary purpose is to allow
62 youtube-dl to serve as the backend for a video search function, such
63 as the one in youtube2mp3. They are only used when their respective
64 forced printing functions are called:
66 thumbnail: Full URL to a video thumbnail image.
67 description: One-line video description.
69 Subclasses of this one should re-define the _real_initialize() and
70 _real_extract() methods and define a _VALID_URL regexp.
71 Probably, they should also be added to the list of extractors.
77 def __init__(self, downloader=None):
78 """Constructor. Receives an optional downloader."""
80 self.set_downloader(downloader)
82 def suitable(self, url):
83 """Receives a URL and returns True if suitable for this IE."""
84 return re.match(self._VALID_URL, url) is not None
87 """Initializes an instance (authentication, etc)."""
89 self._real_initialize()
92 def extract(self, url):
93 """Extracts URL information and returns it in list of dicts."""
95 return self._real_extract(url)
97 def set_downloader(self, downloader):
98 """Sets the downloader for this IE."""
99 self._downloader = downloader
101 def _real_initialize(self):
102 """Real initialization process. Redefine in subclasses."""
105 def _real_extract(self, url):
106 """Real extraction process. Redefine in subclasses."""
110 class YoutubeIE(InfoExtractor):
111 """Information extractor for youtube.com."""
113 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
114 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
115 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
116 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
117 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
118 _NETRC_MACHINE = 'youtube'
119 # Listed in order of quality
120 _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
121 _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
122 _video_extensions = {
128 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
133 _video_dimensions = {
150 def report_lang(self):
151 """Report attempt to set language."""
152 self._downloader.to_screen(u'[youtube] Setting language')
154 def report_login(self):
155 """Report attempt to log in."""
156 self._downloader.to_screen(u'[youtube] Logging in')
158 def report_age_confirmation(self):
159 """Report attempt to confirm age."""
160 self._downloader.to_screen(u'[youtube] Confirming age')
162 def report_video_webpage_download(self, video_id):
163 """Report attempt to download video webpage."""
164 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
166 def report_video_info_webpage_download(self, video_id):
167 """Report attempt to download video info webpage."""
168 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
170 def report_video_subtitles_download(self, video_id):
171 """Report attempt to download video info webpage."""
172 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
174 def report_information_extraction(self, video_id):
175 """Report attempt to extract video information."""
176 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
178 def report_unavailable_format(self, video_id, format):
179 """Report extracted video URL."""
180 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
182 def report_rtmp_download(self):
183 """Indicate the download will use the RTMP protocol."""
184 self._downloader.to_screen(u'[youtube] RTMP download detected')
186 def _closed_captions_xml_to_srt(self, xml_string):
188 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
189 # TODO parse xml instead of regex
190 for n, (start, dur_tag, dur, caption) in enumerate(texts):
191 if not dur: dur = '4'
193 end = start + float(dur)
194 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
195 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
196 caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption)
197 caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption) # double cycle, inentional
199 srt += start + ' --> ' + end + '\n'
200 srt += caption + '\n\n'
203 def _print_formats(self, formats):
204 print 'Available formats:'
206 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
208 def _real_initialize(self):
209 if self._downloader is None:
214 downloader_params = self._downloader.params
216 # Attempt to use provided username and password or .netrc data
217 if downloader_params.get('username', None) is not None:
218 username = downloader_params['username']
219 password = downloader_params['password']
220 elif downloader_params.get('usenetrc', False):
222 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
227 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
228 except (IOError, netrc.NetrcParseError), err:
229 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
233 request = urllib2.Request(self._LANG_URL)
236 urllib2.urlopen(request).read()
237 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
238 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
241 # No authentication to be performed
247 'current_form': 'loginForm',
249 'action_login': 'Log In',
250 'username': username,
251 'password': password,
253 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
256 login_results = urllib2.urlopen(request).read()
257 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
258 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
260 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
261 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
267 'action_confirm': 'Confirm',
269 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
271 self.report_age_confirmation()
272 age_results = urllib2.urlopen(request).read()
273 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
274 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
277 def _real_extract(self, url):
278 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
279 mobj = re.search(self._NEXT_URL_RE, url)
281 url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
283 # Extract video id from URL
284 mobj = re.match(self._VALID_URL, url)
286 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
288 video_id = mobj.group(2)
291 self.report_video_webpage_download(video_id)
292 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
294 video_webpage = urllib2.urlopen(request).read()
295 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
296 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
299 # Attempt to extract SWF player URL
300 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
302 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
307 self.report_video_info_webpage_download(video_id)
308 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
309 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
310 % (video_id, el_type))
311 request = urllib2.Request(video_info_url)
313 video_info_webpage = urllib2.urlopen(request).read()
314 video_info = parse_qs(video_info_webpage)
315 if 'token' in video_info:
317 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
318 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
320 if 'token' not in video_info:
321 if 'reason' in video_info:
322 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
324 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
327 # Start extracting information
328 self.report_information_extraction(video_id)
331 if 'author' not in video_info:
332 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
334 video_uploader = urllib.unquote_plus(video_info['author'][0])
337 if 'title' not in video_info:
338 self._downloader.trouble(u'ERROR: unable to extract video title')
340 video_title = urllib.unquote_plus(video_info['title'][0])
341 video_title = video_title.decode('utf-8')
342 video_title = sanitize_title(video_title)
345 simple_title = simplify_title(video_title)
348 if 'thumbnail_url' not in video_info:
349 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
351 else: # don't panic if we can't find it
352 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
356 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
358 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
359 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
360 for expression in format_expressions:
362 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
370 video_description = u'No description available.'
371 mobj = re.search(r'<meta name="description" content="(.*?)">', video_webpage)
373 video_description = mobj.group(1).decode('utf-8')
375 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
376 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
377 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
378 # TODO use another parser
381 video_subtitles = None
382 if self._downloader.params.get('writesubtitles', False):
383 self.report_video_subtitles_download(video_id)
384 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
386 srt_list = urllib2.urlopen(request).read()
387 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
388 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
390 srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
392 if self._downloader.params.get('subtitleslang', False):
393 srt_lang = self._downloader.params.get('subtitleslang')
394 elif 'en' in srt_lang_list:
397 srt_lang = srt_lang_list[0]
398 if not srt_lang in srt_lang_list:
399 self._downloader.trouble(u'WARNING: no closed captions found in the specified language')
401 request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
403 srt_xml = urllib2.urlopen(request).read()
404 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
405 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
407 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
409 self._downloader.trouble(u'WARNING: video has no closed captions')
412 video_token = urllib.unquote_plus(video_info['token'][0])
414 # Decide which formats to download
415 req_format = self._downloader.params.get('format', None)
417 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
418 self.report_rtmp_download()
419 video_url_list = [(None, video_info['conn'][0])]
420 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
421 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
422 url_data = [parse_qs(uds) for uds in url_data_strs]
423 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
424 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
426 format_limit = self._downloader.params.get('format_limit', None)
427 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
428 if format_limit is not None and format_limit in available_formats:
429 format_list = available_formats[available_formats.index(format_limit):]
431 format_list = available_formats
432 existing_formats = [x for x in format_list if x in url_map]
433 if len(existing_formats) == 0:
434 self._downloader.trouble(u'ERROR: no known formats available for video')
436 if self._downloader.params.get('listformats', None):
437 self._print_formats(existing_formats)
439 if req_format is None or req_format == 'best':
440 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
441 elif req_format == 'worst':
442 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
443 elif req_format in ('-1', 'all'):
444 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
446 # Specific formats. We pick the first in a slash-delimeted sequence.
447 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
448 req_formats = req_format.split('/')
449 video_url_list = None
450 for rf in req_formats:
452 video_url_list = [(rf, url_map[rf])]
454 if video_url_list is None:
455 self._downloader.trouble(u'ERROR: requested format not available')
458 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
461 for format_param, video_real_url in video_url_list:
462 # At this point we have a new video
463 self._downloader.increment_downloads()
466 video_extension = self._video_extensions.get(format_param, 'flv')
469 # Process video information
470 self._downloader.process_info({
471 'id': video_id.decode('utf-8'),
472 'url': video_real_url.decode('utf-8'),
473 'uploader': video_uploader.decode('utf-8'),
474 'upload_date': upload_date,
475 'title': video_title,
476 'stitle': simple_title,
477 'ext': video_extension.decode('utf-8'),
478 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
479 'thumbnail': video_thumbnail.decode('utf-8'),
480 'description': video_description,
481 'player_url': player_url,
482 'subtitles': video_subtitles
484 except UnavailableVideoError, err:
485 self._downloader.trouble(u'\nERROR: unable to download video')
488 class MetacafeIE(InfoExtractor):
489 """Information Extractor for metacafe.com."""
491 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
492 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
493 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
495 IE_NAME = u'metacafe'
497 def __init__(self, youtube_ie, downloader=None):
498 InfoExtractor.__init__(self, downloader)
499 self._youtube_ie = youtube_ie
501 def report_disclaimer(self):
502 """Report disclaimer retrieval."""
503 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
505 def report_age_confirmation(self):
506 """Report attempt to confirm age."""
507 self._downloader.to_screen(u'[metacafe] Confirming age')
509 def report_download_webpage(self, video_id):
510 """Report webpage download."""
511 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
513 def report_extraction(self, video_id):
514 """Report information extraction."""
515 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
517 def _real_initialize(self):
518 # Retrieve disclaimer
519 request = urllib2.Request(self._DISCLAIMER)
521 self.report_disclaimer()
522 disclaimer = urllib2.urlopen(request).read()
523 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
524 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
530 'submit': "Continue - I'm over 18",
532 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
534 self.report_age_confirmation()
535 disclaimer = urllib2.urlopen(request).read()
536 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
537 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
540 def _real_extract(self, url):
541 # Extract id and simplified title from URL
542 mobj = re.match(self._VALID_URL, url)
544 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
547 video_id = mobj.group(1)
549 # Check if video comes from YouTube
550 mobj2 = re.match(r'^yt-(.*)$', video_id)
551 if mobj2 is not None:
552 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
555 # At this point we have a new video
556 self._downloader.increment_downloads()
558 simple_title = mobj.group(2).decode('utf-8')
560 # Retrieve video webpage to extract further information
561 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
563 self.report_download_webpage(video_id)
564 webpage = urllib2.urlopen(request).read()
565 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
566 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
569 # Extract URL, uploader and title from webpage
570 self.report_extraction(video_id)
571 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
573 mediaURL = urllib.unquote(mobj.group(1))
574 video_extension = mediaURL[-3:]
576 # Extract gdaKey if available
577 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
581 gdaKey = mobj.group(1)
582 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
584 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
586 self._downloader.trouble(u'ERROR: unable to extract media URL')
588 vardict = parse_qs(mobj.group(1))
589 if 'mediaData' not in vardict:
590 self._downloader.trouble(u'ERROR: unable to extract media URL')
592 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
594 self._downloader.trouble(u'ERROR: unable to extract media URL')
596 mediaURL = mobj.group(1).replace('\\/', '/')
597 video_extension = mediaURL[-3:]
598 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
600 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
602 self._downloader.trouble(u'ERROR: unable to extract title')
604 video_title = mobj.group(1).decode('utf-8')
605 video_title = sanitize_title(video_title)
607 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
609 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
611 video_uploader = mobj.group(1)
614 # Process video information
615 self._downloader.process_info({
616 'id': video_id.decode('utf-8'),
617 'url': video_url.decode('utf-8'),
618 'uploader': video_uploader.decode('utf-8'),
619 'upload_date': u'NA',
620 'title': video_title,
621 'stitle': simple_title,
622 'ext': video_extension.decode('utf-8'),
626 except UnavailableVideoError:
627 self._downloader.trouble(u'\nERROR: unable to download video')
630 class DailymotionIE(InfoExtractor):
631 """Information Extractor for Dailymotion"""
633 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
634 IE_NAME = u'dailymotion'
636 def __init__(self, downloader=None):
637 InfoExtractor.__init__(self, downloader)
639 def report_download_webpage(self, video_id):
640 """Report webpage download."""
641 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
643 def report_extraction(self, video_id):
644 """Report information extraction."""
645 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
647 def _real_extract(self, url):
648 # Extract id and simplified title from URL
649 mobj = re.match(self._VALID_URL, url)
651 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
654 # At this point we have a new video
655 self._downloader.increment_downloads()
656 video_id = mobj.group(1)
658 video_extension = 'flv'
660 # Retrieve video webpage to extract further information
661 request = urllib2.Request(url)
662 request.add_header('Cookie', 'family_filter=off')
664 self.report_download_webpage(video_id)
665 webpage = urllib2.urlopen(request).read()
666 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
667 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
670 # Extract URL, uploader and title from webpage
671 self.report_extraction(video_id)
672 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
674 self._downloader.trouble(u'ERROR: unable to extract media URL')
676 sequence = urllib.unquote(mobj.group(1))
677 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
679 self._downloader.trouble(u'ERROR: unable to extract media URL')
681 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
683 # if needed add http://www.dailymotion.com/ if relative URL
687 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
689 self._downloader.trouble(u'ERROR: unable to extract title')
691 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
692 video_title = sanitize_title(video_title)
693 simple_title = simplify_title(video_title)
695 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
697 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
699 video_uploader = mobj.group(1)
702 # Process video information
703 self._downloader.process_info({
704 'id': video_id.decode('utf-8'),
705 'url': video_url.decode('utf-8'),
706 'uploader': video_uploader.decode('utf-8'),
707 'upload_date': u'NA',
708 'title': video_title,
709 'stitle': simple_title,
710 'ext': video_extension.decode('utf-8'),
714 except UnavailableVideoError:
715 self._downloader.trouble(u'\nERROR: unable to download video')
718 class GoogleIE(InfoExtractor):
719 """Information extractor for video.google.com."""
721 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
722 IE_NAME = u'video.google'
724 def __init__(self, downloader=None):
725 InfoExtractor.__init__(self, downloader)
727 def report_download_webpage(self, video_id):
728 """Report webpage download."""
729 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
731 def report_extraction(self, video_id):
732 """Report information extraction."""
733 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
735 def _real_extract(self, url):
736 # Extract id from URL
737 mobj = re.match(self._VALID_URL, url)
739 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
742 # At this point we have a new video
743 self._downloader.increment_downloads()
744 video_id = mobj.group(1)
746 video_extension = 'mp4'
748 # Retrieve video webpage to extract further information
749 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
751 self.report_download_webpage(video_id)
752 webpage = urllib2.urlopen(request).read()
753 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
754 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
757 # Extract URL, uploader, and title from webpage
758 self.report_extraction(video_id)
759 mobj = re.search(r"download_url:'([^']+)'", webpage)
761 video_extension = 'flv'
762 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
764 self._downloader.trouble(u'ERROR: unable to extract media URL')
766 mediaURL = urllib.unquote(mobj.group(1))
767 mediaURL = mediaURL.replace('\\x3d', '\x3d')
768 mediaURL = mediaURL.replace('\\x26', '\x26')
772 mobj = re.search(r'<title>(.*)</title>', webpage)
774 self._downloader.trouble(u'ERROR: unable to extract title')
776 video_title = mobj.group(1).decode('utf-8')
777 video_title = sanitize_title(video_title)
778 simple_title = simplify_title(video_title)
780 # Extract video description
781 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
783 self._downloader.trouble(u'ERROR: unable to extract video description')
785 video_description = mobj.group(1).decode('utf-8')
786 if not video_description:
787 video_description = 'No description available.'
789 # Extract video thumbnail
790 if self._downloader.params.get('forcethumbnail', False):
791 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
793 webpage = urllib2.urlopen(request).read()
794 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
795 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
797 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
799 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
801 video_thumbnail = mobj.group(1)
802 else: # we need something to pass to process_info
806 # Process video information
807 self._downloader.process_info({
808 'id': video_id.decode('utf-8'),
809 'url': video_url.decode('utf-8'),
811 'upload_date': u'NA',
812 'title': video_title,
813 'stitle': simple_title,
814 'ext': video_extension.decode('utf-8'),
818 except UnavailableVideoError:
819 self._downloader.trouble(u'\nERROR: unable to download video')
822 class PhotobucketIE(InfoExtractor):
823 """Information extractor for photobucket.com."""
825 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
826 IE_NAME = u'photobucket'
828 def __init__(self, downloader=None):
829 InfoExtractor.__init__(self, downloader)
831 def report_download_webpage(self, video_id):
832 """Report webpage download."""
833 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
835 def report_extraction(self, video_id):
836 """Report information extraction."""
837 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
839 def _real_extract(self, url):
840 # Extract id from URL
841 mobj = re.match(self._VALID_URL, url)
843 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
846 # At this point we have a new video
847 self._downloader.increment_downloads()
848 video_id = mobj.group(1)
850 video_extension = 'flv'
852 # Retrieve video webpage to extract further information
853 request = urllib2.Request(url)
855 self.report_download_webpage(video_id)
856 webpage = urllib2.urlopen(request).read()
857 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
858 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
861 # Extract URL, uploader, and title from webpage
862 self.report_extraction(video_id)
863 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
865 self._downloader.trouble(u'ERROR: unable to extract media URL')
867 mediaURL = urllib.unquote(mobj.group(1))
871 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
873 self._downloader.trouble(u'ERROR: unable to extract title')
875 video_title = mobj.group(1).decode('utf-8')
876 video_title = sanitize_title(video_title)
877 simple_title = simplify_title(video_title)
879 video_uploader = mobj.group(2).decode('utf-8')
882 # Process video information
883 self._downloader.process_info({
884 'id': video_id.decode('utf-8'),
885 'url': video_url.decode('utf-8'),
886 'uploader': video_uploader,
887 'upload_date': u'NA',
888 'title': video_title,
889 'stitle': simple_title,
890 'ext': video_extension.decode('utf-8'),
894 except UnavailableVideoError:
895 self._downloader.trouble(u'\nERROR: unable to download video')
898 class YahooIE(InfoExtractor):
899 """Information extractor for video.yahoo.com."""
901 # _VALID_URL matches all Yahoo! Video URLs
902 # _VPAGE_URL matches only the extractable '/watch/' URLs
903 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
904 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
905 IE_NAME = u'video.yahoo'
907 def __init__(self, downloader=None):
908 InfoExtractor.__init__(self, downloader)
910 def report_download_webpage(self, video_id):
911 """Report webpage download."""
912 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
914 def report_extraction(self, video_id):
915 """Report information extraction."""
916 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
918 def _real_extract(self, url, new_video=True):
919 # Extract ID from URL
920 mobj = re.match(self._VALID_URL, url)
922 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
925 # At this point we have a new video
926 self._downloader.increment_downloads()
927 video_id = mobj.group(2)
928 video_extension = 'flv'
930 # Rewrite valid but non-extractable URLs as
931 # extractable English language /watch/ URLs
932 if re.match(self._VPAGE_URL, url) is None:
933 request = urllib2.Request(url)
935 webpage = urllib2.urlopen(request).read()
936 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
937 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
940 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
942 self._downloader.trouble(u'ERROR: Unable to extract id field')
944 yahoo_id = mobj.group(1)
946 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
948 self._downloader.trouble(u'ERROR: Unable to extract vid field')
950 yahoo_vid = mobj.group(1)
952 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
953 return self._real_extract(url, new_video=False)
955 # Retrieve video webpage to extract further information
956 request = urllib2.Request(url)
958 self.report_download_webpage(video_id)
959 webpage = urllib2.urlopen(request).read()
960 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
961 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
964 # Extract uploader and title from webpage
965 self.report_extraction(video_id)
966 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
968 self._downloader.trouble(u'ERROR: unable to extract video title')
970 video_title = mobj.group(1).decode('utf-8')
971 simple_title = simplify_title(video_title)
973 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
975 self._downloader.trouble(u'ERROR: unable to extract video uploader')
977 video_uploader = mobj.group(1).decode('utf-8')
979 # Extract video thumbnail
980 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
982 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
984 video_thumbnail = mobj.group(1).decode('utf-8')
986 # Extract video description
987 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
989 self._downloader.trouble(u'ERROR: unable to extract video description')
991 video_description = mobj.group(1).decode('utf-8')
992 if not video_description:
993 video_description = 'No description available.'
995 # Extract video height and width
996 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
998 self._downloader.trouble(u'ERROR: unable to extract video height')
1000 yv_video_height = mobj.group(1)
1002 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1004 self._downloader.trouble(u'ERROR: unable to extract video width')
1006 yv_video_width = mobj.group(1)
1008 # Retrieve video playlist to extract media URL
1009 # I'm not completely sure what all these options are, but we
1010 # seem to need most of them, otherwise the server sends a 401.
1011 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1012 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1013 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1014 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1015 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1017 self.report_download_webpage(video_id)
1018 webpage = urllib2.urlopen(request).read()
1019 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1020 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1023 # Extract media URL from playlist XML
1024 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1026 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1028 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1029 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1032 # Process video information
1033 self._downloader.process_info({
1034 'id': video_id.decode('utf-8'),
1036 'uploader': video_uploader,
1037 'upload_date': u'NA',
1038 'title': video_title,
1039 'stitle': simple_title,
1040 'ext': video_extension.decode('utf-8'),
1041 'thumbnail': video_thumbnail.decode('utf-8'),
1042 'description': video_description,
1043 'thumbnail': video_thumbnail,
1046 except UnavailableVideoError:
1047 self._downloader.trouble(u'\nERROR: unable to download video')
1050 class VimeoIE(InfoExtractor):
1051 """Information extractor for vimeo.com."""
1053 # _VALID_URL matches Vimeo URLs
1054 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1057 def __init__(self, downloader=None):
1058 InfoExtractor.__init__(self, downloader)
1060 def report_download_webpage(self, video_id):
1061 """Report webpage download."""
1062 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1064 def report_extraction(self, video_id):
1065 """Report information extraction."""
1066 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1068 def _real_extract(self, url, new_video=True):
1069 # Extract ID from URL
1070 mobj = re.match(self._VALID_URL, url)
1072 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1075 # At this point we have a new video
1076 self._downloader.increment_downloads()
1077 video_id = mobj.group(1)
1079 # Retrieve video webpage to extract further information
1080 request = urllib2.Request(url, None, std_headers)
1082 self.report_download_webpage(video_id)
1083 webpage = urllib2.urlopen(request).read()
1084 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1085 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1088 # Now we begin extracting as much information as we can from what we
1089 # retrieved. First we extract the information common to all extractors,
1090 # and latter we extract those that are Vimeo specific.
1091 self.report_extraction(video_id)
1093 # Extract the config JSON
1094 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1096 config = json.loads(config)
1098 self._downloader.trouble(u'ERROR: unable to extract info section')
1102 video_title = config["video"]["title"]
1103 simple_title = simplify_title(video_title)
1106 video_uploader = config["video"]["owner"]["name"]
1108 # Extract video thumbnail
1109 video_thumbnail = config["video"]["thumbnail"]
1111 # Extract video description
1115 video_description = u'No description available.'
1116 mobj = re.search(r'<meta name="description" content="(.*?)" />', webpage, re.MULTILINE)
1117 if mobj is not None:
1118 video_description = mobj.group(1)
1120 html_parser = lxml.etree.HTMLParser()
1121 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser)
1122 video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip()
1123 # TODO use another parser
1125 # Extract upload date
1126 video_upload_date = u'NA'
1127 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1128 if mobj is not None:
1129 video_upload_date = mobj.group(1)
1131 # Vimeo specific: extract request signature and timestamp
1132 sig = config['request']['signature']
1133 timestamp = config['request']['timestamp']
1135 # Vimeo specific: extract video codec and quality information
1136 # TODO bind to format param
1137 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1138 for codec in codecs:
1139 if codec[0] in config["video"]["files"]:
1140 video_codec = codec[0]
1141 video_extension = codec[1]
1142 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
1143 else: quality = 'sd'
1146 self._downloader.trouble(u'ERROR: no known codec found')
1149 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1150 %(video_id, sig, timestamp, quality, video_codec.upper())
1153 # Process video information
1154 self._downloader.process_info({
1157 'uploader': video_uploader,
1158 'upload_date': video_upload_date,
1159 'title': video_title,
1160 'stitle': simple_title,
1161 'ext': video_extension,
1162 'thumbnail': video_thumbnail,
1163 'description': video_description,
1166 except UnavailableVideoError:
1167 self._downloader.trouble(u'ERROR: unable to download video')
1170 class GenericIE(InfoExtractor):
1171 """Generic last-resort information extractor."""
1174 IE_NAME = u'generic'
1176 def __init__(self, downloader=None):
1177 InfoExtractor.__init__(self, downloader)
1179 def report_download_webpage(self, video_id):
1180 """Report webpage download."""
1181 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1182 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1184 def report_extraction(self, video_id):
1185 """Report information extraction."""
1186 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1188 def report_following_redirect(self, new_url):
1189 """Report information extraction."""
1190 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1192 def _test_redirect(self, url):
1193 """Check if it is a redirect, like url shorteners, in case restart chain."""
1194 class HeadRequest(urllib2.Request):
1195 def get_method(self):
1198 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1200 Subclass the HTTPRedirectHandler to make it use our
1201 HeadRequest also on the redirected URL
1203 def redirect_request(self, req, fp, code, msg, headers, newurl):
1204 if code in (301, 302, 303, 307):
1205 newurl = newurl.replace(' ', '%20')
1206 newheaders = dict((k,v) for k,v in req.headers.items()
1207 if k.lower() not in ("content-length", "content-type"))
1208 return HeadRequest(newurl,
1210 origin_req_host=req.get_origin_req_host(),
1213 raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1215 class HTTPMethodFallback(urllib2.BaseHandler):
1217 Fallback to GET if HEAD is not allowed (405 HTTP error)
1219 def http_error_405(self, req, fp, code, msg, headers):
1223 newheaders = dict((k,v) for k,v in req.headers.items()
1224 if k.lower() not in ("content-length", "content-type"))
1225 return self.parent.open(urllib2.Request(req.get_full_url(),
1227 origin_req_host=req.get_origin_req_host(),
1231 opener = urllib2.OpenerDirector()
1232 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1233 HTTPMethodFallback, HEADRedirectHandler,
1234 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1235 opener.add_handler(handler())
1237 response = opener.open(HeadRequest(url))
1238 new_url = response.geturl()
1240 if url == new_url: return False
1242 self.report_following_redirect(new_url)
1243 self._downloader.download([new_url])
1246 def _real_extract(self, url):
1247 if self._test_redirect(url): return
1249 # At this point we have a new video
1250 self._downloader.increment_downloads()
1252 video_id = url.split('/')[-1]
1253 request = urllib2.Request(url)
1255 self.report_download_webpage(video_id)
1256 webpage = urllib2.urlopen(request).read()
1257 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1258 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1260 except ValueError, err:
1261 # since this is the last-resort InfoExtractor, if
1262 # this error is thrown, it'll be thrown here
1263 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1266 self.report_extraction(video_id)
1267 # Start with something easy: JW Player in SWFObject
1268 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1270 # Broaden the search a little bit
1271 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1273 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1276 # It's possible that one of the regexes
1277 # matched, but returned an empty group:
1278 if mobj.group(1) is None:
1279 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1282 video_url = urllib.unquote(mobj.group(1))
1283 video_id = os.path.basename(video_url)
1285 # here's a fun little line of code for you:
1286 video_extension = os.path.splitext(video_id)[1][1:]
1287 video_id = os.path.splitext(video_id)[0]
1289 # it's tempting to parse this further, but you would
1290 # have to take into account all the variations like
1291 # Video Title - Site Name
1292 # Site Name | Video Title
1293 # Video Title - Tagline | Site Name
1294 # and so on and so forth; it's just not practical
1295 mobj = re.search(r'<title>(.*)</title>', webpage)
1297 self._downloader.trouble(u'ERROR: unable to extract title')
1299 video_title = mobj.group(1).decode('utf-8')
1300 video_title = sanitize_title(video_title)
1301 simple_title = simplify_title(video_title)
1303 # video uploader is domain name
1304 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1306 self._downloader.trouble(u'ERROR: unable to extract title')
1308 video_uploader = mobj.group(1).decode('utf-8')
1311 # Process video information
1312 self._downloader.process_info({
1313 'id': video_id.decode('utf-8'),
1314 'url': video_url.decode('utf-8'),
1315 'uploader': video_uploader,
1316 'upload_date': u'NA',
1317 'title': video_title,
1318 'stitle': simple_title,
1319 'ext': video_extension.decode('utf-8'),
1323 except UnavailableVideoError, err:
1324 self._downloader.trouble(u'\nERROR: unable to download video')
1327 class YoutubeSearchIE(InfoExtractor):
1328 """Information Extractor for YouTube search queries."""
1329 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1330 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1332 _max_youtube_results = 1000
1333 IE_NAME = u'youtube:search'
1335 def __init__(self, youtube_ie, downloader=None):
1336 InfoExtractor.__init__(self, downloader)
1337 self._youtube_ie = youtube_ie
1339 def report_download_page(self, query, pagenum):
1340 """Report attempt to download playlist page with given number."""
1341 query = query.decode(preferredencoding())
1342 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1344 def _real_initialize(self):
1345 self._youtube_ie.initialize()
1347 def _real_extract(self, query):
1348 mobj = re.match(self._VALID_URL, query)
1350 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1353 prefix, query = query.split(':')
1355 query = query.encode('utf-8')
1357 self._download_n_results(query, 1)
1359 elif prefix == 'all':
1360 self._download_n_results(query, self._max_youtube_results)
1366 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1368 elif n > self._max_youtube_results:
1369 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1370 n = self._max_youtube_results
1371 self._download_n_results(query, n)
1373 except ValueError: # parsing prefix as integer fails
1374 self._download_n_results(query, 1)
1377 def _download_n_results(self, query, n):
1378 """Downloads a specified number of results for a query"""
1384 while (50 * pagenum) < limit:
1385 self.report_download_page(query, pagenum+1)
1386 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1387 request = urllib2.Request(result_url)
1389 data = urllib2.urlopen(request).read()
1390 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1391 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
1393 api_response = json.loads(data)['data']
1395 new_ids = list(video['id'] for video in api_response['items'])
1396 video_ids += new_ids
1398 limit = min(n, api_response['totalItems'])
1401 if len(video_ids) > n:
1402 video_ids = video_ids[:n]
1403 for id in video_ids:
1404 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1408 class GoogleSearchIE(InfoExtractor):
1409 """Information Extractor for Google Video search queries."""
1410 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1411 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1412 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1413 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1415 _max_google_results = 1000
1416 IE_NAME = u'video.google:search'
1418 def __init__(self, google_ie, downloader=None):
1419 InfoExtractor.__init__(self, downloader)
1420 self._google_ie = google_ie
1422 def report_download_page(self, query, pagenum):
1423 """Report attempt to download playlist page with given number."""
1424 query = query.decode(preferredencoding())
1425 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1427 def _real_initialize(self):
1428 self._google_ie.initialize()
1430 def _real_extract(self, query):
1431 mobj = re.match(self._VALID_URL, query)
1433 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1436 prefix, query = query.split(':')
1438 query = query.encode('utf-8')
1440 self._download_n_results(query, 1)
1442 elif prefix == 'all':
1443 self._download_n_results(query, self._max_google_results)
1449 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1451 elif n > self._max_google_results:
1452 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1453 n = self._max_google_results
1454 self._download_n_results(query, n)
1456 except ValueError: # parsing prefix as integer fails
1457 self._download_n_results(query, 1)
1460 def _download_n_results(self, query, n):
1461 """Downloads a specified number of results for a query"""
1467 self.report_download_page(query, pagenum)
1468 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1469 request = urllib2.Request(result_url)
1471 page = urllib2.urlopen(request).read()
1472 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1473 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1476 # Extract video identifiers
1477 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1478 video_id = mobj.group(1)
1479 if video_id not in video_ids:
1480 video_ids.append(video_id)
1481 if len(video_ids) == n:
1482 # Specified n videos reached
1483 for id in video_ids:
1484 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1487 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1488 for id in video_ids:
1489 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1492 pagenum = pagenum + 1
1495 class YahooSearchIE(InfoExtractor):
1496 """Information Extractor for Yahoo! Video search queries."""
1497 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1498 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1499 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1500 _MORE_PAGES_INDICATOR = r'\s*Next'
1502 _max_yahoo_results = 1000
1503 IE_NAME = u'video.yahoo:search'
1505 def __init__(self, yahoo_ie, downloader=None):
1506 InfoExtractor.__init__(self, downloader)
1507 self._yahoo_ie = yahoo_ie
1509 def report_download_page(self, query, pagenum):
1510 """Report attempt to download playlist page with given number."""
1511 query = query.decode(preferredencoding())
1512 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1514 def _real_initialize(self):
1515 self._yahoo_ie.initialize()
1517 def _real_extract(self, query):
1518 mobj = re.match(self._VALID_URL, query)
1520 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1523 prefix, query = query.split(':')
1525 query = query.encode('utf-8')
1527 self._download_n_results(query, 1)
1529 elif prefix == 'all':
1530 self._download_n_results(query, self._max_yahoo_results)
1536 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1538 elif n > self._max_yahoo_results:
1539 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1540 n = self._max_yahoo_results
1541 self._download_n_results(query, n)
1543 except ValueError: # parsing prefix as integer fails
1544 self._download_n_results(query, 1)
1547 def _download_n_results(self, query, n):
1548 """Downloads a specified number of results for a query"""
1551 already_seen = set()
1555 self.report_download_page(query, pagenum)
1556 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1557 request = urllib2.Request(result_url)
1559 page = urllib2.urlopen(request).read()
1560 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1561 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1564 # Extract video identifiers
1565 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1566 video_id = mobj.group(1)
1567 if video_id not in already_seen:
1568 video_ids.append(video_id)
1569 already_seen.add(video_id)
1570 if len(video_ids) == n:
1571 # Specified n videos reached
1572 for id in video_ids:
1573 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1576 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1577 for id in video_ids:
1578 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1581 pagenum = pagenum + 1
1584 class YoutubePlaylistIE(InfoExtractor):
1585 """Information Extractor for YouTube playlists."""
1587 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1588 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1589 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&list=PL%s&'
1590 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1592 IE_NAME = u'youtube:playlist'
1594 def __init__(self, youtube_ie, downloader=None):
1595 InfoExtractor.__init__(self, downloader)
1596 self._youtube_ie = youtube_ie
1598 def report_download_page(self, playlist_id, pagenum):
1599 """Report attempt to download playlist page with given number."""
1600 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1602 def _real_initialize(self):
1603 self._youtube_ie.initialize()
1605 def _real_extract(self, url):
1606 # Extract playlist id
1607 mobj = re.match(self._VALID_URL, url)
1609 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1613 if mobj.group(3) is not None:
1614 self._youtube_ie.extract(mobj.group(3))
1617 # Download playlist pages
1618 # prefix is 'p' as default for playlists but there are other types that need extra care
1619 playlist_prefix = mobj.group(1)
1620 if playlist_prefix == 'a':
1621 playlist_access = 'artist'
1623 playlist_prefix = 'p'
1624 playlist_access = 'view_play_list'
1625 playlist_id = mobj.group(2)
1630 self.report_download_page(playlist_id, pagenum)
1631 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1632 request = urllib2.Request(url)
1634 page = urllib2.urlopen(request).read()
1635 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1636 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1639 # Extract video identifiers
1641 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1642 if mobj.group(1) not in ids_in_page:
1643 ids_in_page.append(mobj.group(1))
1644 video_ids.extend(ids_in_page)
1646 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1648 pagenum = pagenum + 1
1650 playliststart = self._downloader.params.get('playliststart', 1) - 1
1651 playlistend = self._downloader.params.get('playlistend', -1)
1652 if playlistend == -1:
1653 video_ids = video_ids[playliststart:]
1655 video_ids = video_ids[playliststart:playlistend]
1657 for id in video_ids:
1658 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1662 class YoutubeUserIE(InfoExtractor):
1663 """Information Extractor for YouTube users."""
1665 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1666 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1667 _GDATA_PAGE_SIZE = 50
1668 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1669 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1671 IE_NAME = u'youtube:user'
1673 def __init__(self, youtube_ie, downloader=None):
1674 InfoExtractor.__init__(self, downloader)
1675 self._youtube_ie = youtube_ie
1677 def report_download_page(self, username, start_index):
1678 """Report attempt to download user page."""
1679 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1680 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1682 def _real_initialize(self):
1683 self._youtube_ie.initialize()
1685 def _real_extract(self, url):
1687 mobj = re.match(self._VALID_URL, url)
1689 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1692 username = mobj.group(1)
1694 # Download video ids using YouTube Data API. Result size per
1695 # query is limited (currently to 50 videos) so we need to query
1696 # page by page until there are no video ids - it means we got
1703 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1704 self.report_download_page(username, start_index)
1706 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1709 page = urllib2.urlopen(request).read()
1710 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1711 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1714 # Extract video identifiers
1717 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1718 if mobj.group(1) not in ids_in_page:
1719 ids_in_page.append(mobj.group(1))
1721 video_ids.extend(ids_in_page)
1723 # A little optimization - if current page is not
1724 # "full", ie. does not contain PAGE_SIZE video ids then
1725 # we can assume that this page is the last one - there
1726 # are no more ids on further pages - no need to query
1729 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1734 all_ids_count = len(video_ids)
1735 playliststart = self._downloader.params.get('playliststart', 1) - 1
1736 playlistend = self._downloader.params.get('playlistend', -1)
1738 if playlistend == -1:
1739 video_ids = video_ids[playliststart:]
1741 video_ids = video_ids[playliststart:playlistend]
1743 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1744 (username, all_ids_count, len(video_ids)))
1746 for video_id in video_ids:
1747 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
1750 class DepositFilesIE(InfoExtractor):
1751 """Information extractor for depositfiles.com"""
1753 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1754 IE_NAME = u'DepositFiles'
1756 def __init__(self, downloader=None):
1757 InfoExtractor.__init__(self, downloader)
1759 def report_download_webpage(self, file_id):
1760 """Report webpage download."""
1761 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1763 def report_extraction(self, file_id):
1764 """Report information extraction."""
1765 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1767 def _real_extract(self, url):
1768 # At this point we have a new file
1769 self._downloader.increment_downloads()
1771 file_id = url.split('/')[-1]
1772 # Rebuild url in english locale
1773 url = 'http://depositfiles.com/en/files/' + file_id
1775 # Retrieve file webpage with 'Free download' button pressed
1776 free_download_indication = { 'gateway_result' : '1' }
1777 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1779 self.report_download_webpage(file_id)
1780 webpage = urllib2.urlopen(request).read()
1781 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1782 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
1785 # Search for the real file URL
1786 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1787 if (mobj is None) or (mobj.group(1) is None):
1788 # Try to figure out reason of the error.
1789 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1790 if (mobj is not None) and (mobj.group(1) is not None):
1791 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1792 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1794 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1797 file_url = mobj.group(1)
1798 file_extension = os.path.splitext(file_url)[1][1:]
1800 # Search for file title
1801 mobj = re.search(r'<b title="(.*?)">', webpage)
1803 self._downloader.trouble(u'ERROR: unable to extract title')
1805 file_title = mobj.group(1).decode('utf-8')
1808 # Process file information
1809 self._downloader.process_info({
1810 'id': file_id.decode('utf-8'),
1811 'url': file_url.decode('utf-8'),
1813 'upload_date': u'NA',
1814 'title': file_title,
1815 'stitle': file_title,
1816 'ext': file_extension.decode('utf-8'),
1820 except UnavailableVideoError, err:
1821 self._downloader.trouble(u'ERROR: unable to download file')
1824 class FacebookIE(InfoExtractor):
1825 """Information Extractor for Facebook"""
1827 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1828 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1829 _NETRC_MACHINE = 'facebook'
1830 _available_formats = ['video', 'highqual', 'lowqual']
1831 _video_extensions = {
1836 IE_NAME = u'facebook'
1838 def __init__(self, downloader=None):
1839 InfoExtractor.__init__(self, downloader)
1841 def _reporter(self, message):
1842 """Add header and report message."""
1843 self._downloader.to_screen(u'[facebook] %s' % message)
1845 def report_login(self):
1846 """Report attempt to log in."""
1847 self._reporter(u'Logging in')
1849 def report_video_webpage_download(self, video_id):
1850 """Report attempt to download video webpage."""
1851 self._reporter(u'%s: Downloading video webpage' % video_id)
1853 def report_information_extraction(self, video_id):
1854 """Report attempt to extract video information."""
1855 self._reporter(u'%s: Extracting video information' % video_id)
1857 def _parse_page(self, video_webpage):
1858 """Extract video information from page"""
1860 data = {'title': r'\("video_title", "(.*?)"\)',
1861 'description': r'<div class="datawrap">(.*?)</div>',
1862 'owner': r'\("video_owner_name", "(.*?)"\)',
1863 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1866 for piece in data.keys():
1867 mobj = re.search(data[piece], video_webpage)
1868 if mobj is not None:
1869 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1873 for fmt in self._available_formats:
1874 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1875 if mobj is not None:
1876 # URL is in a Javascript segment inside an escaped Unicode format within
1877 # the generally utf-8 page
1878 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1879 video_info['video_urls'] = video_urls
1883 def _real_initialize(self):
1884 if self._downloader is None:
1889 downloader_params = self._downloader.params
1891 # Attempt to use provided username and password or .netrc data
1892 if downloader_params.get('username', None) is not None:
1893 useremail = downloader_params['username']
1894 password = downloader_params['password']
1895 elif downloader_params.get('usenetrc', False):
1897 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1898 if info is not None:
1902 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1903 except (IOError, netrc.NetrcParseError), err:
1904 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1907 if useremail is None:
1916 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1919 login_results = urllib2.urlopen(request).read()
1920 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1921 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1923 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1924 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1927 def _real_extract(self, url):
1928 mobj = re.match(self._VALID_URL, url)
1930 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1932 video_id = mobj.group('ID')
1935 self.report_video_webpage_download(video_id)
1936 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
1938 page = urllib2.urlopen(request)
1939 video_webpage = page.read()
1940 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1941 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1944 # Start extracting information
1945 self.report_information_extraction(video_id)
1947 # Extract information
1948 video_info = self._parse_page(video_webpage)
1951 if 'owner' not in video_info:
1952 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1954 video_uploader = video_info['owner']
1957 if 'title' not in video_info:
1958 self._downloader.trouble(u'ERROR: unable to extract video title')
1960 video_title = video_info['title']
1961 video_title = video_title.decode('utf-8')
1962 video_title = sanitize_title(video_title)
1964 simple_title = simplify_title(video_title)
1967 if 'thumbnail' not in video_info:
1968 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1969 video_thumbnail = ''
1971 video_thumbnail = video_info['thumbnail']
1975 if 'upload_date' in video_info:
1976 upload_time = video_info['upload_date']
1977 timetuple = email.utils.parsedate_tz(upload_time)
1978 if timetuple is not None:
1980 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
1985 video_description = video_info.get('description', 'No description available.')
1987 url_map = video_info['video_urls']
1988 if len(url_map.keys()) > 0:
1989 # Decide which formats to download
1990 req_format = self._downloader.params.get('format', None)
1991 format_limit = self._downloader.params.get('format_limit', None)
1993 if format_limit is not None and format_limit in self._available_formats:
1994 format_list = self._available_formats[self._available_formats.index(format_limit):]
1996 format_list = self._available_formats
1997 existing_formats = [x for x in format_list if x in url_map]
1998 if len(existing_formats) == 0:
1999 self._downloader.trouble(u'ERROR: no known formats available for video')
2001 if req_format is None:
2002 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2003 elif req_format == 'worst':
2004 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2005 elif req_format == '-1':
2006 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2009 if req_format not in url_map:
2010 self._downloader.trouble(u'ERROR: requested format not available')
2012 video_url_list = [(req_format, url_map[req_format])] # Specific format
2014 for format_param, video_real_url in video_url_list:
2016 # At this point we have a new video
2017 self._downloader.increment_downloads()
2020 video_extension = self._video_extensions.get(format_param, 'mp4')
2023 # Process video information
2024 self._downloader.process_info({
2025 'id': video_id.decode('utf-8'),
2026 'url': video_real_url.decode('utf-8'),
2027 'uploader': video_uploader.decode('utf-8'),
2028 'upload_date': upload_date,
2029 'title': video_title,
2030 'stitle': simple_title,
2031 'ext': video_extension.decode('utf-8'),
2032 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2033 'thumbnail': video_thumbnail.decode('utf-8'),
2034 'description': video_description.decode('utf-8'),
2037 except UnavailableVideoError, err:
2038 self._downloader.trouble(u'\nERROR: unable to download video')
2040 class BlipTVIE(InfoExtractor):
2041 """Information extractor for blip.tv"""
2043 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2044 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2045 IE_NAME = u'blip.tv'
2047 def report_extraction(self, file_id):
2048 """Report information extraction."""
2049 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2051 def report_direct_download(self, title):
2052 """Report information extraction."""
2053 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2055 def _real_extract(self, url):
2056 mobj = re.match(self._VALID_URL, url)
2058 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2065 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2066 request = urllib2.Request(json_url)
2067 self.report_extraction(mobj.group(1))
2070 urlh = urllib2.urlopen(request)
2071 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2072 basename = url.split('/')[-1]
2073 title,ext = os.path.splitext(basename)
2074 title = title.decode('UTF-8')
2075 ext = ext.replace('.', '')
2076 self.report_direct_download(title)
2081 'stitle': simplify_title(title),
2085 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2086 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2088 if info is None: # Regular URL
2090 json_code = urlh.read()
2091 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2092 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2096 json_data = json.loads(json_code)
2097 if 'Post' in json_data:
2098 data = json_data['Post']
2102 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2103 video_url = data['media']['url']
2104 umobj = re.match(self._URL_EXT, video_url)
2106 raise ValueError('Can not determine filename extension')
2107 ext = umobj.group(1)
2110 'id': data['item_id'],
2112 'uploader': data['display_name'],
2113 'upload_date': upload_date,
2114 'title': data['title'],
2115 'stitle': simplify_title(data['title']),
2117 'format': data['media']['mimeType'],
2118 'thumbnail': data['thumbnailUrl'],
2119 'description': data['description'],
2120 'player_url': data['embedUrl']
2122 except (ValueError,KeyError), err:
2123 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2126 self._downloader.increment_downloads()
2129 self._downloader.process_info(info)
2130 except UnavailableVideoError, err:
2131 self._downloader.trouble(u'\nERROR: unable to download video')
2134 class MyVideoIE(InfoExtractor):
2135 """Information Extractor for myvideo.de."""
2137 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2138 IE_NAME = u'myvideo'
2140 def __init__(self, downloader=None):
2141 InfoExtractor.__init__(self, downloader)
2143 def report_download_webpage(self, video_id):
2144 """Report webpage download."""
2145 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2147 def report_extraction(self, video_id):
2148 """Report information extraction."""
2149 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2151 def _real_extract(self,url):
2152 mobj = re.match(self._VALID_URL, url)
2154 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2157 video_id = mobj.group(1)
2160 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2162 self.report_download_webpage(video_id)
2163 webpage = urllib2.urlopen(request).read()
2164 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2165 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2168 self.report_extraction(video_id)
2169 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2172 self._downloader.trouble(u'ERROR: unable to extract media URL')
2174 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2176 mobj = re.search('<title>([^<]+)</title>', webpage)
2178 self._downloader.trouble(u'ERROR: unable to extract title')
2181 video_title = mobj.group(1)
2182 video_title = sanitize_title(video_title)
2184 simple_title = simplify_title(video_title)
2187 self._downloader.process_info({
2191 'upload_date': u'NA',
2192 'title': video_title,
2193 'stitle': simple_title,
2198 except UnavailableVideoError:
2199 self._downloader.trouble(u'\nERROR: Unable to download video')
2201 class ComedyCentralIE(InfoExtractor):
2202 """Information extractor for The Daily Show and Colbert Report """
2204 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2205 IE_NAME = u'comedycentral'
2207 def report_extraction(self, episode_id):
2208 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2210 def report_config_download(self, episode_id):
2211 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2213 def report_index_download(self, episode_id):
2214 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2216 def report_player_url(self, episode_id):
2217 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2219 def _real_extract(self, url):
2220 mobj = re.match(self._VALID_URL, url)
2222 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2225 if mobj.group('shortname'):
2226 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2227 url = u'http://www.thedailyshow.com/full-episodes/'
2229 url = u'http://www.colbertnation.com/full-episodes/'
2230 mobj = re.match(self._VALID_URL, url)
2231 assert mobj is not None
2233 dlNewest = not mobj.group('episode')
2235 epTitle = mobj.group('showname')
2237 epTitle = mobj.group('episode')
2239 req = urllib2.Request(url)
2240 self.report_extraction(epTitle)
2242 htmlHandle = urllib2.urlopen(req)
2243 html = htmlHandle.read()
2244 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2245 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2248 url = htmlHandle.geturl()
2249 mobj = re.match(self._VALID_URL, url)
2251 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2253 if mobj.group('episode') == '':
2254 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2256 epTitle = mobj.group('episode')
2258 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2259 if len(mMovieParams) == 0:
2260 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2263 playerUrl_raw = mMovieParams[0][0]
2264 self.report_player_url(epTitle)
2266 urlHandle = urllib2.urlopen(playerUrl_raw)
2267 playerUrl = urlHandle.geturl()
2268 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2269 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2272 uri = mMovieParams[0][1]
2273 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2274 self.report_index_download(epTitle)
2276 indexXml = urllib2.urlopen(indexUrl).read()
2277 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2278 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2281 idoc = xml.etree.ElementTree.fromstring(indexXml)
2282 itemEls = idoc.findall('.//item')
2283 for itemEl in itemEls:
2284 mediaId = itemEl.findall('./guid')[0].text
2285 shortMediaId = mediaId.split(':')[-1]
2286 showId = mediaId.split(':')[-2].replace('.com', '')
2287 officialTitle = itemEl.findall('./title')[0].text
2288 officialDate = itemEl.findall('./pubDate')[0].text
2290 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2291 urllib.urlencode({'uri': mediaId}))
2292 configReq = urllib2.Request(configUrl)
2293 self.report_config_download(epTitle)
2295 configXml = urllib2.urlopen(configReq).read()
2296 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2297 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2300 cdoc = xml.etree.ElementTree.fromstring(configXml)
2302 for rendition in cdoc.findall('.//rendition'):
2303 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2307 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2310 # For now, just pick the highest bitrate
2311 format,video_url = turls[-1]
2313 self._downloader.increment_downloads()
2315 effTitle = showId + u'-' + epTitle
2320 'upload_date': officialDate,
2322 'stitle': simplify_title(effTitle),
2326 'description': officialTitle,
2327 'player_url': playerUrl
2331 self._downloader.process_info(info)
2332 except UnavailableVideoError, err:
2333 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
2337 class EscapistIE(InfoExtractor):
2338 """Information extractor for The Escapist """
2340 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2341 IE_NAME = u'escapist'
2343 def report_extraction(self, showName):
2344 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2346 def report_config_download(self, showName):
2347 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2349 def _real_extract(self, url):
2350 htmlParser = HTMLParser.HTMLParser()
2352 mobj = re.match(self._VALID_URL, url)
2354 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2356 showName = mobj.group('showname')
2357 videoId = mobj.group('episode')
2359 self.report_extraction(showName)
2361 webPage = urllib2.urlopen(url).read()
2362 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2363 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2366 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2367 description = htmlParser.unescape(descMatch.group(1))
2368 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2369 imgUrl = htmlParser.unescape(imgMatch.group(1))
2370 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2371 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
2372 configUrlMatch = re.search('config=(.*)$', playerUrl)
2373 configUrl = urllib2.unquote(configUrlMatch.group(1))
2375 self.report_config_download(showName)
2377 configJSON = urllib2.urlopen(configUrl).read()
2378 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2379 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2382 # Technically, it's JavaScript, not JSON
2383 configJSON = configJSON.replace("'", '"')
2386 config = json.loads(configJSON)
2387 except (ValueError,), err:
2388 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2391 playlist = config['playlist']
2392 videoUrl = playlist[1]['url']
2394 self._downloader.increment_downloads()
2398 'uploader': showName,
2399 'upload_date': None,
2401 'stitle': simplify_title(showName),
2404 'thumbnail': imgUrl,
2405 'description': description,
2406 'player_url': playerUrl,
2410 self._downloader.process_info(info)
2411 except UnavailableVideoError, err:
2412 self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
2415 class CollegeHumorIE(InfoExtractor):
2416 """Information extractor for collegehumor.com"""
2418 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2419 IE_NAME = u'collegehumor'
2421 def report_webpage(self, video_id):
2422 """Report information extraction."""
2423 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2425 def report_extraction(self, video_id):
2426 """Report information extraction."""
2427 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2429 def _real_extract(self, url):
2430 htmlParser = HTMLParser.HTMLParser()
2432 mobj = re.match(self._VALID_URL, url)
2434 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2436 video_id = mobj.group('videoid')
2438 self.report_webpage(video_id)
2439 request = urllib2.Request(url)
2441 webpage = urllib2.urlopen(request).read()
2442 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2443 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2446 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2448 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2450 internal_video_id = m.group('internalvideoid')
2454 'internal_id': internal_video_id,
2457 self.report_extraction(video_id)
2458 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2460 metaXml = urllib2.urlopen(xmlUrl).read()
2461 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2462 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
2465 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2467 videoNode = mdoc.findall('./video')[0]
2468 info['description'] = videoNode.findall('./description')[0].text
2469 info['title'] = videoNode.findall('./caption')[0].text
2470 info['stitle'] = simplify_title(info['title'])
2471 info['url'] = videoNode.findall('./file')[0].text
2472 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2473 info['ext'] = info['url'].rpartition('.')[2]
2474 info['format'] = info['ext']
2476 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2479 self._downloader.increment_downloads()
2482 self._downloader.process_info(info)
2483 except UnavailableVideoError, err:
2484 self._downloader.trouble(u'\nERROR: unable to download video')
2487 class XVideosIE(InfoExtractor):
2488 """Information extractor for xvideos.com"""
2490 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2491 IE_NAME = u'xvideos'
2493 def report_webpage(self, video_id):
2494 """Report information extraction."""
2495 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2497 def report_extraction(self, video_id):
2498 """Report information extraction."""
2499 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2501 def _real_extract(self, url):
2502 htmlParser = HTMLParser.HTMLParser()
2504 mobj = re.match(self._VALID_URL, url)
2506 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2508 video_id = mobj.group(1).decode('utf-8')
2510 self.report_webpage(video_id)
2512 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2514 webpage = urllib2.urlopen(request).read()
2515 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2516 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2519 self.report_extraction(video_id)
2523 mobj = re.search(r'flv_url=(.+?)&', webpage)
2525 self._downloader.trouble(u'ERROR: unable to extract video url')
2527 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2531 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2533 self._downloader.trouble(u'ERROR: unable to extract video title')
2535 video_title = mobj.group(1).decode('utf-8')
2538 # Extract video thumbnail
2539 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
2541 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2543 video_thumbnail = mobj.group(1).decode('utf-8')
2547 self._downloader.increment_downloads()
2552 'upload_date': None,
2553 'title': video_title,
2554 'stitle': simplify_title(video_title),
2557 'thumbnail': video_thumbnail,
2558 'description': None,
2563 self._downloader.process_info(info)
2564 except UnavailableVideoError, err:
2565 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
2568 class SoundcloudIE(InfoExtractor):
2569 """Information extractor for soundcloud.com
2570 To access the media, the uid of the song and a stream token
2571 must be extracted from the page source and the script must make
2572 a request to media.soundcloud.com/crossdomain.xml. Then
2573 the media can be grabbed by requesting from an url composed
2574 of the stream token and uid
2577 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2578 IE_NAME = u'soundcloud'
2580 def __init__(self, downloader=None):
2581 InfoExtractor.__init__(self, downloader)
2583 def report_webpage(self, video_id):
2584 """Report information extraction."""
2585 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2587 def report_extraction(self, video_id):
2588 """Report information extraction."""
2589 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2591 def _real_extract(self, url):
2592 htmlParser = HTMLParser.HTMLParser()
2594 mobj = re.match(self._VALID_URL, url)
2596 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2599 # extract uploader (which is in the url)
2600 uploader = mobj.group(1).decode('utf-8')
2601 # extract simple title (uploader + slug of song title)
2602 slug_title = mobj.group(2).decode('utf-8')
2603 simple_title = uploader + '-' + slug_title
2605 self.report_webpage('%s/%s' % (uploader, slug_title))
2607 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2609 webpage = urllib2.urlopen(request).read()
2610 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2611 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2614 self.report_extraction('%s/%s' % (uploader, slug_title))
2616 # extract uid and stream token that soundcloud hands out for access
2617 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2619 video_id = mobj.group(1)
2620 stream_token = mobj.group(2)
2622 # extract unsimplified title
2623 mobj = re.search('"title":"(.*?)",', webpage)
2625 title = mobj.group(1)
2627 # construct media url (with uid/token)
2628 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2629 mediaURL = mediaURL % (video_id, stream_token)
2632 description = u'No description available'
2633 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2635 description = mobj.group(1)
2639 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2642 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2643 except Exception, e:
2646 # for soundcloud, a request to a cross domain is required for cookies
2647 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2650 self._downloader.process_info({
2651 'id': video_id.decode('utf-8'),
2653 'uploader': uploader.decode('utf-8'),
2654 'upload_date': upload_date,
2655 'title': simple_title.decode('utf-8'),
2656 'stitle': simple_title.decode('utf-8'),
2660 'description': description.decode('utf-8')
2662 except UnavailableVideoError:
2663 self._downloader.trouble(u'\nERROR: unable to download video')
2666 class InfoQIE(InfoExtractor):
2667 """Information extractor for infoq.com"""
2669 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2672 def report_webpage(self, video_id):
2673 """Report information extraction."""
2674 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2676 def report_extraction(self, video_id):
2677 """Report information extraction."""
2678 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2680 def _real_extract(self, url):
2681 htmlParser = HTMLParser.HTMLParser()
2683 mobj = re.match(self._VALID_URL, url)
2685 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2688 self.report_webpage(url)
2690 request = urllib2.Request(url)
2692 webpage = urllib2.urlopen(request).read()
2693 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2694 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2697 self.report_extraction(url)
2701 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2703 self._downloader.trouble(u'ERROR: unable to extract video url')
2705 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2709 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2711 self._downloader.trouble(u'ERROR: unable to extract video title')
2713 video_title = mobj.group(1).decode('utf-8')
2715 # Extract description
2716 video_description = u'No description available.'
2717 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2718 if mobj is not None:
2719 video_description = mobj.group(1).decode('utf-8')
2721 video_filename = video_url.split('/')[-1]
2722 video_id, extension = video_filename.split('.')
2724 self._downloader.increment_downloads()
2729 'upload_date': None,
2730 'title': video_title,
2731 'stitle': simplify_title(video_title),
2733 'format': extension, # Extension is always(?) mp4, but seems to be flv
2735 'description': video_description,
2740 self._downloader.process_info(info)
2741 except UnavailableVideoError, err:
2742 self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
2744 class MixcloudIE(InfoExtractor):
2745 """Information extractor for www.mixcloud.com"""
2746 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2747 IE_NAME = u'mixcloud'
2749 def __init__(self, downloader=None):
2750 InfoExtractor.__init__(self, downloader)
2752 def report_download_json(self, file_id):
2753 """Report JSON download."""
2754 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2756 def report_extraction(self, file_id):
2757 """Report information extraction."""
2758 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2760 def get_urls(self, jsonData, fmt, bitrate='best'):
2761 """Get urls from 'audio_formats' section in json"""
2764 bitrate_list = jsonData[fmt]
2765 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2766 bitrate = max(bitrate_list) # select highest
2768 url_list = jsonData[fmt][bitrate]
2769 except TypeError: # we have no bitrate info.
2770 url_list = jsonData[fmt]
2774 def check_urls(self, url_list):
2775 """Returns 1st active url from list"""
2776 for url in url_list:
2778 urllib2.urlopen(url)
2780 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2785 def _print_formats(self, formats):
2786 print 'Available formats:'
2787 for fmt in formats.keys():
2788 for b in formats[fmt]:
2790 ext = formats[fmt][b][0]
2791 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
2792 except TypeError: # we have no bitrate info
2793 ext = formats[fmt][0]
2794 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
2797 def _real_extract(self, url):
2798 mobj = re.match(self._VALID_URL, url)
2800 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2802 # extract uploader & filename from url
2803 uploader = mobj.group(1).decode('utf-8')
2804 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2806 # construct API request
2807 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2808 # retrieve .json file with links to files
2809 request = urllib2.Request(file_url)
2811 self.report_download_json(file_url)
2812 jsonData = urllib2.urlopen(request).read()
2813 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2814 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
2818 json_data = json.loads(jsonData)
2819 player_url = json_data['player_swf_url']
2820 formats = dict(json_data['audio_formats'])
2822 req_format = self._downloader.params.get('format', None)
2825 if self._downloader.params.get('listformats', None):
2826 self._print_formats(formats)
2829 if req_format is None or req_format == 'best':
2830 for format_param in formats.keys():
2831 url_list = self.get_urls(formats, format_param)
2833 file_url = self.check_urls(url_list)
2834 if file_url is not None:
2837 if req_format not in formats.keys():
2838 self._downloader.trouble(u'ERROR: format is not available')
2841 url_list = self.get_urls(formats, req_format)
2842 file_url = self.check_urls(url_list)
2843 format_param = req_format
2846 self._downloader.increment_downloads()
2848 # Process file information
2849 self._downloader.process_info({
2850 'id': file_id.decode('utf-8'),
2851 'url': file_url.decode('utf-8'),
2852 'uploader': uploader.decode('utf-8'),
2853 'upload_date': u'NA',
2854 'title': json_data['name'],
2855 'stitle': simplify_title(json_data['name']),
2856 'ext': file_url.split('.')[-1].decode('utf-8'),
2857 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2858 'thumbnail': json_data['thumbnail_url'],
2859 'description': json_data['description'],
2860 'player_url': player_url.decode('utf-8'),
2862 except UnavailableVideoError, err:
2863 self._downloader.trouble(u'ERROR: unable to download file')
2865 class StanfordOpenClassroomIE(InfoExtractor):
2866 """Information extractor for Stanford's Open ClassRoom"""
2868 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2869 IE_NAME = u'stanfordoc'
2871 def report_download_webpage(self, objid):
2872 """Report information extraction."""
2873 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2875 def report_extraction(self, video_id):
2876 """Report information extraction."""
2877 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2879 def _real_extract(self, url):
2880 mobj = re.match(self._VALID_URL, url)
2882 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2885 if mobj.group('course') and mobj.group('video'): # A specific video
2886 course = mobj.group('course')
2887 video = mobj.group('video')
2889 'id': simplify_title(course + '_' + video),
2892 self.report_extraction(info['id'])
2893 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2894 xmlUrl = baseUrl + video + '.xml'
2896 metaXml = urllib2.urlopen(xmlUrl).read()
2897 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2898 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2900 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2902 info['title'] = mdoc.findall('./title')[0].text
2903 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2905 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2907 info['stitle'] = simplify_title(info['title'])
2908 info['ext'] = info['url'].rpartition('.')[2]
2909 info['format'] = info['ext']
2910 self._downloader.increment_downloads()
2912 self._downloader.process_info(info)
2913 except UnavailableVideoError, err:
2914 self._downloader.trouble(u'\nERROR: unable to download video')
2915 elif mobj.group('course'): # A course page
2916 unescapeHTML = HTMLParser.HTMLParser().unescape
2918 course = mobj.group('course')
2920 'id': simplify_title(course),
2924 self.report_download_webpage(info['id'])
2926 coursepage = urllib2.urlopen(url).read()
2927 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2928 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2931 m = re.search('<h1>([^<]+)</h1>', coursepage)
2933 info['title'] = unescapeHTML(m.group(1))
2935 info['title'] = info['id']
2936 info['stitle'] = simplify_title(info['title'])
2938 m = re.search('<description>([^<]+)</description>', coursepage)
2940 info['description'] = unescapeHTML(m.group(1))
2942 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2945 'type': 'reference',
2946 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2950 for entry in info['list']:
2951 assert entry['type'] == 'reference'
2952 self.extract(entry['url'])
2954 unescapeHTML = HTMLParser.HTMLParser().unescape
2957 'id': 'Stanford OpenClassroom',
2961 self.report_download_webpage(info['id'])
2962 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2964 rootpage = urllib2.urlopen(rootURL).read()
2965 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2966 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2969 info['title'] = info['id']
2970 info['stitle'] = simplify_title(info['title'])
2972 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2975 'type': 'reference',
2976 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2980 for entry in info['list']:
2981 assert entry['type'] == 'reference'
2982 self.extract(entry['url'])
2984 class MTVIE(InfoExtractor):
2985 """Information extractor for MTV.com"""
2987 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2990 def report_webpage(self, video_id):
2991 """Report information extraction."""
2992 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2994 def report_extraction(self, video_id):
2995 """Report information extraction."""
2996 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2998 def _real_extract(self, url):
2999 mobj = re.match(self._VALID_URL, url)
3001 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3003 if not mobj.group('proto'):
3004 url = 'http://' + url
3005 video_id = mobj.group('videoid')
3006 self.report_webpage(video_id)
3008 request = urllib2.Request(url)
3010 webpage = urllib2.urlopen(request).read()
3011 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3012 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3015 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3017 self._downloader.trouble(u'ERROR: unable to extract song name')
3019 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3020 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3022 self._downloader.trouble(u'ERROR: unable to extract performer')
3024 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3025 video_title = performer + ' - ' + song_name
3027 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3029 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3031 mtvn_uri = mobj.group(1)
3033 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3035 self._downloader.trouble(u'ERROR: unable to extract content id')
3037 content_id = mobj.group(1)
3039 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3040 self.report_extraction(video_id)
3041 request = urllib2.Request(videogen_url)
3043 metadataXml = urllib2.urlopen(request).read()
3044 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3045 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
3048 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3049 renditions = mdoc.findall('.//rendition')
3051 # For now, always pick the highest quality.
3052 rendition = renditions[-1]
3055 _,_,ext = rendition.attrib['type'].partition('/')
3056 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3057 video_url = rendition.find('./src').text
3059 self._downloader.trouble('Invalid rendition field.')
3062 self._downloader.increment_downloads()
3066 'uploader': performer,
3067 'title': video_title,
3068 'stitle': simplify_title(video_title),
3074 self._downloader.process_info(info)
3075 except UnavailableVideoError, err:
3076 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)