2 # -*- coding: utf-8 -*-
15 import xml.etree.ElementTree
16 from urlparse import parse_qs
19 import cStringIO as StringIO
26 class InfoExtractor(object):
27 """Information Extractor class.
29 Information extractors are the classes that, given a URL, extract
30 information from the video (or videos) the URL refers to. This
31 information includes the real video URL, the video title and simplified
32 title, author and others. The information is stored in a dictionary
33 which is then passed to the FileDownloader. The FileDownloader
34 processes this information possibly downloading the video to the file
35 system, among other possible outcomes. The dictionaries must include
40 uploader: Nickname of the video uploader.
42 ext: Video filename extension.
44 player_url: SWF Player URL (may be None).
46 The following fields are optional. Their primary purpose is to allow
47 youtube-dl to serve as the backend for a video search function, such
48 as the one in youtube2mp3. They are only used when their respective
49 forced printing functions are called:
51 thumbnail: Full URL to a video thumbnail image.
52 description: One-line video description.
54 Subclasses of this one should re-define the _real_initialize() and
55 _real_extract() methods and define a _VALID_URL regexp.
56 Probably, they should also be added to the list of extractors.
62 def __init__(self, downloader=None):
63 """Constructor. Receives an optional downloader."""
65 self.set_downloader(downloader)
67 def suitable(self, url):
68 """Receives a URL and returns True if suitable for this IE."""
69 return re.match(self._VALID_URL, url) is not None
72 """Initializes an instance (authentication, etc)."""
74 self._real_initialize()
77 def extract(self, url):
78 """Extracts URL information and returns it in list of dicts."""
80 return self._real_extract(url)
82 def set_downloader(self, downloader):
83 """Sets the downloader for this IE."""
84 self._downloader = downloader
86 def _real_initialize(self):
87 """Real initialization process. Redefine in subclasses."""
90 def _real_extract(self, url):
91 """Real extraction process. Redefine in subclasses."""
95 class YoutubeIE(InfoExtractor):
96 """Information extractor for youtube.com."""
98 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
99 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
100 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
101 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
102 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
103 _NETRC_MACHINE = 'youtube'
104 # Listed in order of quality
105 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
106 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
107 _video_extensions = {
113 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
119 _video_dimensions = {
137 def report_lang(self):
138 """Report attempt to set language."""
139 self._downloader.to_screen(u'[youtube] Setting language')
141 def report_login(self):
142 """Report attempt to log in."""
143 self._downloader.to_screen(u'[youtube] Logging in')
145 def report_age_confirmation(self):
146 """Report attempt to confirm age."""
147 self._downloader.to_screen(u'[youtube] Confirming age')
149 def report_video_webpage_download(self, video_id):
150 """Report attempt to download video webpage."""
151 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
153 def report_video_info_webpage_download(self, video_id):
154 """Report attempt to download video info webpage."""
155 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
157 def report_video_subtitles_download(self, video_id):
158 """Report attempt to download video info webpage."""
159 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
161 def report_information_extraction(self, video_id):
162 """Report attempt to extract video information."""
163 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
165 def report_unavailable_format(self, video_id, format):
166 """Report extracted video URL."""
167 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
169 def report_rtmp_download(self):
170 """Indicate the download will use the RTMP protocol."""
171 self._downloader.to_screen(u'[youtube] RTMP download detected')
173 def _closed_captions_xml_to_srt(self, xml_string):
175 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
176 # TODO parse xml instead of regex
177 for n, (start, dur_tag, dur, caption) in enumerate(texts):
178 if not dur: dur = '4'
180 end = start + float(dur)
181 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
182 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
183 caption = unescapeHTML(caption)
184 caption = unescapeHTML(caption) # double cycle, intentional
186 srt += start + ' --> ' + end + '\n'
187 srt += caption + '\n\n'
190 def _print_formats(self, formats):
191 print 'Available formats:'
193 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
195 def _real_initialize(self):
196 if self._downloader is None:
201 downloader_params = self._downloader.params
203 # Attempt to use provided username and password or .netrc data
204 if downloader_params.get('username', None) is not None:
205 username = downloader_params['username']
206 password = downloader_params['password']
207 elif downloader_params.get('usenetrc', False):
209 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
214 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
215 except (IOError, netrc.NetrcParseError), err:
216 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
220 request = urllib2.Request(self._LANG_URL)
223 urllib2.urlopen(request).read()
224 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
225 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
228 # No authentication to be performed
234 'current_form': 'loginForm',
236 'action_login': 'Log In',
237 'username': username,
238 'password': password,
240 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
243 login_results = urllib2.urlopen(request).read()
244 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
245 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
247 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
248 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
254 'action_confirm': 'Confirm',
256 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
258 self.report_age_confirmation()
259 age_results = urllib2.urlopen(request).read()
260 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
261 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
264 def _real_extract(self, url):
265 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
266 mobj = re.search(self._NEXT_URL_RE, url)
268 url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
270 # Extract video id from URL
271 mobj = re.match(self._VALID_URL, url)
273 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
275 video_id = mobj.group(2)
278 self.report_video_webpage_download(video_id)
279 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
281 video_webpage = urllib2.urlopen(request).read()
282 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
283 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
286 # Attempt to extract SWF player URL
287 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
289 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
294 self.report_video_info_webpage_download(video_id)
295 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
296 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
297 % (video_id, el_type))
298 request = urllib2.Request(video_info_url)
300 video_info_webpage = urllib2.urlopen(request).read()
301 video_info = parse_qs(video_info_webpage)
302 if 'token' in video_info:
304 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
305 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
307 if 'token' not in video_info:
308 if 'reason' in video_info:
309 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
311 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
314 # Start extracting information
315 self.report_information_extraction(video_id)
318 if 'author' not in video_info:
319 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
321 video_uploader = urllib.unquote_plus(video_info['author'][0])
324 if 'title' not in video_info:
325 self._downloader.trouble(u'ERROR: unable to extract video title')
327 video_title = urllib.unquote_plus(video_info['title'][0])
328 video_title = video_title.decode('utf-8')
331 if 'thumbnail_url' not in video_info:
332 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
334 else: # don't panic if we can't find it
335 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
339 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
341 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
342 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
343 for expression in format_expressions:
345 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
350 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
351 if video_description: video_description = clean_html(video_description)
352 else: video_description = ''
355 video_subtitles = None
356 if self._downloader.params.get('writesubtitles', False):
358 self.report_video_subtitles_download(video_id)
359 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
361 srt_list = urllib2.urlopen(request).read()
362 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
363 raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
364 srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
365 if not srt_lang_list:
366 raise Trouble(u'WARNING: video has no closed captions')
367 if self._downloader.params.get('subtitleslang', False):
368 srt_lang = self._downloader.params.get('subtitleslang')
369 elif 'en' in srt_lang_list:
372 srt_lang = srt_lang_list[0]
373 if not srt_lang in srt_lang_list:
374 raise Trouble(u'WARNING: no closed captions found in the specified language')
375 request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
377 srt_xml = urllib2.urlopen(request).read()
378 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
379 raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
380 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
381 except Trouble as trouble:
382 self._downloader.trouble(trouble[0])
385 video_token = urllib.unquote_plus(video_info['token'][0])
387 # Decide which formats to download
388 req_format = self._downloader.params.get('format', None)
390 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
391 self.report_rtmp_download()
392 video_url_list = [(None, video_info['conn'][0])]
393 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
394 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
395 url_data = [parse_qs(uds) for uds in url_data_strs]
396 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
397 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
399 format_limit = self._downloader.params.get('format_limit', None)
400 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
401 if format_limit is not None and format_limit in available_formats:
402 format_list = available_formats[available_formats.index(format_limit):]
404 format_list = available_formats
405 existing_formats = [x for x in format_list if x in url_map]
406 if len(existing_formats) == 0:
407 self._downloader.trouble(u'ERROR: no known formats available for video')
409 if self._downloader.params.get('listformats', None):
410 self._print_formats(existing_formats)
412 if req_format is None or req_format == 'best':
413 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
414 elif req_format == 'worst':
415 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
416 elif req_format in ('-1', 'all'):
417 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
419 # Specific formats. We pick the first in a slash-delimeted sequence.
420 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
421 req_formats = req_format.split('/')
422 video_url_list = None
423 for rf in req_formats:
425 video_url_list = [(rf, url_map[rf])]
427 if video_url_list is None:
428 self._downloader.trouble(u'ERROR: requested format not available')
431 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
435 for format_param, video_real_url in video_url_list:
437 video_extension = self._video_extensions.get(format_param, 'flv')
440 'id': video_id.decode('utf-8'),
441 'url': video_real_url.decode('utf-8'),
442 'uploader': video_uploader.decode('utf-8'),
443 'upload_date': upload_date,
444 'title': video_title,
445 'ext': video_extension.decode('utf-8'),
446 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
447 'thumbnail': video_thumbnail.decode('utf-8'),
448 'description': video_description,
449 'player_url': player_url,
450 'subtitles': video_subtitles
455 class MetacafeIE(InfoExtractor):
456 """Information Extractor for metacafe.com."""
458 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
459 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
460 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
461 IE_NAME = u'metacafe'
463 def __init__(self, downloader=None):
464 InfoExtractor.__init__(self, downloader)
466 def report_disclaimer(self):
467 """Report disclaimer retrieval."""
468 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
470 def report_age_confirmation(self):
471 """Report attempt to confirm age."""
472 self._downloader.to_screen(u'[metacafe] Confirming age')
474 def report_download_webpage(self, video_id):
475 """Report webpage download."""
476 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
478 def report_extraction(self, video_id):
479 """Report information extraction."""
480 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
482 def _real_initialize(self):
483 # Retrieve disclaimer
484 request = urllib2.Request(self._DISCLAIMER)
486 self.report_disclaimer()
487 disclaimer = urllib2.urlopen(request).read()
488 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
489 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
495 'submit': "Continue - I'm over 18",
497 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
499 self.report_age_confirmation()
500 disclaimer = urllib2.urlopen(request).read()
501 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
502 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
505 def _real_extract(self, url):
506 # Extract id and simplified title from URL
507 mobj = re.match(self._VALID_URL, url)
509 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
512 video_id = mobj.group(1)
514 # Check if video comes from YouTube
515 mobj2 = re.match(r'^yt-(.*)$', video_id)
516 if mobj2 is not None:
517 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
520 # Retrieve video webpage to extract further information
521 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
523 self.report_download_webpage(video_id)
524 webpage = urllib2.urlopen(request).read()
525 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
526 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
529 # Extract URL, uploader and title from webpage
530 self.report_extraction(video_id)
531 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
533 mediaURL = urllib.unquote(mobj.group(1))
534 video_extension = mediaURL[-3:]
536 # Extract gdaKey if available
537 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
541 gdaKey = mobj.group(1)
542 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
544 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
546 self._downloader.trouble(u'ERROR: unable to extract media URL')
548 vardict = parse_qs(mobj.group(1))
549 if 'mediaData' not in vardict:
550 self._downloader.trouble(u'ERROR: unable to extract media URL')
552 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
554 self._downloader.trouble(u'ERROR: unable to extract media URL')
556 mediaURL = mobj.group(1).replace('\\/', '/')
557 video_extension = mediaURL[-3:]
558 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
560 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
562 self._downloader.trouble(u'ERROR: unable to extract title')
564 video_title = mobj.group(1).decode('utf-8')
566 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
568 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
570 video_uploader = mobj.group(1)
573 'id': video_id.decode('utf-8'),
574 'url': video_url.decode('utf-8'),
575 'uploader': video_uploader.decode('utf-8'),
576 'upload_date': u'NA',
577 'title': video_title,
578 'ext': video_extension.decode('utf-8'),
584 class DailymotionIE(InfoExtractor):
585 """Information Extractor for Dailymotion"""
587 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
588 IE_NAME = u'dailymotion'
590 def __init__(self, downloader=None):
591 InfoExtractor.__init__(self, downloader)
593 def report_download_webpage(self, video_id):
594 """Report webpage download."""
595 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
597 def report_extraction(self, video_id):
598 """Report information extraction."""
599 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
601 def _real_extract(self, url):
602 # Extract id and simplified title from URL
603 mobj = re.match(self._VALID_URL, url)
605 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
608 video_id = mobj.group(1)
610 video_extension = 'flv'
612 # Retrieve video webpage to extract further information
613 request = urllib2.Request(url)
614 request.add_header('Cookie', 'family_filter=off')
616 self.report_download_webpage(video_id)
617 webpage = urllib2.urlopen(request).read()
618 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
619 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
622 # Extract URL, uploader and title from webpage
623 self.report_extraction(video_id)
624 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
626 self._downloader.trouble(u'ERROR: unable to extract media URL')
628 sequence = urllib.unquote(mobj.group(1))
629 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
631 self._downloader.trouble(u'ERROR: unable to extract media URL')
633 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
635 # if needed add http://www.dailymotion.com/ if relative URL
639 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
641 self._downloader.trouble(u'ERROR: unable to extract title')
643 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
645 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
647 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
649 video_uploader = mobj.group(1)
652 'id': video_id.decode('utf-8'),
653 'url': video_url.decode('utf-8'),
654 'uploader': video_uploader.decode('utf-8'),
655 'upload_date': u'NA',
656 'title': video_title,
657 'ext': video_extension.decode('utf-8'),
663 class GoogleIE(InfoExtractor):
664 """Information extractor for video.google.com."""
666 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
667 IE_NAME = u'video.google'
669 def __init__(self, downloader=None):
670 InfoExtractor.__init__(self, downloader)
672 def report_download_webpage(self, video_id):
673 """Report webpage download."""
674 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
676 def report_extraction(self, video_id):
677 """Report information extraction."""
678 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
680 def _real_extract(self, url):
681 # Extract id from URL
682 mobj = re.match(self._VALID_URL, url)
684 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
687 video_id = mobj.group(1)
689 video_extension = 'mp4'
691 # Retrieve video webpage to extract further information
692 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
694 self.report_download_webpage(video_id)
695 webpage = urllib2.urlopen(request).read()
696 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
697 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
700 # Extract URL, uploader, and title from webpage
701 self.report_extraction(video_id)
702 mobj = re.search(r"download_url:'([^']+)'", webpage)
704 video_extension = 'flv'
705 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
707 self._downloader.trouble(u'ERROR: unable to extract media URL')
709 mediaURL = urllib.unquote(mobj.group(1))
710 mediaURL = mediaURL.replace('\\x3d', '\x3d')
711 mediaURL = mediaURL.replace('\\x26', '\x26')
715 mobj = re.search(r'<title>(.*)</title>', webpage)
717 self._downloader.trouble(u'ERROR: unable to extract title')
719 video_title = mobj.group(1).decode('utf-8')
721 # Extract video description
722 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
724 self._downloader.trouble(u'ERROR: unable to extract video description')
726 video_description = mobj.group(1).decode('utf-8')
727 if not video_description:
728 video_description = 'No description available.'
730 # Extract video thumbnail
731 if self._downloader.params.get('forcethumbnail', False):
732 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
734 webpage = urllib2.urlopen(request).read()
735 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
736 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
738 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
740 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
742 video_thumbnail = mobj.group(1)
743 else: # we need something to pass to process_info
747 'id': video_id.decode('utf-8'),
748 'url': video_url.decode('utf-8'),
750 'upload_date': u'NA',
751 'title': video_title,
752 'ext': video_extension.decode('utf-8'),
758 class PhotobucketIE(InfoExtractor):
759 """Information extractor for photobucket.com."""
761 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
762 IE_NAME = u'photobucket'
764 def __init__(self, downloader=None):
765 InfoExtractor.__init__(self, downloader)
767 def report_download_webpage(self, video_id):
768 """Report webpage download."""
769 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
771 def report_extraction(self, video_id):
772 """Report information extraction."""
773 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
775 def _real_extract(self, url):
776 # Extract id from URL
777 mobj = re.match(self._VALID_URL, url)
779 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
782 video_id = mobj.group(1)
784 video_extension = 'flv'
786 # Retrieve video webpage to extract further information
787 request = urllib2.Request(url)
789 self.report_download_webpage(video_id)
790 webpage = urllib2.urlopen(request).read()
791 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
792 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
795 # Extract URL, uploader, and title from webpage
796 self.report_extraction(video_id)
797 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
799 self._downloader.trouble(u'ERROR: unable to extract media URL')
801 mediaURL = urllib.unquote(mobj.group(1))
805 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
807 self._downloader.trouble(u'ERROR: unable to extract title')
809 video_title = mobj.group(1).decode('utf-8')
811 video_uploader = mobj.group(2).decode('utf-8')
814 'id': video_id.decode('utf-8'),
815 'url': video_url.decode('utf-8'),
816 'uploader': video_uploader,
817 'upload_date': u'NA',
818 'title': video_title,
819 'ext': video_extension.decode('utf-8'),
825 class YahooIE(InfoExtractor):
826 """Information extractor for video.yahoo.com."""
828 # _VALID_URL matches all Yahoo! Video URLs
829 # _VPAGE_URL matches only the extractable '/watch/' URLs
830 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
831 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
832 IE_NAME = u'video.yahoo'
834 def __init__(self, downloader=None):
835 InfoExtractor.__init__(self, downloader)
837 def report_download_webpage(self, video_id):
838 """Report webpage download."""
839 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
841 def report_extraction(self, video_id):
842 """Report information extraction."""
843 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
845 def _real_extract(self, url, new_video=True):
846 # Extract ID from URL
847 mobj = re.match(self._VALID_URL, url)
849 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
852 video_id = mobj.group(2)
853 video_extension = 'flv'
855 # Rewrite valid but non-extractable URLs as
856 # extractable English language /watch/ URLs
857 if re.match(self._VPAGE_URL, url) is None:
858 request = urllib2.Request(url)
860 webpage = urllib2.urlopen(request).read()
861 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
862 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
865 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
867 self._downloader.trouble(u'ERROR: Unable to extract id field')
869 yahoo_id = mobj.group(1)
871 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
873 self._downloader.trouble(u'ERROR: Unable to extract vid field')
875 yahoo_vid = mobj.group(1)
877 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
878 return self._real_extract(url, new_video=False)
880 # Retrieve video webpage to extract further information
881 request = urllib2.Request(url)
883 self.report_download_webpage(video_id)
884 webpage = urllib2.urlopen(request).read()
885 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
886 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
889 # Extract uploader and title from webpage
890 self.report_extraction(video_id)
891 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
893 self._downloader.trouble(u'ERROR: unable to extract video title')
895 video_title = mobj.group(1).decode('utf-8')
897 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
899 self._downloader.trouble(u'ERROR: unable to extract video uploader')
901 video_uploader = mobj.group(1).decode('utf-8')
903 # Extract video thumbnail
904 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
906 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
908 video_thumbnail = mobj.group(1).decode('utf-8')
910 # Extract video description
911 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
913 self._downloader.trouble(u'ERROR: unable to extract video description')
915 video_description = mobj.group(1).decode('utf-8')
916 if not video_description:
917 video_description = 'No description available.'
919 # Extract video height and width
920 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
922 self._downloader.trouble(u'ERROR: unable to extract video height')
924 yv_video_height = mobj.group(1)
926 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
928 self._downloader.trouble(u'ERROR: unable to extract video width')
930 yv_video_width = mobj.group(1)
932 # Retrieve video playlist to extract media URL
933 # I'm not completely sure what all these options are, but we
934 # seem to need most of them, otherwise the server sends a 401.
935 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
936 yv_bitrate = '700' # according to Wikipedia this is hard-coded
937 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
938 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
939 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
941 self.report_download_webpage(video_id)
942 webpage = urllib2.urlopen(request).read()
943 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
944 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
947 # Extract media URL from playlist XML
948 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
950 self._downloader.trouble(u'ERROR: Unable to extract media URL')
952 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
953 video_url = unescapeHTML(video_url)
956 'id': video_id.decode('utf-8'),
958 'uploader': video_uploader,
959 'upload_date': u'NA',
960 'title': video_title,
961 'ext': video_extension.decode('utf-8'),
962 'thumbnail': video_thumbnail.decode('utf-8'),
963 'description': video_description,
964 'thumbnail': video_thumbnail,
969 class VimeoIE(InfoExtractor):
970 """Information extractor for vimeo.com."""
972 # _VALID_URL matches Vimeo URLs
973 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
976 def __init__(self, downloader=None):
977 InfoExtractor.__init__(self, downloader)
979 def report_download_webpage(self, video_id):
980 """Report webpage download."""
981 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
983 def report_extraction(self, video_id):
984 """Report information extraction."""
985 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
987 def _real_extract(self, url, new_video=True):
988 # Extract ID from URL
989 mobj = re.match(self._VALID_URL, url)
991 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
994 video_id = mobj.group(1)
996 # Retrieve video webpage to extract further information
997 request = urllib2.Request(url, None, std_headers)
999 self.report_download_webpage(video_id)
1000 webpage = urllib2.urlopen(request).read()
1001 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1002 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1005 # Now we begin extracting as much information as we can from what we
1006 # retrieved. First we extract the information common to all extractors,
1007 # and latter we extract those that are Vimeo specific.
1008 self.report_extraction(video_id)
1010 # Extract the config JSON
1011 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1013 config = json.loads(config)
1015 self._downloader.trouble(u'ERROR: unable to extract info section')
1019 video_title = config["video"]["title"]
1022 video_uploader = config["video"]["owner"]["name"]
1024 # Extract video thumbnail
1025 video_thumbnail = config["video"]["thumbnail"]
1027 # Extract video description
1028 video_description = get_element_by_id("description", webpage.decode('utf8'))
1029 if video_description: video_description = clean_html(video_description)
1030 else: video_description = ''
1032 # Extract upload date
1033 video_upload_date = u'NA'
1034 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1035 if mobj is not None:
1036 video_upload_date = mobj.group(1)
1038 # Vimeo specific: extract request signature and timestamp
1039 sig = config['request']['signature']
1040 timestamp = config['request']['timestamp']
1042 # Vimeo specific: extract video codec and quality information
1043 # TODO bind to format param
1044 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1045 for codec in codecs:
1046 if codec[0] in config["video"]["files"]:
1047 video_codec = codec[0]
1048 video_extension = codec[1]
1049 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
1050 else: quality = 'sd'
1053 self._downloader.trouble(u'ERROR: no known codec found')
1056 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1057 %(video_id, sig, timestamp, quality, video_codec.upper())
1062 'uploader': video_uploader,
1063 'upload_date': video_upload_date,
1064 'title': video_title,
1065 'ext': video_extension,
1066 'thumbnail': video_thumbnail,
1067 'description': video_description,
1072 class GenericIE(InfoExtractor):
1073 """Generic last-resort information extractor."""
1076 IE_NAME = u'generic'
1078 def __init__(self, downloader=None):
1079 InfoExtractor.__init__(self, downloader)
1081 def report_download_webpage(self, video_id):
1082 """Report webpage download."""
1083 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1084 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1086 def report_extraction(self, video_id):
1087 """Report information extraction."""
1088 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1090 def report_following_redirect(self, new_url):
1091 """Report information extraction."""
1092 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1094 def _test_redirect(self, url):
1095 """Check if it is a redirect, like url shorteners, in case restart chain."""
1096 class HeadRequest(urllib2.Request):
1097 def get_method(self):
1100 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1102 Subclass the HTTPRedirectHandler to make it use our
1103 HeadRequest also on the redirected URL
1105 def redirect_request(self, req, fp, code, msg, headers, newurl):
1106 if code in (301, 302, 303, 307):
1107 newurl = newurl.replace(' ', '%20')
1108 newheaders = dict((k,v) for k,v in req.headers.items()
1109 if k.lower() not in ("content-length", "content-type"))
1110 return HeadRequest(newurl,
1112 origin_req_host=req.get_origin_req_host(),
1115 raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1117 class HTTPMethodFallback(urllib2.BaseHandler):
1119 Fallback to GET if HEAD is not allowed (405 HTTP error)
1121 def http_error_405(self, req, fp, code, msg, headers):
1125 newheaders = dict((k,v) for k,v in req.headers.items()
1126 if k.lower() not in ("content-length", "content-type"))
1127 return self.parent.open(urllib2.Request(req.get_full_url(),
1129 origin_req_host=req.get_origin_req_host(),
1133 opener = urllib2.OpenerDirector()
1134 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1135 HTTPMethodFallback, HEADRedirectHandler,
1136 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1137 opener.add_handler(handler())
1139 response = opener.open(HeadRequest(url))
1140 new_url = response.geturl()
1142 if url == new_url: return False
1144 self.report_following_redirect(new_url)
1145 self._downloader.download([new_url])
1148 def _real_extract(self, url):
1149 if self._test_redirect(url): return
1151 video_id = url.split('/')[-1]
1152 request = urllib2.Request(url)
1154 self.report_download_webpage(video_id)
1155 webpage = urllib2.urlopen(request).read()
1156 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1157 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1159 except ValueError, err:
1160 # since this is the last-resort InfoExtractor, if
1161 # this error is thrown, it'll be thrown here
1162 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1165 self.report_extraction(video_id)
1166 # Start with something easy: JW Player in SWFObject
1167 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1169 # Broaden the search a little bit
1170 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1172 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1175 # It's possible that one of the regexes
1176 # matched, but returned an empty group:
1177 if mobj.group(1) is None:
1178 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1181 video_url = urllib.unquote(mobj.group(1))
1182 video_id = os.path.basename(video_url)
1184 # here's a fun little line of code for you:
1185 video_extension = os.path.splitext(video_id)[1][1:]
1186 video_id = os.path.splitext(video_id)[0]
1188 # it's tempting to parse this further, but you would
1189 # have to take into account all the variations like
1190 # Video Title - Site Name
1191 # Site Name | Video Title
1192 # Video Title - Tagline | Site Name
1193 # and so on and so forth; it's just not practical
1194 mobj = re.search(r'<title>(.*)</title>', webpage)
1196 self._downloader.trouble(u'ERROR: unable to extract title')
1198 video_title = mobj.group(1).decode('utf-8')
1200 # video uploader is domain name
1201 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1203 self._downloader.trouble(u'ERROR: unable to extract title')
1205 video_uploader = mobj.group(1).decode('utf-8')
1208 'id': video_id.decode('utf-8'),
1209 'url': video_url.decode('utf-8'),
1210 'uploader': video_uploader,
1211 'upload_date': u'NA',
1212 'title': video_title,
1213 'ext': video_extension.decode('utf-8'),
1219 class YoutubeSearchIE(InfoExtractor):
1220 """Information Extractor for YouTube search queries."""
1221 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1222 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1223 _max_youtube_results = 1000
1224 IE_NAME = u'youtube:search'
1226 def __init__(self, downloader=None):
1227 InfoExtractor.__init__(self, downloader)
1229 def report_download_page(self, query, pagenum):
1230 """Report attempt to download playlist page with given number."""
1231 query = query.decode(preferredencoding())
1232 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1234 def _real_extract(self, query):
1235 mobj = re.match(self._VALID_URL, query)
1237 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1240 prefix, query = query.split(':')
1242 query = query.encode('utf-8')
1244 self._download_n_results(query, 1)
1246 elif prefix == 'all':
1247 self._download_n_results(query, self._max_youtube_results)
1253 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1255 elif n > self._max_youtube_results:
1256 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1257 n = self._max_youtube_results
1258 self._download_n_results(query, n)
1260 except ValueError: # parsing prefix as integer fails
1261 self._download_n_results(query, 1)
1264 def _download_n_results(self, query, n):
1265 """Downloads a specified number of results for a query"""
1271 while (50 * pagenum) < limit:
1272 self.report_download_page(query, pagenum+1)
1273 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1274 request = urllib2.Request(result_url)
1276 data = urllib2.urlopen(request).read()
1277 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1278 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
1280 api_response = json.loads(data)['data']
1282 new_ids = list(video['id'] for video in api_response['items'])
1283 video_ids += new_ids
1285 limit = min(n, api_response['totalItems'])
1288 if len(video_ids) > n:
1289 video_ids = video_ids[:n]
1290 for id in video_ids:
1291 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1295 class GoogleSearchIE(InfoExtractor):
1296 """Information Extractor for Google Video search queries."""
1297 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1298 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1299 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1300 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1301 _max_google_results = 1000
1302 IE_NAME = u'video.google:search'
1304 def __init__(self, downloader=None):
1305 InfoExtractor.__init__(self, downloader)
1307 def report_download_page(self, query, pagenum):
1308 """Report attempt to download playlist page with given number."""
1309 query = query.decode(preferredencoding())
1310 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1312 def _real_extract(self, query):
1313 mobj = re.match(self._VALID_URL, query)
1315 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1318 prefix, query = query.split(':')
1320 query = query.encode('utf-8')
1322 self._download_n_results(query, 1)
1324 elif prefix == 'all':
1325 self._download_n_results(query, self._max_google_results)
1331 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1333 elif n > self._max_google_results:
1334 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1335 n = self._max_google_results
1336 self._download_n_results(query, n)
1338 except ValueError: # parsing prefix as integer fails
1339 self._download_n_results(query, 1)
1342 def _download_n_results(self, query, n):
1343 """Downloads a specified number of results for a query"""
1349 self.report_download_page(query, pagenum)
1350 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1351 request = urllib2.Request(result_url)
1353 page = urllib2.urlopen(request).read()
1354 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1355 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1358 # Extract video identifiers
1359 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1360 video_id = mobj.group(1)
1361 if video_id not in video_ids:
1362 video_ids.append(video_id)
1363 if len(video_ids) == n:
1364 # Specified n videos reached
1365 for id in video_ids:
1366 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1369 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1370 for id in video_ids:
1371 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1374 pagenum = pagenum + 1
1377 class YahooSearchIE(InfoExtractor):
1378 """Information Extractor for Yahoo! Video search queries."""
1379 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1380 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1381 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1382 _MORE_PAGES_INDICATOR = r'\s*Next'
1383 _max_yahoo_results = 1000
1384 IE_NAME = u'video.yahoo:search'
1386 def __init__(self, downloader=None):
1387 InfoExtractor.__init__(self, downloader)
1389 def report_download_page(self, query, pagenum):
1390 """Report attempt to download playlist page with given number."""
1391 query = query.decode(preferredencoding())
1392 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1394 def _real_extract(self, query):
1395 mobj = re.match(self._VALID_URL, query)
1397 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1400 prefix, query = query.split(':')
1402 query = query.encode('utf-8')
1404 self._download_n_results(query, 1)
1406 elif prefix == 'all':
1407 self._download_n_results(query, self._max_yahoo_results)
1413 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1415 elif n > self._max_yahoo_results:
1416 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1417 n = self._max_yahoo_results
1418 self._download_n_results(query, n)
1420 except ValueError: # parsing prefix as integer fails
1421 self._download_n_results(query, 1)
1424 def _download_n_results(self, query, n):
1425 """Downloads a specified number of results for a query"""
1428 already_seen = set()
1432 self.report_download_page(query, pagenum)
1433 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1434 request = urllib2.Request(result_url)
1436 page = urllib2.urlopen(request).read()
1437 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1438 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1441 # Extract video identifiers
1442 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1443 video_id = mobj.group(1)
1444 if video_id not in already_seen:
1445 video_ids.append(video_id)
1446 already_seen.add(video_id)
1447 if len(video_ids) == n:
1448 # Specified n videos reached
1449 for id in video_ids:
1450 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1453 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1454 for id in video_ids:
1455 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1458 pagenum = pagenum + 1
1461 class YoutubePlaylistIE(InfoExtractor):
1462 """Information Extractor for YouTube playlists."""
1464 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1465 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1466 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&list=PL%s&'
1467 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1468 IE_NAME = u'youtube:playlist'
1470 def __init__(self, downloader=None):
1471 InfoExtractor.__init__(self, downloader)
1473 def report_download_page(self, playlist_id, pagenum):
1474 """Report attempt to download playlist page with given number."""
1475 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1477 def _real_extract(self, url):
1478 # Extract playlist id
1479 mobj = re.match(self._VALID_URL, url)
1481 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1485 if mobj.group(3) is not None:
1486 self._downloader.download([mobj.group(3)])
1489 # Download playlist pages
1490 # prefix is 'p' as default for playlists but there are other types that need extra care
1491 playlist_prefix = mobj.group(1)
1492 if playlist_prefix == 'a':
1493 playlist_access = 'artist'
1495 playlist_prefix = 'p'
1496 playlist_access = 'view_play_list'
1497 playlist_id = mobj.group(2)
1502 self.report_download_page(playlist_id, pagenum)
1503 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1504 request = urllib2.Request(url)
1506 page = urllib2.urlopen(request).read()
1507 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1508 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1511 # Extract video identifiers
1513 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1514 if mobj.group(1) not in ids_in_page:
1515 ids_in_page.append(mobj.group(1))
1516 video_ids.extend(ids_in_page)
1518 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1520 pagenum = pagenum + 1
1522 playliststart = self._downloader.params.get('playliststart', 1) - 1
1523 playlistend = self._downloader.params.get('playlistend', -1)
1524 if playlistend == -1:
1525 video_ids = video_ids[playliststart:]
1527 video_ids = video_ids[playliststart:playlistend]
1529 for id in video_ids:
1530 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1534 class YoutubeUserIE(InfoExtractor):
1535 """Information Extractor for YouTube users."""
1537 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1538 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1539 _GDATA_PAGE_SIZE = 50
1540 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1541 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1542 IE_NAME = u'youtube:user'
1544 def __init__(self, downloader=None):
1545 InfoExtractor.__init__(self, downloader)
1547 def report_download_page(self, username, start_index):
1548 """Report attempt to download user page."""
1549 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1550 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1552 def _real_extract(self, url):
1554 mobj = re.match(self._VALID_URL, url)
1556 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1559 username = mobj.group(1)
1561 # Download video ids using YouTube Data API. Result size per
1562 # query is limited (currently to 50 videos) so we need to query
1563 # page by page until there are no video ids - it means we got
1570 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1571 self.report_download_page(username, start_index)
1573 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1576 page = urllib2.urlopen(request).read()
1577 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1578 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1581 # Extract video identifiers
1584 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1585 if mobj.group(1) not in ids_in_page:
1586 ids_in_page.append(mobj.group(1))
1588 video_ids.extend(ids_in_page)
1590 # A little optimization - if current page is not
1591 # "full", ie. does not contain PAGE_SIZE video ids then
1592 # we can assume that this page is the last one - there
1593 # are no more ids on further pages - no need to query
1596 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1601 all_ids_count = len(video_ids)
1602 playliststart = self._downloader.params.get('playliststart', 1) - 1
1603 playlistend = self._downloader.params.get('playlistend', -1)
1605 if playlistend == -1:
1606 video_ids = video_ids[playliststart:]
1608 video_ids = video_ids[playliststart:playlistend]
1610 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1611 (username, all_ids_count, len(video_ids)))
1613 for video_id in video_ids:
1614 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1617 class BlipTVUserIE(InfoExtractor):
1618 """Information Extractor for blip.tv users."""
1620 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1622 IE_NAME = u'blip.tv:user'
1624 def __init__(self, downloader=None):
1625 InfoExtractor.__init__(self, downloader)
1627 def report_download_page(self, username, pagenum):
1628 """Report attempt to download user page."""
1629 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1630 (self.IE_NAME, username, pagenum))
1632 def _real_extract(self, url):
1634 mobj = re.match(self._VALID_URL, url)
1636 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1639 username = mobj.group(1)
1643 request = urllib2.Request(url)
1646 page = urllib2.urlopen(request).read().decode('utf-8')
1647 mobj = re.search(r'data-source-url="([^"]+)"', page)
1648 page_base = "http://blip.tv" + unescapeHTML(mobj.group(1))
1649 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1650 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1654 # Download video ids using BlipTV Page API. Result size per
1655 # query is limited (currently to 10 videos) so we need to query
1656 # page by page until there are no video ids - it means we got
1663 self.report_download_page(username, pagenum)
1665 request = urllib2.Request( page_base + "&page=" + str(pagenum+1) )
1668 page = urllib2.urlopen(request).read().decode('utf-8')
1669 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1670 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1673 # Extract video identifiers
1676 for mobj in re.finditer(r'href="/([^"]+)"', page):
1677 if mobj.group(1) not in ids_in_page:
1678 ids_in_page.append(unescapeHTML(mobj.group(1)))
1680 video_ids.extend(ids_in_page)
1682 # A little optimization - if current page is not
1683 # "full", ie. does not contain PAGE_SIZE video ids then
1684 # we can assume that this page is the last one - there
1685 # are no more ids on further pages - no need to query
1688 if len(ids_in_page) < self._PAGE_SIZE:
1693 all_ids_count = len(video_ids)
1694 playliststart = self._downloader.params.get('playliststart', 1) - 1
1695 playlistend = self._downloader.params.get('playlistend', -1)
1697 if playlistend == -1:
1698 video_ids = video_ids[playliststart:]
1700 video_ids = video_ids[playliststart:playlistend]
1702 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1703 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1705 for video_id in video_ids:
1706 self._downloader.download([u'http://blip.tv/'+video_id])
1709 class DepositFilesIE(InfoExtractor):
1710 """Information extractor for depositfiles.com"""
1712 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1713 IE_NAME = u'DepositFiles'
1715 def __init__(self, downloader=None):
1716 InfoExtractor.__init__(self, downloader)
1718 def report_download_webpage(self, file_id):
1719 """Report webpage download."""
1720 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1722 def report_extraction(self, file_id):
1723 """Report information extraction."""
1724 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1726 def _real_extract(self, url):
1727 file_id = url.split('/')[-1]
1728 # Rebuild url in english locale
1729 url = 'http://depositfiles.com/en/files/' + file_id
1731 # Retrieve file webpage with 'Free download' button pressed
1732 free_download_indication = { 'gateway_result' : '1' }
1733 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1735 self.report_download_webpage(file_id)
1736 webpage = urllib2.urlopen(request).read()
1737 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1738 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
1741 # Search for the real file URL
1742 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1743 if (mobj is None) or (mobj.group(1) is None):
1744 # Try to figure out reason of the error.
1745 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1746 if (mobj is not None) and (mobj.group(1) is not None):
1747 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1748 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1750 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1753 file_url = mobj.group(1)
1754 file_extension = os.path.splitext(file_url)[1][1:]
1756 # Search for file title
1757 mobj = re.search(r'<b title="(.*?)">', webpage)
1759 self._downloader.trouble(u'ERROR: unable to extract title')
1761 file_title = mobj.group(1).decode('utf-8')
1764 'id': file_id.decode('utf-8'),
1765 'url': file_url.decode('utf-8'),
1767 'upload_date': u'NA',
1768 'title': file_title,
1769 'ext': file_extension.decode('utf-8'),
1775 class FacebookIE(InfoExtractor):
1776 """Information Extractor for Facebook"""
1778 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1779 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1780 _NETRC_MACHINE = 'facebook'
1781 _available_formats = ['video', 'highqual', 'lowqual']
1782 _video_extensions = {
1787 IE_NAME = u'facebook'
1789 def __init__(self, downloader=None):
1790 InfoExtractor.__init__(self, downloader)
1792 def _reporter(self, message):
1793 """Add header and report message."""
1794 self._downloader.to_screen(u'[facebook] %s' % message)
1796 def report_login(self):
1797 """Report attempt to log in."""
1798 self._reporter(u'Logging in')
1800 def report_video_webpage_download(self, video_id):
1801 """Report attempt to download video webpage."""
1802 self._reporter(u'%s: Downloading video webpage' % video_id)
1804 def report_information_extraction(self, video_id):
1805 """Report attempt to extract video information."""
1806 self._reporter(u'%s: Extracting video information' % video_id)
1808 def _parse_page(self, video_webpage):
1809 """Extract video information from page"""
1811 data = {'title': r'\("video_title", "(.*?)"\)',
1812 'description': r'<div class="datawrap">(.*?)</div>',
1813 'owner': r'\("video_owner_name", "(.*?)"\)',
1814 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1817 for piece in data.keys():
1818 mobj = re.search(data[piece], video_webpage)
1819 if mobj is not None:
1820 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1824 for fmt in self._available_formats:
1825 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1826 if mobj is not None:
1827 # URL is in a Javascript segment inside an escaped Unicode format within
1828 # the generally utf-8 page
1829 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1830 video_info['video_urls'] = video_urls
1834 def _real_initialize(self):
1835 if self._downloader is None:
1840 downloader_params = self._downloader.params
1842 # Attempt to use provided username and password or .netrc data
1843 if downloader_params.get('username', None) is not None:
1844 useremail = downloader_params['username']
1845 password = downloader_params['password']
1846 elif downloader_params.get('usenetrc', False):
1848 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1849 if info is not None:
1853 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1854 except (IOError, netrc.NetrcParseError), err:
1855 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1858 if useremail is None:
1867 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1870 login_results = urllib2.urlopen(request).read()
1871 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1872 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1874 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1875 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1878 def _real_extract(self, url):
1879 mobj = re.match(self._VALID_URL, url)
1881 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1883 video_id = mobj.group('ID')
1886 self.report_video_webpage_download(video_id)
1887 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
1889 page = urllib2.urlopen(request)
1890 video_webpage = page.read()
1891 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1892 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1895 # Start extracting information
1896 self.report_information_extraction(video_id)
1898 # Extract information
1899 video_info = self._parse_page(video_webpage)
1902 if 'owner' not in video_info:
1903 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1905 video_uploader = video_info['owner']
1908 if 'title' not in video_info:
1909 self._downloader.trouble(u'ERROR: unable to extract video title')
1911 video_title = video_info['title']
1912 video_title = video_title.decode('utf-8')
1915 if 'thumbnail' not in video_info:
1916 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1917 video_thumbnail = ''
1919 video_thumbnail = video_info['thumbnail']
1923 if 'upload_date' in video_info:
1924 upload_time = video_info['upload_date']
1925 timetuple = email.utils.parsedate_tz(upload_time)
1926 if timetuple is not None:
1928 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
1933 video_description = video_info.get('description', 'No description available.')
1935 url_map = video_info['video_urls']
1936 if len(url_map.keys()) > 0:
1937 # Decide which formats to download
1938 req_format = self._downloader.params.get('format', None)
1939 format_limit = self._downloader.params.get('format_limit', None)
1941 if format_limit is not None and format_limit in self._available_formats:
1942 format_list = self._available_formats[self._available_formats.index(format_limit):]
1944 format_list = self._available_formats
1945 existing_formats = [x for x in format_list if x in url_map]
1946 if len(existing_formats) == 0:
1947 self._downloader.trouble(u'ERROR: no known formats available for video')
1949 if req_format is None:
1950 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1951 elif req_format == 'worst':
1952 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1953 elif req_format == '-1':
1954 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1957 if req_format not in url_map:
1958 self._downloader.trouble(u'ERROR: requested format not available')
1960 video_url_list = [(req_format, url_map[req_format])] # Specific format
1963 for format_param, video_real_url in video_url_list:
1965 video_extension = self._video_extensions.get(format_param, 'mp4')
1968 'id': video_id.decode('utf-8'),
1969 'url': video_real_url.decode('utf-8'),
1970 'uploader': video_uploader.decode('utf-8'),
1971 'upload_date': upload_date,
1972 'title': video_title,
1973 'ext': video_extension.decode('utf-8'),
1974 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1975 'thumbnail': video_thumbnail.decode('utf-8'),
1976 'description': video_description.decode('utf-8'),
1981 class BlipTVIE(InfoExtractor):
1982 """Information extractor for blip.tv"""
1984 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
1985 _URL_EXT = r'^.*\.([a-z0-9]+)$'
1986 IE_NAME = u'blip.tv'
1988 def report_extraction(self, file_id):
1989 """Report information extraction."""
1990 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
1992 def report_direct_download(self, title):
1993 """Report information extraction."""
1994 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
1996 def _real_extract(self, url):
1997 mobj = re.match(self._VALID_URL, url)
1999 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2006 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2007 request = urllib2.Request(json_url)
2008 self.report_extraction(mobj.group(1))
2011 urlh = urllib2.urlopen(request)
2012 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2013 basename = url.split('/')[-1]
2014 title,ext = os.path.splitext(basename)
2015 title = title.decode('UTF-8')
2016 ext = ext.replace('.', '')
2017 self.report_direct_download(title)
2025 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2026 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2028 if info is None: # Regular URL
2030 json_code = urlh.read()
2031 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2032 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2036 json_data = json.loads(json_code)
2037 if 'Post' in json_data:
2038 data = json_data['Post']
2042 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2043 video_url = data['media']['url']
2044 umobj = re.match(self._URL_EXT, video_url)
2046 raise ValueError('Can not determine filename extension')
2047 ext = umobj.group(1)
2050 'id': data['item_id'],
2052 'uploader': data['display_name'],
2053 'upload_date': upload_date,
2054 'title': data['title'],
2056 'format': data['media']['mimeType'],
2057 'thumbnail': data['thumbnailUrl'],
2058 'description': data['description'],
2059 'player_url': data['embedUrl']
2061 except (ValueError,KeyError), err:
2062 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2068 class MyVideoIE(InfoExtractor):
2069 """Information Extractor for myvideo.de."""
2071 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2072 IE_NAME = u'myvideo'
2074 def __init__(self, downloader=None):
2075 InfoExtractor.__init__(self, downloader)
2077 def report_download_webpage(self, video_id):
2078 """Report webpage download."""
2079 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2081 def report_extraction(self, video_id):
2082 """Report information extraction."""
2083 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2085 def _real_extract(self,url):
2086 mobj = re.match(self._VALID_URL, url)
2088 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2091 video_id = mobj.group(1)
2094 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2096 self.report_download_webpage(video_id)
2097 webpage = urllib2.urlopen(request).read()
2098 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2099 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2102 self.report_extraction(video_id)
2103 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2106 self._downloader.trouble(u'ERROR: unable to extract media URL')
2108 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2110 mobj = re.search('<title>([^<]+)</title>', webpage)
2112 self._downloader.trouble(u'ERROR: unable to extract title')
2115 video_title = mobj.group(1)
2121 'upload_date': u'NA',
2122 'title': video_title,
2128 class ComedyCentralIE(InfoExtractor):
2129 """Information extractor for The Daily Show and Colbert Report """
2131 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2132 IE_NAME = u'comedycentral'
2134 def report_extraction(self, episode_id):
2135 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2137 def report_config_download(self, episode_id):
2138 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2140 def report_index_download(self, episode_id):
2141 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2143 def report_player_url(self, episode_id):
2144 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2146 def _real_extract(self, url):
2147 mobj = re.match(self._VALID_URL, url)
2149 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2152 if mobj.group('shortname'):
2153 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2154 url = u'http://www.thedailyshow.com/full-episodes/'
2156 url = u'http://www.colbertnation.com/full-episodes/'
2157 mobj = re.match(self._VALID_URL, url)
2158 assert mobj is not None
2160 dlNewest = not mobj.group('episode')
2162 epTitle = mobj.group('showname')
2164 epTitle = mobj.group('episode')
2166 req = urllib2.Request(url)
2167 self.report_extraction(epTitle)
2169 htmlHandle = urllib2.urlopen(req)
2170 html = htmlHandle.read()
2171 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2172 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2175 url = htmlHandle.geturl()
2176 mobj = re.match(self._VALID_URL, url)
2178 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2180 if mobj.group('episode') == '':
2181 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2183 epTitle = mobj.group('episode')
2185 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2186 if len(mMovieParams) == 0:
2187 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2190 playerUrl_raw = mMovieParams[0][0]
2191 self.report_player_url(epTitle)
2193 urlHandle = urllib2.urlopen(playerUrl_raw)
2194 playerUrl = urlHandle.geturl()
2195 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2196 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2199 uri = mMovieParams[0][1]
2200 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2201 self.report_index_download(epTitle)
2203 indexXml = urllib2.urlopen(indexUrl).read()
2204 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2205 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2210 idoc = xml.etree.ElementTree.fromstring(indexXml)
2211 itemEls = idoc.findall('.//item')
2212 for itemEl in itemEls:
2213 mediaId = itemEl.findall('./guid')[0].text
2214 shortMediaId = mediaId.split(':')[-1]
2215 showId = mediaId.split(':')[-2].replace('.com', '')
2216 officialTitle = itemEl.findall('./title')[0].text
2217 officialDate = itemEl.findall('./pubDate')[0].text
2219 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2220 urllib.urlencode({'uri': mediaId}))
2221 configReq = urllib2.Request(configUrl)
2222 self.report_config_download(epTitle)
2224 configXml = urllib2.urlopen(configReq).read()
2225 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2226 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2229 cdoc = xml.etree.ElementTree.fromstring(configXml)
2231 for rendition in cdoc.findall('.//rendition'):
2232 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2236 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2239 # For now, just pick the highest bitrate
2240 format,video_url = turls[-1]
2242 effTitle = showId + u'-' + epTitle
2247 'upload_date': officialDate,
2252 'description': officialTitle,
2253 'player_url': playerUrl
2256 results.append(info)
2261 class EscapistIE(InfoExtractor):
2262 """Information extractor for The Escapist """
2264 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2265 IE_NAME = u'escapist'
2267 def report_extraction(self, showName):
2268 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2270 def report_config_download(self, showName):
2271 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2273 def _real_extract(self, url):
2274 mobj = re.match(self._VALID_URL, url)
2276 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2278 showName = mobj.group('showname')
2279 videoId = mobj.group('episode')
2281 self.report_extraction(showName)
2283 webPageBytes = urllib2.urlopen(url).read()
2284 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2285 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2288 webPage = webPageBytes.decode('utf-8')
2289 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2290 description = unescapeHTML(descMatch.group(1))
2291 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2292 imgUrl = unescapeHTML(imgMatch.group(1))
2293 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2294 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2295 configUrlMatch = re.search('config=(.*)$', playerUrl)
2296 configUrl = urllib2.unquote(configUrlMatch.group(1))
2298 self.report_config_download(showName)
2300 configJSON = urllib2.urlopen(configUrl).read()
2301 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2302 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2305 # Technically, it's JavaScript, not JSON
2306 configJSON = configJSON.replace("'", '"')
2309 config = json.loads(configJSON)
2310 except (ValueError,), err:
2311 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2314 playlist = config['playlist']
2315 videoUrl = playlist[1]['url']
2320 'uploader': showName,
2321 'upload_date': None,
2325 'thumbnail': imgUrl,
2326 'description': description,
2327 'player_url': playerUrl,
2333 class CollegeHumorIE(InfoExtractor):
2334 """Information extractor for collegehumor.com"""
2336 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2337 IE_NAME = u'collegehumor'
2339 def report_webpage(self, video_id):
2340 """Report information extraction."""
2341 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2343 def report_extraction(self, video_id):
2344 """Report information extraction."""
2345 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2347 def _real_extract(self, url):
2348 mobj = re.match(self._VALID_URL, url)
2350 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2352 video_id = mobj.group('videoid')
2354 self.report_webpage(video_id)
2355 request = urllib2.Request(url)
2357 webpage = urllib2.urlopen(request).read()
2358 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2359 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2362 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2364 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2366 internal_video_id = m.group('internalvideoid')
2370 'internal_id': internal_video_id,
2373 self.report_extraction(video_id)
2374 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2376 metaXml = urllib2.urlopen(xmlUrl).read()
2377 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2378 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
2381 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2383 videoNode = mdoc.findall('./video')[0]
2384 info['description'] = videoNode.findall('./description')[0].text
2385 info['title'] = videoNode.findall('./caption')[0].text
2386 info['url'] = videoNode.findall('./file')[0].text
2387 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2388 info['ext'] = info['url'].rpartition('.')[2]
2389 info['format'] = info['ext']
2391 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2397 class XVideosIE(InfoExtractor):
2398 """Information extractor for xvideos.com"""
2400 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2401 IE_NAME = u'xvideos'
2403 def report_webpage(self, video_id):
2404 """Report information extraction."""
2405 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2407 def report_extraction(self, video_id):
2408 """Report information extraction."""
2409 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2411 def _real_extract(self, url):
2412 mobj = re.match(self._VALID_URL, url)
2414 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2416 video_id = mobj.group(1).decode('utf-8')
2418 self.report_webpage(video_id)
2420 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2422 webpage = urllib2.urlopen(request).read()
2423 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2424 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2427 self.report_extraction(video_id)
2431 mobj = re.search(r'flv_url=(.+?)&', webpage)
2433 self._downloader.trouble(u'ERROR: unable to extract video url')
2435 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2439 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2441 self._downloader.trouble(u'ERROR: unable to extract video title')
2443 video_title = mobj.group(1).decode('utf-8')
2446 # Extract video thumbnail
2447 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
2449 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2451 video_thumbnail = mobj.group(1).decode('utf-8')
2457 'upload_date': None,
2458 'title': video_title,
2461 'thumbnail': video_thumbnail,
2462 'description': None,
2469 class SoundcloudIE(InfoExtractor):
2470 """Information extractor for soundcloud.com
2471 To access the media, the uid of the song and a stream token
2472 must be extracted from the page source and the script must make
2473 a request to media.soundcloud.com/crossdomain.xml. Then
2474 the media can be grabbed by requesting from an url composed
2475 of the stream token and uid
2478 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2479 IE_NAME = u'soundcloud'
2481 def __init__(self, downloader=None):
2482 InfoExtractor.__init__(self, downloader)
2484 def report_webpage(self, video_id):
2485 """Report information extraction."""
2486 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2488 def report_extraction(self, video_id):
2489 """Report information extraction."""
2490 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2492 def _real_extract(self, url):
2493 mobj = re.match(self._VALID_URL, url)
2495 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2498 # extract uploader (which is in the url)
2499 uploader = mobj.group(1).decode('utf-8')
2500 # extract simple title (uploader + slug of song title)
2501 slug_title = mobj.group(2).decode('utf-8')
2502 simple_title = uploader + u'-' + slug_title
2504 self.report_webpage('%s/%s' % (uploader, slug_title))
2506 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2508 webpage = urllib2.urlopen(request).read()
2509 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2510 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2513 self.report_extraction('%s/%s' % (uploader, slug_title))
2515 # extract uid and stream token that soundcloud hands out for access
2516 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2518 video_id = mobj.group(1)
2519 stream_token = mobj.group(2)
2521 # extract unsimplified title
2522 mobj = re.search('"title":"(.*?)",', webpage)
2524 title = mobj.group(1).decode('utf-8')
2526 title = simple_title
2528 # construct media url (with uid/token)
2529 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2530 mediaURL = mediaURL % (video_id, stream_token)
2533 description = u'No description available'
2534 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2536 description = mobj.group(1)
2540 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2543 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2544 except Exception, e:
2545 self._downloader.to_stderr(str(e))
2547 # for soundcloud, a request to a cross domain is required for cookies
2548 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2551 'id': video_id.decode('utf-8'),
2553 'uploader': uploader.decode('utf-8'),
2554 'upload_date': upload_date,
2559 'description': description.decode('utf-8')
2563 class InfoQIE(InfoExtractor):
2564 """Information extractor for infoq.com"""
2566 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2569 def report_webpage(self, video_id):
2570 """Report information extraction."""
2571 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2573 def report_extraction(self, video_id):
2574 """Report information extraction."""
2575 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2577 def _real_extract(self, url):
2578 mobj = re.match(self._VALID_URL, url)
2580 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2583 self.report_webpage(url)
2585 request = urllib2.Request(url)
2587 webpage = urllib2.urlopen(request).read()
2588 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2589 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2592 self.report_extraction(url)
2596 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2598 self._downloader.trouble(u'ERROR: unable to extract video url')
2600 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2604 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2606 self._downloader.trouble(u'ERROR: unable to extract video title')
2608 video_title = mobj.group(1).decode('utf-8')
2610 # Extract description
2611 video_description = u'No description available.'
2612 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2613 if mobj is not None:
2614 video_description = mobj.group(1).decode('utf-8')
2616 video_filename = video_url.split('/')[-1]
2617 video_id, extension = video_filename.split('.')
2623 'upload_date': None,
2624 'title': video_title,
2626 'format': extension, # Extension is always(?) mp4, but seems to be flv
2628 'description': video_description,
2634 class MixcloudIE(InfoExtractor):
2635 """Information extractor for www.mixcloud.com"""
2636 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2637 IE_NAME = u'mixcloud'
2639 def __init__(self, downloader=None):
2640 InfoExtractor.__init__(self, downloader)
2642 def report_download_json(self, file_id):
2643 """Report JSON download."""
2644 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2646 def report_extraction(self, file_id):
2647 """Report information extraction."""
2648 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2650 def get_urls(self, jsonData, fmt, bitrate='best'):
2651 """Get urls from 'audio_formats' section in json"""
2654 bitrate_list = jsonData[fmt]
2655 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2656 bitrate = max(bitrate_list) # select highest
2658 url_list = jsonData[fmt][bitrate]
2659 except TypeError: # we have no bitrate info.
2660 url_list = jsonData[fmt]
2663 def check_urls(self, url_list):
2664 """Returns 1st active url from list"""
2665 for url in url_list:
2667 urllib2.urlopen(url)
2669 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2674 def _print_formats(self, formats):
2675 print 'Available formats:'
2676 for fmt in formats.keys():
2677 for b in formats[fmt]:
2679 ext = formats[fmt][b][0]
2680 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
2681 except TypeError: # we have no bitrate info
2682 ext = formats[fmt][0]
2683 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
2686 def _real_extract(self, url):
2687 mobj = re.match(self._VALID_URL, url)
2689 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2691 # extract uploader & filename from url
2692 uploader = mobj.group(1).decode('utf-8')
2693 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2695 # construct API request
2696 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2697 # retrieve .json file with links to files
2698 request = urllib2.Request(file_url)
2700 self.report_download_json(file_url)
2701 jsonData = urllib2.urlopen(request).read()
2702 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2703 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
2707 json_data = json.loads(jsonData)
2708 player_url = json_data['player_swf_url']
2709 formats = dict(json_data['audio_formats'])
2711 req_format = self._downloader.params.get('format', None)
2714 if self._downloader.params.get('listformats', None):
2715 self._print_formats(formats)
2718 if req_format is None or req_format == 'best':
2719 for format_param in formats.keys():
2720 url_list = self.get_urls(formats, format_param)
2722 file_url = self.check_urls(url_list)
2723 if file_url is not None:
2726 if req_format not in formats.keys():
2727 self._downloader.trouble(u'ERROR: format is not available')
2730 url_list = self.get_urls(formats, req_format)
2731 file_url = self.check_urls(url_list)
2732 format_param = req_format
2735 'id': file_id.decode('utf-8'),
2736 'url': file_url.decode('utf-8'),
2737 'uploader': uploader.decode('utf-8'),
2738 'upload_date': u'NA',
2739 'title': json_data['name'],
2740 'ext': file_url.split('.')[-1].decode('utf-8'),
2741 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2742 'thumbnail': json_data['thumbnail_url'],
2743 'description': json_data['description'],
2744 'player_url': player_url.decode('utf-8'),
2747 class StanfordOpenClassroomIE(InfoExtractor):
2748 """Information extractor for Stanford's Open ClassRoom"""
2750 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2751 IE_NAME = u'stanfordoc'
2753 def report_download_webpage(self, objid):
2754 """Report information extraction."""
2755 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2757 def report_extraction(self, video_id):
2758 """Report information extraction."""
2759 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2761 def _real_extract(self, url):
2762 mobj = re.match(self._VALID_URL, url)
2764 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2767 if mobj.group('course') and mobj.group('video'): # A specific video
2768 course = mobj.group('course')
2769 video = mobj.group('video')
2771 'id': course + '_' + video,
2774 self.report_extraction(info['id'])
2775 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2776 xmlUrl = baseUrl + video + '.xml'
2778 metaXml = urllib2.urlopen(xmlUrl).read()
2779 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2780 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2782 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2784 info['title'] = mdoc.findall('./title')[0].text
2785 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2787 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2789 info['ext'] = info['url'].rpartition('.')[2]
2790 info['format'] = info['ext']
2792 elif mobj.group('course'): # A course page
2793 course = mobj.group('course')
2799 self.report_download_webpage(info['id'])
2801 coursepage = urllib2.urlopen(url).read()
2802 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2803 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2806 m = re.search('<h1>([^<]+)</h1>', coursepage)
2808 info['title'] = unescapeHTML(m.group(1))
2810 info['title'] = info['id']
2812 m = re.search('<description>([^<]+)</description>', coursepage)
2814 info['description'] = unescapeHTML(m.group(1))
2816 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2819 'type': 'reference',
2820 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2824 for entry in info['list']:
2825 assert entry['type'] == 'reference'
2826 results += self.extract(entry['url'])
2831 'id': 'Stanford OpenClassroom',
2835 self.report_download_webpage(info['id'])
2836 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2838 rootpage = urllib2.urlopen(rootURL).read()
2839 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2840 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2843 info['title'] = info['id']
2845 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2848 'type': 'reference',
2849 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2854 for entry in info['list']:
2855 assert entry['type'] == 'reference'
2856 results += self.extract(entry['url'])
2859 class MTVIE(InfoExtractor):
2860 """Information extractor for MTV.com"""
2862 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2865 def report_webpage(self, video_id):
2866 """Report information extraction."""
2867 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2869 def report_extraction(self, video_id):
2870 """Report information extraction."""
2871 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2873 def _real_extract(self, url):
2874 mobj = re.match(self._VALID_URL, url)
2876 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2878 if not mobj.group('proto'):
2879 url = 'http://' + url
2880 video_id = mobj.group('videoid')
2881 self.report_webpage(video_id)
2883 request = urllib2.Request(url)
2885 webpage = urllib2.urlopen(request).read()
2886 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2887 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2890 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2892 self._downloader.trouble(u'ERROR: unable to extract song name')
2894 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2895 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2897 self._downloader.trouble(u'ERROR: unable to extract performer')
2899 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2900 video_title = performer + ' - ' + song_name
2902 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2904 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
2906 mtvn_uri = mobj.group(1)
2908 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2910 self._downloader.trouble(u'ERROR: unable to extract content id')
2912 content_id = mobj.group(1)
2914 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2915 self.report_extraction(video_id)
2916 request = urllib2.Request(videogen_url)
2918 metadataXml = urllib2.urlopen(request).read()
2919 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2920 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
2923 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2924 renditions = mdoc.findall('.//rendition')
2926 # For now, always pick the highest quality.
2927 rendition = renditions[-1]
2930 _,_,ext = rendition.attrib['type'].partition('/')
2931 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2932 video_url = rendition.find('./src').text
2934 self._downloader.trouble('Invalid rendition field.')
2940 'uploader': performer,
2941 'title': video_title,