2 # -*- coding: utf-8 -*-
15 import xml.etree.ElementTree
16 from urlparse import parse_qs
19 import cStringIO as StringIO
26 class InfoExtractor(object):
27 """Information Extractor class.
29 Information extractors are the classes that, given a URL, extract
30 information from the video (or videos) the URL refers to. This
31 information includes the real video URL, the video title and simplified
32 title, author and others. The information is stored in a dictionary
33 which is then passed to the FileDownloader. The FileDownloader
34 processes this information possibly downloading the video to the file
35 system, among other possible outcomes. The dictionaries must include
40 uploader: Nickname of the video uploader.
42 stitle: Simplified title.
43 ext: Video filename extension.
45 player_url: SWF Player URL (may be None).
47 The following fields are optional. Their primary purpose is to allow
48 youtube-dl to serve as the backend for a video search function, such
49 as the one in youtube2mp3. They are only used when their respective
50 forced printing functions are called:
52 thumbnail: Full URL to a video thumbnail image.
53 description: One-line video description.
55 Subclasses of this one should re-define the _real_initialize() and
56 _real_extract() methods and define a _VALID_URL regexp.
57 Probably, they should also be added to the list of extractors.
63 def __init__(self, downloader=None):
64 """Constructor. Receives an optional downloader."""
66 self.set_downloader(downloader)
68 def suitable(self, url):
69 """Receives a URL and returns True if suitable for this IE."""
70 return re.match(self._VALID_URL, url) is not None
73 """Initializes an instance (authentication, etc)."""
75 self._real_initialize()
78 def extract(self, url):
79 """Extracts URL information and returns it in list of dicts."""
81 return self._real_extract(url)
83 def set_downloader(self, downloader):
84 """Sets the downloader for this IE."""
85 self._downloader = downloader
87 def _real_initialize(self):
88 """Real initialization process. Redefine in subclasses."""
91 def _real_extract(self, url):
92 """Real extraction process. Redefine in subclasses."""
96 class YoutubeIE(InfoExtractor):
97 """Information extractor for youtube.com."""
99 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
100 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
101 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
102 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
103 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
104 _NETRC_MACHINE = 'youtube'
105 # Listed in order of quality
106 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
107 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
108 _video_extensions = {
114 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
120 _video_dimensions = {
138 def report_lang(self):
139 """Report attempt to set language."""
140 self._downloader.to_screen(u'[youtube] Setting language')
142 def report_login(self):
143 """Report attempt to log in."""
144 self._downloader.to_screen(u'[youtube] Logging in')
146 def report_age_confirmation(self):
147 """Report attempt to confirm age."""
148 self._downloader.to_screen(u'[youtube] Confirming age')
150 def report_video_webpage_download(self, video_id):
151 """Report attempt to download video webpage."""
152 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
154 def report_video_info_webpage_download(self, video_id):
155 """Report attempt to download video info webpage."""
156 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
158 def report_video_subtitles_download(self, video_id):
159 """Report attempt to download video info webpage."""
160 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
162 def report_information_extraction(self, video_id):
163 """Report attempt to extract video information."""
164 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
166 def report_unavailable_format(self, video_id, format):
167 """Report extracted video URL."""
168 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
170 def report_rtmp_download(self):
171 """Indicate the download will use the RTMP protocol."""
172 self._downloader.to_screen(u'[youtube] RTMP download detected')
174 def _closed_captions_xml_to_srt(self, xml_string):
176 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
177 # TODO parse xml instead of regex
178 for n, (start, dur_tag, dur, caption) in enumerate(texts):
179 if not dur: dur = '4'
181 end = start + float(dur)
182 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
183 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
184 caption = unescapeHTML(caption)
185 caption = unescapeHTML(caption) # double cycle, inentional
187 srt += start + ' --> ' + end + '\n'
188 srt += caption + '\n\n'
191 def _print_formats(self, formats):
192 print 'Available formats:'
194 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
196 def _real_initialize(self):
197 if self._downloader is None:
202 downloader_params = self._downloader.params
204 # Attempt to use provided username and password or .netrc data
205 if downloader_params.get('username', None) is not None:
206 username = downloader_params['username']
207 password = downloader_params['password']
208 elif downloader_params.get('usenetrc', False):
210 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
215 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
216 except (IOError, netrc.NetrcParseError), err:
217 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
221 request = urllib2.Request(self._LANG_URL)
224 urllib2.urlopen(request).read()
225 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
226 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
229 # No authentication to be performed
235 'current_form': 'loginForm',
237 'action_login': 'Log In',
238 'username': username,
239 'password': password,
241 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
244 login_results = urllib2.urlopen(request).read()
245 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
246 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
248 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
249 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
255 'action_confirm': 'Confirm',
257 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
259 self.report_age_confirmation()
260 age_results = urllib2.urlopen(request).read()
261 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
262 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
265 def _real_extract(self, url):
266 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
267 mobj = re.search(self._NEXT_URL_RE, url)
269 url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
271 # Extract video id from URL
272 mobj = re.match(self._VALID_URL, url)
274 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
276 video_id = mobj.group(2)
279 self.report_video_webpage_download(video_id)
280 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
282 video_webpage = urllib2.urlopen(request).read()
283 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
284 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
287 # Attempt to extract SWF player URL
288 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
290 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
295 self.report_video_info_webpage_download(video_id)
296 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
297 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
298 % (video_id, el_type))
299 request = urllib2.Request(video_info_url)
301 video_info_webpage = urllib2.urlopen(request).read()
302 video_info = parse_qs(video_info_webpage)
303 if 'token' in video_info:
305 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
306 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
308 if 'token' not in video_info:
309 if 'reason' in video_info:
310 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
312 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
315 # Start extracting information
316 self.report_information_extraction(video_id)
319 if 'author' not in video_info:
320 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
322 video_uploader = urllib.unquote_plus(video_info['author'][0])
325 if 'title' not in video_info:
326 self._downloader.trouble(u'ERROR: unable to extract video title')
328 video_title = urllib.unquote_plus(video_info['title'][0])
329 video_title = video_title.decode('utf-8')
330 video_title = sanitize_title(video_title)
333 simple_title = simplify_title(video_title)
336 if 'thumbnail_url' not in video_info:
337 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
339 else: # don't panic if we can't find it
340 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
344 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
346 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
347 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
348 for expression in format_expressions:
350 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
355 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
356 if video_description: video_description = clean_html(video_description)
357 else: video_description = ''
360 video_subtitles = None
361 if self._downloader.params.get('writesubtitles', False):
363 self.report_video_subtitles_download(video_id)
364 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
366 srt_list = urllib2.urlopen(request).read()
367 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
368 raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
369 srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
370 if not srt_lang_list:
371 raise Trouble(u'WARNING: video has no closed captions')
372 if self._downloader.params.get('subtitleslang', False):
373 srt_lang = self._downloader.params.get('subtitleslang')
374 elif 'en' in srt_lang_list:
377 srt_lang = srt_lang_list[0]
378 if not srt_lang in srt_lang_list:
379 raise Trouble(u'WARNING: no closed captions found in the specified language')
380 request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
382 srt_xml = urllib2.urlopen(request).read()
383 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
384 raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
385 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
386 except Trouble as trouble:
387 self._downloader.trouble(trouble[0])
390 video_token = urllib.unquote_plus(video_info['token'][0])
392 # Decide which formats to download
393 req_format = self._downloader.params.get('format', None)
395 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
396 self.report_rtmp_download()
397 video_url_list = [(None, video_info['conn'][0])]
398 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
399 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
400 url_data = [parse_qs(uds) for uds in url_data_strs]
401 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
402 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
404 format_limit = self._downloader.params.get('format_limit', None)
405 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
406 if format_limit is not None and format_limit in available_formats:
407 format_list = available_formats[available_formats.index(format_limit):]
409 format_list = available_formats
410 existing_formats = [x for x in format_list if x in url_map]
411 if len(existing_formats) == 0:
412 self._downloader.trouble(u'ERROR: no known formats available for video')
414 if self._downloader.params.get('listformats', None):
415 self._print_formats(existing_formats)
417 if req_format is None or req_format == 'best':
418 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
419 elif req_format == 'worst':
420 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
421 elif req_format in ('-1', 'all'):
422 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
424 # Specific formats. We pick the first in a slash-delimeted sequence.
425 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
426 req_formats = req_format.split('/')
427 video_url_list = None
428 for rf in req_formats:
430 video_url_list = [(rf, url_map[rf])]
432 if video_url_list is None:
433 self._downloader.trouble(u'ERROR: requested format not available')
436 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
440 for format_param, video_real_url in video_url_list:
442 video_extension = self._video_extensions.get(format_param, 'flv')
445 'id': video_id.decode('utf-8'),
446 'url': video_real_url.decode('utf-8'),
447 'uploader': video_uploader.decode('utf-8'),
448 'upload_date': upload_date,
449 'title': video_title,
450 'stitle': simple_title,
451 'ext': video_extension.decode('utf-8'),
452 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
453 'thumbnail': video_thumbnail.decode('utf-8'),
454 'description': video_description,
455 'player_url': player_url,
456 'subtitles': video_subtitles
461 class MetacafeIE(InfoExtractor):
462 """Information Extractor for metacafe.com."""
464 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
465 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
466 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
467 IE_NAME = u'metacafe'
469 def __init__(self, downloader=None):
470 InfoExtractor.__init__(self, downloader)
472 def report_disclaimer(self):
473 """Report disclaimer retrieval."""
474 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
476 def report_age_confirmation(self):
477 """Report attempt to confirm age."""
478 self._downloader.to_screen(u'[metacafe] Confirming age')
480 def report_download_webpage(self, video_id):
481 """Report webpage download."""
482 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
484 def report_extraction(self, video_id):
485 """Report information extraction."""
486 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
488 def _real_initialize(self):
489 # Retrieve disclaimer
490 request = urllib2.Request(self._DISCLAIMER)
492 self.report_disclaimer()
493 disclaimer = urllib2.urlopen(request).read()
494 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
495 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
501 'submit': "Continue - I'm over 18",
503 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
505 self.report_age_confirmation()
506 disclaimer = urllib2.urlopen(request).read()
507 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
508 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
511 def _real_extract(self, url):
512 # Extract id and simplified title from URL
513 mobj = re.match(self._VALID_URL, url)
515 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
518 video_id = mobj.group(1)
520 # Check if video comes from YouTube
521 mobj2 = re.match(r'^yt-(.*)$', video_id)
522 if mobj2 is not None:
523 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
526 simple_title = mobj.group(2).decode('utf-8')
528 # Retrieve video webpage to extract further information
529 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
531 self.report_download_webpage(video_id)
532 webpage = urllib2.urlopen(request).read()
533 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
534 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
537 # Extract URL, uploader and title from webpage
538 self.report_extraction(video_id)
539 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
541 mediaURL = urllib.unquote(mobj.group(1))
542 video_extension = mediaURL[-3:]
544 # Extract gdaKey if available
545 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
549 gdaKey = mobj.group(1)
550 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
552 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
554 self._downloader.trouble(u'ERROR: unable to extract media URL')
556 vardict = parse_qs(mobj.group(1))
557 if 'mediaData' not in vardict:
558 self._downloader.trouble(u'ERROR: unable to extract media URL')
560 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
562 self._downloader.trouble(u'ERROR: unable to extract media URL')
564 mediaURL = mobj.group(1).replace('\\/', '/')
565 video_extension = mediaURL[-3:]
566 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
568 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
570 self._downloader.trouble(u'ERROR: unable to extract title')
572 video_title = mobj.group(1).decode('utf-8')
573 video_title = sanitize_title(video_title)
575 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
577 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
579 video_uploader = mobj.group(1)
582 'id': video_id.decode('utf-8'),
583 'url': video_url.decode('utf-8'),
584 'uploader': video_uploader.decode('utf-8'),
585 'upload_date': u'NA',
586 'title': video_title,
587 'stitle': simple_title,
588 'ext': video_extension.decode('utf-8'),
594 class DailymotionIE(InfoExtractor):
595 """Information Extractor for Dailymotion"""
597 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
598 IE_NAME = u'dailymotion'
600 def __init__(self, downloader=None):
601 InfoExtractor.__init__(self, downloader)
603 def report_download_webpage(self, video_id):
604 """Report webpage download."""
605 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
607 def report_extraction(self, video_id):
608 """Report information extraction."""
609 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
611 def _real_extract(self, url):
612 # Extract id and simplified title from URL
613 mobj = re.match(self._VALID_URL, url)
615 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
618 video_id = mobj.group(1)
620 video_extension = 'flv'
622 # Retrieve video webpage to extract further information
623 request = urllib2.Request(url)
624 request.add_header('Cookie', 'family_filter=off')
626 self.report_download_webpage(video_id)
627 webpage = urllib2.urlopen(request).read()
628 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
629 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
632 # Extract URL, uploader and title from webpage
633 self.report_extraction(video_id)
634 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
636 self._downloader.trouble(u'ERROR: unable to extract media URL')
638 sequence = urllib.unquote(mobj.group(1))
639 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
641 self._downloader.trouble(u'ERROR: unable to extract media URL')
643 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
645 # if needed add http://www.dailymotion.com/ if relative URL
649 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
651 self._downloader.trouble(u'ERROR: unable to extract title')
653 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
654 video_title = sanitize_title(video_title)
655 simple_title = simplify_title(video_title)
657 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
659 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
661 video_uploader = mobj.group(1)
664 'id': video_id.decode('utf-8'),
665 'url': video_url.decode('utf-8'),
666 'uploader': video_uploader.decode('utf-8'),
667 'upload_date': u'NA',
668 'title': video_title,
669 'stitle': simple_title,
670 'ext': video_extension.decode('utf-8'),
676 class GoogleIE(InfoExtractor):
677 """Information extractor for video.google.com."""
679 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
680 IE_NAME = u'video.google'
682 def __init__(self, downloader=None):
683 InfoExtractor.__init__(self, downloader)
685 def report_download_webpage(self, video_id):
686 """Report webpage download."""
687 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
689 def report_extraction(self, video_id):
690 """Report information extraction."""
691 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
693 def _real_extract(self, url):
694 # Extract id from URL
695 mobj = re.match(self._VALID_URL, url)
697 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
700 video_id = mobj.group(1)
702 video_extension = 'mp4'
704 # Retrieve video webpage to extract further information
705 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
707 self.report_download_webpage(video_id)
708 webpage = urllib2.urlopen(request).read()
709 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
710 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
713 # Extract URL, uploader, and title from webpage
714 self.report_extraction(video_id)
715 mobj = re.search(r"download_url:'([^']+)'", webpage)
717 video_extension = 'flv'
718 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
720 self._downloader.trouble(u'ERROR: unable to extract media URL')
722 mediaURL = urllib.unquote(mobj.group(1))
723 mediaURL = mediaURL.replace('\\x3d', '\x3d')
724 mediaURL = mediaURL.replace('\\x26', '\x26')
728 mobj = re.search(r'<title>(.*)</title>', webpage)
730 self._downloader.trouble(u'ERROR: unable to extract title')
732 video_title = mobj.group(1).decode('utf-8')
733 video_title = sanitize_title(video_title)
734 simple_title = simplify_title(video_title)
736 # Extract video description
737 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
739 self._downloader.trouble(u'ERROR: unable to extract video description')
741 video_description = mobj.group(1).decode('utf-8')
742 if not video_description:
743 video_description = 'No description available.'
745 # Extract video thumbnail
746 if self._downloader.params.get('forcethumbnail', False):
747 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
749 webpage = urllib2.urlopen(request).read()
750 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
751 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
753 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
755 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
757 video_thumbnail = mobj.group(1)
758 else: # we need something to pass to process_info
762 'id': video_id.decode('utf-8'),
763 'url': video_url.decode('utf-8'),
765 'upload_date': u'NA',
766 'title': video_title,
767 'stitle': simple_title,
768 'ext': video_extension.decode('utf-8'),
774 class PhotobucketIE(InfoExtractor):
775 """Information extractor for photobucket.com."""
777 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
778 IE_NAME = u'photobucket'
780 def __init__(self, downloader=None):
781 InfoExtractor.__init__(self, downloader)
783 def report_download_webpage(self, video_id):
784 """Report webpage download."""
785 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
787 def report_extraction(self, video_id):
788 """Report information extraction."""
789 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
791 def _real_extract(self, url):
792 # Extract id from URL
793 mobj = re.match(self._VALID_URL, url)
795 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
798 video_id = mobj.group(1)
800 video_extension = 'flv'
802 # Retrieve video webpage to extract further information
803 request = urllib2.Request(url)
805 self.report_download_webpage(video_id)
806 webpage = urllib2.urlopen(request).read()
807 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
808 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
811 # Extract URL, uploader, and title from webpage
812 self.report_extraction(video_id)
813 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
815 self._downloader.trouble(u'ERROR: unable to extract media URL')
817 mediaURL = urllib.unquote(mobj.group(1))
821 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
823 self._downloader.trouble(u'ERROR: unable to extract title')
825 video_title = mobj.group(1).decode('utf-8')
826 video_title = sanitize_title(video_title)
827 simple_title = simplify_title(video_title)
829 video_uploader = mobj.group(2).decode('utf-8')
832 'id': video_id.decode('utf-8'),
833 'url': video_url.decode('utf-8'),
834 'uploader': video_uploader,
835 'upload_date': u'NA',
836 'title': video_title,
837 'stitle': simple_title,
838 'ext': video_extension.decode('utf-8'),
844 class YahooIE(InfoExtractor):
845 """Information extractor for video.yahoo.com."""
847 # _VALID_URL matches all Yahoo! Video URLs
848 # _VPAGE_URL matches only the extractable '/watch/' URLs
849 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
850 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
851 IE_NAME = u'video.yahoo'
853 def __init__(self, downloader=None):
854 InfoExtractor.__init__(self, downloader)
856 def report_download_webpage(self, video_id):
857 """Report webpage download."""
858 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
860 def report_extraction(self, video_id):
861 """Report information extraction."""
862 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
864 def _real_extract(self, url, new_video=True):
865 # Extract ID from URL
866 mobj = re.match(self._VALID_URL, url)
868 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
871 video_id = mobj.group(2)
872 video_extension = 'flv'
874 # Rewrite valid but non-extractable URLs as
875 # extractable English language /watch/ URLs
876 if re.match(self._VPAGE_URL, url) is None:
877 request = urllib2.Request(url)
879 webpage = urllib2.urlopen(request).read()
880 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
881 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
884 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
886 self._downloader.trouble(u'ERROR: Unable to extract id field')
888 yahoo_id = mobj.group(1)
890 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
892 self._downloader.trouble(u'ERROR: Unable to extract vid field')
894 yahoo_vid = mobj.group(1)
896 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
897 return self._real_extract(url, new_video=False)
899 # Retrieve video webpage to extract further information
900 request = urllib2.Request(url)
902 self.report_download_webpage(video_id)
903 webpage = urllib2.urlopen(request).read()
904 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
905 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
908 # Extract uploader and title from webpage
909 self.report_extraction(video_id)
910 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
912 self._downloader.trouble(u'ERROR: unable to extract video title')
914 video_title = mobj.group(1).decode('utf-8')
915 simple_title = simplify_title(video_title)
917 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
919 self._downloader.trouble(u'ERROR: unable to extract video uploader')
921 video_uploader = mobj.group(1).decode('utf-8')
923 # Extract video thumbnail
924 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
926 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
928 video_thumbnail = mobj.group(1).decode('utf-8')
930 # Extract video description
931 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
933 self._downloader.trouble(u'ERROR: unable to extract video description')
935 video_description = mobj.group(1).decode('utf-8')
936 if not video_description:
937 video_description = 'No description available.'
939 # Extract video height and width
940 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
942 self._downloader.trouble(u'ERROR: unable to extract video height')
944 yv_video_height = mobj.group(1)
946 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
948 self._downloader.trouble(u'ERROR: unable to extract video width')
950 yv_video_width = mobj.group(1)
952 # Retrieve video playlist to extract media URL
953 # I'm not completely sure what all these options are, but we
954 # seem to need most of them, otherwise the server sends a 401.
955 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
956 yv_bitrate = '700' # according to Wikipedia this is hard-coded
957 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
958 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
959 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
961 self.report_download_webpage(video_id)
962 webpage = urllib2.urlopen(request).read()
963 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
964 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
967 # Extract media URL from playlist XML
968 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
970 self._downloader.trouble(u'ERROR: Unable to extract media URL')
972 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
973 video_url = unescapeHTML(video_url)
976 'id': video_id.decode('utf-8'),
978 'uploader': video_uploader,
979 'upload_date': u'NA',
980 'title': video_title,
981 'stitle': simple_title,
982 'ext': video_extension.decode('utf-8'),
983 'thumbnail': video_thumbnail.decode('utf-8'),
984 'description': video_description,
985 'thumbnail': video_thumbnail,
990 class VimeoIE(InfoExtractor):
991 """Information extractor for vimeo.com."""
993 # _VALID_URL matches Vimeo URLs
994 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
997 def __init__(self, downloader=None):
998 InfoExtractor.__init__(self, downloader)
1000 def report_download_webpage(self, video_id):
1001 """Report webpage download."""
1002 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1004 def report_extraction(self, video_id):
1005 """Report information extraction."""
1006 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1008 def _real_extract(self, url, new_video=True):
1009 # Extract ID from URL
1010 mobj = re.match(self._VALID_URL, url)
1012 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1015 video_id = mobj.group(1)
1017 # Retrieve video webpage to extract further information
1018 request = urllib2.Request(url, None, std_headers)
1020 self.report_download_webpage(video_id)
1021 webpage = urllib2.urlopen(request).read()
1022 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1023 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1026 # Now we begin extracting as much information as we can from what we
1027 # retrieved. First we extract the information common to all extractors,
1028 # and latter we extract those that are Vimeo specific.
1029 self.report_extraction(video_id)
1031 # Extract the config JSON
1032 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1034 config = json.loads(config)
1036 self._downloader.trouble(u'ERROR: unable to extract info section')
1040 video_title = config["video"]["title"]
1041 simple_title = simplify_title(video_title)
1044 video_uploader = config["video"]["owner"]["name"]
1046 # Extract video thumbnail
1047 video_thumbnail = config["video"]["thumbnail"]
1049 # Extract video description
1050 video_description = get_element_by_id("description", webpage.decode('utf8'))
1051 if video_description: video_description = clean_html(video_description)
1052 else: video_description = ''
1054 # Extract upload date
1055 video_upload_date = u'NA'
1056 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1057 if mobj is not None:
1058 video_upload_date = mobj.group(1)
1060 # Vimeo specific: extract request signature and timestamp
1061 sig = config['request']['signature']
1062 timestamp = config['request']['timestamp']
1064 # Vimeo specific: extract video codec and quality information
1065 # TODO bind to format param
1066 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1067 for codec in codecs:
1068 if codec[0] in config["video"]["files"]:
1069 video_codec = codec[0]
1070 video_extension = codec[1]
1071 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
1072 else: quality = 'sd'
1075 self._downloader.trouble(u'ERROR: no known codec found')
1078 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1079 %(video_id, sig, timestamp, quality, video_codec.upper())
1084 'uploader': video_uploader,
1085 'upload_date': video_upload_date,
1086 'title': video_title,
1087 'stitle': simple_title,
1088 'ext': video_extension,
1089 'thumbnail': video_thumbnail,
1090 'description': video_description,
1095 class GenericIE(InfoExtractor):
1096 """Generic last-resort information extractor."""
1099 IE_NAME = u'generic'
1101 def __init__(self, downloader=None):
1102 InfoExtractor.__init__(self, downloader)
1104 def report_download_webpage(self, video_id):
1105 """Report webpage download."""
1106 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1107 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1109 def report_extraction(self, video_id):
1110 """Report information extraction."""
1111 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1113 def report_following_redirect(self, new_url):
1114 """Report information extraction."""
1115 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1117 def _test_redirect(self, url):
1118 """Check if it is a redirect, like url shorteners, in case restart chain."""
1119 class HeadRequest(urllib2.Request):
1120 def get_method(self):
1123 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1125 Subclass the HTTPRedirectHandler to make it use our
1126 HeadRequest also on the redirected URL
1128 def redirect_request(self, req, fp, code, msg, headers, newurl):
1129 if code in (301, 302, 303, 307):
1130 newurl = newurl.replace(' ', '%20')
1131 newheaders = dict((k,v) for k,v in req.headers.items()
1132 if k.lower() not in ("content-length", "content-type"))
1133 return HeadRequest(newurl,
1135 origin_req_host=req.get_origin_req_host(),
1138 raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1140 class HTTPMethodFallback(urllib2.BaseHandler):
1142 Fallback to GET if HEAD is not allowed (405 HTTP error)
1144 def http_error_405(self, req, fp, code, msg, headers):
1148 newheaders = dict((k,v) for k,v in req.headers.items()
1149 if k.lower() not in ("content-length", "content-type"))
1150 return self.parent.open(urllib2.Request(req.get_full_url(),
1152 origin_req_host=req.get_origin_req_host(),
1156 opener = urllib2.OpenerDirector()
1157 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1158 HTTPMethodFallback, HEADRedirectHandler,
1159 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1160 opener.add_handler(handler())
1162 response = opener.open(HeadRequest(url))
1163 new_url = response.geturl()
1165 if url == new_url: return False
1167 self.report_following_redirect(new_url)
1168 self._downloader.download([new_url])
1171 def _real_extract(self, url):
1172 if self._test_redirect(url): return
1174 video_id = url.split('/')[-1]
1175 request = urllib2.Request(url)
1177 self.report_download_webpage(video_id)
1178 webpage = urllib2.urlopen(request).read()
1179 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1180 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1182 except ValueError, err:
1183 # since this is the last-resort InfoExtractor, if
1184 # this error is thrown, it'll be thrown here
1185 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1188 self.report_extraction(video_id)
1189 # Start with something easy: JW Player in SWFObject
1190 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1192 # Broaden the search a little bit
1193 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1195 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1198 # It's possible that one of the regexes
1199 # matched, but returned an empty group:
1200 if mobj.group(1) is None:
1201 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1204 video_url = urllib.unquote(mobj.group(1))
1205 video_id = os.path.basename(video_url)
1207 # here's a fun little line of code for you:
1208 video_extension = os.path.splitext(video_id)[1][1:]
1209 video_id = os.path.splitext(video_id)[0]
1211 # it's tempting to parse this further, but you would
1212 # have to take into account all the variations like
1213 # Video Title - Site Name
1214 # Site Name | Video Title
1215 # Video Title - Tagline | Site Name
1216 # and so on and so forth; it's just not practical
1217 mobj = re.search(r'<title>(.*)</title>', webpage)
1219 self._downloader.trouble(u'ERROR: unable to extract title')
1221 video_title = mobj.group(1).decode('utf-8')
1222 video_title = sanitize_title(video_title)
1223 simple_title = simplify_title(video_title)
1225 # video uploader is domain name
1226 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1228 self._downloader.trouble(u'ERROR: unable to extract title')
1230 video_uploader = mobj.group(1).decode('utf-8')
1233 'id': video_id.decode('utf-8'),
1234 'url': video_url.decode('utf-8'),
1235 'uploader': video_uploader,
1236 'upload_date': u'NA',
1237 'title': video_title,
1238 'stitle': simple_title,
1239 'ext': video_extension.decode('utf-8'),
1245 class YoutubeSearchIE(InfoExtractor):
1246 """Information Extractor for YouTube search queries."""
1247 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1248 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1249 _max_youtube_results = 1000
1250 IE_NAME = u'youtube:search'
1252 def __init__(self, downloader=None):
1253 InfoExtractor.__init__(self, downloader)
1255 def report_download_page(self, query, pagenum):
1256 """Report attempt to download playlist page with given number."""
1257 query = query.decode(preferredencoding())
1258 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1260 def _real_extract(self, query):
1261 mobj = re.match(self._VALID_URL, query)
1263 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1266 prefix, query = query.split(':')
1268 query = query.encode('utf-8')
1270 self._download_n_results(query, 1)
1272 elif prefix == 'all':
1273 self._download_n_results(query, self._max_youtube_results)
1279 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1281 elif n > self._max_youtube_results:
1282 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1283 n = self._max_youtube_results
1284 self._download_n_results(query, n)
1286 except ValueError: # parsing prefix as integer fails
1287 self._download_n_results(query, 1)
1290 def _download_n_results(self, query, n):
1291 """Downloads a specified number of results for a query"""
1297 while (50 * pagenum) < limit:
1298 self.report_download_page(query, pagenum+1)
1299 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1300 request = urllib2.Request(result_url)
1302 data = urllib2.urlopen(request).read()
1303 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1304 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
1306 api_response = json.loads(data)['data']
1308 new_ids = list(video['id'] for video in api_response['items'])
1309 video_ids += new_ids
1311 limit = min(n, api_response['totalItems'])
1314 if len(video_ids) > n:
1315 video_ids = video_ids[:n]
1316 for id in video_ids:
1317 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1321 class GoogleSearchIE(InfoExtractor):
1322 """Information Extractor for Google Video search queries."""
1323 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1324 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1325 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1326 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1327 _max_google_results = 1000
1328 IE_NAME = u'video.google:search'
1330 def __init__(self, downloader=None):
1331 InfoExtractor.__init__(self, downloader)
1333 def report_download_page(self, query, pagenum):
1334 """Report attempt to download playlist page with given number."""
1335 query = query.decode(preferredencoding())
1336 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1338 def _real_extract(self, query):
1339 mobj = re.match(self._VALID_URL, query)
1341 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1344 prefix, query = query.split(':')
1346 query = query.encode('utf-8')
1348 self._download_n_results(query, 1)
1350 elif prefix == 'all':
1351 self._download_n_results(query, self._max_google_results)
1357 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1359 elif n > self._max_google_results:
1360 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1361 n = self._max_google_results
1362 self._download_n_results(query, n)
1364 except ValueError: # parsing prefix as integer fails
1365 self._download_n_results(query, 1)
1368 def _download_n_results(self, query, n):
1369 """Downloads a specified number of results for a query"""
1375 self.report_download_page(query, pagenum)
1376 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1377 request = urllib2.Request(result_url)
1379 page = urllib2.urlopen(request).read()
1380 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1381 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1384 # Extract video identifiers
1385 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1386 video_id = mobj.group(1)
1387 if video_id not in video_ids:
1388 video_ids.append(video_id)
1389 if len(video_ids) == n:
1390 # Specified n videos reached
1391 for id in video_ids:
1392 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1395 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1396 for id in video_ids:
1397 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1400 pagenum = pagenum + 1
1403 class YahooSearchIE(InfoExtractor):
1404 """Information Extractor for Yahoo! Video search queries."""
1405 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1406 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1407 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1408 _MORE_PAGES_INDICATOR = r'\s*Next'
1409 _max_yahoo_results = 1000
1410 IE_NAME = u'video.yahoo:search'
1412 def __init__(self, downloader=None):
1413 InfoExtractor.__init__(self, downloader)
1415 def report_download_page(self, query, pagenum):
1416 """Report attempt to download playlist page with given number."""
1417 query = query.decode(preferredencoding())
1418 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1420 def _real_extract(self, query):
1421 mobj = re.match(self._VALID_URL, query)
1423 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1426 prefix, query = query.split(':')
1428 query = query.encode('utf-8')
1430 self._download_n_results(query, 1)
1432 elif prefix == 'all':
1433 self._download_n_results(query, self._max_yahoo_results)
1439 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1441 elif n > self._max_yahoo_results:
1442 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1443 n = self._max_yahoo_results
1444 self._download_n_results(query, n)
1446 except ValueError: # parsing prefix as integer fails
1447 self._download_n_results(query, 1)
1450 def _download_n_results(self, query, n):
1451 """Downloads a specified number of results for a query"""
1454 already_seen = set()
1458 self.report_download_page(query, pagenum)
1459 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1460 request = urllib2.Request(result_url)
1462 page = urllib2.urlopen(request).read()
1463 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1464 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1467 # Extract video identifiers
1468 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1469 video_id = mobj.group(1)
1470 if video_id not in already_seen:
1471 video_ids.append(video_id)
1472 already_seen.add(video_id)
1473 if len(video_ids) == n:
1474 # Specified n videos reached
1475 for id in video_ids:
1476 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1479 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1480 for id in video_ids:
1481 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1484 pagenum = pagenum + 1
1487 class YoutubePlaylistIE(InfoExtractor):
1488 """Information Extractor for YouTube playlists."""
1490 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1491 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1492 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&list=PL%s&'
1493 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1494 IE_NAME = u'youtube:playlist'
1496 def __init__(self, downloader=None):
1497 InfoExtractor.__init__(self, downloader)
1499 def report_download_page(self, playlist_id, pagenum):
1500 """Report attempt to download playlist page with given number."""
1501 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1503 def _real_extract(self, url):
1504 # Extract playlist id
1505 mobj = re.match(self._VALID_URL, url)
1507 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1511 if mobj.group(3) is not None:
1512 self._downloader.download([mobj.group(3)])
1515 # Download playlist pages
1516 # prefix is 'p' as default for playlists but there are other types that need extra care
1517 playlist_prefix = mobj.group(1)
1518 if playlist_prefix == 'a':
1519 playlist_access = 'artist'
1521 playlist_prefix = 'p'
1522 playlist_access = 'view_play_list'
1523 playlist_id = mobj.group(2)
1528 self.report_download_page(playlist_id, pagenum)
1529 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1530 request = urllib2.Request(url)
1532 page = urllib2.urlopen(request).read()
1533 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1534 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1537 # Extract video identifiers
1539 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1540 if mobj.group(1) not in ids_in_page:
1541 ids_in_page.append(mobj.group(1))
1542 video_ids.extend(ids_in_page)
1544 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1546 pagenum = pagenum + 1
1548 playliststart = self._downloader.params.get('playliststart', 1) - 1
1549 playlistend = self._downloader.params.get('playlistend', -1)
1550 if playlistend == -1:
1551 video_ids = video_ids[playliststart:]
1553 video_ids = video_ids[playliststart:playlistend]
1555 for id in video_ids:
1556 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1560 class YoutubeUserIE(InfoExtractor):
1561 """Information Extractor for YouTube users."""
1563 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1564 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1565 _GDATA_PAGE_SIZE = 50
1566 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1567 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1568 IE_NAME = u'youtube:user'
1570 def __init__(self, downloader=None):
1571 InfoExtractor.__init__(self, downloader)
1573 def report_download_page(self, username, start_index):
1574 """Report attempt to download user page."""
1575 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1576 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1578 def _real_extract(self, url):
1580 mobj = re.match(self._VALID_URL, url)
1582 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1585 username = mobj.group(1)
1587 # Download video ids using YouTube Data API. Result size per
1588 # query is limited (currently to 50 videos) so we need to query
1589 # page by page until there are no video ids - it means we got
1596 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1597 self.report_download_page(username, start_index)
1599 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1602 page = urllib2.urlopen(request).read()
1603 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1604 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1607 # Extract video identifiers
1610 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1611 if mobj.group(1) not in ids_in_page:
1612 ids_in_page.append(mobj.group(1))
1614 video_ids.extend(ids_in_page)
1616 # A little optimization - if current page is not
1617 # "full", ie. does not contain PAGE_SIZE video ids then
1618 # we can assume that this page is the last one - there
1619 # are no more ids on further pages - no need to query
1622 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1627 all_ids_count = len(video_ids)
1628 playliststart = self._downloader.params.get('playliststart', 1) - 1
1629 playlistend = self._downloader.params.get('playlistend', -1)
1631 if playlistend == -1:
1632 video_ids = video_ids[playliststart:]
1634 video_ids = video_ids[playliststart:playlistend]
1636 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1637 (username, all_ids_count, len(video_ids)))
1639 for video_id in video_ids:
1640 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1643 class DepositFilesIE(InfoExtractor):
1644 """Information extractor for depositfiles.com"""
1646 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1647 IE_NAME = u'DepositFiles'
1649 def __init__(self, downloader=None):
1650 InfoExtractor.__init__(self, downloader)
1652 def report_download_webpage(self, file_id):
1653 """Report webpage download."""
1654 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1656 def report_extraction(self, file_id):
1657 """Report information extraction."""
1658 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1660 def _real_extract(self, url):
1661 file_id = url.split('/')[-1]
1662 # Rebuild url in english locale
1663 url = 'http://depositfiles.com/en/files/' + file_id
1665 # Retrieve file webpage with 'Free download' button pressed
1666 free_download_indication = { 'gateway_result' : '1' }
1667 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1669 self.report_download_webpage(file_id)
1670 webpage = urllib2.urlopen(request).read()
1671 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1672 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
1675 # Search for the real file URL
1676 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1677 if (mobj is None) or (mobj.group(1) is None):
1678 # Try to figure out reason of the error.
1679 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1680 if (mobj is not None) and (mobj.group(1) is not None):
1681 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1682 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1684 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1687 file_url = mobj.group(1)
1688 file_extension = os.path.splitext(file_url)[1][1:]
1690 # Search for file title
1691 mobj = re.search(r'<b title="(.*?)">', webpage)
1693 self._downloader.trouble(u'ERROR: unable to extract title')
1695 file_title = mobj.group(1).decode('utf-8')
1698 'id': file_id.decode('utf-8'),
1699 'url': file_url.decode('utf-8'),
1701 'upload_date': u'NA',
1702 'title': file_title,
1703 'stitle': file_title,
1704 'ext': file_extension.decode('utf-8'),
1710 class FacebookIE(InfoExtractor):
1711 """Information Extractor for Facebook"""
1713 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1714 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1715 _NETRC_MACHINE = 'facebook'
1716 _available_formats = ['video', 'highqual', 'lowqual']
1717 _video_extensions = {
1722 IE_NAME = u'facebook'
1724 def __init__(self, downloader=None):
1725 InfoExtractor.__init__(self, downloader)
1727 def _reporter(self, message):
1728 """Add header and report message."""
1729 self._downloader.to_screen(u'[facebook] %s' % message)
1731 def report_login(self):
1732 """Report attempt to log in."""
1733 self._reporter(u'Logging in')
1735 def report_video_webpage_download(self, video_id):
1736 """Report attempt to download video webpage."""
1737 self._reporter(u'%s: Downloading video webpage' % video_id)
1739 def report_information_extraction(self, video_id):
1740 """Report attempt to extract video information."""
1741 self._reporter(u'%s: Extracting video information' % video_id)
1743 def _parse_page(self, video_webpage):
1744 """Extract video information from page"""
1746 data = {'title': r'\("video_title", "(.*?)"\)',
1747 'description': r'<div class="datawrap">(.*?)</div>',
1748 'owner': r'\("video_owner_name", "(.*?)"\)',
1749 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1752 for piece in data.keys():
1753 mobj = re.search(data[piece], video_webpage)
1754 if mobj is not None:
1755 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1759 for fmt in self._available_formats:
1760 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1761 if mobj is not None:
1762 # URL is in a Javascript segment inside an escaped Unicode format within
1763 # the generally utf-8 page
1764 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1765 video_info['video_urls'] = video_urls
1769 def _real_initialize(self):
1770 if self._downloader is None:
1775 downloader_params = self._downloader.params
1777 # Attempt to use provided username and password or .netrc data
1778 if downloader_params.get('username', None) is not None:
1779 useremail = downloader_params['username']
1780 password = downloader_params['password']
1781 elif downloader_params.get('usenetrc', False):
1783 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1784 if info is not None:
1788 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1789 except (IOError, netrc.NetrcParseError), err:
1790 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1793 if useremail is None:
1802 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1805 login_results = urllib2.urlopen(request).read()
1806 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1807 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1809 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1810 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1813 def _real_extract(self, url):
1814 mobj = re.match(self._VALID_URL, url)
1816 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1818 video_id = mobj.group('ID')
1821 self.report_video_webpage_download(video_id)
1822 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
1824 page = urllib2.urlopen(request)
1825 video_webpage = page.read()
1826 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1827 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1830 # Start extracting information
1831 self.report_information_extraction(video_id)
1833 # Extract information
1834 video_info = self._parse_page(video_webpage)
1837 if 'owner' not in video_info:
1838 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1840 video_uploader = video_info['owner']
1843 if 'title' not in video_info:
1844 self._downloader.trouble(u'ERROR: unable to extract video title')
1846 video_title = video_info['title']
1847 video_title = video_title.decode('utf-8')
1848 video_title = sanitize_title(video_title)
1850 simple_title = simplify_title(video_title)
1853 if 'thumbnail' not in video_info:
1854 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1855 video_thumbnail = ''
1857 video_thumbnail = video_info['thumbnail']
1861 if 'upload_date' in video_info:
1862 upload_time = video_info['upload_date']
1863 timetuple = email.utils.parsedate_tz(upload_time)
1864 if timetuple is not None:
1866 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
1871 video_description = video_info.get('description', 'No description available.')
1873 url_map = video_info['video_urls']
1874 if len(url_map.keys()) > 0:
1875 # Decide which formats to download
1876 req_format = self._downloader.params.get('format', None)
1877 format_limit = self._downloader.params.get('format_limit', None)
1879 if format_limit is not None and format_limit in self._available_formats:
1880 format_list = self._available_formats[self._available_formats.index(format_limit):]
1882 format_list = self._available_formats
1883 existing_formats = [x for x in format_list if x in url_map]
1884 if len(existing_formats) == 0:
1885 self._downloader.trouble(u'ERROR: no known formats available for video')
1887 if req_format is None:
1888 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1889 elif req_format == 'worst':
1890 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1891 elif req_format == '-1':
1892 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1895 if req_format not in url_map:
1896 self._downloader.trouble(u'ERROR: requested format not available')
1898 video_url_list = [(req_format, url_map[req_format])] # Specific format
1901 for format_param, video_real_url in video_url_list:
1903 video_extension = self._video_extensions.get(format_param, 'mp4')
1906 'id': video_id.decode('utf-8'),
1907 'url': video_real_url.decode('utf-8'),
1908 'uploader': video_uploader.decode('utf-8'),
1909 'upload_date': upload_date,
1910 'title': video_title,
1911 'stitle': simple_title,
1912 'ext': video_extension.decode('utf-8'),
1913 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1914 'thumbnail': video_thumbnail.decode('utf-8'),
1915 'description': video_description.decode('utf-8'),
1920 class BlipTVIE(InfoExtractor):
1921 """Information extractor for blip.tv"""
1923 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
1924 _URL_EXT = r'^.*\.([a-z0-9]+)$'
1925 IE_NAME = u'blip.tv'
1927 def report_extraction(self, file_id):
1928 """Report information extraction."""
1929 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
1931 def report_direct_download(self, title):
1932 """Report information extraction."""
1933 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
1935 def _real_extract(self, url):
1936 mobj = re.match(self._VALID_URL, url)
1938 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1945 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1946 request = urllib2.Request(json_url)
1947 self.report_extraction(mobj.group(1))
1950 urlh = urllib2.urlopen(request)
1951 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1952 basename = url.split('/')[-1]
1953 title,ext = os.path.splitext(basename)
1954 title = title.decode('UTF-8')
1955 ext = ext.replace('.', '')
1956 self.report_direct_download(title)
1961 'stitle': simplify_title(title),
1965 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1966 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1968 if info is None: # Regular URL
1970 json_code = urlh.read()
1971 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1972 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
1976 json_data = json.loads(json_code)
1977 if 'Post' in json_data:
1978 data = json_data['Post']
1982 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
1983 video_url = data['media']['url']
1984 umobj = re.match(self._URL_EXT, video_url)
1986 raise ValueError('Can not determine filename extension')
1987 ext = umobj.group(1)
1990 'id': data['item_id'],
1992 'uploader': data['display_name'],
1993 'upload_date': upload_date,
1994 'title': data['title'],
1995 'stitle': simplify_title(data['title']),
1997 'format': data['media']['mimeType'],
1998 'thumbnail': data['thumbnailUrl'],
1999 'description': data['description'],
2000 'player_url': data['embedUrl']
2002 except (ValueError,KeyError), err:
2003 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2009 class MyVideoIE(InfoExtractor):
2010 """Information Extractor for myvideo.de."""
2012 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2013 IE_NAME = u'myvideo'
2015 def __init__(self, downloader=None):
2016 InfoExtractor.__init__(self, downloader)
2018 def report_download_webpage(self, video_id):
2019 """Report webpage download."""
2020 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2022 def report_extraction(self, video_id):
2023 """Report information extraction."""
2024 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2026 def _real_extract(self,url):
2027 mobj = re.match(self._VALID_URL, url)
2029 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2032 video_id = mobj.group(1)
2035 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2037 self.report_download_webpage(video_id)
2038 webpage = urllib2.urlopen(request).read()
2039 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2040 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2043 self.report_extraction(video_id)
2044 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2047 self._downloader.trouble(u'ERROR: unable to extract media URL')
2049 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2051 mobj = re.search('<title>([^<]+)</title>', webpage)
2053 self._downloader.trouble(u'ERROR: unable to extract title')
2056 video_title = mobj.group(1)
2057 video_title = sanitize_title(video_title)
2059 simple_title = simplify_title(video_title)
2065 'upload_date': u'NA',
2066 'title': video_title,
2067 'stitle': simple_title,
2073 class ComedyCentralIE(InfoExtractor):
2074 """Information extractor for The Daily Show and Colbert Report """
2076 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2077 IE_NAME = u'comedycentral'
2079 def report_extraction(self, episode_id):
2080 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2082 def report_config_download(self, episode_id):
2083 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2085 def report_index_download(self, episode_id):
2086 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2088 def report_player_url(self, episode_id):
2089 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2091 def _real_extract(self, url):
2092 mobj = re.match(self._VALID_URL, url)
2094 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2097 if mobj.group('shortname'):
2098 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2099 url = u'http://www.thedailyshow.com/full-episodes/'
2101 url = u'http://www.colbertnation.com/full-episodes/'
2102 mobj = re.match(self._VALID_URL, url)
2103 assert mobj is not None
2105 dlNewest = not mobj.group('episode')
2107 epTitle = mobj.group('showname')
2109 epTitle = mobj.group('episode')
2111 req = urllib2.Request(url)
2112 self.report_extraction(epTitle)
2114 htmlHandle = urllib2.urlopen(req)
2115 html = htmlHandle.read()
2116 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2117 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2120 url = htmlHandle.geturl()
2121 mobj = re.match(self._VALID_URL, url)
2123 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2125 if mobj.group('episode') == '':
2126 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2128 epTitle = mobj.group('episode')
2130 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2131 if len(mMovieParams) == 0:
2132 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2135 playerUrl_raw = mMovieParams[0][0]
2136 self.report_player_url(epTitle)
2138 urlHandle = urllib2.urlopen(playerUrl_raw)
2139 playerUrl = urlHandle.geturl()
2140 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2141 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2144 uri = mMovieParams[0][1]
2145 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2146 self.report_index_download(epTitle)
2148 indexXml = urllib2.urlopen(indexUrl).read()
2149 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2150 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2155 idoc = xml.etree.ElementTree.fromstring(indexXml)
2156 itemEls = idoc.findall('.//item')
2157 for itemEl in itemEls:
2158 mediaId = itemEl.findall('./guid')[0].text
2159 shortMediaId = mediaId.split(':')[-1]
2160 showId = mediaId.split(':')[-2].replace('.com', '')
2161 officialTitle = itemEl.findall('./title')[0].text
2162 officialDate = itemEl.findall('./pubDate')[0].text
2164 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2165 urllib.urlencode({'uri': mediaId}))
2166 configReq = urllib2.Request(configUrl)
2167 self.report_config_download(epTitle)
2169 configXml = urllib2.urlopen(configReq).read()
2170 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2171 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2174 cdoc = xml.etree.ElementTree.fromstring(configXml)
2176 for rendition in cdoc.findall('.//rendition'):
2177 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2181 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2184 # For now, just pick the highest bitrate
2185 format,video_url = turls[-1]
2187 effTitle = showId + u'-' + epTitle
2192 'upload_date': officialDate,
2194 'stitle': simplify_title(effTitle),
2198 'description': officialTitle,
2199 'player_url': playerUrl
2202 results.append(info)
2207 class EscapistIE(InfoExtractor):
2208 """Information extractor for The Escapist """
2210 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2211 IE_NAME = u'escapist'
2213 def report_extraction(self, showName):
2214 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2216 def report_config_download(self, showName):
2217 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2219 def _real_extract(self, url):
2220 mobj = re.match(self._VALID_URL, url)
2222 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2224 showName = mobj.group('showname')
2225 videoId = mobj.group('episode')
2227 self.report_extraction(showName)
2229 webPage = urllib2.urlopen(url).read()
2230 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2231 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2234 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2235 description = unescapeHTML(descMatch.group(1))
2236 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2237 imgUrl = unescapeHTML(imgMatch.group(1))
2238 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2239 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2240 configUrlMatch = re.search('config=(.*)$', playerUrl)
2241 configUrl = urllib2.unquote(configUrlMatch.group(1))
2243 self.report_config_download(showName)
2245 configJSON = urllib2.urlopen(configUrl).read()
2246 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2247 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2250 # Technically, it's JavaScript, not JSON
2251 configJSON = configJSON.replace("'", '"')
2254 config = json.loads(configJSON)
2255 except (ValueError,), err:
2256 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2259 playlist = config['playlist']
2260 videoUrl = playlist[1]['url']
2265 'uploader': showName,
2266 'upload_date': None,
2268 'stitle': simplify_title(showName),
2271 'thumbnail': imgUrl,
2272 'description': description,
2273 'player_url': playerUrl,
2279 class CollegeHumorIE(InfoExtractor):
2280 """Information extractor for collegehumor.com"""
2282 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2283 IE_NAME = u'collegehumor'
2285 def report_webpage(self, video_id):
2286 """Report information extraction."""
2287 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2289 def report_extraction(self, video_id):
2290 """Report information extraction."""
2291 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2293 def _real_extract(self, url):
2294 mobj = re.match(self._VALID_URL, url)
2296 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2298 video_id = mobj.group('videoid')
2300 self.report_webpage(video_id)
2301 request = urllib2.Request(url)
2303 webpage = urllib2.urlopen(request).read()
2304 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2305 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2308 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2310 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2312 internal_video_id = m.group('internalvideoid')
2316 'internal_id': internal_video_id,
2319 self.report_extraction(video_id)
2320 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2322 metaXml = urllib2.urlopen(xmlUrl).read()
2323 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2324 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
2327 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2329 videoNode = mdoc.findall('./video')[0]
2330 info['description'] = videoNode.findall('./description')[0].text
2331 info['title'] = videoNode.findall('./caption')[0].text
2332 info['stitle'] = simplify_title(info['title'])
2333 info['url'] = videoNode.findall('./file')[0].text
2334 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2335 info['ext'] = info['url'].rpartition('.')[2]
2336 info['format'] = info['ext']
2338 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2344 class XVideosIE(InfoExtractor):
2345 """Information extractor for xvideos.com"""
2347 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2348 IE_NAME = u'xvideos'
2350 def report_webpage(self, video_id):
2351 """Report information extraction."""
2352 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2354 def report_extraction(self, video_id):
2355 """Report information extraction."""
2356 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2358 def _real_extract(self, url):
2359 mobj = re.match(self._VALID_URL, url)
2361 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2363 video_id = mobj.group(1).decode('utf-8')
2365 self.report_webpage(video_id)
2367 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2369 webpage = urllib2.urlopen(request).read()
2370 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2371 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2374 self.report_extraction(video_id)
2378 mobj = re.search(r'flv_url=(.+?)&', webpage)
2380 self._downloader.trouble(u'ERROR: unable to extract video url')
2382 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2386 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2388 self._downloader.trouble(u'ERROR: unable to extract video title')
2390 video_title = mobj.group(1).decode('utf-8')
2393 # Extract video thumbnail
2394 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
2396 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2398 video_thumbnail = mobj.group(1).decode('utf-8')
2404 'upload_date': None,
2405 'title': video_title,
2406 'stitle': simplify_title(video_title),
2409 'thumbnail': video_thumbnail,
2410 'description': None,
2417 class SoundcloudIE(InfoExtractor):
2418 """Information extractor for soundcloud.com
2419 To access the media, the uid of the song and a stream token
2420 must be extracted from the page source and the script must make
2421 a request to media.soundcloud.com/crossdomain.xml. Then
2422 the media can be grabbed by requesting from an url composed
2423 of the stream token and uid
2426 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2427 IE_NAME = u'soundcloud'
2429 def __init__(self, downloader=None):
2430 InfoExtractor.__init__(self, downloader)
2432 def report_webpage(self, video_id):
2433 """Report information extraction."""
2434 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2436 def report_extraction(self, video_id):
2437 """Report information extraction."""
2438 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2440 def _real_extract(self, url):
2441 mobj = re.match(self._VALID_URL, url)
2443 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2446 # extract uploader (which is in the url)
2447 uploader = mobj.group(1).decode('utf-8')
2448 # extract simple title (uploader + slug of song title)
2449 slug_title = mobj.group(2).decode('utf-8')
2450 simple_title = uploader + '-' + slug_title
2452 self.report_webpage('%s/%s' % (uploader, slug_title))
2454 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2456 webpage = urllib2.urlopen(request).read()
2457 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2458 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2461 self.report_extraction('%s/%s' % (uploader, slug_title))
2463 # extract uid and stream token that soundcloud hands out for access
2464 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2466 video_id = mobj.group(1)
2467 stream_token = mobj.group(2)
2469 # extract unsimplified title
2470 mobj = re.search('"title":"(.*?)",', webpage)
2472 title = mobj.group(1)
2474 # construct media url (with uid/token)
2475 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2476 mediaURL = mediaURL % (video_id, stream_token)
2479 description = u'No description available'
2480 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2482 description = mobj.group(1)
2486 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2489 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2490 except Exception, e:
2493 # for soundcloud, a request to a cross domain is required for cookies
2494 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2497 'id': video_id.decode('utf-8'),
2499 'uploader': uploader.decode('utf-8'),
2500 'upload_date': upload_date,
2501 'title': simple_title.decode('utf-8'),
2502 'stitle': simple_title.decode('utf-8'),
2506 'description': description.decode('utf-8')
2510 class InfoQIE(InfoExtractor):
2511 """Information extractor for infoq.com"""
2513 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2516 def report_webpage(self, video_id):
2517 """Report information extraction."""
2518 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2520 def report_extraction(self, video_id):
2521 """Report information extraction."""
2522 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2524 def _real_extract(self, url):
2525 mobj = re.match(self._VALID_URL, url)
2527 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2530 self.report_webpage(url)
2532 request = urllib2.Request(url)
2534 webpage = urllib2.urlopen(request).read()
2535 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2536 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2539 self.report_extraction(url)
2543 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2545 self._downloader.trouble(u'ERROR: unable to extract video url')
2547 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2551 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2553 self._downloader.trouble(u'ERROR: unable to extract video title')
2555 video_title = mobj.group(1).decode('utf-8')
2557 # Extract description
2558 video_description = u'No description available.'
2559 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2560 if mobj is not None:
2561 video_description = mobj.group(1).decode('utf-8')
2563 video_filename = video_url.split('/')[-1]
2564 video_id, extension = video_filename.split('.')
2570 'upload_date': None,
2571 'title': video_title,
2572 'stitle': simplify_title(video_title),
2574 'format': extension, # Extension is always(?) mp4, but seems to be flv
2576 'description': video_description,
2582 class MixcloudIE(InfoExtractor):
2583 """Information extractor for www.mixcloud.com"""
2584 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2585 IE_NAME = u'mixcloud'
2587 def __init__(self, downloader=None):
2588 InfoExtractor.__init__(self, downloader)
2590 def report_download_json(self, file_id):
2591 """Report JSON download."""
2592 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2594 def report_extraction(self, file_id):
2595 """Report information extraction."""
2596 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2598 def get_urls(self, jsonData, fmt, bitrate='best'):
2599 """Get urls from 'audio_formats' section in json"""
2602 bitrate_list = jsonData[fmt]
2603 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2604 bitrate = max(bitrate_list) # select highest
2606 url_list = jsonData[fmt][bitrate]
2607 except TypeError: # we have no bitrate info.
2608 url_list = jsonData[fmt]
2611 def check_urls(self, url_list):
2612 """Returns 1st active url from list"""
2613 for url in url_list:
2615 urllib2.urlopen(url)
2617 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2622 def _print_formats(self, formats):
2623 print 'Available formats:'
2624 for fmt in formats.keys():
2625 for b in formats[fmt]:
2627 ext = formats[fmt][b][0]
2628 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
2629 except TypeError: # we have no bitrate info
2630 ext = formats[fmt][0]
2631 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
2634 def _real_extract(self, url):
2635 mobj = re.match(self._VALID_URL, url)
2637 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2639 # extract uploader & filename from url
2640 uploader = mobj.group(1).decode('utf-8')
2641 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2643 # construct API request
2644 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2645 # retrieve .json file with links to files
2646 request = urllib2.Request(file_url)
2648 self.report_download_json(file_url)
2649 jsonData = urllib2.urlopen(request).read()
2650 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2651 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
2655 json_data = json.loads(jsonData)
2656 player_url = json_data['player_swf_url']
2657 formats = dict(json_data['audio_formats'])
2659 req_format = self._downloader.params.get('format', None)
2662 if self._downloader.params.get('listformats', None):
2663 self._print_formats(formats)
2666 if req_format is None or req_format == 'best':
2667 for format_param in formats.keys():
2668 url_list = self.get_urls(formats, format_param)
2670 file_url = self.check_urls(url_list)
2671 if file_url is not None:
2674 if req_format not in formats.keys():
2675 self._downloader.trouble(u'ERROR: format is not available')
2678 url_list = self.get_urls(formats, req_format)
2679 file_url = self.check_urls(url_list)
2680 format_param = req_format
2683 'id': file_id.decode('utf-8'),
2684 'url': file_url.decode('utf-8'),
2685 'uploader': uploader.decode('utf-8'),
2686 'upload_date': u'NA',
2687 'title': json_data['name'],
2688 'stitle': simplify_title(json_data['name']),
2689 'ext': file_url.split('.')[-1].decode('utf-8'),
2690 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2691 'thumbnail': json_data['thumbnail_url'],
2692 'description': json_data['description'],
2693 'player_url': player_url.decode('utf-8'),
2696 class StanfordOpenClassroomIE(InfoExtractor):
2697 """Information extractor for Stanford's Open ClassRoom"""
2699 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2700 IE_NAME = u'stanfordoc'
2702 def report_download_webpage(self, objid):
2703 """Report information extraction."""
2704 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2706 def report_extraction(self, video_id):
2707 """Report information extraction."""
2708 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2710 def _real_extract(self, url):
2711 mobj = re.match(self._VALID_URL, url)
2713 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2716 if mobj.group('course') and mobj.group('video'): # A specific video
2717 course = mobj.group('course')
2718 video = mobj.group('video')
2720 'id': simplify_title(course + '_' + video),
2723 self.report_extraction(info['id'])
2724 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2725 xmlUrl = baseUrl + video + '.xml'
2727 metaXml = urllib2.urlopen(xmlUrl).read()
2728 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2729 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2731 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2733 info['title'] = mdoc.findall('./title')[0].text
2734 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2736 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2738 info['stitle'] = simplify_title(info['title'])
2739 info['ext'] = info['url'].rpartition('.')[2]
2740 info['format'] = info['ext']
2742 elif mobj.group('course'): # A course page
2743 course = mobj.group('course')
2745 'id': simplify_title(course),
2749 self.report_download_webpage(info['id'])
2751 coursepage = urllib2.urlopen(url).read()
2752 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2753 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2756 m = re.search('<h1>([^<]+)</h1>', coursepage)
2758 info['title'] = unescapeHTML(m.group(1))
2760 info['title'] = info['id']
2761 info['stitle'] = simplify_title(info['title'])
2763 m = re.search('<description>([^<]+)</description>', coursepage)
2765 info['description'] = unescapeHTML(m.group(1))
2767 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2770 'type': 'reference',
2771 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2775 for entry in info['list']:
2776 assert entry['type'] == 'reference'
2777 results += self.extract(entry['url'])
2782 'id': 'Stanford OpenClassroom',
2786 self.report_download_webpage(info['id'])
2787 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2789 rootpage = urllib2.urlopen(rootURL).read()
2790 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2791 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2794 info['title'] = info['id']
2795 info['stitle'] = simplify_title(info['title'])
2797 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2800 'type': 'reference',
2801 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2806 for entry in info['list']:
2807 assert entry['type'] == 'reference'
2808 results += self.extract(entry['url'])
2811 class MTVIE(InfoExtractor):
2812 """Information extractor for MTV.com"""
2814 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2817 def report_webpage(self, video_id):
2818 """Report information extraction."""
2819 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2821 def report_extraction(self, video_id):
2822 """Report information extraction."""
2823 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2825 def _real_extract(self, url):
2826 mobj = re.match(self._VALID_URL, url)
2828 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2830 if not mobj.group('proto'):
2831 url = 'http://' + url
2832 video_id = mobj.group('videoid')
2833 self.report_webpage(video_id)
2835 request = urllib2.Request(url)
2837 webpage = urllib2.urlopen(request).read()
2838 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2839 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2842 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2844 self._downloader.trouble(u'ERROR: unable to extract song name')
2846 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2847 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2849 self._downloader.trouble(u'ERROR: unable to extract performer')
2851 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2852 video_title = performer + ' - ' + song_name
2854 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2856 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
2858 mtvn_uri = mobj.group(1)
2860 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2862 self._downloader.trouble(u'ERROR: unable to extract content id')
2864 content_id = mobj.group(1)
2866 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2867 self.report_extraction(video_id)
2868 request = urllib2.Request(videogen_url)
2870 metadataXml = urllib2.urlopen(request).read()
2871 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2872 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
2875 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2876 renditions = mdoc.findall('.//rendition')
2878 # For now, always pick the highest quality.
2879 rendition = renditions[-1]
2882 _,_,ext = rendition.attrib['type'].partition('/')
2883 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2884 video_url = rendition.find('./src').text
2886 self._downloader.trouble('Invalid rendition field.')
2892 'uploader': performer,
2893 'title': video_title,
2894 'stitle': simplify_title(video_title),