2 # -*- coding: utf-8 -*-
15 import xml.etree.ElementTree
16 from urlparse import parse_qs
19 import cStringIO as StringIO
26 class InfoExtractor(object):
27 """Information Extractor class.
29 Information extractors are the classes that, given a URL, extract
30 information from the video (or videos) the URL refers to. This
31 information includes the real video URL, the video title and simplified
32 title, author and others. The information is stored in a dictionary
33 which is then passed to the FileDownloader. The FileDownloader
34 processes this information possibly downloading the video to the file
35 system, among other possible outcomes. The dictionaries must include
40 uploader: Nickname of the video uploader.
42 ext: Video filename extension.
44 player_url: SWF Player URL (may be None).
46 The following fields are optional. Their primary purpose is to allow
47 youtube-dl to serve as the backend for a video search function, such
48 as the one in youtube2mp3. They are only used when their respective
49 forced printing functions are called:
51 thumbnail: Full URL to a video thumbnail image.
52 description: One-line video description.
54 Subclasses of this one should re-define the _real_initialize() and
55 _real_extract() methods and define a _VALID_URL regexp.
56 Probably, they should also be added to the list of extractors.
62 def __init__(self, downloader=None):
63 """Constructor. Receives an optional downloader."""
65 self.set_downloader(downloader)
67 def suitable(self, url):
68 """Receives a URL and returns True if suitable for this IE."""
69 return re.match(self._VALID_URL, url) is not None
72 """Initializes an instance (authentication, etc)."""
74 self._real_initialize()
77 def extract(self, url):
78 """Extracts URL information and returns it in list of dicts."""
80 return self._real_extract(url)
82 def set_downloader(self, downloader):
83 """Sets the downloader for this IE."""
84 self._downloader = downloader
86 def _real_initialize(self):
87 """Real initialization process. Redefine in subclasses."""
90 def _real_extract(self, url):
91 """Real extraction process. Redefine in subclasses."""
95 class YoutubeIE(InfoExtractor):
96 """Information extractor for youtube.com."""
98 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
99 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
100 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
101 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
102 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
103 _NETRC_MACHINE = 'youtube'
104 # Listed in order of quality
105 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
106 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
107 _video_extensions = {
113 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
119 _video_dimensions = {
137 def report_lang(self):
138 """Report attempt to set language."""
139 self._downloader.to_screen(u'[youtube] Setting language')
141 def report_login(self):
142 """Report attempt to log in."""
143 self._downloader.to_screen(u'[youtube] Logging in')
145 def report_age_confirmation(self):
146 """Report attempt to confirm age."""
147 self._downloader.to_screen(u'[youtube] Confirming age')
149 def report_video_webpage_download(self, video_id):
150 """Report attempt to download video webpage."""
151 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
153 def report_video_info_webpage_download(self, video_id):
154 """Report attempt to download video info webpage."""
155 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
157 def report_video_subtitles_download(self, video_id):
158 """Report attempt to download video info webpage."""
159 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
161 def report_information_extraction(self, video_id):
162 """Report attempt to extract video information."""
163 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
165 def report_unavailable_format(self, video_id, format):
166 """Report extracted video URL."""
167 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
169 def report_rtmp_download(self):
170 """Indicate the download will use the RTMP protocol."""
171 self._downloader.to_screen(u'[youtube] RTMP download detected')
173 def _closed_captions_xml_to_srt(self, xml_string):
175 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
176 # TODO parse xml instead of regex
177 for n, (start, dur_tag, dur, caption) in enumerate(texts):
178 if not dur: dur = '4'
180 end = start + float(dur)
181 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
182 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
183 caption = unescapeHTML(caption)
184 caption = unescapeHTML(caption) # double cycle, intentional
186 srt += start + ' --> ' + end + '\n'
187 srt += caption + '\n\n'
190 def _print_formats(self, formats):
191 print 'Available formats:'
193 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
195 def _real_initialize(self):
196 if self._downloader is None:
201 downloader_params = self._downloader.params
203 # Attempt to use provided username and password or .netrc data
204 if downloader_params.get('username', None) is not None:
205 username = downloader_params['username']
206 password = downloader_params['password']
207 elif downloader_params.get('usenetrc', False):
209 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
214 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
215 except (IOError, netrc.NetrcParseError), err:
216 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
220 request = urllib2.Request(self._LANG_URL)
223 urllib2.urlopen(request).read()
224 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
225 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
228 # No authentication to be performed
234 'current_form': 'loginForm',
236 'action_login': 'Log In',
237 'username': username,
238 'password': password,
240 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
243 login_results = urllib2.urlopen(request).read()
244 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
245 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
247 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
248 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
254 'action_confirm': 'Confirm',
256 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
258 self.report_age_confirmation()
259 age_results = urllib2.urlopen(request).read()
260 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
261 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
264 def _real_extract(self, url):
265 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
266 mobj = re.search(self._NEXT_URL_RE, url)
268 url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
270 # Extract video id from URL
271 mobj = re.match(self._VALID_URL, url)
273 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
275 video_id = mobj.group(2)
278 self.report_video_webpage_download(video_id)
279 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
281 video_webpage = urllib2.urlopen(request).read()
282 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
283 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
286 # Attempt to extract SWF player URL
287 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
289 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
294 self.report_video_info_webpage_download(video_id)
295 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
296 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
297 % (video_id, el_type))
298 request = urllib2.Request(video_info_url)
300 video_info_webpage = urllib2.urlopen(request).read()
301 video_info = parse_qs(video_info_webpage)
302 if 'token' in video_info:
304 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
305 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
307 if 'token' not in video_info:
308 if 'reason' in video_info:
309 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
311 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
314 # Check for "rental" videos
315 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
316 self._downloader.trouble(u'ERROR: "rental" videos not supported')
319 # Start extracting information
320 self.report_information_extraction(video_id)
323 if 'author' not in video_info:
324 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
326 video_uploader = urllib.unquote_plus(video_info['author'][0])
329 if 'title' not in video_info:
330 self._downloader.trouble(u'ERROR: unable to extract video title')
332 video_title = urllib.unquote_plus(video_info['title'][0])
333 video_title = video_title.decode('utf-8')
336 if 'thumbnail_url' not in video_info:
337 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
339 else: # don't panic if we can't find it
340 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
344 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
346 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
347 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
348 for expression in format_expressions:
350 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
355 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
356 if video_description: video_description = clean_html(video_description)
357 else: video_description = ''
360 video_subtitles = None
361 if self._downloader.params.get('writesubtitles', False):
363 self.report_video_subtitles_download(video_id)
364 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
366 srt_list = urllib2.urlopen(request).read()
367 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
368 raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
369 srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
370 if not srt_lang_list:
371 raise Trouble(u'WARNING: video has no closed captions')
372 if self._downloader.params.get('subtitleslang', False):
373 srt_lang = self._downloader.params.get('subtitleslang')
374 elif 'en' in srt_lang_list:
377 srt_lang = srt_lang_list[0]
378 if not srt_lang in srt_lang_list:
379 raise Trouble(u'WARNING: no closed captions found in the specified language')
380 request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
382 srt_xml = urllib2.urlopen(request).read()
383 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
384 raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
385 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
386 except Trouble as trouble:
387 self._downloader.trouble(trouble[0])
390 video_token = urllib.unquote_plus(video_info['token'][0])
392 # Decide which formats to download
393 req_format = self._downloader.params.get('format', None)
395 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
396 self.report_rtmp_download()
397 video_url_list = [(None, video_info['conn'][0])]
398 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
399 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
400 url_data = [parse_qs(uds) for uds in url_data_strs]
401 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
402 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
404 format_limit = self._downloader.params.get('format_limit', None)
405 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
406 if format_limit is not None and format_limit in available_formats:
407 format_list = available_formats[available_formats.index(format_limit):]
409 format_list = available_formats
410 existing_formats = [x for x in format_list if x in url_map]
411 if len(existing_formats) == 0:
412 self._downloader.trouble(u'ERROR: no known formats available for video')
414 if self._downloader.params.get('listformats', None):
415 self._print_formats(existing_formats)
417 if req_format is None or req_format == 'best':
418 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
419 elif req_format == 'worst':
420 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
421 elif req_format in ('-1', 'all'):
422 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
424 # Specific formats. We pick the first in a slash-delimeted sequence.
425 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
426 req_formats = req_format.split('/')
427 video_url_list = None
428 for rf in req_formats:
430 video_url_list = [(rf, url_map[rf])]
432 if video_url_list is None:
433 self._downloader.trouble(u'ERROR: requested format not available')
436 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
440 for format_param, video_real_url in video_url_list:
442 video_extension = self._video_extensions.get(format_param, 'flv')
445 'id': video_id.decode('utf-8'),
446 'url': video_real_url.decode('utf-8'),
447 'uploader': video_uploader.decode('utf-8'),
448 'upload_date': upload_date,
449 'title': video_title,
450 'ext': video_extension.decode('utf-8'),
451 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
452 'thumbnail': video_thumbnail.decode('utf-8'),
453 'description': video_description,
454 'player_url': player_url,
455 'subtitles': video_subtitles
460 class MetacafeIE(InfoExtractor):
461 """Information Extractor for metacafe.com."""
463 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
464 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
465 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
466 IE_NAME = u'metacafe'
468 def __init__(self, downloader=None):
469 InfoExtractor.__init__(self, downloader)
471 def report_disclaimer(self):
472 """Report disclaimer retrieval."""
473 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
475 def report_age_confirmation(self):
476 """Report attempt to confirm age."""
477 self._downloader.to_screen(u'[metacafe] Confirming age')
479 def report_download_webpage(self, video_id):
480 """Report webpage download."""
481 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
483 def report_extraction(self, video_id):
484 """Report information extraction."""
485 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
487 def _real_initialize(self):
488 # Retrieve disclaimer
489 request = urllib2.Request(self._DISCLAIMER)
491 self.report_disclaimer()
492 disclaimer = urllib2.urlopen(request).read()
493 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
494 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
500 'submit': "Continue - I'm over 18",
502 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
504 self.report_age_confirmation()
505 disclaimer = urllib2.urlopen(request).read()
506 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
507 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
510 def _real_extract(self, url):
511 # Extract id and simplified title from URL
512 mobj = re.match(self._VALID_URL, url)
514 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
517 video_id = mobj.group(1)
519 # Check if video comes from YouTube
520 mobj2 = re.match(r'^yt-(.*)$', video_id)
521 if mobj2 is not None:
522 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
525 # Retrieve video webpage to extract further information
526 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
528 self.report_download_webpage(video_id)
529 webpage = urllib2.urlopen(request).read()
530 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
531 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
534 # Extract URL, uploader and title from webpage
535 self.report_extraction(video_id)
536 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
538 mediaURL = urllib.unquote(mobj.group(1))
539 video_extension = mediaURL[-3:]
541 # Extract gdaKey if available
542 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
546 gdaKey = mobj.group(1)
547 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
549 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
551 self._downloader.trouble(u'ERROR: unable to extract media URL')
553 vardict = parse_qs(mobj.group(1))
554 if 'mediaData' not in vardict:
555 self._downloader.trouble(u'ERROR: unable to extract media URL')
557 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
559 self._downloader.trouble(u'ERROR: unable to extract media URL')
561 mediaURL = mobj.group(1).replace('\\/', '/')
562 video_extension = mediaURL[-3:]
563 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
565 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
567 self._downloader.trouble(u'ERROR: unable to extract title')
569 video_title = mobj.group(1).decode('utf-8')
571 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
573 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
575 video_uploader = mobj.group(1)
578 'id': video_id.decode('utf-8'),
579 'url': video_url.decode('utf-8'),
580 'uploader': video_uploader.decode('utf-8'),
581 'upload_date': u'NA',
582 'title': video_title,
583 'ext': video_extension.decode('utf-8'),
589 class DailymotionIE(InfoExtractor):
590 """Information Extractor for Dailymotion"""
592 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
593 IE_NAME = u'dailymotion'
595 def __init__(self, downloader=None):
596 InfoExtractor.__init__(self, downloader)
598 def report_download_webpage(self, video_id):
599 """Report webpage download."""
600 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
602 def report_extraction(self, video_id):
603 """Report information extraction."""
604 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
606 def _real_extract(self, url):
607 # Extract id and simplified title from URL
608 mobj = re.match(self._VALID_URL, url)
610 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
613 video_id = mobj.group(1)
615 video_extension = 'flv'
617 # Retrieve video webpage to extract further information
618 request = urllib2.Request(url)
619 request.add_header('Cookie', 'family_filter=off')
621 self.report_download_webpage(video_id)
622 webpage = urllib2.urlopen(request).read()
623 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
624 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
627 # Extract URL, uploader and title from webpage
628 self.report_extraction(video_id)
629 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
631 self._downloader.trouble(u'ERROR: unable to extract media URL')
633 sequence = urllib.unquote(mobj.group(1))
634 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
636 self._downloader.trouble(u'ERROR: unable to extract media URL')
638 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
640 # if needed add http://www.dailymotion.com/ if relative URL
644 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
646 self._downloader.trouble(u'ERROR: unable to extract title')
648 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
650 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
652 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
654 video_uploader = mobj.group(1)
657 'id': video_id.decode('utf-8'),
658 'url': video_url.decode('utf-8'),
659 'uploader': video_uploader.decode('utf-8'),
660 'upload_date': u'NA',
661 'title': video_title,
662 'ext': video_extension.decode('utf-8'),
668 class GoogleIE(InfoExtractor):
669 """Information extractor for video.google.com."""
671 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
672 IE_NAME = u'video.google'
674 def __init__(self, downloader=None):
675 InfoExtractor.__init__(self, downloader)
677 def report_download_webpage(self, video_id):
678 """Report webpage download."""
679 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
681 def report_extraction(self, video_id):
682 """Report information extraction."""
683 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
685 def _real_extract(self, url):
686 # Extract id from URL
687 mobj = re.match(self._VALID_URL, url)
689 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
692 video_id = mobj.group(1)
694 video_extension = 'mp4'
696 # Retrieve video webpage to extract further information
697 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
699 self.report_download_webpage(video_id)
700 webpage = urllib2.urlopen(request).read()
701 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
702 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
705 # Extract URL, uploader, and title from webpage
706 self.report_extraction(video_id)
707 mobj = re.search(r"download_url:'([^']+)'", webpage)
709 video_extension = 'flv'
710 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
712 self._downloader.trouble(u'ERROR: unable to extract media URL')
714 mediaURL = urllib.unquote(mobj.group(1))
715 mediaURL = mediaURL.replace('\\x3d', '\x3d')
716 mediaURL = mediaURL.replace('\\x26', '\x26')
720 mobj = re.search(r'<title>(.*)</title>', webpage)
722 self._downloader.trouble(u'ERROR: unable to extract title')
724 video_title = mobj.group(1).decode('utf-8')
726 # Extract video description
727 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
729 self._downloader.trouble(u'ERROR: unable to extract video description')
731 video_description = mobj.group(1).decode('utf-8')
732 if not video_description:
733 video_description = 'No description available.'
735 # Extract video thumbnail
736 if self._downloader.params.get('forcethumbnail', False):
737 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
739 webpage = urllib2.urlopen(request).read()
740 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
741 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
743 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
745 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
747 video_thumbnail = mobj.group(1)
748 else: # we need something to pass to process_info
752 'id': video_id.decode('utf-8'),
753 'url': video_url.decode('utf-8'),
755 'upload_date': u'NA',
756 'title': video_title,
757 'ext': video_extension.decode('utf-8'),
763 class PhotobucketIE(InfoExtractor):
764 """Information extractor for photobucket.com."""
766 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
767 IE_NAME = u'photobucket'
769 def __init__(self, downloader=None):
770 InfoExtractor.__init__(self, downloader)
772 def report_download_webpage(self, video_id):
773 """Report webpage download."""
774 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
776 def report_extraction(self, video_id):
777 """Report information extraction."""
778 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
780 def _real_extract(self, url):
781 # Extract id from URL
782 mobj = re.match(self._VALID_URL, url)
784 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
787 video_id = mobj.group(1)
789 video_extension = 'flv'
791 # Retrieve video webpage to extract further information
792 request = urllib2.Request(url)
794 self.report_download_webpage(video_id)
795 webpage = urllib2.urlopen(request).read()
796 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
797 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
800 # Extract URL, uploader, and title from webpage
801 self.report_extraction(video_id)
802 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
804 self._downloader.trouble(u'ERROR: unable to extract media URL')
806 mediaURL = urllib.unquote(mobj.group(1))
810 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
812 self._downloader.trouble(u'ERROR: unable to extract title')
814 video_title = mobj.group(1).decode('utf-8')
816 video_uploader = mobj.group(2).decode('utf-8')
819 'id': video_id.decode('utf-8'),
820 'url': video_url.decode('utf-8'),
821 'uploader': video_uploader,
822 'upload_date': u'NA',
823 'title': video_title,
824 'ext': video_extension.decode('utf-8'),
830 class YahooIE(InfoExtractor):
831 """Information extractor for video.yahoo.com."""
833 # _VALID_URL matches all Yahoo! Video URLs
834 # _VPAGE_URL matches only the extractable '/watch/' URLs
835 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
836 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
837 IE_NAME = u'video.yahoo'
839 def __init__(self, downloader=None):
840 InfoExtractor.__init__(self, downloader)
842 def report_download_webpage(self, video_id):
843 """Report webpage download."""
844 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
846 def report_extraction(self, video_id):
847 """Report information extraction."""
848 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
850 def _real_extract(self, url, new_video=True):
851 # Extract ID from URL
852 mobj = re.match(self._VALID_URL, url)
854 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
857 video_id = mobj.group(2)
858 video_extension = 'flv'
860 # Rewrite valid but non-extractable URLs as
861 # extractable English language /watch/ URLs
862 if re.match(self._VPAGE_URL, url) is None:
863 request = urllib2.Request(url)
865 webpage = urllib2.urlopen(request).read()
866 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
867 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
870 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
872 self._downloader.trouble(u'ERROR: Unable to extract id field')
874 yahoo_id = mobj.group(1)
876 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
878 self._downloader.trouble(u'ERROR: Unable to extract vid field')
880 yahoo_vid = mobj.group(1)
882 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
883 return self._real_extract(url, new_video=False)
885 # Retrieve video webpage to extract further information
886 request = urllib2.Request(url)
888 self.report_download_webpage(video_id)
889 webpage = urllib2.urlopen(request).read()
890 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
891 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
894 # Extract uploader and title from webpage
895 self.report_extraction(video_id)
896 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
898 self._downloader.trouble(u'ERROR: unable to extract video title')
900 video_title = mobj.group(1).decode('utf-8')
902 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
904 self._downloader.trouble(u'ERROR: unable to extract video uploader')
906 video_uploader = mobj.group(1).decode('utf-8')
908 # Extract video thumbnail
909 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
911 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
913 video_thumbnail = mobj.group(1).decode('utf-8')
915 # Extract video description
916 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
918 self._downloader.trouble(u'ERROR: unable to extract video description')
920 video_description = mobj.group(1).decode('utf-8')
921 if not video_description:
922 video_description = 'No description available.'
924 # Extract video height and width
925 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
927 self._downloader.trouble(u'ERROR: unable to extract video height')
929 yv_video_height = mobj.group(1)
931 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
933 self._downloader.trouble(u'ERROR: unable to extract video width')
935 yv_video_width = mobj.group(1)
937 # Retrieve video playlist to extract media URL
938 # I'm not completely sure what all these options are, but we
939 # seem to need most of them, otherwise the server sends a 401.
940 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
941 yv_bitrate = '700' # according to Wikipedia this is hard-coded
942 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
943 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
944 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
946 self.report_download_webpage(video_id)
947 webpage = urllib2.urlopen(request).read()
948 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
949 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
952 # Extract media URL from playlist XML
953 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
955 self._downloader.trouble(u'ERROR: Unable to extract media URL')
957 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
958 video_url = unescapeHTML(video_url)
961 'id': video_id.decode('utf-8'),
963 'uploader': video_uploader,
964 'upload_date': u'NA',
965 'title': video_title,
966 'ext': video_extension.decode('utf-8'),
967 'thumbnail': video_thumbnail.decode('utf-8'),
968 'description': video_description,
969 'thumbnail': video_thumbnail,
974 class VimeoIE(InfoExtractor):
975 """Information extractor for vimeo.com."""
977 # _VALID_URL matches Vimeo URLs
978 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
981 def __init__(self, downloader=None):
982 InfoExtractor.__init__(self, downloader)
984 def report_download_webpage(self, video_id):
985 """Report webpage download."""
986 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
988 def report_extraction(self, video_id):
989 """Report information extraction."""
990 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
992 def _real_extract(self, url, new_video=True):
993 # Extract ID from URL
994 mobj = re.match(self._VALID_URL, url)
996 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
999 video_id = mobj.group(1)
1001 # Retrieve video webpage to extract further information
1002 request = urllib2.Request(url, None, std_headers)
1004 self.report_download_webpage(video_id)
1005 webpage = urllib2.urlopen(request).read()
1006 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1007 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1010 # Now we begin extracting as much information as we can from what we
1011 # retrieved. First we extract the information common to all extractors,
1012 # and latter we extract those that are Vimeo specific.
1013 self.report_extraction(video_id)
1015 # Extract the config JSON
1016 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1018 config = json.loads(config)
1020 self._downloader.trouble(u'ERROR: unable to extract info section')
1024 video_title = config["video"]["title"]
1027 video_uploader = config["video"]["owner"]["name"]
1029 # Extract video thumbnail
1030 video_thumbnail = config["video"]["thumbnail"]
1032 # Extract video description
1033 video_description = get_element_by_id("description", webpage.decode('utf8'))
1034 if video_description: video_description = clean_html(video_description)
1035 else: video_description = ''
1037 # Extract upload date
1038 video_upload_date = u'NA'
1039 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1040 if mobj is not None:
1041 video_upload_date = mobj.group(1)
1043 # Vimeo specific: extract request signature and timestamp
1044 sig = config['request']['signature']
1045 timestamp = config['request']['timestamp']
1047 # Vimeo specific: extract video codec and quality information
1048 # TODO bind to format param
1049 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1050 for codec in codecs:
1051 if codec[0] in config["video"]["files"]:
1052 video_codec = codec[0]
1053 video_extension = codec[1]
1054 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
1055 else: quality = 'sd'
1058 self._downloader.trouble(u'ERROR: no known codec found')
1061 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1062 %(video_id, sig, timestamp, quality, video_codec.upper())
1067 'uploader': video_uploader,
1068 'upload_date': video_upload_date,
1069 'title': video_title,
1070 'ext': video_extension,
1071 'thumbnail': video_thumbnail,
1072 'description': video_description,
1077 class GenericIE(InfoExtractor):
1078 """Generic last-resort information extractor."""
1081 IE_NAME = u'generic'
1083 def __init__(self, downloader=None):
1084 InfoExtractor.__init__(self, downloader)
1086 def report_download_webpage(self, video_id):
1087 """Report webpage download."""
1088 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1089 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1091 def report_extraction(self, video_id):
1092 """Report information extraction."""
1093 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1095 def report_following_redirect(self, new_url):
1096 """Report information extraction."""
1097 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1099 def _test_redirect(self, url):
1100 """Check if it is a redirect, like url shorteners, in case restart chain."""
1101 class HeadRequest(urllib2.Request):
1102 def get_method(self):
1105 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1107 Subclass the HTTPRedirectHandler to make it use our
1108 HeadRequest also on the redirected URL
1110 def redirect_request(self, req, fp, code, msg, headers, newurl):
1111 if code in (301, 302, 303, 307):
1112 newurl = newurl.replace(' ', '%20')
1113 newheaders = dict((k,v) for k,v in req.headers.items()
1114 if k.lower() not in ("content-length", "content-type"))
1115 return HeadRequest(newurl,
1117 origin_req_host=req.get_origin_req_host(),
1120 raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1122 class HTTPMethodFallback(urllib2.BaseHandler):
1124 Fallback to GET if HEAD is not allowed (405 HTTP error)
1126 def http_error_405(self, req, fp, code, msg, headers):
1130 newheaders = dict((k,v) for k,v in req.headers.items()
1131 if k.lower() not in ("content-length", "content-type"))
1132 return self.parent.open(urllib2.Request(req.get_full_url(),
1134 origin_req_host=req.get_origin_req_host(),
1138 opener = urllib2.OpenerDirector()
1139 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1140 HTTPMethodFallback, HEADRedirectHandler,
1141 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1142 opener.add_handler(handler())
1144 response = opener.open(HeadRequest(url))
1145 new_url = response.geturl()
1147 if url == new_url: return False
1149 self.report_following_redirect(new_url)
1150 self._downloader.download([new_url])
1153 def _real_extract(self, url):
1154 if self._test_redirect(url): return
1156 video_id = url.split('/')[-1]
1157 request = urllib2.Request(url)
1159 self.report_download_webpage(video_id)
1160 webpage = urllib2.urlopen(request).read()
1161 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1162 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1164 except ValueError, err:
1165 # since this is the last-resort InfoExtractor, if
1166 # this error is thrown, it'll be thrown here
1167 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1170 self.report_extraction(video_id)
1171 # Start with something easy: JW Player in SWFObject
1172 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1174 # Broaden the search a little bit
1175 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1177 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1180 # It's possible that one of the regexes
1181 # matched, but returned an empty group:
1182 if mobj.group(1) is None:
1183 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1186 video_url = urllib.unquote(mobj.group(1))
1187 video_id = os.path.basename(video_url)
1189 # here's a fun little line of code for you:
1190 video_extension = os.path.splitext(video_id)[1][1:]
1191 video_id = os.path.splitext(video_id)[0]
1193 # it's tempting to parse this further, but you would
1194 # have to take into account all the variations like
1195 # Video Title - Site Name
1196 # Site Name | Video Title
1197 # Video Title - Tagline | Site Name
1198 # and so on and so forth; it's just not practical
1199 mobj = re.search(r'<title>(.*)</title>', webpage)
1201 self._downloader.trouble(u'ERROR: unable to extract title')
1203 video_title = mobj.group(1).decode('utf-8')
1205 # video uploader is domain name
1206 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1208 self._downloader.trouble(u'ERROR: unable to extract title')
1210 video_uploader = mobj.group(1).decode('utf-8')
1213 'id': video_id.decode('utf-8'),
1214 'url': video_url.decode('utf-8'),
1215 'uploader': video_uploader,
1216 'upload_date': u'NA',
1217 'title': video_title,
1218 'ext': video_extension.decode('utf-8'),
1224 class YoutubeSearchIE(InfoExtractor):
1225 """Information Extractor for YouTube search queries."""
1226 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1227 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1228 _max_youtube_results = 1000
1229 IE_NAME = u'youtube:search'
1231 def __init__(self, downloader=None):
1232 InfoExtractor.__init__(self, downloader)
1234 def report_download_page(self, query, pagenum):
1235 """Report attempt to download playlist page with given number."""
1236 query = query.decode(preferredencoding())
1237 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1239 def _real_extract(self, query):
1240 mobj = re.match(self._VALID_URL, query)
1242 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1245 prefix, query = query.split(':')
1247 query = query.encode('utf-8')
1249 self._download_n_results(query, 1)
1251 elif prefix == 'all':
1252 self._download_n_results(query, self._max_youtube_results)
1258 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1260 elif n > self._max_youtube_results:
1261 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1262 n = self._max_youtube_results
1263 self._download_n_results(query, n)
1265 except ValueError: # parsing prefix as integer fails
1266 self._download_n_results(query, 1)
1269 def _download_n_results(self, query, n):
1270 """Downloads a specified number of results for a query"""
1276 while (50 * pagenum) < limit:
1277 self.report_download_page(query, pagenum+1)
1278 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1279 request = urllib2.Request(result_url)
1281 data = urllib2.urlopen(request).read()
1282 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1283 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
1285 api_response = json.loads(data)['data']
1287 new_ids = list(video['id'] for video in api_response['items'])
1288 video_ids += new_ids
1290 limit = min(n, api_response['totalItems'])
1293 if len(video_ids) > n:
1294 video_ids = video_ids[:n]
1295 for id in video_ids:
1296 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1300 class GoogleSearchIE(InfoExtractor):
1301 """Information Extractor for Google Video search queries."""
1302 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1303 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1304 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1305 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1306 _max_google_results = 1000
1307 IE_NAME = u'video.google:search'
1309 def __init__(self, downloader=None):
1310 InfoExtractor.__init__(self, downloader)
1312 def report_download_page(self, query, pagenum):
1313 """Report attempt to download playlist page with given number."""
1314 query = query.decode(preferredencoding())
1315 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1317 def _real_extract(self, query):
1318 mobj = re.match(self._VALID_URL, query)
1320 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1323 prefix, query = query.split(':')
1325 query = query.encode('utf-8')
1327 self._download_n_results(query, 1)
1329 elif prefix == 'all':
1330 self._download_n_results(query, self._max_google_results)
1336 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1338 elif n > self._max_google_results:
1339 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1340 n = self._max_google_results
1341 self._download_n_results(query, n)
1343 except ValueError: # parsing prefix as integer fails
1344 self._download_n_results(query, 1)
1347 def _download_n_results(self, query, n):
1348 """Downloads a specified number of results for a query"""
1354 self.report_download_page(query, pagenum)
1355 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1356 request = urllib2.Request(result_url)
1358 page = urllib2.urlopen(request).read()
1359 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1360 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1363 # Extract video identifiers
1364 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1365 video_id = mobj.group(1)
1366 if video_id not in video_ids:
1367 video_ids.append(video_id)
1368 if len(video_ids) == n:
1369 # Specified n videos reached
1370 for id in video_ids:
1371 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1374 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1375 for id in video_ids:
1376 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1379 pagenum = pagenum + 1
1382 class YahooSearchIE(InfoExtractor):
1383 """Information Extractor for Yahoo! Video search queries."""
1384 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1385 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1386 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1387 _MORE_PAGES_INDICATOR = r'\s*Next'
1388 _max_yahoo_results = 1000
1389 IE_NAME = u'video.yahoo:search'
1391 def __init__(self, downloader=None):
1392 InfoExtractor.__init__(self, downloader)
1394 def report_download_page(self, query, pagenum):
1395 """Report attempt to download playlist page with given number."""
1396 query = query.decode(preferredencoding())
1397 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1399 def _real_extract(self, query):
1400 mobj = re.match(self._VALID_URL, query)
1402 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1405 prefix, query = query.split(':')
1407 query = query.encode('utf-8')
1409 self._download_n_results(query, 1)
1411 elif prefix == 'all':
1412 self._download_n_results(query, self._max_yahoo_results)
1418 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1420 elif n > self._max_yahoo_results:
1421 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1422 n = self._max_yahoo_results
1423 self._download_n_results(query, n)
1425 except ValueError: # parsing prefix as integer fails
1426 self._download_n_results(query, 1)
1429 def _download_n_results(self, query, n):
1430 """Downloads a specified number of results for a query"""
1433 already_seen = set()
1437 self.report_download_page(query, pagenum)
1438 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1439 request = urllib2.Request(result_url)
1441 page = urllib2.urlopen(request).read()
1442 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1443 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1446 # Extract video identifiers
1447 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1448 video_id = mobj.group(1)
1449 if video_id not in already_seen:
1450 video_ids.append(video_id)
1451 already_seen.add(video_id)
1452 if len(video_ids) == n:
1453 # Specified n videos reached
1454 for id in video_ids:
1455 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1458 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1459 for id in video_ids:
1460 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1463 pagenum = pagenum + 1
1466 class YoutubePlaylistIE(InfoExtractor):
1467 """Information Extractor for YouTube playlists."""
1469 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1470 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1471 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&list=PL%s&'
1472 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1473 IE_NAME = u'youtube:playlist'
1475 def __init__(self, downloader=None):
1476 InfoExtractor.__init__(self, downloader)
1478 def report_download_page(self, playlist_id, pagenum):
1479 """Report attempt to download playlist page with given number."""
1480 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1482 def _real_extract(self, url):
1483 # Extract playlist id
1484 mobj = re.match(self._VALID_URL, url)
1486 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1490 if mobj.group(3) is not None:
1491 self._downloader.download([mobj.group(3)])
1494 # Download playlist pages
1495 # prefix is 'p' as default for playlists but there are other types that need extra care
1496 playlist_prefix = mobj.group(1)
1497 if playlist_prefix == 'a':
1498 playlist_access = 'artist'
1500 playlist_prefix = 'p'
1501 playlist_access = 'view_play_list'
1502 playlist_id = mobj.group(2)
1507 self.report_download_page(playlist_id, pagenum)
1508 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1509 request = urllib2.Request(url)
1511 page = urllib2.urlopen(request).read()
1512 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1513 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1516 # Extract video identifiers
1518 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1519 if mobj.group(1) not in ids_in_page:
1520 ids_in_page.append(mobj.group(1))
1521 video_ids.extend(ids_in_page)
1523 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1525 pagenum = pagenum + 1
1527 playliststart = self._downloader.params.get('playliststart', 1) - 1
1528 playlistend = self._downloader.params.get('playlistend', -1)
1529 if playlistend == -1:
1530 video_ids = video_ids[playliststart:]
1532 video_ids = video_ids[playliststart:playlistend]
1534 for id in video_ids:
1535 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1539 class YoutubeUserIE(InfoExtractor):
1540 """Information Extractor for YouTube users."""
1542 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1543 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1544 _GDATA_PAGE_SIZE = 50
1545 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1546 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1547 IE_NAME = u'youtube:user'
1549 def __init__(self, downloader=None):
1550 InfoExtractor.__init__(self, downloader)
1552 def report_download_page(self, username, start_index):
1553 """Report attempt to download user page."""
1554 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1555 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1557 def _real_extract(self, url):
1559 mobj = re.match(self._VALID_URL, url)
1561 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1564 username = mobj.group(1)
1566 # Download video ids using YouTube Data API. Result size per
1567 # query is limited (currently to 50 videos) so we need to query
1568 # page by page until there are no video ids - it means we got
1575 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1576 self.report_download_page(username, start_index)
1578 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1581 page = urllib2.urlopen(request).read()
1582 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1583 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1586 # Extract video identifiers
1589 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1590 if mobj.group(1) not in ids_in_page:
1591 ids_in_page.append(mobj.group(1))
1593 video_ids.extend(ids_in_page)
1595 # A little optimization - if current page is not
1596 # "full", ie. does not contain PAGE_SIZE video ids then
1597 # we can assume that this page is the last one - there
1598 # are no more ids on further pages - no need to query
1601 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1606 all_ids_count = len(video_ids)
1607 playliststart = self._downloader.params.get('playliststart', 1) - 1
1608 playlistend = self._downloader.params.get('playlistend', -1)
1610 if playlistend == -1:
1611 video_ids = video_ids[playliststart:]
1613 video_ids = video_ids[playliststart:playlistend]
1615 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1616 (username, all_ids_count, len(video_ids)))
1618 for video_id in video_ids:
1619 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1622 class DepositFilesIE(InfoExtractor):
1623 """Information extractor for depositfiles.com"""
1625 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1626 IE_NAME = u'DepositFiles'
1628 def __init__(self, downloader=None):
1629 InfoExtractor.__init__(self, downloader)
1631 def report_download_webpage(self, file_id):
1632 """Report webpage download."""
1633 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1635 def report_extraction(self, file_id):
1636 """Report information extraction."""
1637 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1639 def _real_extract(self, url):
1640 file_id = url.split('/')[-1]
1641 # Rebuild url in english locale
1642 url = 'http://depositfiles.com/en/files/' + file_id
1644 # Retrieve file webpage with 'Free download' button pressed
1645 free_download_indication = { 'gateway_result' : '1' }
1646 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1648 self.report_download_webpage(file_id)
1649 webpage = urllib2.urlopen(request).read()
1650 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1651 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
1654 # Search for the real file URL
1655 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1656 if (mobj is None) or (mobj.group(1) is None):
1657 # Try to figure out reason of the error.
1658 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1659 if (mobj is not None) and (mobj.group(1) is not None):
1660 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1661 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1663 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1666 file_url = mobj.group(1)
1667 file_extension = os.path.splitext(file_url)[1][1:]
1669 # Search for file title
1670 mobj = re.search(r'<b title="(.*?)">', webpage)
1672 self._downloader.trouble(u'ERROR: unable to extract title')
1674 file_title = mobj.group(1).decode('utf-8')
1677 'id': file_id.decode('utf-8'),
1678 'url': file_url.decode('utf-8'),
1680 'upload_date': u'NA',
1681 'title': file_title,
1682 'ext': file_extension.decode('utf-8'),
1688 class FacebookIE(InfoExtractor):
1689 """Information Extractor for Facebook"""
1691 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1692 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1693 _NETRC_MACHINE = 'facebook'
1694 _available_formats = ['video', 'highqual', 'lowqual']
1695 _video_extensions = {
1700 IE_NAME = u'facebook'
1702 def __init__(self, downloader=None):
1703 InfoExtractor.__init__(self, downloader)
1705 def _reporter(self, message):
1706 """Add header and report message."""
1707 self._downloader.to_screen(u'[facebook] %s' % message)
1709 def report_login(self):
1710 """Report attempt to log in."""
1711 self._reporter(u'Logging in')
1713 def report_video_webpage_download(self, video_id):
1714 """Report attempt to download video webpage."""
1715 self._reporter(u'%s: Downloading video webpage' % video_id)
1717 def report_information_extraction(self, video_id):
1718 """Report attempt to extract video information."""
1719 self._reporter(u'%s: Extracting video information' % video_id)
1721 def _parse_page(self, video_webpage):
1722 """Extract video information from page"""
1724 data = {'title': r'\("video_title", "(.*?)"\)',
1725 'description': r'<div class="datawrap">(.*?)</div>',
1726 'owner': r'\("video_owner_name", "(.*?)"\)',
1727 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1730 for piece in data.keys():
1731 mobj = re.search(data[piece], video_webpage)
1732 if mobj is not None:
1733 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1737 for fmt in self._available_formats:
1738 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1739 if mobj is not None:
1740 # URL is in a Javascript segment inside an escaped Unicode format within
1741 # the generally utf-8 page
1742 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1743 video_info['video_urls'] = video_urls
1747 def _real_initialize(self):
1748 if self._downloader is None:
1753 downloader_params = self._downloader.params
1755 # Attempt to use provided username and password or .netrc data
1756 if downloader_params.get('username', None) is not None:
1757 useremail = downloader_params['username']
1758 password = downloader_params['password']
1759 elif downloader_params.get('usenetrc', False):
1761 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1762 if info is not None:
1766 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1767 except (IOError, netrc.NetrcParseError), err:
1768 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1771 if useremail is None:
1780 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1783 login_results = urllib2.urlopen(request).read()
1784 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1785 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1787 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1788 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1791 def _real_extract(self, url):
1792 mobj = re.match(self._VALID_URL, url)
1794 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1796 video_id = mobj.group('ID')
1799 self.report_video_webpage_download(video_id)
1800 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
1802 page = urllib2.urlopen(request)
1803 video_webpage = page.read()
1804 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1805 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1808 # Start extracting information
1809 self.report_information_extraction(video_id)
1811 # Extract information
1812 video_info = self._parse_page(video_webpage)
1815 if 'owner' not in video_info:
1816 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1818 video_uploader = video_info['owner']
1821 if 'title' not in video_info:
1822 self._downloader.trouble(u'ERROR: unable to extract video title')
1824 video_title = video_info['title']
1825 video_title = video_title.decode('utf-8')
1828 if 'thumbnail' not in video_info:
1829 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1830 video_thumbnail = ''
1832 video_thumbnail = video_info['thumbnail']
1836 if 'upload_date' in video_info:
1837 upload_time = video_info['upload_date']
1838 timetuple = email.utils.parsedate_tz(upload_time)
1839 if timetuple is not None:
1841 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
1846 video_description = video_info.get('description', 'No description available.')
1848 url_map = video_info['video_urls']
1849 if len(url_map.keys()) > 0:
1850 # Decide which formats to download
1851 req_format = self._downloader.params.get('format', None)
1852 format_limit = self._downloader.params.get('format_limit', None)
1854 if format_limit is not None and format_limit in self._available_formats:
1855 format_list = self._available_formats[self._available_formats.index(format_limit):]
1857 format_list = self._available_formats
1858 existing_formats = [x for x in format_list if x in url_map]
1859 if len(existing_formats) == 0:
1860 self._downloader.trouble(u'ERROR: no known formats available for video')
1862 if req_format is None:
1863 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1864 elif req_format == 'worst':
1865 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1866 elif req_format == '-1':
1867 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1870 if req_format not in url_map:
1871 self._downloader.trouble(u'ERROR: requested format not available')
1873 video_url_list = [(req_format, url_map[req_format])] # Specific format
1876 for format_param, video_real_url in video_url_list:
1878 video_extension = self._video_extensions.get(format_param, 'mp4')
1881 'id': video_id.decode('utf-8'),
1882 'url': video_real_url.decode('utf-8'),
1883 'uploader': video_uploader.decode('utf-8'),
1884 'upload_date': upload_date,
1885 'title': video_title,
1886 'ext': video_extension.decode('utf-8'),
1887 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1888 'thumbnail': video_thumbnail.decode('utf-8'),
1889 'description': video_description.decode('utf-8'),
1894 class BlipTVIE(InfoExtractor):
1895 """Information extractor for blip.tv"""
1897 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
1898 _URL_EXT = r'^.*\.([a-z0-9]+)$'
1899 IE_NAME = u'blip.tv'
1901 def report_extraction(self, file_id):
1902 """Report information extraction."""
1903 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
1905 def report_direct_download(self, title):
1906 """Report information extraction."""
1907 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
1909 def _real_extract(self, url):
1910 mobj = re.match(self._VALID_URL, url)
1912 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1919 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1920 request = urllib2.Request(json_url)
1921 self.report_extraction(mobj.group(1))
1924 urlh = urllib2.urlopen(request)
1925 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1926 basename = url.split('/')[-1]
1927 title,ext = os.path.splitext(basename)
1928 title = title.decode('UTF-8')
1929 ext = ext.replace('.', '')
1930 self.report_direct_download(title)
1938 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1939 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1941 if info is None: # Regular URL
1943 json_code = urlh.read()
1944 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1945 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
1949 json_data = json.loads(json_code)
1950 if 'Post' in json_data:
1951 data = json_data['Post']
1955 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
1956 video_url = data['media']['url']
1957 umobj = re.match(self._URL_EXT, video_url)
1959 raise ValueError('Can not determine filename extension')
1960 ext = umobj.group(1)
1963 'id': data['item_id'],
1965 'uploader': data['display_name'],
1966 'upload_date': upload_date,
1967 'title': data['title'],
1969 'format': data['media']['mimeType'],
1970 'thumbnail': data['thumbnailUrl'],
1971 'description': data['description'],
1972 'player_url': data['embedUrl']
1974 except (ValueError,KeyError), err:
1975 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
1981 class MyVideoIE(InfoExtractor):
1982 """Information Extractor for myvideo.de."""
1984 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
1985 IE_NAME = u'myvideo'
1987 def __init__(self, downloader=None):
1988 InfoExtractor.__init__(self, downloader)
1990 def report_download_webpage(self, video_id):
1991 """Report webpage download."""
1992 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
1994 def report_extraction(self, video_id):
1995 """Report information extraction."""
1996 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
1998 def _real_extract(self,url):
1999 mobj = re.match(self._VALID_URL, url)
2001 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2004 video_id = mobj.group(1)
2007 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2009 self.report_download_webpage(video_id)
2010 webpage = urllib2.urlopen(request).read()
2011 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2012 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2015 self.report_extraction(video_id)
2016 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2019 self._downloader.trouble(u'ERROR: unable to extract media URL')
2021 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2023 mobj = re.search('<title>([^<]+)</title>', webpage)
2025 self._downloader.trouble(u'ERROR: unable to extract title')
2028 video_title = mobj.group(1)
2034 'upload_date': u'NA',
2035 'title': video_title,
2041 class ComedyCentralIE(InfoExtractor):
2042 """Information extractor for The Daily Show and Colbert Report """
2044 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2045 IE_NAME = u'comedycentral'
2047 def report_extraction(self, episode_id):
2048 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2050 def report_config_download(self, episode_id):
2051 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2053 def report_index_download(self, episode_id):
2054 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2056 def report_player_url(self, episode_id):
2057 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2059 def _real_extract(self, url):
2060 mobj = re.match(self._VALID_URL, url)
2062 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2065 if mobj.group('shortname'):
2066 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2067 url = u'http://www.thedailyshow.com/full-episodes/'
2069 url = u'http://www.colbertnation.com/full-episodes/'
2070 mobj = re.match(self._VALID_URL, url)
2071 assert mobj is not None
2073 dlNewest = not mobj.group('episode')
2075 epTitle = mobj.group('showname')
2077 epTitle = mobj.group('episode')
2079 req = urllib2.Request(url)
2080 self.report_extraction(epTitle)
2082 htmlHandle = urllib2.urlopen(req)
2083 html = htmlHandle.read()
2084 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2085 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2088 url = htmlHandle.geturl()
2089 mobj = re.match(self._VALID_URL, url)
2091 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2093 if mobj.group('episode') == '':
2094 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2096 epTitle = mobj.group('episode')
2098 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2099 if len(mMovieParams) == 0:
2100 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2103 playerUrl_raw = mMovieParams[0][0]
2104 self.report_player_url(epTitle)
2106 urlHandle = urllib2.urlopen(playerUrl_raw)
2107 playerUrl = urlHandle.geturl()
2108 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2109 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2112 uri = mMovieParams[0][1]
2113 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2114 self.report_index_download(epTitle)
2116 indexXml = urllib2.urlopen(indexUrl).read()
2117 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2118 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2123 idoc = xml.etree.ElementTree.fromstring(indexXml)
2124 itemEls = idoc.findall('.//item')
2125 for itemEl in itemEls:
2126 mediaId = itemEl.findall('./guid')[0].text
2127 shortMediaId = mediaId.split(':')[-1]
2128 showId = mediaId.split(':')[-2].replace('.com', '')
2129 officialTitle = itemEl.findall('./title')[0].text
2130 officialDate = itemEl.findall('./pubDate')[0].text
2132 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2133 urllib.urlencode({'uri': mediaId}))
2134 configReq = urllib2.Request(configUrl)
2135 self.report_config_download(epTitle)
2137 configXml = urllib2.urlopen(configReq).read()
2138 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2139 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2142 cdoc = xml.etree.ElementTree.fromstring(configXml)
2144 for rendition in cdoc.findall('.//rendition'):
2145 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2149 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2152 # For now, just pick the highest bitrate
2153 format,video_url = turls[-1]
2155 effTitle = showId + u'-' + epTitle
2160 'upload_date': officialDate,
2165 'description': officialTitle,
2166 'player_url': playerUrl
2169 results.append(info)
2174 class EscapistIE(InfoExtractor):
2175 """Information extractor for The Escapist """
2177 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2178 IE_NAME = u'escapist'
2180 def report_extraction(self, showName):
2181 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2183 def report_config_download(self, showName):
2184 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2186 def _real_extract(self, url):
2187 mobj = re.match(self._VALID_URL, url)
2189 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2191 showName = mobj.group('showname')
2192 videoId = mobj.group('episode')
2194 self.report_extraction(showName)
2196 webPageBytes = urllib2.urlopen(url).read()
2197 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2198 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2201 webPage = webPageBytes.decode('utf-8')
2202 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2203 description = unescapeHTML(descMatch.group(1))
2204 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2205 imgUrl = unescapeHTML(imgMatch.group(1))
2206 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2207 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2208 configUrlMatch = re.search('config=(.*)$', playerUrl)
2209 configUrl = urllib2.unquote(configUrlMatch.group(1))
2211 self.report_config_download(showName)
2213 configJSON = urllib2.urlopen(configUrl).read()
2214 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2215 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2218 # Technically, it's JavaScript, not JSON
2219 configJSON = configJSON.replace("'", '"')
2222 config = json.loads(configJSON)
2223 except (ValueError,), err:
2224 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2227 playlist = config['playlist']
2228 videoUrl = playlist[1]['url']
2233 'uploader': showName,
2234 'upload_date': None,
2238 'thumbnail': imgUrl,
2239 'description': description,
2240 'player_url': playerUrl,
2246 class CollegeHumorIE(InfoExtractor):
2247 """Information extractor for collegehumor.com"""
2249 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2250 IE_NAME = u'collegehumor'
2252 def report_webpage(self, video_id):
2253 """Report information extraction."""
2254 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2256 def report_extraction(self, video_id):
2257 """Report information extraction."""
2258 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2260 def _real_extract(self, url):
2261 mobj = re.match(self._VALID_URL, url)
2263 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2265 video_id = mobj.group('videoid')
2267 self.report_webpage(video_id)
2268 request = urllib2.Request(url)
2270 webpage = urllib2.urlopen(request).read()
2271 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2272 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2275 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2277 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2279 internal_video_id = m.group('internalvideoid')
2283 'internal_id': internal_video_id,
2286 self.report_extraction(video_id)
2287 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2289 metaXml = urllib2.urlopen(xmlUrl).read()
2290 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2291 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
2294 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2296 videoNode = mdoc.findall('./video')[0]
2297 info['description'] = videoNode.findall('./description')[0].text
2298 info['title'] = videoNode.findall('./caption')[0].text
2299 info['url'] = videoNode.findall('./file')[0].text
2300 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2301 info['ext'] = info['url'].rpartition('.')[2]
2302 info['format'] = info['ext']
2304 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2310 class XVideosIE(InfoExtractor):
2311 """Information extractor for xvideos.com"""
2313 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2314 IE_NAME = u'xvideos'
2316 def report_webpage(self, video_id):
2317 """Report information extraction."""
2318 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2320 def report_extraction(self, video_id):
2321 """Report information extraction."""
2322 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2324 def _real_extract(self, url):
2325 mobj = re.match(self._VALID_URL, url)
2327 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2329 video_id = mobj.group(1).decode('utf-8')
2331 self.report_webpage(video_id)
2333 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2335 webpage = urllib2.urlopen(request).read()
2336 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2337 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2340 self.report_extraction(video_id)
2344 mobj = re.search(r'flv_url=(.+?)&', webpage)
2346 self._downloader.trouble(u'ERROR: unable to extract video url')
2348 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2352 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2354 self._downloader.trouble(u'ERROR: unable to extract video title')
2356 video_title = mobj.group(1).decode('utf-8')
2359 # Extract video thumbnail
2360 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
2362 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2364 video_thumbnail = mobj.group(1).decode('utf-8')
2370 'upload_date': None,
2371 'title': video_title,
2374 'thumbnail': video_thumbnail,
2375 'description': None,
2382 class SoundcloudIE(InfoExtractor):
2383 """Information extractor for soundcloud.com
2384 To access the media, the uid of the song and a stream token
2385 must be extracted from the page source and the script must make
2386 a request to media.soundcloud.com/crossdomain.xml. Then
2387 the media can be grabbed by requesting from an url composed
2388 of the stream token and uid
2391 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2392 IE_NAME = u'soundcloud'
2394 def __init__(self, downloader=None):
2395 InfoExtractor.__init__(self, downloader)
2397 def report_webpage(self, video_id):
2398 """Report information extraction."""
2399 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2401 def report_extraction(self, video_id):
2402 """Report information extraction."""
2403 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2405 def _real_extract(self, url):
2406 mobj = re.match(self._VALID_URL, url)
2408 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2411 # extract uploader (which is in the url)
2412 uploader = mobj.group(1).decode('utf-8')
2413 # extract simple title (uploader + slug of song title)
2414 slug_title = mobj.group(2).decode('utf-8')
2415 simple_title = uploader + u'-' + slug_title
2417 self.report_webpage('%s/%s' % (uploader, slug_title))
2419 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2421 webpage = urllib2.urlopen(request).read()
2422 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2423 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2426 self.report_extraction('%s/%s' % (uploader, slug_title))
2428 # extract uid and stream token that soundcloud hands out for access
2429 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2431 video_id = mobj.group(1)
2432 stream_token = mobj.group(2)
2434 # extract unsimplified title
2435 mobj = re.search('"title":"(.*?)",', webpage)
2437 title = mobj.group(1).decode('utf-8')
2439 title = simple_title
2441 # construct media url (with uid/token)
2442 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2443 mediaURL = mediaURL % (video_id, stream_token)
2446 description = u'No description available'
2447 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2449 description = mobj.group(1)
2453 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2456 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2457 except Exception, e:
2458 self._downloader.to_stderr(str(e))
2460 # for soundcloud, a request to a cross domain is required for cookies
2461 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2464 'id': video_id.decode('utf-8'),
2466 'uploader': uploader.decode('utf-8'),
2467 'upload_date': upload_date,
2472 'description': description.decode('utf-8')
2476 class InfoQIE(InfoExtractor):
2477 """Information extractor for infoq.com"""
2479 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2482 def report_webpage(self, video_id):
2483 """Report information extraction."""
2484 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2486 def report_extraction(self, video_id):
2487 """Report information extraction."""
2488 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2490 def _real_extract(self, url):
2491 mobj = re.match(self._VALID_URL, url)
2493 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2496 self.report_webpage(url)
2498 request = urllib2.Request(url)
2500 webpage = urllib2.urlopen(request).read()
2501 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2502 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2505 self.report_extraction(url)
2509 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2511 self._downloader.trouble(u'ERROR: unable to extract video url')
2513 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2517 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2519 self._downloader.trouble(u'ERROR: unable to extract video title')
2521 video_title = mobj.group(1).decode('utf-8')
2523 # Extract description
2524 video_description = u'No description available.'
2525 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2526 if mobj is not None:
2527 video_description = mobj.group(1).decode('utf-8')
2529 video_filename = video_url.split('/')[-1]
2530 video_id, extension = video_filename.split('.')
2536 'upload_date': None,
2537 'title': video_title,
2539 'format': extension, # Extension is always(?) mp4, but seems to be flv
2541 'description': video_description,
2547 class MixcloudIE(InfoExtractor):
2548 """Information extractor for www.mixcloud.com"""
2549 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2550 IE_NAME = u'mixcloud'
2552 def __init__(self, downloader=None):
2553 InfoExtractor.__init__(self, downloader)
2555 def report_download_json(self, file_id):
2556 """Report JSON download."""
2557 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2559 def report_extraction(self, file_id):
2560 """Report information extraction."""
2561 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2563 def get_urls(self, jsonData, fmt, bitrate='best'):
2564 """Get urls from 'audio_formats' section in json"""
2567 bitrate_list = jsonData[fmt]
2568 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2569 bitrate = max(bitrate_list) # select highest
2571 url_list = jsonData[fmt][bitrate]
2572 except TypeError: # we have no bitrate info.
2573 url_list = jsonData[fmt]
2576 def check_urls(self, url_list):
2577 """Returns 1st active url from list"""
2578 for url in url_list:
2580 urllib2.urlopen(url)
2582 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2587 def _print_formats(self, formats):
2588 print 'Available formats:'
2589 for fmt in formats.keys():
2590 for b in formats[fmt]:
2592 ext = formats[fmt][b][0]
2593 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
2594 except TypeError: # we have no bitrate info
2595 ext = formats[fmt][0]
2596 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
2599 def _real_extract(self, url):
2600 mobj = re.match(self._VALID_URL, url)
2602 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2604 # extract uploader & filename from url
2605 uploader = mobj.group(1).decode('utf-8')
2606 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2608 # construct API request
2609 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2610 # retrieve .json file with links to files
2611 request = urllib2.Request(file_url)
2613 self.report_download_json(file_url)
2614 jsonData = urllib2.urlopen(request).read()
2615 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2616 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
2620 json_data = json.loads(jsonData)
2621 player_url = json_data['player_swf_url']
2622 formats = dict(json_data['audio_formats'])
2624 req_format = self._downloader.params.get('format', None)
2627 if self._downloader.params.get('listformats', None):
2628 self._print_formats(formats)
2631 if req_format is None or req_format == 'best':
2632 for format_param in formats.keys():
2633 url_list = self.get_urls(formats, format_param)
2635 file_url = self.check_urls(url_list)
2636 if file_url is not None:
2639 if req_format not in formats.keys():
2640 self._downloader.trouble(u'ERROR: format is not available')
2643 url_list = self.get_urls(formats, req_format)
2644 file_url = self.check_urls(url_list)
2645 format_param = req_format
2648 'id': file_id.decode('utf-8'),
2649 'url': file_url.decode('utf-8'),
2650 'uploader': uploader.decode('utf-8'),
2651 'upload_date': u'NA',
2652 'title': json_data['name'],
2653 'ext': file_url.split('.')[-1].decode('utf-8'),
2654 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2655 'thumbnail': json_data['thumbnail_url'],
2656 'description': json_data['description'],
2657 'player_url': player_url.decode('utf-8'),
2660 class StanfordOpenClassroomIE(InfoExtractor):
2661 """Information extractor for Stanford's Open ClassRoom"""
2663 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2664 IE_NAME = u'stanfordoc'
2666 def report_download_webpage(self, objid):
2667 """Report information extraction."""
2668 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2670 def report_extraction(self, video_id):
2671 """Report information extraction."""
2672 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2674 def _real_extract(self, url):
2675 mobj = re.match(self._VALID_URL, url)
2677 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2680 if mobj.group('course') and mobj.group('video'): # A specific video
2681 course = mobj.group('course')
2682 video = mobj.group('video')
2684 'id': course + '_' + video,
2687 self.report_extraction(info['id'])
2688 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2689 xmlUrl = baseUrl + video + '.xml'
2691 metaXml = urllib2.urlopen(xmlUrl).read()
2692 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2693 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2695 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2697 info['title'] = mdoc.findall('./title')[0].text
2698 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2700 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2702 info['ext'] = info['url'].rpartition('.')[2]
2703 info['format'] = info['ext']
2705 elif mobj.group('course'): # A course page
2706 course = mobj.group('course')
2712 self.report_download_webpage(info['id'])
2714 coursepage = urllib2.urlopen(url).read()
2715 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2716 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2719 m = re.search('<h1>([^<]+)</h1>', coursepage)
2721 info['title'] = unescapeHTML(m.group(1))
2723 info['title'] = info['id']
2725 m = re.search('<description>([^<]+)</description>', coursepage)
2727 info['description'] = unescapeHTML(m.group(1))
2729 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2732 'type': 'reference',
2733 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2737 for entry in info['list']:
2738 assert entry['type'] == 'reference'
2739 results += self.extract(entry['url'])
2744 'id': 'Stanford OpenClassroom',
2748 self.report_download_webpage(info['id'])
2749 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2751 rootpage = urllib2.urlopen(rootURL).read()
2752 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2753 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2756 info['title'] = info['id']
2758 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2761 'type': 'reference',
2762 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2767 for entry in info['list']:
2768 assert entry['type'] == 'reference'
2769 results += self.extract(entry['url'])
2772 class MTVIE(InfoExtractor):
2773 """Information extractor for MTV.com"""
2775 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2778 def report_webpage(self, video_id):
2779 """Report information extraction."""
2780 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2782 def report_extraction(self, video_id):
2783 """Report information extraction."""
2784 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2786 def _real_extract(self, url):
2787 mobj = re.match(self._VALID_URL, url)
2789 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2791 if not mobj.group('proto'):
2792 url = 'http://' + url
2793 video_id = mobj.group('videoid')
2794 self.report_webpage(video_id)
2796 request = urllib2.Request(url)
2798 webpage = urllib2.urlopen(request).read()
2799 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2800 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2803 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2805 self._downloader.trouble(u'ERROR: unable to extract song name')
2807 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2808 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2810 self._downloader.trouble(u'ERROR: unable to extract performer')
2812 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2813 video_title = performer + ' - ' + song_name
2815 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2817 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
2819 mtvn_uri = mobj.group(1)
2821 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2823 self._downloader.trouble(u'ERROR: unable to extract content id')
2825 content_id = mobj.group(1)
2827 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2828 self.report_extraction(video_id)
2829 request = urllib2.Request(videogen_url)
2831 metadataXml = urllib2.urlopen(request).read()
2832 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2833 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
2836 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2837 renditions = mdoc.findall('.//rendition')
2839 # For now, always pick the highest quality.
2840 rendition = renditions[-1]
2843 _,_,ext = rendition.attrib['type'].partition('/')
2844 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2845 video_url = rendition.find('./src').text
2847 self._downloader.trouble('Invalid rendition field.')
2853 'uploader': performer,
2854 'title': video_title,