2 # -*- coding: utf-8 -*-
15 import xml.etree.ElementTree
16 from urlparse import parse_qs
19 import cStringIO as StringIO
26 class InfoExtractor(object):
27 """Information Extractor class.
29 Information extractors are the classes that, given a URL, extract
30 information from the video (or videos) the URL refers to. This
31 information includes the real video URL, the video title and simplified
32 title, author and others. The information is stored in a dictionary
33 which is then passed to the FileDownloader. The FileDownloader
34 processes this information possibly downloading the video to the file
35 system, among other possible outcomes. The dictionaries must include
40 uploader: Nickname of the video uploader.
42 ext: Video filename extension.
44 player_url: SWF Player URL (may be None).
46 The following fields are optional. Their primary purpose is to allow
47 youtube-dl to serve as the backend for a video search function, such
48 as the one in youtube2mp3. They are only used when their respective
49 forced printing functions are called:
51 thumbnail: Full URL to a video thumbnail image.
52 description: One-line video description.
54 Subclasses of this one should re-define the _real_initialize() and
55 _real_extract() methods and define a _VALID_URL regexp.
56 Probably, they should also be added to the list of extractors.
62 def __init__(self, downloader=None):
63 """Constructor. Receives an optional downloader."""
65 self.set_downloader(downloader)
67 def suitable(self, url):
68 """Receives a URL and returns True if suitable for this IE."""
69 return re.match(self._VALID_URL, url) is not None
72 """Initializes an instance (authentication, etc)."""
74 self._real_initialize()
77 def extract(self, url):
78 """Extracts URL information and returns it in list of dicts."""
80 return self._real_extract(url)
82 def set_downloader(self, downloader):
83 """Sets the downloader for this IE."""
84 self._downloader = downloader
86 def _real_initialize(self):
87 """Real initialization process. Redefine in subclasses."""
90 def _real_extract(self, url):
91 """Real extraction process. Redefine in subclasses."""
95 class YoutubeIE(InfoExtractor):
96 """Information extractor for youtube.com."""
98 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
99 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
100 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
101 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
102 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
103 _NETRC_MACHINE = 'youtube'
104 # Listed in order of quality
105 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
106 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
107 _video_extensions = {
113 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
119 _video_dimensions = {
137 def report_lang(self):
138 """Report attempt to set language."""
139 self._downloader.to_screen(u'[youtube] Setting language')
141 def report_login(self):
142 """Report attempt to log in."""
143 self._downloader.to_screen(u'[youtube] Logging in')
145 def report_age_confirmation(self):
146 """Report attempt to confirm age."""
147 self._downloader.to_screen(u'[youtube] Confirming age')
149 def report_video_webpage_download(self, video_id):
150 """Report attempt to download video webpage."""
151 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
153 def report_video_info_webpage_download(self, video_id):
154 """Report attempt to download video info webpage."""
155 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
157 def report_video_subtitles_download(self, video_id):
158 """Report attempt to download video info webpage."""
159 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
161 def report_information_extraction(self, video_id):
162 """Report attempt to extract video information."""
163 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
165 def report_unavailable_format(self, video_id, format):
166 """Report extracted video URL."""
167 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
169 def report_rtmp_download(self):
170 """Indicate the download will use the RTMP protocol."""
171 self._downloader.to_screen(u'[youtube] RTMP download detected')
173 def _closed_captions_xml_to_srt(self, xml_string):
175 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
176 # TODO parse xml instead of regex
177 for n, (start, dur_tag, dur, caption) in enumerate(texts):
178 if not dur: dur = '4'
180 end = start + float(dur)
181 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
182 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
183 caption = unescapeHTML(caption)
184 caption = unescapeHTML(caption) # double cycle, intentional
185 srt += str(n+1) + '\n'
186 srt += start + ' --> ' + end + '\n'
187 srt += caption + '\n\n'
190 def _print_formats(self, formats):
191 print 'Available formats:'
193 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
195 def _real_initialize(self):
196 if self._downloader is None:
201 downloader_params = self._downloader.params
203 # Attempt to use provided username and password or .netrc data
204 if downloader_params.get('username', None) is not None:
205 username = downloader_params['username']
206 password = downloader_params['password']
207 elif downloader_params.get('usenetrc', False):
209 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
214 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
215 except (IOError, netrc.NetrcParseError), err:
216 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
220 request = urllib2.Request(self._LANG_URL)
223 urllib2.urlopen(request).read()
224 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
225 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
228 # No authentication to be performed
234 'current_form': 'loginForm',
236 'action_login': 'Log In',
237 'username': username,
238 'password': password,
240 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
243 login_results = urllib2.urlopen(request).read()
244 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
245 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
247 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
248 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
254 'action_confirm': 'Confirm',
256 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
258 self.report_age_confirmation()
259 age_results = urllib2.urlopen(request).read()
260 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
261 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
264 def _real_extract(self, url):
265 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
266 mobj = re.search(self._NEXT_URL_RE, url)
268 url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
270 # Extract video id from URL
271 mobj = re.match(self._VALID_URL, url)
273 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
275 video_id = mobj.group(2)
278 self.report_video_webpage_download(video_id)
279 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
281 video_webpage = urllib2.urlopen(request).read()
282 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
283 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
286 # Attempt to extract SWF player URL
287 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
289 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
294 self.report_video_info_webpage_download(video_id)
295 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
296 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
297 % (video_id, el_type))
298 request = urllib2.Request(video_info_url)
300 video_info_webpage = urllib2.urlopen(request).read()
301 video_info = parse_qs(video_info_webpage)
302 if 'token' in video_info:
304 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
305 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
307 if 'token' not in video_info:
308 if 'reason' in video_info:
309 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
311 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
314 # Check for "rental" videos
315 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
316 self._downloader.trouble(u'ERROR: "rental" videos not supported')
319 # Start extracting information
320 self.report_information_extraction(video_id)
323 if 'author' not in video_info:
324 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
326 video_uploader = urllib.unquote_plus(video_info['author'][0])
329 if 'title' not in video_info:
330 self._downloader.trouble(u'ERROR: unable to extract video title')
332 video_title = urllib.unquote_plus(video_info['title'][0])
333 video_title = video_title.decode('utf-8')
336 if 'thumbnail_url' not in video_info:
337 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
339 else: # don't panic if we can't find it
340 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
344 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
346 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
347 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
348 for expression in format_expressions:
350 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
355 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
356 if video_description: video_description = clean_html(video_description)
357 else: video_description = ''
360 video_subtitles = None
361 if self._downloader.params.get('writesubtitles', False):
363 self.report_video_subtitles_download(video_id)
364 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
366 srt_list = urllib2.urlopen(request).read()
367 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
368 raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
369 srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
370 if not srt_lang_list:
371 raise Trouble(u'WARNING: video has no closed captions')
372 if self._downloader.params.get('subtitleslang', False):
373 srt_lang = self._downloader.params.get('subtitleslang')
374 elif 'en' in srt_lang_list:
377 srt_lang = srt_lang_list[0]
378 if not srt_lang in srt_lang_list:
379 raise Trouble(u'WARNING: no closed captions found in the specified language')
380 request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
382 srt_xml = urllib2.urlopen(request).read()
383 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
384 raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
385 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
386 except Trouble as trouble:
387 self._downloader.trouble(trouble[0])
390 video_token = urllib.unquote_plus(video_info['token'][0])
392 # Decide which formats to download
393 req_format = self._downloader.params.get('format', None)
395 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
396 self.report_rtmp_download()
397 video_url_list = [(None, video_info['conn'][0])]
398 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
399 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
400 url_data = [parse_qs(uds) for uds in url_data_strs]
401 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
402 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
404 format_limit = self._downloader.params.get('format_limit', None)
405 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
406 if format_limit is not None and format_limit in available_formats:
407 format_list = available_formats[available_formats.index(format_limit):]
409 format_list = available_formats
410 existing_formats = [x for x in format_list if x in url_map]
411 if len(existing_formats) == 0:
412 self._downloader.trouble(u'ERROR: no known formats available for video')
414 if self._downloader.params.get('listformats', None):
415 self._print_formats(existing_formats)
417 if req_format is None or req_format == 'best':
418 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
419 elif req_format == 'worst':
420 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
421 elif req_format in ('-1', 'all'):
422 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
424 # Specific formats. We pick the first in a slash-delimeted sequence.
425 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
426 req_formats = req_format.split('/')
427 video_url_list = None
428 for rf in req_formats:
430 video_url_list = [(rf, url_map[rf])]
432 if video_url_list is None:
433 self._downloader.trouble(u'ERROR: requested format not available')
436 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
440 for format_param, video_real_url in video_url_list:
442 video_extension = self._video_extensions.get(format_param, 'flv')
445 'id': video_id.decode('utf-8'),
446 'url': video_real_url.decode('utf-8'),
447 'uploader': video_uploader.decode('utf-8'),
448 'upload_date': upload_date,
449 'title': video_title,
450 'ext': video_extension.decode('utf-8'),
451 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
452 'thumbnail': video_thumbnail.decode('utf-8'),
453 'description': video_description,
454 'player_url': player_url,
455 'subtitles': video_subtitles
460 class MetacafeIE(InfoExtractor):
461 """Information Extractor for metacafe.com."""
463 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
464 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
465 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
466 IE_NAME = u'metacafe'
468 def __init__(self, downloader=None):
469 InfoExtractor.__init__(self, downloader)
471 def report_disclaimer(self):
472 """Report disclaimer retrieval."""
473 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
475 def report_age_confirmation(self):
476 """Report attempt to confirm age."""
477 self._downloader.to_screen(u'[metacafe] Confirming age')
479 def report_download_webpage(self, video_id):
480 """Report webpage download."""
481 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
483 def report_extraction(self, video_id):
484 """Report information extraction."""
485 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
487 def _real_initialize(self):
488 # Retrieve disclaimer
489 request = urllib2.Request(self._DISCLAIMER)
491 self.report_disclaimer()
492 disclaimer = urllib2.urlopen(request).read()
493 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
494 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
500 'submit': "Continue - I'm over 18",
502 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
504 self.report_age_confirmation()
505 disclaimer = urllib2.urlopen(request).read()
506 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
507 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
510 def _real_extract(self, url):
511 # Extract id and simplified title from URL
512 mobj = re.match(self._VALID_URL, url)
514 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
517 video_id = mobj.group(1)
519 # Check if video comes from YouTube
520 mobj2 = re.match(r'^yt-(.*)$', video_id)
521 if mobj2 is not None:
522 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
525 # Retrieve video webpage to extract further information
526 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
528 self.report_download_webpage(video_id)
529 webpage = urllib2.urlopen(request).read()
530 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
531 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
534 # Extract URL, uploader and title from webpage
535 self.report_extraction(video_id)
536 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
538 mediaURL = urllib.unquote(mobj.group(1))
539 video_extension = mediaURL[-3:]
541 # Extract gdaKey if available
542 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
546 gdaKey = mobj.group(1)
547 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
549 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
551 self._downloader.trouble(u'ERROR: unable to extract media URL')
553 vardict = parse_qs(mobj.group(1))
554 if 'mediaData' not in vardict:
555 self._downloader.trouble(u'ERROR: unable to extract media URL')
557 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
559 self._downloader.trouble(u'ERROR: unable to extract media URL')
561 mediaURL = mobj.group(1).replace('\\/', '/')
562 video_extension = mediaURL[-3:]
563 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
565 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
567 self._downloader.trouble(u'ERROR: unable to extract title')
569 video_title = mobj.group(1).decode('utf-8')
571 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
573 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
575 video_uploader = mobj.group(1)
578 'id': video_id.decode('utf-8'),
579 'url': video_url.decode('utf-8'),
580 'uploader': video_uploader.decode('utf-8'),
581 'upload_date': u'NA',
582 'title': video_title,
583 'ext': video_extension.decode('utf-8'),
589 class DailymotionIE(InfoExtractor):
590 """Information Extractor for Dailymotion"""
592 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
593 IE_NAME = u'dailymotion'
595 def __init__(self, downloader=None):
596 InfoExtractor.__init__(self, downloader)
598 def report_download_webpage(self, video_id):
599 """Report webpage download."""
600 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
602 def report_extraction(self, video_id):
603 """Report information extraction."""
604 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
606 def _real_extract(self, url):
607 # Extract id and simplified title from URL
608 mobj = re.match(self._VALID_URL, url)
610 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
613 video_id = mobj.group(1)
615 video_extension = 'flv'
617 # Retrieve video webpage to extract further information
618 request = urllib2.Request(url)
619 request.add_header('Cookie', 'family_filter=off')
621 self.report_download_webpage(video_id)
622 webpage = urllib2.urlopen(request).read()
623 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
624 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
627 # Extract URL, uploader and title from webpage
628 self.report_extraction(video_id)
629 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
631 self._downloader.trouble(u'ERROR: unable to extract media URL')
633 sequence = urllib.unquote(mobj.group(1))
634 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
636 self._downloader.trouble(u'ERROR: unable to extract media URL')
638 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
640 # if needed add http://www.dailymotion.com/ if relative URL
644 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
646 self._downloader.trouble(u'ERROR: unable to extract title')
648 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
650 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
652 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
654 video_uploader = mobj.group(1)
657 'id': video_id.decode('utf-8'),
658 'url': video_url.decode('utf-8'),
659 'uploader': video_uploader.decode('utf-8'),
660 'upload_date': u'NA',
661 'title': video_title,
662 'ext': video_extension.decode('utf-8'),
668 class GoogleIE(InfoExtractor):
669 """Information extractor for video.google.com."""
671 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
672 IE_NAME = u'video.google'
674 def __init__(self, downloader=None):
675 InfoExtractor.__init__(self, downloader)
677 def report_download_webpage(self, video_id):
678 """Report webpage download."""
679 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
681 def report_extraction(self, video_id):
682 """Report information extraction."""
683 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
685 def _real_extract(self, url):
686 # Extract id from URL
687 mobj = re.match(self._VALID_URL, url)
689 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
692 video_id = mobj.group(1)
694 video_extension = 'mp4'
696 # Retrieve video webpage to extract further information
697 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
699 self.report_download_webpage(video_id)
700 webpage = urllib2.urlopen(request).read()
701 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
702 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
705 # Extract URL, uploader, and title from webpage
706 self.report_extraction(video_id)
707 mobj = re.search(r"download_url:'([^']+)'", webpage)
709 video_extension = 'flv'
710 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
712 self._downloader.trouble(u'ERROR: unable to extract media URL')
714 mediaURL = urllib.unquote(mobj.group(1))
715 mediaURL = mediaURL.replace('\\x3d', '\x3d')
716 mediaURL = mediaURL.replace('\\x26', '\x26')
720 mobj = re.search(r'<title>(.*)</title>', webpage)
722 self._downloader.trouble(u'ERROR: unable to extract title')
724 video_title = mobj.group(1).decode('utf-8')
726 # Extract video description
727 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
729 self._downloader.trouble(u'ERROR: unable to extract video description')
731 video_description = mobj.group(1).decode('utf-8')
732 if not video_description:
733 video_description = 'No description available.'
735 # Extract video thumbnail
736 if self._downloader.params.get('forcethumbnail', False):
737 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
739 webpage = urllib2.urlopen(request).read()
740 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
741 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
743 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
745 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
747 video_thumbnail = mobj.group(1)
748 else: # we need something to pass to process_info
752 'id': video_id.decode('utf-8'),
753 'url': video_url.decode('utf-8'),
755 'upload_date': u'NA',
756 'title': video_title,
757 'ext': video_extension.decode('utf-8'),
763 class PhotobucketIE(InfoExtractor):
764 """Information extractor for photobucket.com."""
766 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
767 IE_NAME = u'photobucket'
769 def __init__(self, downloader=None):
770 InfoExtractor.__init__(self, downloader)
772 def report_download_webpage(self, video_id):
773 """Report webpage download."""
774 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
776 def report_extraction(self, video_id):
777 """Report information extraction."""
778 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
780 def _real_extract(self, url):
781 # Extract id from URL
782 mobj = re.match(self._VALID_URL, url)
784 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
787 video_id = mobj.group(1)
789 video_extension = 'flv'
791 # Retrieve video webpage to extract further information
792 request = urllib2.Request(url)
794 self.report_download_webpage(video_id)
795 webpage = urllib2.urlopen(request).read()
796 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
797 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
800 # Extract URL, uploader, and title from webpage
801 self.report_extraction(video_id)
802 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
804 self._downloader.trouble(u'ERROR: unable to extract media URL')
806 mediaURL = urllib.unquote(mobj.group(1))
810 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
812 self._downloader.trouble(u'ERROR: unable to extract title')
814 video_title = mobj.group(1).decode('utf-8')
816 video_uploader = mobj.group(2).decode('utf-8')
819 'id': video_id.decode('utf-8'),
820 'url': video_url.decode('utf-8'),
821 'uploader': video_uploader,
822 'upload_date': u'NA',
823 'title': video_title,
824 'ext': video_extension.decode('utf-8'),
830 class YahooIE(InfoExtractor):
831 """Information extractor for video.yahoo.com."""
833 # _VALID_URL matches all Yahoo! Video URLs
834 # _VPAGE_URL matches only the extractable '/watch/' URLs
835 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
836 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
837 IE_NAME = u'video.yahoo'
839 def __init__(self, downloader=None):
840 InfoExtractor.__init__(self, downloader)
842 def report_download_webpage(self, video_id):
843 """Report webpage download."""
844 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
846 def report_extraction(self, video_id):
847 """Report information extraction."""
848 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
850 def _real_extract(self, url, new_video=True):
851 # Extract ID from URL
852 mobj = re.match(self._VALID_URL, url)
854 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
857 video_id = mobj.group(2)
858 video_extension = 'flv'
860 # Rewrite valid but non-extractable URLs as
861 # extractable English language /watch/ URLs
862 if re.match(self._VPAGE_URL, url) is None:
863 request = urllib2.Request(url)
865 webpage = urllib2.urlopen(request).read()
866 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
867 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
870 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
872 self._downloader.trouble(u'ERROR: Unable to extract id field')
874 yahoo_id = mobj.group(1)
876 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
878 self._downloader.trouble(u'ERROR: Unable to extract vid field')
880 yahoo_vid = mobj.group(1)
882 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
883 return self._real_extract(url, new_video=False)
885 # Retrieve video webpage to extract further information
886 request = urllib2.Request(url)
888 self.report_download_webpage(video_id)
889 webpage = urllib2.urlopen(request).read()
890 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
891 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
894 # Extract uploader and title from webpage
895 self.report_extraction(video_id)
896 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
898 self._downloader.trouble(u'ERROR: unable to extract video title')
900 video_title = mobj.group(1).decode('utf-8')
902 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
904 self._downloader.trouble(u'ERROR: unable to extract video uploader')
906 video_uploader = mobj.group(1).decode('utf-8')
908 # Extract video thumbnail
909 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
911 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
913 video_thumbnail = mobj.group(1).decode('utf-8')
915 # Extract video description
916 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
918 self._downloader.trouble(u'ERROR: unable to extract video description')
920 video_description = mobj.group(1).decode('utf-8')
921 if not video_description:
922 video_description = 'No description available.'
924 # Extract video height and width
925 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
927 self._downloader.trouble(u'ERROR: unable to extract video height')
929 yv_video_height = mobj.group(1)
931 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
933 self._downloader.trouble(u'ERROR: unable to extract video width')
935 yv_video_width = mobj.group(1)
937 # Retrieve video playlist to extract media URL
938 # I'm not completely sure what all these options are, but we
939 # seem to need most of them, otherwise the server sends a 401.
940 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
941 yv_bitrate = '700' # according to Wikipedia this is hard-coded
942 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
943 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
944 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
946 self.report_download_webpage(video_id)
947 webpage = urllib2.urlopen(request).read()
948 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
949 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
952 # Extract media URL from playlist XML
953 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
955 self._downloader.trouble(u'ERROR: Unable to extract media URL')
957 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
958 video_url = unescapeHTML(video_url)
961 'id': video_id.decode('utf-8'),
963 'uploader': video_uploader,
964 'upload_date': u'NA',
965 'title': video_title,
966 'ext': video_extension.decode('utf-8'),
967 'thumbnail': video_thumbnail.decode('utf-8'),
968 'description': video_description,
969 'thumbnail': video_thumbnail,
974 class VimeoIE(InfoExtractor):
975 """Information extractor for vimeo.com."""
977 # _VALID_URL matches Vimeo URLs
978 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
981 def __init__(self, downloader=None):
982 InfoExtractor.__init__(self, downloader)
984 def report_download_webpage(self, video_id):
985 """Report webpage download."""
986 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
988 def report_extraction(self, video_id):
989 """Report information extraction."""
990 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
992 def _real_extract(self, url, new_video=True):
993 # Extract ID from URL
994 mobj = re.match(self._VALID_URL, url)
996 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
999 video_id = mobj.group(1)
1001 # Retrieve video webpage to extract further information
1002 request = urllib2.Request(url, None, std_headers)
1004 self.report_download_webpage(video_id)
1005 webpage = urllib2.urlopen(request).read()
1006 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1007 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1010 # Now we begin extracting as much information as we can from what we
1011 # retrieved. First we extract the information common to all extractors,
1012 # and latter we extract those that are Vimeo specific.
1013 self.report_extraction(video_id)
1015 # Extract the config JSON
1016 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1018 config = json.loads(config)
1020 self._downloader.trouble(u'ERROR: unable to extract info section')
1024 video_title = config["video"]["title"]
1027 video_uploader = config["video"]["owner"]["name"]
1029 # Extract video thumbnail
1030 video_thumbnail = config["video"]["thumbnail"]
1032 # Extract video description
1033 video_description = get_element_by_id("description", webpage.decode('utf8'))
1034 if video_description: video_description = clean_html(video_description)
1035 else: video_description = ''
1037 # Extract upload date
1038 video_upload_date = u'NA'
1039 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1040 if mobj is not None:
1041 video_upload_date = mobj.group(1)
1043 # Vimeo specific: extract request signature and timestamp
1044 sig = config['request']['signature']
1045 timestamp = config['request']['timestamp']
1047 # Vimeo specific: extract video codec and quality information
1048 # TODO bind to format param
1049 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1050 for codec in codecs:
1051 if codec[0] in config["video"]["files"]:
1052 video_codec = codec[0]
1053 video_extension = codec[1]
1054 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
1055 else: quality = 'sd'
1058 self._downloader.trouble(u'ERROR: no known codec found')
1061 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1062 %(video_id, sig, timestamp, quality, video_codec.upper())
1067 'uploader': video_uploader,
1068 'upload_date': video_upload_date,
1069 'title': video_title,
1070 'ext': video_extension,
1071 'thumbnail': video_thumbnail,
1072 'description': video_description,
1077 class GenericIE(InfoExtractor):
1078 """Generic last-resort information extractor."""
1081 IE_NAME = u'generic'
1083 def __init__(self, downloader=None):
1084 InfoExtractor.__init__(self, downloader)
1086 def report_download_webpage(self, video_id):
1087 """Report webpage download."""
1088 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1089 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1091 def report_extraction(self, video_id):
1092 """Report information extraction."""
1093 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1095 def report_following_redirect(self, new_url):
1096 """Report information extraction."""
1097 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1099 def _test_redirect(self, url):
1100 """Check if it is a redirect, like url shorteners, in case restart chain."""
1101 class HeadRequest(urllib2.Request):
1102 def get_method(self):
1105 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1107 Subclass the HTTPRedirectHandler to make it use our
1108 HeadRequest also on the redirected URL
1110 def redirect_request(self, req, fp, code, msg, headers, newurl):
1111 if code in (301, 302, 303, 307):
1112 newurl = newurl.replace(' ', '%20')
1113 newheaders = dict((k,v) for k,v in req.headers.items()
1114 if k.lower() not in ("content-length", "content-type"))
1115 return HeadRequest(newurl,
1117 origin_req_host=req.get_origin_req_host(),
1120 raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1122 class HTTPMethodFallback(urllib2.BaseHandler):
1124 Fallback to GET if HEAD is not allowed (405 HTTP error)
1126 def http_error_405(self, req, fp, code, msg, headers):
1130 newheaders = dict((k,v) for k,v in req.headers.items()
1131 if k.lower() not in ("content-length", "content-type"))
1132 return self.parent.open(urllib2.Request(req.get_full_url(),
1134 origin_req_host=req.get_origin_req_host(),
1138 opener = urllib2.OpenerDirector()
1139 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1140 HTTPMethodFallback, HEADRedirectHandler,
1141 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1142 opener.add_handler(handler())
1144 response = opener.open(HeadRequest(url))
1145 new_url = response.geturl()
1147 if url == new_url: return False
1149 self.report_following_redirect(new_url)
1150 self._downloader.download([new_url])
1153 def _real_extract(self, url):
1154 if self._test_redirect(url): return
1156 video_id = url.split('/')[-1]
1157 request = urllib2.Request(url)
1159 self.report_download_webpage(video_id)
1160 webpage = urllib2.urlopen(request).read()
1161 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1162 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1164 except ValueError, err:
1165 # since this is the last-resort InfoExtractor, if
1166 # this error is thrown, it'll be thrown here
1167 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1170 self.report_extraction(video_id)
1171 # Start with something easy: JW Player in SWFObject
1172 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1174 # Broaden the search a little bit
1175 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1177 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1180 # It's possible that one of the regexes
1181 # matched, but returned an empty group:
1182 if mobj.group(1) is None:
1183 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1186 video_url = urllib.unquote(mobj.group(1))
1187 video_id = os.path.basename(video_url)
1189 # here's a fun little line of code for you:
1190 video_extension = os.path.splitext(video_id)[1][1:]
1191 video_id = os.path.splitext(video_id)[0]
1193 # it's tempting to parse this further, but you would
1194 # have to take into account all the variations like
1195 # Video Title - Site Name
1196 # Site Name | Video Title
1197 # Video Title - Tagline | Site Name
1198 # and so on and so forth; it's just not practical
1199 mobj = re.search(r'<title>(.*)</title>', webpage)
1201 self._downloader.trouble(u'ERROR: unable to extract title')
1203 video_title = mobj.group(1).decode('utf-8')
1205 # video uploader is domain name
1206 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1208 self._downloader.trouble(u'ERROR: unable to extract title')
1210 video_uploader = mobj.group(1).decode('utf-8')
1213 'id': video_id.decode('utf-8'),
1214 'url': video_url.decode('utf-8'),
1215 'uploader': video_uploader,
1216 'upload_date': u'NA',
1217 'title': video_title,
1218 'ext': video_extension.decode('utf-8'),
1224 class YoutubeSearchIE(InfoExtractor):
1225 """Information Extractor for YouTube search queries."""
1226 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1227 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1228 _max_youtube_results = 1000
1229 IE_NAME = u'youtube:search'
1231 def __init__(self, downloader=None):
1232 InfoExtractor.__init__(self, downloader)
1234 def report_download_page(self, query, pagenum):
1235 """Report attempt to download search page with given number."""
1236 query = query.decode(preferredencoding())
1237 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1239 def _real_extract(self, query):
1240 mobj = re.match(self._VALID_URL, query)
1242 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1245 prefix, query = query.split(':')
1247 query = query.encode('utf-8')
1249 self._download_n_results(query, 1)
1251 elif prefix == 'all':
1252 self._download_n_results(query, self._max_youtube_results)
1258 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1260 elif n > self._max_youtube_results:
1261 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1262 n = self._max_youtube_results
1263 self._download_n_results(query, n)
1265 except ValueError: # parsing prefix as integer fails
1266 self._download_n_results(query, 1)
1269 def _download_n_results(self, query, n):
1270 """Downloads a specified number of results for a query"""
1276 while (50 * pagenum) < limit:
1277 self.report_download_page(query, pagenum+1)
1278 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1279 request = urllib2.Request(result_url)
1281 data = urllib2.urlopen(request).read()
1282 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1283 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
1285 api_response = json.loads(data)['data']
1287 new_ids = list(video['id'] for video in api_response['items'])
1288 video_ids += new_ids
1290 limit = min(n, api_response['totalItems'])
1293 if len(video_ids) > n:
1294 video_ids = video_ids[:n]
1295 for id in video_ids:
1296 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1300 class GoogleSearchIE(InfoExtractor):
1301 """Information Extractor for Google Video search queries."""
1302 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1303 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1304 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1305 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1306 _max_google_results = 1000
1307 IE_NAME = u'video.google:search'
1309 def __init__(self, downloader=None):
1310 InfoExtractor.__init__(self, downloader)
1312 def report_download_page(self, query, pagenum):
1313 """Report attempt to download playlist page with given number."""
1314 query = query.decode(preferredencoding())
1315 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1317 def _real_extract(self, query):
1318 mobj = re.match(self._VALID_URL, query)
1320 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1323 prefix, query = query.split(':')
1325 query = query.encode('utf-8')
1327 self._download_n_results(query, 1)
1329 elif prefix == 'all':
1330 self._download_n_results(query, self._max_google_results)
1336 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1338 elif n > self._max_google_results:
1339 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1340 n = self._max_google_results
1341 self._download_n_results(query, n)
1343 except ValueError: # parsing prefix as integer fails
1344 self._download_n_results(query, 1)
1347 def _download_n_results(self, query, n):
1348 """Downloads a specified number of results for a query"""
1354 self.report_download_page(query, pagenum)
1355 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1356 request = urllib2.Request(result_url)
1358 page = urllib2.urlopen(request).read()
1359 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1360 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1363 # Extract video identifiers
1364 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1365 video_id = mobj.group(1)
1366 if video_id not in video_ids:
1367 video_ids.append(video_id)
1368 if len(video_ids) == n:
1369 # Specified n videos reached
1370 for id in video_ids:
1371 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1374 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1375 for id in video_ids:
1376 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1379 pagenum = pagenum + 1
1382 class YahooSearchIE(InfoExtractor):
1383 """Information Extractor for Yahoo! Video search queries."""
1384 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1385 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1386 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1387 _MORE_PAGES_INDICATOR = r'\s*Next'
1388 _max_yahoo_results = 1000
1389 IE_NAME = u'video.yahoo:search'
1391 def __init__(self, downloader=None):
1392 InfoExtractor.__init__(self, downloader)
1394 def report_download_page(self, query, pagenum):
1395 """Report attempt to download playlist page with given number."""
1396 query = query.decode(preferredencoding())
1397 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1399 def _real_extract(self, query):
1400 mobj = re.match(self._VALID_URL, query)
1402 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1405 prefix, query = query.split(':')
1407 query = query.encode('utf-8')
1409 self._download_n_results(query, 1)
1411 elif prefix == 'all':
1412 self._download_n_results(query, self._max_yahoo_results)
1418 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1420 elif n > self._max_yahoo_results:
1421 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1422 n = self._max_yahoo_results
1423 self._download_n_results(query, n)
1425 except ValueError: # parsing prefix as integer fails
1426 self._download_n_results(query, 1)
1429 def _download_n_results(self, query, n):
1430 """Downloads a specified number of results for a query"""
1433 already_seen = set()
1437 self.report_download_page(query, pagenum)
1438 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1439 request = urllib2.Request(result_url)
1441 page = urllib2.urlopen(request).read()
1442 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1443 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1446 # Extract video identifiers
1447 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1448 video_id = mobj.group(1)
1449 if video_id not in already_seen:
1450 video_ids.append(video_id)
1451 already_seen.add(video_id)
1452 if len(video_ids) == n:
1453 # Specified n videos reached
1454 for id in video_ids:
1455 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1458 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1459 for id in video_ids:
1460 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1463 pagenum = pagenum + 1
1466 class YoutubePlaylistIE(InfoExtractor):
1467 """Information Extractor for YouTube playlists."""
1469 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1470 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1471 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&list=(PL)?%s&'
1472 _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1473 IE_NAME = u'youtube:playlist'
1475 def __init__(self, downloader=None):
1476 InfoExtractor.__init__(self, downloader)
1478 def report_download_page(self, playlist_id, pagenum):
1479 """Report attempt to download playlist page with given number."""
1480 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1482 def _real_extract(self, url):
1483 # Extract playlist id
1484 mobj = re.match(self._VALID_URL, url)
1486 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1490 if mobj.group(3) is not None:
1491 self._downloader.download([mobj.group(3)])
1494 # Download playlist pages
1495 # prefix is 'p' as default for playlists but there are other types that need extra care
1496 playlist_prefix = mobj.group(1)
1497 if playlist_prefix == 'a':
1498 playlist_access = 'artist'
1500 playlist_prefix = 'p'
1501 playlist_access = 'view_play_list'
1502 playlist_id = mobj.group(2)
1507 self.report_download_page(playlist_id, pagenum)
1508 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1509 request = urllib2.Request(url)
1511 page = urllib2.urlopen(request).read()
1512 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1513 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1516 # Extract video identifiers
1518 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1519 if mobj.group(1) not in ids_in_page:
1520 ids_in_page.append(mobj.group(1))
1521 video_ids.extend(ids_in_page)
1523 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1525 pagenum = pagenum + 1
1527 playliststart = self._downloader.params.get('playliststart', 1) - 1
1528 playlistend = self._downloader.params.get('playlistend', -1)
1529 if playlistend == -1:
1530 video_ids = video_ids[playliststart:]
1532 video_ids = video_ids[playliststart:playlistend]
1534 for id in video_ids:
1535 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1539 class YoutubeUserIE(InfoExtractor):
1540 """Information Extractor for YouTube users."""
1542 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1543 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1544 _GDATA_PAGE_SIZE = 50
1545 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1546 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1547 IE_NAME = u'youtube:user'
1549 def __init__(self, downloader=None):
1550 InfoExtractor.__init__(self, downloader)
1552 def report_download_page(self, username, start_index):
1553 """Report attempt to download user page."""
1554 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1555 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1557 def _real_extract(self, url):
1559 mobj = re.match(self._VALID_URL, url)
1561 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1564 username = mobj.group(1)
1566 # Download video ids using YouTube Data API. Result size per
1567 # query is limited (currently to 50 videos) so we need to query
1568 # page by page until there are no video ids - it means we got
1575 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1576 self.report_download_page(username, start_index)
1578 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1581 page = urllib2.urlopen(request).read()
1582 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1583 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1586 # Extract video identifiers
1589 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1590 if mobj.group(1) not in ids_in_page:
1591 ids_in_page.append(mobj.group(1))
1593 video_ids.extend(ids_in_page)
1595 # A little optimization - if current page is not
1596 # "full", ie. does not contain PAGE_SIZE video ids then
1597 # we can assume that this page is the last one - there
1598 # are no more ids on further pages - no need to query
1601 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1606 all_ids_count = len(video_ids)
1607 playliststart = self._downloader.params.get('playliststart', 1) - 1
1608 playlistend = self._downloader.params.get('playlistend', -1)
1610 if playlistend == -1:
1611 video_ids = video_ids[playliststart:]
1613 video_ids = video_ids[playliststart:playlistend]
1615 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1616 (username, all_ids_count, len(video_ids)))
1618 for video_id in video_ids:
1619 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1622 class DepositFilesIE(InfoExtractor):
1623 """Information extractor for depositfiles.com"""
1625 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1626 IE_NAME = u'DepositFiles'
1628 def __init__(self, downloader=None):
1629 InfoExtractor.__init__(self, downloader)
1631 def report_download_webpage(self, file_id):
1632 """Report webpage download."""
1633 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1635 def report_extraction(self, file_id):
1636 """Report information extraction."""
1637 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1639 def _real_extract(self, url):
1640 file_id = url.split('/')[-1]
1641 # Rebuild url in english locale
1642 url = 'http://depositfiles.com/en/files/' + file_id
1644 # Retrieve file webpage with 'Free download' button pressed
1645 free_download_indication = { 'gateway_result' : '1' }
1646 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1648 self.report_download_webpage(file_id)
1649 webpage = urllib2.urlopen(request).read()
1650 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1651 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
1654 # Search for the real file URL
1655 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1656 if (mobj is None) or (mobj.group(1) is None):
1657 # Try to figure out reason of the error.
1658 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1659 if (mobj is not None) and (mobj.group(1) is not None):
1660 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1661 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1663 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1666 file_url = mobj.group(1)
1667 file_extension = os.path.splitext(file_url)[1][1:]
1669 # Search for file title
1670 mobj = re.search(r'<b title="(.*?)">', webpage)
1672 self._downloader.trouble(u'ERROR: unable to extract title')
1674 file_title = mobj.group(1).decode('utf-8')
1677 'id': file_id.decode('utf-8'),
1678 'url': file_url.decode('utf-8'),
1680 'upload_date': u'NA',
1681 'title': file_title,
1682 'ext': file_extension.decode('utf-8'),
1688 class FacebookIE(InfoExtractor):
1689 """Information Extractor for Facebook"""
1691 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1692 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1693 _NETRC_MACHINE = 'facebook'
1694 _available_formats = ['video', 'highqual', 'lowqual']
1695 _video_extensions = {
1700 IE_NAME = u'facebook'
1702 def __init__(self, downloader=None):
1703 InfoExtractor.__init__(self, downloader)
1705 def _reporter(self, message):
1706 """Add header and report message."""
1707 self._downloader.to_screen(u'[facebook] %s' % message)
1709 def report_login(self):
1710 """Report attempt to log in."""
1711 self._reporter(u'Logging in')
1713 def report_video_webpage_download(self, video_id):
1714 """Report attempt to download video webpage."""
1715 self._reporter(u'%s: Downloading video webpage' % video_id)
1717 def report_information_extraction(self, video_id):
1718 """Report attempt to extract video information."""
1719 self._reporter(u'%s: Extracting video information' % video_id)
1721 def _parse_page(self, video_webpage):
1722 """Extract video information from page"""
1724 data = {'title': r'\("video_title", "(.*?)"\)',
1725 'description': r'<div class="datawrap">(.*?)</div>',
1726 'owner': r'\("video_owner_name", "(.*?)"\)',
1727 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1730 for piece in data.keys():
1731 mobj = re.search(data[piece], video_webpage)
1732 if mobj is not None:
1733 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1737 for fmt in self._available_formats:
1738 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1739 if mobj is not None:
1740 # URL is in a Javascript segment inside an escaped Unicode format within
1741 # the generally utf-8 page
1742 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1743 video_info['video_urls'] = video_urls
1747 def _real_initialize(self):
1748 if self._downloader is None:
1753 downloader_params = self._downloader.params
1755 # Attempt to use provided username and password or .netrc data
1756 if downloader_params.get('username', None) is not None:
1757 useremail = downloader_params['username']
1758 password = downloader_params['password']
1759 elif downloader_params.get('usenetrc', False):
1761 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1762 if info is not None:
1766 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1767 except (IOError, netrc.NetrcParseError), err:
1768 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1771 if useremail is None:
1780 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1783 login_results = urllib2.urlopen(request).read()
1784 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1785 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1787 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1788 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1791 def _real_extract(self, url):
1792 mobj = re.match(self._VALID_URL, url)
1794 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1796 video_id = mobj.group('ID')
1799 self.report_video_webpage_download(video_id)
1800 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
1802 page = urllib2.urlopen(request)
1803 video_webpage = page.read()
1804 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1805 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1808 # Start extracting information
1809 self.report_information_extraction(video_id)
1811 # Extract information
1812 video_info = self._parse_page(video_webpage)
1815 if 'owner' not in video_info:
1816 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1818 video_uploader = video_info['owner']
1821 if 'title' not in video_info:
1822 self._downloader.trouble(u'ERROR: unable to extract video title')
1824 video_title = video_info['title']
1825 video_title = video_title.decode('utf-8')
1828 if 'thumbnail' not in video_info:
1829 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1830 video_thumbnail = ''
1832 video_thumbnail = video_info['thumbnail']
1836 if 'upload_date' in video_info:
1837 upload_time = video_info['upload_date']
1838 timetuple = email.utils.parsedate_tz(upload_time)
1839 if timetuple is not None:
1841 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
1846 video_description = video_info.get('description', 'No description available.')
1848 url_map = video_info['video_urls']
1849 if len(url_map.keys()) > 0:
1850 # Decide which formats to download
1851 req_format = self._downloader.params.get('format', None)
1852 format_limit = self._downloader.params.get('format_limit', None)
1854 if format_limit is not None and format_limit in self._available_formats:
1855 format_list = self._available_formats[self._available_formats.index(format_limit):]
1857 format_list = self._available_formats
1858 existing_formats = [x for x in format_list if x in url_map]
1859 if len(existing_formats) == 0:
1860 self._downloader.trouble(u'ERROR: no known formats available for video')
1862 if req_format is None:
1863 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1864 elif req_format == 'worst':
1865 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1866 elif req_format == '-1':
1867 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1870 if req_format not in url_map:
1871 self._downloader.trouble(u'ERROR: requested format not available')
1873 video_url_list = [(req_format, url_map[req_format])] # Specific format
1876 for format_param, video_real_url in video_url_list:
1878 video_extension = self._video_extensions.get(format_param, 'mp4')
1881 'id': video_id.decode('utf-8'),
1882 'url': video_real_url.decode('utf-8'),
1883 'uploader': video_uploader.decode('utf-8'),
1884 'upload_date': upload_date,
1885 'title': video_title,
1886 'ext': video_extension.decode('utf-8'),
1887 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1888 'thumbnail': video_thumbnail.decode('utf-8'),
1889 'description': video_description.decode('utf-8'),
1894 class BlipTVIE(InfoExtractor):
1895 """Information extractor for blip.tv"""
1897 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
1898 _URL_EXT = r'^.*\.([a-z0-9]+)$'
1899 IE_NAME = u'blip.tv'
1901 def report_extraction(self, file_id):
1902 """Report information extraction."""
1903 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
1905 def report_direct_download(self, title):
1906 """Report information extraction."""
1907 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
1909 def _real_extract(self, url):
1910 mobj = re.match(self._VALID_URL, url)
1912 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1919 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1920 request = urllib2.Request(json_url)
1921 self.report_extraction(mobj.group(1))
1924 urlh = urllib2.urlopen(request)
1925 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1926 basename = url.split('/')[-1]
1927 title,ext = os.path.splitext(basename)
1928 title = title.decode('UTF-8')
1929 ext = ext.replace('.', '')
1930 self.report_direct_download(title)
1938 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1939 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1941 if info is None: # Regular URL
1943 json_code = urlh.read()
1944 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1945 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
1949 json_data = json.loads(json_code)
1950 if 'Post' in json_data:
1951 data = json_data['Post']
1955 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
1956 video_url = data['media']['url']
1957 umobj = re.match(self._URL_EXT, video_url)
1959 raise ValueError('Can not determine filename extension')
1960 ext = umobj.group(1)
1963 'id': data['item_id'],
1965 'uploader': data['display_name'],
1966 'upload_date': upload_date,
1967 'title': data['title'],
1969 'format': data['media']['mimeType'],
1970 'thumbnail': data['thumbnailUrl'],
1971 'description': data['description'],
1972 'player_url': data['embedUrl']
1974 except (ValueError,KeyError), err:
1975 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
1981 class MyVideoIE(InfoExtractor):
1982 """Information Extractor for myvideo.de."""
1984 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
1985 IE_NAME = u'myvideo'
1987 def __init__(self, downloader=None):
1988 InfoExtractor.__init__(self, downloader)
1990 def report_download_webpage(self, video_id):
1991 """Report webpage download."""
1992 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
1994 def report_extraction(self, video_id):
1995 """Report information extraction."""
1996 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
1998 def _real_extract(self,url):
1999 mobj = re.match(self._VALID_URL, url)
2001 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2004 video_id = mobj.group(1)
2007 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2009 self.report_download_webpage(video_id)
2010 webpage = urllib2.urlopen(request).read()
2011 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2012 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2015 self.report_extraction(video_id)
2016 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2019 self._downloader.trouble(u'ERROR: unable to extract media URL')
2021 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2023 mobj = re.search('<title>([^<]+)</title>', webpage)
2025 self._downloader.trouble(u'ERROR: unable to extract title')
2028 video_title = mobj.group(1)
2034 'upload_date': u'NA',
2035 'title': video_title,
2041 class ComedyCentralIE(InfoExtractor):
2042 """Information extractor for The Daily Show and Colbert Report """
2044 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2045 IE_NAME = u'comedycentral'
2047 def report_extraction(self, episode_id):
2048 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2050 def report_config_download(self, episode_id):
2051 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2053 def report_index_download(self, episode_id):
2054 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2056 def report_player_url(self, episode_id):
2057 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2059 def _real_extract(self, url):
2060 mobj = re.match(self._VALID_URL, url)
2062 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2065 if mobj.group('shortname'):
2066 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2067 url = u'http://www.thedailyshow.com/full-episodes/'
2069 url = u'http://www.colbertnation.com/full-episodes/'
2070 mobj = re.match(self._VALID_URL, url)
2071 assert mobj is not None
2073 dlNewest = not mobj.group('episode')
2075 epTitle = mobj.group('showname')
2077 epTitle = mobj.group('episode')
2079 req = urllib2.Request(url)
2080 self.report_extraction(epTitle)
2082 htmlHandle = urllib2.urlopen(req)
2083 html = htmlHandle.read()
2084 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2085 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2088 url = htmlHandle.geturl()
2089 mobj = re.match(self._VALID_URL, url)
2091 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2093 if mobj.group('episode') == '':
2094 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2096 epTitle = mobj.group('episode')
2098 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2099 if len(mMovieParams) == 0:
2100 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2103 playerUrl_raw = mMovieParams[0][0]
2104 self.report_player_url(epTitle)
2106 urlHandle = urllib2.urlopen(playerUrl_raw)
2107 playerUrl = urlHandle.geturl()
2108 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2109 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2112 uri = mMovieParams[0][1]
2113 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2114 self.report_index_download(epTitle)
2116 indexXml = urllib2.urlopen(indexUrl).read()
2117 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2118 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2123 idoc = xml.etree.ElementTree.fromstring(indexXml)
2124 itemEls = idoc.findall('.//item')
2125 for itemEl in itemEls:
2126 mediaId = itemEl.findall('./guid')[0].text
2127 shortMediaId = mediaId.split(':')[-1]
2128 showId = mediaId.split(':')[-2].replace('.com', '')
2129 officialTitle = itemEl.findall('./title')[0].text
2130 officialDate = itemEl.findall('./pubDate')[0].text
2132 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2133 urllib.urlencode({'uri': mediaId}))
2134 configReq = urllib2.Request(configUrl)
2135 self.report_config_download(epTitle)
2137 configXml = urllib2.urlopen(configReq).read()
2138 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2139 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2142 cdoc = xml.etree.ElementTree.fromstring(configXml)
2144 for rendition in cdoc.findall('.//rendition'):
2145 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2149 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2152 # For now, just pick the highest bitrate
2153 format,video_url = turls[-1]
2155 effTitle = showId + u'-' + epTitle
2160 'upload_date': officialDate,
2165 'description': officialTitle,
2166 'player_url': playerUrl
2169 results.append(info)
2174 class EscapistIE(InfoExtractor):
2175 """Information extractor for The Escapist """
2177 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2178 IE_NAME = u'escapist'
2180 def report_extraction(self, showName):
2181 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2183 def report_config_download(self, showName):
2184 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2186 def _real_extract(self, url):
2187 mobj = re.match(self._VALID_URL, url)
2189 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2191 showName = mobj.group('showname')
2192 videoId = mobj.group('episode')
2194 self.report_extraction(showName)
2196 webPage = urllib2.urlopen(url)
2197 webPageBytes = webPage.read()
2198 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2199 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2200 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2201 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2204 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2205 description = unescapeHTML(descMatch.group(1))
2206 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2207 imgUrl = unescapeHTML(imgMatch.group(1))
2208 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2209 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2210 configUrlMatch = re.search('config=(.*)$', playerUrl)
2211 configUrl = urllib2.unquote(configUrlMatch.group(1))
2213 self.report_config_download(showName)
2215 configJSON = urllib2.urlopen(configUrl).read()
2216 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2217 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2220 # Technically, it's JavaScript, not JSON
2221 configJSON = configJSON.replace("'", '"')
2224 config = json.loads(configJSON)
2225 except (ValueError,), err:
2226 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2229 playlist = config['playlist']
2230 videoUrl = playlist[1]['url']
2235 'uploader': showName,
2236 'upload_date': None,
2240 'thumbnail': imgUrl,
2241 'description': description,
2242 'player_url': playerUrl,
2248 class CollegeHumorIE(InfoExtractor):
2249 """Information extractor for collegehumor.com"""
2251 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2252 IE_NAME = u'collegehumor'
2254 def report_webpage(self, video_id):
2255 """Report information extraction."""
2256 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2258 def report_extraction(self, video_id):
2259 """Report information extraction."""
2260 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2262 def _real_extract(self, url):
2263 mobj = re.match(self._VALID_URL, url)
2265 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2267 video_id = mobj.group('videoid')
2269 self.report_webpage(video_id)
2270 request = urllib2.Request(url)
2272 webpage = urllib2.urlopen(request).read()
2273 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2274 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2277 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2279 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2281 internal_video_id = m.group('internalvideoid')
2285 'internal_id': internal_video_id,
2288 self.report_extraction(video_id)
2289 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2291 metaXml = urllib2.urlopen(xmlUrl).read()
2292 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2293 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
2296 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2298 videoNode = mdoc.findall('./video')[0]
2299 info['description'] = videoNode.findall('./description')[0].text
2300 info['title'] = videoNode.findall('./caption')[0].text
2301 info['url'] = videoNode.findall('./file')[0].text
2302 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2303 info['ext'] = info['url'].rpartition('.')[2]
2304 info['format'] = info['ext']
2306 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2312 class XVideosIE(InfoExtractor):
2313 """Information extractor for xvideos.com"""
2315 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2316 IE_NAME = u'xvideos'
2318 def report_webpage(self, video_id):
2319 """Report information extraction."""
2320 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2322 def report_extraction(self, video_id):
2323 """Report information extraction."""
2324 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2326 def _real_extract(self, url):
2327 mobj = re.match(self._VALID_URL, url)
2329 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2331 video_id = mobj.group(1).decode('utf-8')
2333 self.report_webpage(video_id)
2335 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2337 webpage = urllib2.urlopen(request).read()
2338 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2339 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2342 self.report_extraction(video_id)
2346 mobj = re.search(r'flv_url=(.+?)&', webpage)
2348 self._downloader.trouble(u'ERROR: unable to extract video url')
2350 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2354 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2356 self._downloader.trouble(u'ERROR: unable to extract video title')
2358 video_title = mobj.group(1).decode('utf-8')
2361 # Extract video thumbnail
2362 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
2364 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2366 video_thumbnail = mobj.group(1).decode('utf-8')
2372 'upload_date': None,
2373 'title': video_title,
2376 'thumbnail': video_thumbnail,
2377 'description': None,
2384 class SoundcloudIE(InfoExtractor):
2385 """Information extractor for soundcloud.com
2386 To access the media, the uid of the song and a stream token
2387 must be extracted from the page source and the script must make
2388 a request to media.soundcloud.com/crossdomain.xml. Then
2389 the media can be grabbed by requesting from an url composed
2390 of the stream token and uid
2393 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2394 IE_NAME = u'soundcloud'
2396 def __init__(self, downloader=None):
2397 InfoExtractor.__init__(self, downloader)
2399 def report_webpage(self, video_id):
2400 """Report information extraction."""
2401 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2403 def report_extraction(self, video_id):
2404 """Report information extraction."""
2405 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2407 def _real_extract(self, url):
2408 mobj = re.match(self._VALID_URL, url)
2410 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2413 # extract uploader (which is in the url)
2414 uploader = mobj.group(1).decode('utf-8')
2415 # extract simple title (uploader + slug of song title)
2416 slug_title = mobj.group(2).decode('utf-8')
2417 simple_title = uploader + u'-' + slug_title
2419 self.report_webpage('%s/%s' % (uploader, slug_title))
2421 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2423 webpage = urllib2.urlopen(request).read()
2424 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2425 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2428 self.report_extraction('%s/%s' % (uploader, slug_title))
2430 # extract uid and stream token that soundcloud hands out for access
2431 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2433 video_id = mobj.group(1)
2434 stream_token = mobj.group(2)
2436 # extract unsimplified title
2437 mobj = re.search('"title":"(.*?)",', webpage)
2439 title = mobj.group(1).decode('utf-8')
2441 title = simple_title
2443 # construct media url (with uid/token)
2444 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2445 mediaURL = mediaURL % (video_id, stream_token)
2448 description = u'No description available'
2449 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2451 description = mobj.group(1)
2455 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2458 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2459 except Exception, e:
2460 self._downloader.to_stderr(str(e))
2462 # for soundcloud, a request to a cross domain is required for cookies
2463 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2466 'id': video_id.decode('utf-8'),
2468 'uploader': uploader.decode('utf-8'),
2469 'upload_date': upload_date,
2474 'description': description.decode('utf-8')
2478 class InfoQIE(InfoExtractor):
2479 """Information extractor for infoq.com"""
2481 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2484 def report_webpage(self, video_id):
2485 """Report information extraction."""
2486 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2488 def report_extraction(self, video_id):
2489 """Report information extraction."""
2490 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2492 def _real_extract(self, url):
2493 mobj = re.match(self._VALID_URL, url)
2495 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2498 self.report_webpage(url)
2500 request = urllib2.Request(url)
2502 webpage = urllib2.urlopen(request).read()
2503 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2504 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2507 self.report_extraction(url)
2511 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2513 self._downloader.trouble(u'ERROR: unable to extract video url')
2515 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2519 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2521 self._downloader.trouble(u'ERROR: unable to extract video title')
2523 video_title = mobj.group(1).decode('utf-8')
2525 # Extract description
2526 video_description = u'No description available.'
2527 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2528 if mobj is not None:
2529 video_description = mobj.group(1).decode('utf-8')
2531 video_filename = video_url.split('/')[-1]
2532 video_id, extension = video_filename.split('.')
2538 'upload_date': None,
2539 'title': video_title,
2541 'format': extension, # Extension is always(?) mp4, but seems to be flv
2543 'description': video_description,
2549 class MixcloudIE(InfoExtractor):
2550 """Information extractor for www.mixcloud.com"""
2551 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2552 IE_NAME = u'mixcloud'
2554 def __init__(self, downloader=None):
2555 InfoExtractor.__init__(self, downloader)
2557 def report_download_json(self, file_id):
2558 """Report JSON download."""
2559 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2561 def report_extraction(self, file_id):
2562 """Report information extraction."""
2563 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2565 def get_urls(self, jsonData, fmt, bitrate='best'):
2566 """Get urls from 'audio_formats' section in json"""
2569 bitrate_list = jsonData[fmt]
2570 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2571 bitrate = max(bitrate_list) # select highest
2573 url_list = jsonData[fmt][bitrate]
2574 except TypeError: # we have no bitrate info.
2575 url_list = jsonData[fmt]
2578 def check_urls(self, url_list):
2579 """Returns 1st active url from list"""
2580 for url in url_list:
2582 urllib2.urlopen(url)
2584 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2589 def _print_formats(self, formats):
2590 print 'Available formats:'
2591 for fmt in formats.keys():
2592 for b in formats[fmt]:
2594 ext = formats[fmt][b][0]
2595 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
2596 except TypeError: # we have no bitrate info
2597 ext = formats[fmt][0]
2598 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
2601 def _real_extract(self, url):
2602 mobj = re.match(self._VALID_URL, url)
2604 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2606 # extract uploader & filename from url
2607 uploader = mobj.group(1).decode('utf-8')
2608 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2610 # construct API request
2611 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2612 # retrieve .json file with links to files
2613 request = urllib2.Request(file_url)
2615 self.report_download_json(file_url)
2616 jsonData = urllib2.urlopen(request).read()
2617 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2618 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
2622 json_data = json.loads(jsonData)
2623 player_url = json_data['player_swf_url']
2624 formats = dict(json_data['audio_formats'])
2626 req_format = self._downloader.params.get('format', None)
2629 if self._downloader.params.get('listformats', None):
2630 self._print_formats(formats)
2633 if req_format is None or req_format == 'best':
2634 for format_param in formats.keys():
2635 url_list = self.get_urls(formats, format_param)
2637 file_url = self.check_urls(url_list)
2638 if file_url is not None:
2641 if req_format not in formats.keys():
2642 self._downloader.trouble(u'ERROR: format is not available')
2645 url_list = self.get_urls(formats, req_format)
2646 file_url = self.check_urls(url_list)
2647 format_param = req_format
2650 'id': file_id.decode('utf-8'),
2651 'url': file_url.decode('utf-8'),
2652 'uploader': uploader.decode('utf-8'),
2653 'upload_date': u'NA',
2654 'title': json_data['name'],
2655 'ext': file_url.split('.')[-1].decode('utf-8'),
2656 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2657 'thumbnail': json_data['thumbnail_url'],
2658 'description': json_data['description'],
2659 'player_url': player_url.decode('utf-8'),
2662 class StanfordOpenClassroomIE(InfoExtractor):
2663 """Information extractor for Stanford's Open ClassRoom"""
2665 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2666 IE_NAME = u'stanfordoc'
2668 def report_download_webpage(self, objid):
2669 """Report information extraction."""
2670 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2672 def report_extraction(self, video_id):
2673 """Report information extraction."""
2674 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2676 def _real_extract(self, url):
2677 mobj = re.match(self._VALID_URL, url)
2679 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2682 if mobj.group('course') and mobj.group('video'): # A specific video
2683 course = mobj.group('course')
2684 video = mobj.group('video')
2686 'id': course + '_' + video,
2689 self.report_extraction(info['id'])
2690 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2691 xmlUrl = baseUrl + video + '.xml'
2693 metaXml = urllib2.urlopen(xmlUrl).read()
2694 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2695 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2697 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2699 info['title'] = mdoc.findall('./title')[0].text
2700 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2702 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2704 info['ext'] = info['url'].rpartition('.')[2]
2705 info['format'] = info['ext']
2707 elif mobj.group('course'): # A course page
2708 course = mobj.group('course')
2714 self.report_download_webpage(info['id'])
2716 coursepage = urllib2.urlopen(url).read()
2717 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2718 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2721 m = re.search('<h1>([^<]+)</h1>', coursepage)
2723 info['title'] = unescapeHTML(m.group(1))
2725 info['title'] = info['id']
2727 m = re.search('<description>([^<]+)</description>', coursepage)
2729 info['description'] = unescapeHTML(m.group(1))
2731 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2734 'type': 'reference',
2735 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2739 for entry in info['list']:
2740 assert entry['type'] == 'reference'
2741 results += self.extract(entry['url'])
2746 'id': 'Stanford OpenClassroom',
2750 self.report_download_webpage(info['id'])
2751 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2753 rootpage = urllib2.urlopen(rootURL).read()
2754 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2755 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2758 info['title'] = info['id']
2760 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2763 'type': 'reference',
2764 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2769 for entry in info['list']:
2770 assert entry['type'] == 'reference'
2771 results += self.extract(entry['url'])
2774 class MTVIE(InfoExtractor):
2775 """Information extractor for MTV.com"""
2777 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2780 def report_webpage(self, video_id):
2781 """Report information extraction."""
2782 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2784 def report_extraction(self, video_id):
2785 """Report information extraction."""
2786 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2788 def _real_extract(self, url):
2789 mobj = re.match(self._VALID_URL, url)
2791 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2793 if not mobj.group('proto'):
2794 url = 'http://' + url
2795 video_id = mobj.group('videoid')
2796 self.report_webpage(video_id)
2798 request = urllib2.Request(url)
2800 webpage = urllib2.urlopen(request).read()
2801 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2802 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2805 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2807 self._downloader.trouble(u'ERROR: unable to extract song name')
2809 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2810 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2812 self._downloader.trouble(u'ERROR: unable to extract performer')
2814 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2815 video_title = performer + ' - ' + song_name
2817 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2819 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
2821 mtvn_uri = mobj.group(1)
2823 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2825 self._downloader.trouble(u'ERROR: unable to extract content id')
2827 content_id = mobj.group(1)
2829 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2830 self.report_extraction(video_id)
2831 request = urllib2.Request(videogen_url)
2833 metadataXml = urllib2.urlopen(request).read()
2834 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2835 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
2838 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2839 renditions = mdoc.findall('.//rendition')
2841 # For now, always pick the highest quality.
2842 rendition = renditions[-1]
2845 _,_,ext = rendition.attrib['type'].partition('/')
2846 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2847 video_url = rendition.find('./src').text
2849 self._downloader.trouble('Invalid rendition field.')
2855 'uploader': performer,
2856 'title': video_title,