2 # -*- coding: utf-8 -*-
15 import xml.etree.ElementTree
16 from urlparse import parse_qs
19 import cStringIO as StringIO
26 class InfoExtractor(object):
27 """Information Extractor class.
29 Information extractors are the classes that, given a URL, extract
30 information from the video (or videos) the URL refers to. This
31 information includes the real video URL, the video title and simplified
32 title, author and others. The information is stored in a dictionary
33 which is then passed to the FileDownloader. The FileDownloader
34 processes this information possibly downloading the video to the file
35 system, among other possible outcomes. The dictionaries must include
40 uploader: Nickname of the video uploader.
42 ext: Video filename extension.
44 player_url: SWF Player URL (may be None).
46 The following fields are optional. Their primary purpose is to allow
47 youtube-dl to serve as the backend for a video search function, such
48 as the one in youtube2mp3. They are only used when their respective
49 forced printing functions are called:
51 thumbnail: Full URL to a video thumbnail image.
52 description: One-line video description.
54 Subclasses of this one should re-define the _real_initialize() and
55 _real_extract() methods and define a _VALID_URL regexp.
56 Probably, they should also be added to the list of extractors.
62 def __init__(self, downloader=None):
63 """Constructor. Receives an optional downloader."""
65 self.set_downloader(downloader)
67 def suitable(self, url):
68 """Receives a URL and returns True if suitable for this IE."""
69 return re.match(self._VALID_URL, url) is not None
72 """Initializes an instance (authentication, etc)."""
74 self._real_initialize()
77 def extract(self, url):
78 """Extracts URL information and returns it in list of dicts."""
80 return self._real_extract(url)
82 def set_downloader(self, downloader):
83 """Sets the downloader for this IE."""
84 self._downloader = downloader
86 def _real_initialize(self):
87 """Real initialization process. Redefine in subclasses."""
90 def _real_extract(self, url):
91 """Real extraction process. Redefine in subclasses."""
95 class YoutubeIE(InfoExtractor):
96 """Information extractor for youtube.com."""
98 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
99 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
100 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
101 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
102 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
103 _NETRC_MACHINE = 'youtube'
104 # Listed in order of quality
105 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
106 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
107 _video_extensions = {
113 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
119 _video_dimensions = {
137 def report_lang(self):
138 """Report attempt to set language."""
139 self._downloader.to_screen(u'[youtube] Setting language')
141 def report_login(self):
142 """Report attempt to log in."""
143 self._downloader.to_screen(u'[youtube] Logging in')
145 def report_age_confirmation(self):
146 """Report attempt to confirm age."""
147 self._downloader.to_screen(u'[youtube] Confirming age')
149 def report_video_webpage_download(self, video_id):
150 """Report attempt to download video webpage."""
151 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
153 def report_video_info_webpage_download(self, video_id):
154 """Report attempt to download video info webpage."""
155 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
157 def report_video_subtitles_download(self, video_id):
158 """Report attempt to download video info webpage."""
159 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
161 def report_information_extraction(self, video_id):
162 """Report attempt to extract video information."""
163 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
165 def report_unavailable_format(self, video_id, format):
166 """Report extracted video URL."""
167 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
169 def report_rtmp_download(self):
170 """Indicate the download will use the RTMP protocol."""
171 self._downloader.to_screen(u'[youtube] RTMP download detected')
173 def _closed_captions_xml_to_srt(self, xml_string):
175 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
176 # TODO parse xml instead of regex
177 for n, (start, dur_tag, dur, caption) in enumerate(texts):
178 if not dur: dur = '4'
180 end = start + float(dur)
181 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
182 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
183 caption = unescapeHTML(caption)
184 caption = unescapeHTML(caption) # double cycle, inentional
186 srt += start + ' --> ' + end + '\n'
187 srt += caption + '\n\n'
190 def _print_formats(self, formats):
191 print 'Available formats:'
193 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
195 def _real_initialize(self):
196 if self._downloader is None:
201 downloader_params = self._downloader.params
203 # Attempt to use provided username and password or .netrc data
204 if downloader_params.get('username', None) is not None:
205 username = downloader_params['username']
206 password = downloader_params['password']
207 elif downloader_params.get('usenetrc', False):
209 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
214 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
215 except (IOError, netrc.NetrcParseError), err:
216 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
220 request = urllib2.Request(self._LANG_URL)
223 urllib2.urlopen(request).read()
224 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
225 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
228 # No authentication to be performed
234 'current_form': 'loginForm',
236 'action_login': 'Log In',
237 'username': username,
238 'password': password,
240 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
243 login_results = urllib2.urlopen(request).read()
244 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
245 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
247 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
248 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
254 'action_confirm': 'Confirm',
256 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
258 self.report_age_confirmation()
259 age_results = urllib2.urlopen(request).read()
260 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
261 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
264 def _real_extract(self, url):
265 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
266 mobj = re.search(self._NEXT_URL_RE, url)
268 url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
270 # Extract video id from URL
271 mobj = re.match(self._VALID_URL, url)
273 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
275 video_id = mobj.group(2)
278 self.report_video_webpage_download(video_id)
279 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
281 video_webpage = urllib2.urlopen(request).read()
282 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
283 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
286 # Attempt to extract SWF player URL
287 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
289 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
294 self.report_video_info_webpage_download(video_id)
295 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
296 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
297 % (video_id, el_type))
298 request = urllib2.Request(video_info_url)
300 video_info_webpage = urllib2.urlopen(request).read()
301 video_info = parse_qs(video_info_webpage)
302 if 'token' in video_info:
304 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
305 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
307 if 'token' not in video_info:
308 if 'reason' in video_info:
309 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
311 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
314 # Start extracting information
315 self.report_information_extraction(video_id)
318 if 'author' not in video_info:
319 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
321 video_uploader = urllib.unquote_plus(video_info['author'][0])
324 if 'title' not in video_info:
325 self._downloader.trouble(u'ERROR: unable to extract video title')
327 video_title = urllib.unquote_plus(video_info['title'][0])
328 video_title = video_title.decode('utf-8')
331 if 'thumbnail_url' not in video_info:
332 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
334 else: # don't panic if we can't find it
335 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
339 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
341 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
342 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
343 for expression in format_expressions:
345 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
350 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
351 if video_description: video_description = clean_html(video_description)
352 else: video_description = ''
355 video_subtitles = None
356 if self._downloader.params.get('writesubtitles', False):
358 self.report_video_subtitles_download(video_id)
359 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
361 srt_list = urllib2.urlopen(request).read()
362 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
363 raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
364 srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
365 if not srt_lang_list:
366 raise Trouble(u'WARNING: video has no closed captions')
367 if self._downloader.params.get('subtitleslang', False):
368 srt_lang = self._downloader.params.get('subtitleslang')
369 elif 'en' in srt_lang_list:
372 srt_lang = srt_lang_list[0]
373 if not srt_lang in srt_lang_list:
374 raise Trouble(u'WARNING: no closed captions found in the specified language')
375 request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
377 srt_xml = urllib2.urlopen(request).read()
378 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
379 raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
380 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
381 except Trouble as trouble:
382 self._downloader.trouble(trouble[0])
385 video_token = urllib.unquote_plus(video_info['token'][0])
387 # Decide which formats to download
388 req_format = self._downloader.params.get('format', None)
390 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
391 self.report_rtmp_download()
392 video_url_list = [(None, video_info['conn'][0])]
393 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
394 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
395 url_data = [parse_qs(uds) for uds in url_data_strs]
396 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
397 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
399 format_limit = self._downloader.params.get('format_limit', None)
400 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
401 if format_limit is not None and format_limit in available_formats:
402 format_list = available_formats[available_formats.index(format_limit):]
404 format_list = available_formats
405 existing_formats = [x for x in format_list if x in url_map]
406 if len(existing_formats) == 0:
407 self._downloader.trouble(u'ERROR: no known formats available for video')
409 if self._downloader.params.get('listformats', None):
410 self._print_formats(existing_formats)
412 if req_format is None or req_format == 'best':
413 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
414 elif req_format == 'worst':
415 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
416 elif req_format in ('-1', 'all'):
417 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
419 # Specific formats. We pick the first in a slash-delimeted sequence.
420 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
421 req_formats = req_format.split('/')
422 video_url_list = None
423 for rf in req_formats:
425 video_url_list = [(rf, url_map[rf])]
427 if video_url_list is None:
428 self._downloader.trouble(u'ERROR: requested format not available')
431 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
435 for format_param, video_real_url in video_url_list:
437 video_extension = self._video_extensions.get(format_param, 'flv')
440 'id': video_id.decode('utf-8'),
441 'url': video_real_url.decode('utf-8'),
442 'uploader': video_uploader.decode('utf-8'),
443 'upload_date': upload_date,
444 'title': video_title,
445 'ext': video_extension.decode('utf-8'),
446 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
447 'thumbnail': video_thumbnail.decode('utf-8'),
448 'description': video_description,
449 'player_url': player_url,
450 'subtitles': video_subtitles
455 class MetacafeIE(InfoExtractor):
456 """Information Extractor for metacafe.com."""
458 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
459 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
460 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
461 IE_NAME = u'metacafe'
463 def __init__(self, downloader=None):
464 InfoExtractor.__init__(self, downloader)
466 def report_disclaimer(self):
467 """Report disclaimer retrieval."""
468 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
470 def report_age_confirmation(self):
471 """Report attempt to confirm age."""
472 self._downloader.to_screen(u'[metacafe] Confirming age')
474 def report_download_webpage(self, video_id):
475 """Report webpage download."""
476 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
478 def report_extraction(self, video_id):
479 """Report information extraction."""
480 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
482 def _real_initialize(self):
483 # Retrieve disclaimer
484 request = urllib2.Request(self._DISCLAIMER)
486 self.report_disclaimer()
487 disclaimer = urllib2.urlopen(request).read()
488 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
489 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
495 'submit': "Continue - I'm over 18",
497 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
499 self.report_age_confirmation()
500 disclaimer = urllib2.urlopen(request).read()
501 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
502 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
505 def _real_extract(self, url):
506 # Extract id and simplified title from URL
507 mobj = re.match(self._VALID_URL, url)
509 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
512 video_id = mobj.group(1)
514 # Check if video comes from YouTube
515 mobj2 = re.match(r'^yt-(.*)$', video_id)
516 if mobj2 is not None:
517 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
520 # Retrieve video webpage to extract further information
521 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
523 self.report_download_webpage(video_id)
524 webpage = urllib2.urlopen(request).read()
525 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
526 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
529 # Extract URL, uploader and title from webpage
530 self.report_extraction(video_id)
531 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
533 mediaURL = urllib.unquote(mobj.group(1))
534 video_extension = mediaURL[-3:]
536 # Extract gdaKey if available
537 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
541 gdaKey = mobj.group(1)
542 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
544 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
546 self._downloader.trouble(u'ERROR: unable to extract media URL')
548 vardict = parse_qs(mobj.group(1))
549 if 'mediaData' not in vardict:
550 self._downloader.trouble(u'ERROR: unable to extract media URL')
552 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
554 self._downloader.trouble(u'ERROR: unable to extract media URL')
556 mediaURL = mobj.group(1).replace('\\/', '/')
557 video_extension = mediaURL[-3:]
558 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
560 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
562 self._downloader.trouble(u'ERROR: unable to extract title')
564 video_title = mobj.group(1).decode('utf-8')
566 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
568 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
570 video_uploader = mobj.group(1)
573 'id': video_id.decode('utf-8'),
574 'url': video_url.decode('utf-8'),
575 'uploader': video_uploader.decode('utf-8'),
576 'upload_date': u'NA',
577 'title': video_title,
578 'ext': video_extension.decode('utf-8'),
584 class DailymotionIE(InfoExtractor):
585 """Information Extractor for Dailymotion"""
587 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
588 IE_NAME = u'dailymotion'
590 def __init__(self, downloader=None):
591 InfoExtractor.__init__(self, downloader)
593 def report_download_webpage(self, video_id):
594 """Report webpage download."""
595 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
597 def report_extraction(self, video_id):
598 """Report information extraction."""
599 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
601 def _real_extract(self, url):
602 # Extract id and simplified title from URL
603 mobj = re.match(self._VALID_URL, url)
605 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
608 video_id = mobj.group(1)
610 video_extension = 'flv'
612 # Retrieve video webpage to extract further information
613 request = urllib2.Request(url)
614 request.add_header('Cookie', 'family_filter=off')
616 self.report_download_webpage(video_id)
617 webpage = urllib2.urlopen(request).read()
618 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
619 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
622 # Extract URL, uploader and title from webpage
623 self.report_extraction(video_id)
624 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
626 self._downloader.trouble(u'ERROR: unable to extract media URL')
628 sequence = urllib.unquote(mobj.group(1))
629 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
631 self._downloader.trouble(u'ERROR: unable to extract media URL')
633 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
635 # if needed add http://www.dailymotion.com/ if relative URL
639 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
641 self._downloader.trouble(u'ERROR: unable to extract title')
643 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
645 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
647 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
649 video_uploader = mobj.group(1)
652 'id': video_id.decode('utf-8'),
653 'url': video_url.decode('utf-8'),
654 'uploader': video_uploader.decode('utf-8'),
655 'upload_date': u'NA',
656 'title': video_title,
657 'ext': video_extension.decode('utf-8'),
663 class GoogleIE(InfoExtractor):
664 """Information extractor for video.google.com."""
666 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
667 IE_NAME = u'video.google'
669 def __init__(self, downloader=None):
670 InfoExtractor.__init__(self, downloader)
672 def report_download_webpage(self, video_id):
673 """Report webpage download."""
674 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
676 def report_extraction(self, video_id):
677 """Report information extraction."""
678 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
680 def _real_extract(self, url):
681 # Extract id from URL
682 mobj = re.match(self._VALID_URL, url)
684 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
687 video_id = mobj.group(1)
689 video_extension = 'mp4'
691 # Retrieve video webpage to extract further information
692 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
694 self.report_download_webpage(video_id)
695 webpage = urllib2.urlopen(request).read()
696 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
697 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
700 # Extract URL, uploader, and title from webpage
701 self.report_extraction(video_id)
702 mobj = re.search(r"download_url:'([^']+)'", webpage)
704 video_extension = 'flv'
705 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
707 self._downloader.trouble(u'ERROR: unable to extract media URL')
709 mediaURL = urllib.unquote(mobj.group(1))
710 mediaURL = mediaURL.replace('\\x3d', '\x3d')
711 mediaURL = mediaURL.replace('\\x26', '\x26')
715 mobj = re.search(r'<title>(.*)</title>', webpage)
717 self._downloader.trouble(u'ERROR: unable to extract title')
719 video_title = mobj.group(1).decode('utf-8')
721 # Extract video description
722 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
724 self._downloader.trouble(u'ERROR: unable to extract video description')
726 video_description = mobj.group(1).decode('utf-8')
727 if not video_description:
728 video_description = 'No description available.'
730 # Extract video thumbnail
731 if self._downloader.params.get('forcethumbnail', False):
732 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
734 webpage = urllib2.urlopen(request).read()
735 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
736 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
738 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
740 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
742 video_thumbnail = mobj.group(1)
743 else: # we need something to pass to process_info
747 'id': video_id.decode('utf-8'),
748 'url': video_url.decode('utf-8'),
750 'upload_date': u'NA',
751 'title': video_title,
752 'ext': video_extension.decode('utf-8'),
758 class PhotobucketIE(InfoExtractor):
759 """Information extractor for photobucket.com."""
761 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
762 IE_NAME = u'photobucket'
764 def __init__(self, downloader=None):
765 InfoExtractor.__init__(self, downloader)
767 def report_download_webpage(self, video_id):
768 """Report webpage download."""
769 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
771 def report_extraction(self, video_id):
772 """Report information extraction."""
773 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
775 def _real_extract(self, url):
776 # Extract id from URL
777 mobj = re.match(self._VALID_URL, url)
779 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
782 video_id = mobj.group(1)
784 video_extension = 'flv'
786 # Retrieve video webpage to extract further information
787 request = urllib2.Request(url)
789 self.report_download_webpage(video_id)
790 webpage = urllib2.urlopen(request).read()
791 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
792 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
795 # Extract URL, uploader, and title from webpage
796 self.report_extraction(video_id)
797 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
799 self._downloader.trouble(u'ERROR: unable to extract media URL')
801 mediaURL = urllib.unquote(mobj.group(1))
805 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
807 self._downloader.trouble(u'ERROR: unable to extract title')
809 video_title = mobj.group(1).decode('utf-8')
811 video_uploader = mobj.group(2).decode('utf-8')
814 'id': video_id.decode('utf-8'),
815 'url': video_url.decode('utf-8'),
816 'uploader': video_uploader,
817 'upload_date': u'NA',
818 'title': video_title,
819 'ext': video_extension.decode('utf-8'),
825 class YahooIE(InfoExtractor):
826 """Information extractor for video.yahoo.com."""
828 # _VALID_URL matches all Yahoo! Video URLs
829 # _VPAGE_URL matches only the extractable '/watch/' URLs
830 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
831 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
832 IE_NAME = u'video.yahoo'
834 def __init__(self, downloader=None):
835 InfoExtractor.__init__(self, downloader)
837 def report_download_webpage(self, video_id):
838 """Report webpage download."""
839 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
841 def report_extraction(self, video_id):
842 """Report information extraction."""
843 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
845 def _real_extract(self, url, new_video=True):
846 # Extract ID from URL
847 mobj = re.match(self._VALID_URL, url)
849 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
852 video_id = mobj.group(2)
853 video_extension = 'flv'
855 # Rewrite valid but non-extractable URLs as
856 # extractable English language /watch/ URLs
857 if re.match(self._VPAGE_URL, url) is None:
858 request = urllib2.Request(url)
860 webpage = urllib2.urlopen(request).read()
861 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
862 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
865 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
867 self._downloader.trouble(u'ERROR: Unable to extract id field')
869 yahoo_id = mobj.group(1)
871 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
873 self._downloader.trouble(u'ERROR: Unable to extract vid field')
875 yahoo_vid = mobj.group(1)
877 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
878 return self._real_extract(url, new_video=False)
880 # Retrieve video webpage to extract further information
881 request = urllib2.Request(url)
883 self.report_download_webpage(video_id)
884 webpage = urllib2.urlopen(request).read()
885 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
886 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
889 # Extract uploader and title from webpage
890 self.report_extraction(video_id)
891 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
893 self._downloader.trouble(u'ERROR: unable to extract video title')
895 video_title = mobj.group(1).decode('utf-8')
897 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
899 self._downloader.trouble(u'ERROR: unable to extract video uploader')
901 video_uploader = mobj.group(1).decode('utf-8')
903 # Extract video thumbnail
904 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
906 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
908 video_thumbnail = mobj.group(1).decode('utf-8')
910 # Extract video description
911 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
913 self._downloader.trouble(u'ERROR: unable to extract video description')
915 video_description = mobj.group(1).decode('utf-8')
916 if not video_description:
917 video_description = 'No description available.'
919 # Extract video height and width
920 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
922 self._downloader.trouble(u'ERROR: unable to extract video height')
924 yv_video_height = mobj.group(1)
926 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
928 self._downloader.trouble(u'ERROR: unable to extract video width')
930 yv_video_width = mobj.group(1)
932 # Retrieve video playlist to extract media URL
933 # I'm not completely sure what all these options are, but we
934 # seem to need most of them, otherwise the server sends a 401.
935 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
936 yv_bitrate = '700' # according to Wikipedia this is hard-coded
937 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
938 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
939 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
941 self.report_download_webpage(video_id)
942 webpage = urllib2.urlopen(request).read()
943 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
944 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
947 # Extract media URL from playlist XML
948 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
950 self._downloader.trouble(u'ERROR: Unable to extract media URL')
952 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
953 video_url = unescapeHTML(video_url)
956 'id': video_id.decode('utf-8'),
958 'uploader': video_uploader,
959 'upload_date': u'NA',
960 'title': video_title,
961 'ext': video_extension.decode('utf-8'),
962 'thumbnail': video_thumbnail.decode('utf-8'),
963 'description': video_description,
964 'thumbnail': video_thumbnail,
969 class VimeoIE(InfoExtractor):
970 """Information extractor for vimeo.com."""
972 # _VALID_URL matches Vimeo URLs
973 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
976 def __init__(self, downloader=None):
977 InfoExtractor.__init__(self, downloader)
979 def report_download_webpage(self, video_id):
980 """Report webpage download."""
981 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
983 def report_extraction(self, video_id):
984 """Report information extraction."""
985 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
987 def _real_extract(self, url, new_video=True):
988 # Extract ID from URL
989 mobj = re.match(self._VALID_URL, url)
991 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
994 video_id = mobj.group(1)
996 # Retrieve video webpage to extract further information
997 request = urllib2.Request(url, None, std_headers)
999 self.report_download_webpage(video_id)
1000 webpage = urllib2.urlopen(request).read()
1001 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1002 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1005 # Now we begin extracting as much information as we can from what we
1006 # retrieved. First we extract the information common to all extractors,
1007 # and latter we extract those that are Vimeo specific.
1008 self.report_extraction(video_id)
1010 # Extract the config JSON
1011 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1013 config = json.loads(config)
1015 self._downloader.trouble(u'ERROR: unable to extract info section')
1019 video_title = config["video"]["title"]
1022 video_uploader = config["video"]["owner"]["name"]
1024 # Extract video thumbnail
1025 video_thumbnail = config["video"]["thumbnail"]
1027 # Extract video description
1028 video_description = get_element_by_id("description", webpage.decode('utf8'))
1029 if video_description: video_description = clean_html(video_description)
1030 else: video_description = ''
1032 # Extract upload date
1033 video_upload_date = u'NA'
1034 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1035 if mobj is not None:
1036 video_upload_date = mobj.group(1)
1038 # Vimeo specific: extract request signature and timestamp
1039 sig = config['request']['signature']
1040 timestamp = config['request']['timestamp']
1042 # Vimeo specific: extract video codec and quality information
1043 # TODO bind to format param
1044 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1045 for codec in codecs:
1046 if codec[0] in config["video"]["files"]:
1047 video_codec = codec[0]
1048 video_extension = codec[1]
1049 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
1050 else: quality = 'sd'
1053 self._downloader.trouble(u'ERROR: no known codec found')
1056 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1057 %(video_id, sig, timestamp, quality, video_codec.upper())
1062 'uploader': video_uploader,
1063 'upload_date': video_upload_date,
1064 'title': video_title,
1065 'ext': video_extension,
1066 'thumbnail': video_thumbnail,
1067 'description': video_description,
1072 class GenericIE(InfoExtractor):
1073 """Generic last-resort information extractor."""
1076 IE_NAME = u'generic'
1078 def __init__(self, downloader=None):
1079 InfoExtractor.__init__(self, downloader)
1081 def report_download_webpage(self, video_id):
1082 """Report webpage download."""
1083 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1084 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1086 def report_extraction(self, video_id):
1087 """Report information extraction."""
1088 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1090 def report_following_redirect(self, new_url):
1091 """Report information extraction."""
1092 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1094 def _test_redirect(self, url):
1095 """Check if it is a redirect, like url shorteners, in case restart chain."""
1096 class HeadRequest(urllib2.Request):
1097 def get_method(self):
1100 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1102 Subclass the HTTPRedirectHandler to make it use our
1103 HeadRequest also on the redirected URL
1105 def redirect_request(self, req, fp, code, msg, headers, newurl):
1106 if code in (301, 302, 303, 307):
1107 newurl = newurl.replace(' ', '%20')
1108 newheaders = dict((k,v) for k,v in req.headers.items()
1109 if k.lower() not in ("content-length", "content-type"))
1110 return HeadRequest(newurl,
1112 origin_req_host=req.get_origin_req_host(),
1115 raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1117 class HTTPMethodFallback(urllib2.BaseHandler):
1119 Fallback to GET if HEAD is not allowed (405 HTTP error)
1121 def http_error_405(self, req, fp, code, msg, headers):
1125 newheaders = dict((k,v) for k,v in req.headers.items()
1126 if k.lower() not in ("content-length", "content-type"))
1127 return self.parent.open(urllib2.Request(req.get_full_url(),
1129 origin_req_host=req.get_origin_req_host(),
1133 opener = urllib2.OpenerDirector()
1134 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1135 HTTPMethodFallback, HEADRedirectHandler,
1136 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1137 opener.add_handler(handler())
1139 response = opener.open(HeadRequest(url))
1140 new_url = response.geturl()
1142 if url == new_url: return False
1144 self.report_following_redirect(new_url)
1145 self._downloader.download([new_url])
1148 def _real_extract(self, url):
1149 if self._test_redirect(url): return
1151 video_id = url.split('/')[-1]
1152 request = urllib2.Request(url)
1154 self.report_download_webpage(video_id)
1155 webpage = urllib2.urlopen(request).read()
1156 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1157 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1159 except ValueError, err:
1160 # since this is the last-resort InfoExtractor, if
1161 # this error is thrown, it'll be thrown here
1162 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1165 self.report_extraction(video_id)
1166 # Start with something easy: JW Player in SWFObject
1167 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1169 # Broaden the search a little bit
1170 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1172 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1175 # It's possible that one of the regexes
1176 # matched, but returned an empty group:
1177 if mobj.group(1) is None:
1178 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1181 video_url = urllib.unquote(mobj.group(1))
1182 video_id = os.path.basename(video_url)
1184 # here's a fun little line of code for you:
1185 video_extension = os.path.splitext(video_id)[1][1:]
1186 video_id = os.path.splitext(video_id)[0]
1188 # it's tempting to parse this further, but you would
1189 # have to take into account all the variations like
1190 # Video Title - Site Name
1191 # Site Name | Video Title
1192 # Video Title - Tagline | Site Name
1193 # and so on and so forth; it's just not practical
1194 mobj = re.search(r'<title>(.*)</title>', webpage)
1196 self._downloader.trouble(u'ERROR: unable to extract title')
1198 video_title = mobj.group(1).decode('utf-8')
1200 # video uploader is domain name
1201 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1203 self._downloader.trouble(u'ERROR: unable to extract title')
1205 video_uploader = mobj.group(1).decode('utf-8')
1208 'id': video_id.decode('utf-8'),
1209 'url': video_url.decode('utf-8'),
1210 'uploader': video_uploader,
1211 'upload_date': u'NA',
1212 'title': video_title,
1213 'ext': video_extension.decode('utf-8'),
1219 class YoutubeSearchIE(InfoExtractor):
1220 """Information Extractor for YouTube search queries."""
1221 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1222 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1223 _max_youtube_results = 1000
1224 IE_NAME = u'youtube:search'
1226 def __init__(self, downloader=None):
1227 InfoExtractor.__init__(self, downloader)
1229 def report_download_page(self, query, pagenum):
1230 """Report attempt to download playlist page with given number."""
1231 query = query.decode(preferredencoding())
1232 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1234 def _real_extract(self, query):
1235 mobj = re.match(self._VALID_URL, query)
1237 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1240 prefix, query = query.split(':')
1242 query = query.encode('utf-8')
1244 self._download_n_results(query, 1)
1246 elif prefix == 'all':
1247 self._download_n_results(query, self._max_youtube_results)
1253 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1255 elif n > self._max_youtube_results:
1256 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1257 n = self._max_youtube_results
1258 self._download_n_results(query, n)
1260 except ValueError: # parsing prefix as integer fails
1261 self._download_n_results(query, 1)
1264 def _download_n_results(self, query, n):
1265 """Downloads a specified number of results for a query"""
1271 while (50 * pagenum) < limit:
1272 self.report_download_page(query, pagenum+1)
1273 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1274 request = urllib2.Request(result_url)
1276 data = urllib2.urlopen(request).read()
1277 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1278 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
1280 api_response = json.loads(data)['data']
1282 new_ids = list(video['id'] for video in api_response['items'])
1283 video_ids += new_ids
1285 limit = min(n, api_response['totalItems'])
1288 if len(video_ids) > n:
1289 video_ids = video_ids[:n]
1290 for id in video_ids:
1291 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1295 class GoogleSearchIE(InfoExtractor):
1296 """Information Extractor for Google Video search queries."""
1297 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1298 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1299 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1300 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1301 _max_google_results = 1000
1302 IE_NAME = u'video.google:search'
1304 def __init__(self, downloader=None):
1305 InfoExtractor.__init__(self, downloader)
1307 def report_download_page(self, query, pagenum):
1308 """Report attempt to download playlist page with given number."""
1309 query = query.decode(preferredencoding())
1310 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1312 def _real_extract(self, query):
1313 mobj = re.match(self._VALID_URL, query)
1315 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1318 prefix, query = query.split(':')
1320 query = query.encode('utf-8')
1322 self._download_n_results(query, 1)
1324 elif prefix == 'all':
1325 self._download_n_results(query, self._max_google_results)
1331 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1333 elif n > self._max_google_results:
1334 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1335 n = self._max_google_results
1336 self._download_n_results(query, n)
1338 except ValueError: # parsing prefix as integer fails
1339 self._download_n_results(query, 1)
1342 def _download_n_results(self, query, n):
1343 """Downloads a specified number of results for a query"""
1349 self.report_download_page(query, pagenum)
1350 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1351 request = urllib2.Request(result_url)
1353 page = urllib2.urlopen(request).read()
1354 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1355 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1358 # Extract video identifiers
1359 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1360 video_id = mobj.group(1)
1361 if video_id not in video_ids:
1362 video_ids.append(video_id)
1363 if len(video_ids) == n:
1364 # Specified n videos reached
1365 for id in video_ids:
1366 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1369 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1370 for id in video_ids:
1371 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1374 pagenum = pagenum + 1
1377 class YahooSearchIE(InfoExtractor):
1378 """Information Extractor for Yahoo! Video search queries."""
1379 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1380 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1381 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1382 _MORE_PAGES_INDICATOR = r'\s*Next'
1383 _max_yahoo_results = 1000
1384 IE_NAME = u'video.yahoo:search'
1386 def __init__(self, downloader=None):
1387 InfoExtractor.__init__(self, downloader)
1389 def report_download_page(self, query, pagenum):
1390 """Report attempt to download playlist page with given number."""
1391 query = query.decode(preferredencoding())
1392 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1394 def _real_extract(self, query):
1395 mobj = re.match(self._VALID_URL, query)
1397 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1400 prefix, query = query.split(':')
1402 query = query.encode('utf-8')
1404 self._download_n_results(query, 1)
1406 elif prefix == 'all':
1407 self._download_n_results(query, self._max_yahoo_results)
1413 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1415 elif n > self._max_yahoo_results:
1416 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1417 n = self._max_yahoo_results
1418 self._download_n_results(query, n)
1420 except ValueError: # parsing prefix as integer fails
1421 self._download_n_results(query, 1)
1424 def _download_n_results(self, query, n):
1425 """Downloads a specified number of results for a query"""
1428 already_seen = set()
1432 self.report_download_page(query, pagenum)
1433 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1434 request = urllib2.Request(result_url)
1436 page = urllib2.urlopen(request).read()
1437 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1438 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1441 # Extract video identifiers
1442 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1443 video_id = mobj.group(1)
1444 if video_id not in already_seen:
1445 video_ids.append(video_id)
1446 already_seen.add(video_id)
1447 if len(video_ids) == n:
1448 # Specified n videos reached
1449 for id in video_ids:
1450 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1453 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1454 for id in video_ids:
1455 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1458 pagenum = pagenum + 1
1461 class YoutubePlaylistIE(InfoExtractor):
1462 """Information Extractor for YouTube playlists."""
1464 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1465 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1466 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&list=PL%s&'
1467 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1468 IE_NAME = u'youtube:playlist'
1470 def __init__(self, downloader=None):
1471 InfoExtractor.__init__(self, downloader)
1473 def report_download_page(self, playlist_id, pagenum):
1474 """Report attempt to download playlist page with given number."""
1475 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1477 def _real_extract(self, url):
1478 # Extract playlist id
1479 mobj = re.match(self._VALID_URL, url)
1481 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1485 if mobj.group(3) is not None:
1486 self._downloader.download([mobj.group(3)])
1489 # Download playlist pages
1490 # prefix is 'p' as default for playlists but there are other types that need extra care
1491 playlist_prefix = mobj.group(1)
1492 if playlist_prefix == 'a':
1493 playlist_access = 'artist'
1495 playlist_prefix = 'p'
1496 playlist_access = 'view_play_list'
1497 playlist_id = mobj.group(2)
1502 self.report_download_page(playlist_id, pagenum)
1503 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1504 request = urllib2.Request(url)
1506 page = urllib2.urlopen(request).read()
1507 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1508 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1511 # Extract video identifiers
1513 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1514 if mobj.group(1) not in ids_in_page:
1515 ids_in_page.append(mobj.group(1))
1516 video_ids.extend(ids_in_page)
1518 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1520 pagenum = pagenum + 1
1522 playliststart = self._downloader.params.get('playliststart', 1) - 1
1523 playlistend = self._downloader.params.get('playlistend', -1)
1524 if playlistend == -1:
1525 video_ids = video_ids[playliststart:]
1527 video_ids = video_ids[playliststart:playlistend]
1529 for id in video_ids:
1530 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1534 class YoutubeUserIE(InfoExtractor):
1535 """Information Extractor for YouTube users."""
1537 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1538 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1539 _GDATA_PAGE_SIZE = 50
1540 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1541 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1542 IE_NAME = u'youtube:user'
1544 def __init__(self, downloader=None):
1545 InfoExtractor.__init__(self, downloader)
1547 def report_download_page(self, username, start_index):
1548 """Report attempt to download user page."""
1549 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1550 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1552 def _real_extract(self, url):
1554 mobj = re.match(self._VALID_URL, url)
1556 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1559 username = mobj.group(1)
1561 # Download video ids using YouTube Data API. Result size per
1562 # query is limited (currently to 50 videos) so we need to query
1563 # page by page until there are no video ids - it means we got
1570 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1571 self.report_download_page(username, start_index)
1573 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1576 page = urllib2.urlopen(request).read()
1577 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1578 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1581 # Extract video identifiers
1584 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1585 if mobj.group(1) not in ids_in_page:
1586 ids_in_page.append(mobj.group(1))
1588 video_ids.extend(ids_in_page)
1590 # A little optimization - if current page is not
1591 # "full", ie. does not contain PAGE_SIZE video ids then
1592 # we can assume that this page is the last one - there
1593 # are no more ids on further pages - no need to query
1596 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1601 all_ids_count = len(video_ids)
1602 playliststart = self._downloader.params.get('playliststart', 1) - 1
1603 playlistend = self._downloader.params.get('playlistend', -1)
1605 if playlistend == -1:
1606 video_ids = video_ids[playliststart:]
1608 video_ids = video_ids[playliststart:playlistend]
1610 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1611 (username, all_ids_count, len(video_ids)))
1613 for video_id in video_ids:
1614 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1617 class DepositFilesIE(InfoExtractor):
1618 """Information extractor for depositfiles.com"""
1620 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1621 IE_NAME = u'DepositFiles'
1623 def __init__(self, downloader=None):
1624 InfoExtractor.__init__(self, downloader)
1626 def report_download_webpage(self, file_id):
1627 """Report webpage download."""
1628 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1630 def report_extraction(self, file_id):
1631 """Report information extraction."""
1632 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1634 def _real_extract(self, url):
1635 file_id = url.split('/')[-1]
1636 # Rebuild url in english locale
1637 url = 'http://depositfiles.com/en/files/' + file_id
1639 # Retrieve file webpage with 'Free download' button pressed
1640 free_download_indication = { 'gateway_result' : '1' }
1641 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1643 self.report_download_webpage(file_id)
1644 webpage = urllib2.urlopen(request).read()
1645 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1646 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
1649 # Search for the real file URL
1650 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1651 if (mobj is None) or (mobj.group(1) is None):
1652 # Try to figure out reason of the error.
1653 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1654 if (mobj is not None) and (mobj.group(1) is not None):
1655 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1656 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1658 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1661 file_url = mobj.group(1)
1662 file_extension = os.path.splitext(file_url)[1][1:]
1664 # Search for file title
1665 mobj = re.search(r'<b title="(.*?)">', webpage)
1667 self._downloader.trouble(u'ERROR: unable to extract title')
1669 file_title = mobj.group(1).decode('utf-8')
1672 'id': file_id.decode('utf-8'),
1673 'url': file_url.decode('utf-8'),
1675 'upload_date': u'NA',
1676 'title': file_title,
1677 'ext': file_extension.decode('utf-8'),
1683 class FacebookIE(InfoExtractor):
1684 """Information Extractor for Facebook"""
1686 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1687 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1688 _NETRC_MACHINE = 'facebook'
1689 _available_formats = ['video', 'highqual', 'lowqual']
1690 _video_extensions = {
1695 IE_NAME = u'facebook'
1697 def __init__(self, downloader=None):
1698 InfoExtractor.__init__(self, downloader)
1700 def _reporter(self, message):
1701 """Add header and report message."""
1702 self._downloader.to_screen(u'[facebook] %s' % message)
1704 def report_login(self):
1705 """Report attempt to log in."""
1706 self._reporter(u'Logging in')
1708 def report_video_webpage_download(self, video_id):
1709 """Report attempt to download video webpage."""
1710 self._reporter(u'%s: Downloading video webpage' % video_id)
1712 def report_information_extraction(self, video_id):
1713 """Report attempt to extract video information."""
1714 self._reporter(u'%s: Extracting video information' % video_id)
1716 def _parse_page(self, video_webpage):
1717 """Extract video information from page"""
1719 data = {'title': r'\("video_title", "(.*?)"\)',
1720 'description': r'<div class="datawrap">(.*?)</div>',
1721 'owner': r'\("video_owner_name", "(.*?)"\)',
1722 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1725 for piece in data.keys():
1726 mobj = re.search(data[piece], video_webpage)
1727 if mobj is not None:
1728 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1732 for fmt in self._available_formats:
1733 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1734 if mobj is not None:
1735 # URL is in a Javascript segment inside an escaped Unicode format within
1736 # the generally utf-8 page
1737 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1738 video_info['video_urls'] = video_urls
1742 def _real_initialize(self):
1743 if self._downloader is None:
1748 downloader_params = self._downloader.params
1750 # Attempt to use provided username and password or .netrc data
1751 if downloader_params.get('username', None) is not None:
1752 useremail = downloader_params['username']
1753 password = downloader_params['password']
1754 elif downloader_params.get('usenetrc', False):
1756 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1757 if info is not None:
1761 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1762 except (IOError, netrc.NetrcParseError), err:
1763 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1766 if useremail is None:
1775 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1778 login_results = urllib2.urlopen(request).read()
1779 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1780 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1782 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1783 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1786 def _real_extract(self, url):
1787 mobj = re.match(self._VALID_URL, url)
1789 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1791 video_id = mobj.group('ID')
1794 self.report_video_webpage_download(video_id)
1795 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
1797 page = urllib2.urlopen(request)
1798 video_webpage = page.read()
1799 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1800 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1803 # Start extracting information
1804 self.report_information_extraction(video_id)
1806 # Extract information
1807 video_info = self._parse_page(video_webpage)
1810 if 'owner' not in video_info:
1811 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1813 video_uploader = video_info['owner']
1816 if 'title' not in video_info:
1817 self._downloader.trouble(u'ERROR: unable to extract video title')
1819 video_title = video_info['title']
1820 video_title = video_title.decode('utf-8')
1823 if 'thumbnail' not in video_info:
1824 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1825 video_thumbnail = ''
1827 video_thumbnail = video_info['thumbnail']
1831 if 'upload_date' in video_info:
1832 upload_time = video_info['upload_date']
1833 timetuple = email.utils.parsedate_tz(upload_time)
1834 if timetuple is not None:
1836 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
1841 video_description = video_info.get('description', 'No description available.')
1843 url_map = video_info['video_urls']
1844 if len(url_map.keys()) > 0:
1845 # Decide which formats to download
1846 req_format = self._downloader.params.get('format', None)
1847 format_limit = self._downloader.params.get('format_limit', None)
1849 if format_limit is not None and format_limit in self._available_formats:
1850 format_list = self._available_formats[self._available_formats.index(format_limit):]
1852 format_list = self._available_formats
1853 existing_formats = [x for x in format_list if x in url_map]
1854 if len(existing_formats) == 0:
1855 self._downloader.trouble(u'ERROR: no known formats available for video')
1857 if req_format is None:
1858 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1859 elif req_format == 'worst':
1860 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1861 elif req_format == '-1':
1862 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1865 if req_format not in url_map:
1866 self._downloader.trouble(u'ERROR: requested format not available')
1868 video_url_list = [(req_format, url_map[req_format])] # Specific format
1871 for format_param, video_real_url in video_url_list:
1873 video_extension = self._video_extensions.get(format_param, 'mp4')
1876 'id': video_id.decode('utf-8'),
1877 'url': video_real_url.decode('utf-8'),
1878 'uploader': video_uploader.decode('utf-8'),
1879 'upload_date': upload_date,
1880 'title': video_title,
1881 'ext': video_extension.decode('utf-8'),
1882 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1883 'thumbnail': video_thumbnail.decode('utf-8'),
1884 'description': video_description.decode('utf-8'),
1889 class BlipTVIE(InfoExtractor):
1890 """Information extractor for blip.tv"""
1892 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
1893 _URL_EXT = r'^.*\.([a-z0-9]+)$'
1894 IE_NAME = u'blip.tv'
1896 def report_extraction(self, file_id):
1897 """Report information extraction."""
1898 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
1900 def report_direct_download(self, title):
1901 """Report information extraction."""
1902 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
1904 def _real_extract(self, url):
1905 mobj = re.match(self._VALID_URL, url)
1907 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1914 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1915 request = urllib2.Request(json_url)
1916 self.report_extraction(mobj.group(1))
1919 urlh = urllib2.urlopen(request)
1920 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1921 basename = url.split('/')[-1]
1922 title,ext = os.path.splitext(basename)
1923 title = title.decode('UTF-8')
1924 ext = ext.replace('.', '')
1925 self.report_direct_download(title)
1933 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1934 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1936 if info is None: # Regular URL
1938 json_code = urlh.read()
1939 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1940 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
1944 json_data = json.loads(json_code)
1945 if 'Post' in json_data:
1946 data = json_data['Post']
1950 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
1951 video_url = data['media']['url']
1952 umobj = re.match(self._URL_EXT, video_url)
1954 raise ValueError('Can not determine filename extension')
1955 ext = umobj.group(1)
1958 'id': data['item_id'],
1960 'uploader': data['display_name'],
1961 'upload_date': upload_date,
1962 'title': data['title'],
1964 'format': data['media']['mimeType'],
1965 'thumbnail': data['thumbnailUrl'],
1966 'description': data['description'],
1967 'player_url': data['embedUrl']
1969 except (ValueError,KeyError), err:
1970 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
1976 class MyVideoIE(InfoExtractor):
1977 """Information Extractor for myvideo.de."""
1979 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
1980 IE_NAME = u'myvideo'
1982 def __init__(self, downloader=None):
1983 InfoExtractor.__init__(self, downloader)
1985 def report_download_webpage(self, video_id):
1986 """Report webpage download."""
1987 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
1989 def report_extraction(self, video_id):
1990 """Report information extraction."""
1991 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
1993 def _real_extract(self,url):
1994 mobj = re.match(self._VALID_URL, url)
1996 self._download.trouble(u'ERROR: invalid URL: %s' % url)
1999 video_id = mobj.group(1)
2002 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2004 self.report_download_webpage(video_id)
2005 webpage = urllib2.urlopen(request).read()
2006 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2007 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2010 self.report_extraction(video_id)
2011 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2014 self._downloader.trouble(u'ERROR: unable to extract media URL')
2016 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2018 mobj = re.search('<title>([^<]+)</title>', webpage)
2020 self._downloader.trouble(u'ERROR: unable to extract title')
2023 video_title = mobj.group(1)
2029 'upload_date': u'NA',
2030 'title': video_title,
2036 class ComedyCentralIE(InfoExtractor):
2037 """Information extractor for The Daily Show and Colbert Report """
2039 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2040 IE_NAME = u'comedycentral'
2042 def report_extraction(self, episode_id):
2043 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2045 def report_config_download(self, episode_id):
2046 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2048 def report_index_download(self, episode_id):
2049 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2051 def report_player_url(self, episode_id):
2052 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2054 def _real_extract(self, url):
2055 mobj = re.match(self._VALID_URL, url)
2057 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2060 if mobj.group('shortname'):
2061 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2062 url = u'http://www.thedailyshow.com/full-episodes/'
2064 url = u'http://www.colbertnation.com/full-episodes/'
2065 mobj = re.match(self._VALID_URL, url)
2066 assert mobj is not None
2068 dlNewest = not mobj.group('episode')
2070 epTitle = mobj.group('showname')
2072 epTitle = mobj.group('episode')
2074 req = urllib2.Request(url)
2075 self.report_extraction(epTitle)
2077 htmlHandle = urllib2.urlopen(req)
2078 html = htmlHandle.read()
2079 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2080 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2083 url = htmlHandle.geturl()
2084 mobj = re.match(self._VALID_URL, url)
2086 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2088 if mobj.group('episode') == '':
2089 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2091 epTitle = mobj.group('episode')
2093 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2094 if len(mMovieParams) == 0:
2095 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2098 playerUrl_raw = mMovieParams[0][0]
2099 self.report_player_url(epTitle)
2101 urlHandle = urllib2.urlopen(playerUrl_raw)
2102 playerUrl = urlHandle.geturl()
2103 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2104 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2107 uri = mMovieParams[0][1]
2108 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2109 self.report_index_download(epTitle)
2111 indexXml = urllib2.urlopen(indexUrl).read()
2112 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2113 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2118 idoc = xml.etree.ElementTree.fromstring(indexXml)
2119 itemEls = idoc.findall('.//item')
2120 for itemEl in itemEls:
2121 mediaId = itemEl.findall('./guid')[0].text
2122 shortMediaId = mediaId.split(':')[-1]
2123 showId = mediaId.split(':')[-2].replace('.com', '')
2124 officialTitle = itemEl.findall('./title')[0].text
2125 officialDate = itemEl.findall('./pubDate')[0].text
2127 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2128 urllib.urlencode({'uri': mediaId}))
2129 configReq = urllib2.Request(configUrl)
2130 self.report_config_download(epTitle)
2132 configXml = urllib2.urlopen(configReq).read()
2133 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2134 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2137 cdoc = xml.etree.ElementTree.fromstring(configXml)
2139 for rendition in cdoc.findall('.//rendition'):
2140 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2144 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2147 # For now, just pick the highest bitrate
2148 format,video_url = turls[-1]
2150 effTitle = showId + u'-' + epTitle
2155 'upload_date': officialDate,
2160 'description': officialTitle,
2161 'player_url': playerUrl
2164 results.append(info)
2169 class EscapistIE(InfoExtractor):
2170 """Information extractor for The Escapist """
2172 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2173 IE_NAME = u'escapist'
2175 def report_extraction(self, showName):
2176 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2178 def report_config_download(self, showName):
2179 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2181 def _real_extract(self, url):
2182 mobj = re.match(self._VALID_URL, url)
2184 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2186 showName = mobj.group('showname')
2187 videoId = mobj.group('episode')
2189 self.report_extraction(showName)
2191 webPageBytes = urllib2.urlopen(url).read()
2192 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2193 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2196 webPage = webPageBytes.decode('utf-8')
2197 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2198 description = unescapeHTML(descMatch.group(1))
2199 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2200 imgUrl = unescapeHTML(imgMatch.group(1))
2201 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2202 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2203 configUrlMatch = re.search('config=(.*)$', playerUrl)
2204 configUrl = urllib2.unquote(configUrlMatch.group(1))
2206 self.report_config_download(showName)
2208 configJSON = urllib2.urlopen(configUrl).read()
2209 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2210 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2213 # Technically, it's JavaScript, not JSON
2214 configJSON = configJSON.replace("'", '"')
2217 config = json.loads(configJSON)
2218 except (ValueError,), err:
2219 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2222 playlist = config['playlist']
2223 videoUrl = playlist[1]['url']
2228 'uploader': showName,
2229 'upload_date': None,
2233 'thumbnail': imgUrl,
2234 'description': description,
2235 'player_url': playerUrl,
2241 class CollegeHumorIE(InfoExtractor):
2242 """Information extractor for collegehumor.com"""
2244 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2245 IE_NAME = u'collegehumor'
2247 def report_webpage(self, video_id):
2248 """Report information extraction."""
2249 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2251 def report_extraction(self, video_id):
2252 """Report information extraction."""
2253 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2255 def _real_extract(self, url):
2256 mobj = re.match(self._VALID_URL, url)
2258 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2260 video_id = mobj.group('videoid')
2262 self.report_webpage(video_id)
2263 request = urllib2.Request(url)
2265 webpage = urllib2.urlopen(request).read()
2266 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2267 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2270 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2272 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2274 internal_video_id = m.group('internalvideoid')
2278 'internal_id': internal_video_id,
2281 self.report_extraction(video_id)
2282 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2284 metaXml = urllib2.urlopen(xmlUrl).read()
2285 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2286 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
2289 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2291 videoNode = mdoc.findall('./video')[0]
2292 info['description'] = videoNode.findall('./description')[0].text
2293 info['title'] = videoNode.findall('./caption')[0].text
2294 info['url'] = videoNode.findall('./file')[0].text
2295 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2296 info['ext'] = info['url'].rpartition('.')[2]
2297 info['format'] = info['ext']
2299 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2305 class XVideosIE(InfoExtractor):
2306 """Information extractor for xvideos.com"""
2308 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2309 IE_NAME = u'xvideos'
2311 def report_webpage(self, video_id):
2312 """Report information extraction."""
2313 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2315 def report_extraction(self, video_id):
2316 """Report information extraction."""
2317 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2319 def _real_extract(self, url):
2320 mobj = re.match(self._VALID_URL, url)
2322 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2324 video_id = mobj.group(1).decode('utf-8')
2326 self.report_webpage(video_id)
2328 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2330 webpage = urllib2.urlopen(request).read()
2331 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2332 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2335 self.report_extraction(video_id)
2339 mobj = re.search(r'flv_url=(.+?)&', webpage)
2341 self._downloader.trouble(u'ERROR: unable to extract video url')
2343 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2347 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2349 self._downloader.trouble(u'ERROR: unable to extract video title')
2351 video_title = mobj.group(1).decode('utf-8')
2354 # Extract video thumbnail
2355 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
2357 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2359 video_thumbnail = mobj.group(1).decode('utf-8')
2365 'upload_date': None,
2366 'title': video_title,
2369 'thumbnail': video_thumbnail,
2370 'description': None,
2377 class SoundcloudIE(InfoExtractor):
2378 """Information extractor for soundcloud.com
2379 To access the media, the uid of the song and a stream token
2380 must be extracted from the page source and the script must make
2381 a request to media.soundcloud.com/crossdomain.xml. Then
2382 the media can be grabbed by requesting from an url composed
2383 of the stream token and uid
2386 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2387 IE_NAME = u'soundcloud'
2389 def __init__(self, downloader=None):
2390 InfoExtractor.__init__(self, downloader)
2392 def report_webpage(self, video_id):
2393 """Report information extraction."""
2394 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2396 def report_extraction(self, video_id):
2397 """Report information extraction."""
2398 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2400 def _real_extract(self, url):
2401 mobj = re.match(self._VALID_URL, url)
2403 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2406 # extract uploader (which is in the url)
2407 uploader = mobj.group(1).decode('utf-8')
2408 # extract simple title (uploader + slug of song title)
2409 slug_title = mobj.group(2).decode('utf-8')
2410 simple_title = uploader + u'-' + slug_title
2412 self.report_webpage('%s/%s' % (uploader, slug_title))
2414 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2416 webpage = urllib2.urlopen(request).read()
2417 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2418 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2421 self.report_extraction('%s/%s' % (uploader, slug_title))
2423 # extract uid and stream token that soundcloud hands out for access
2424 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2426 video_id = mobj.group(1)
2427 stream_token = mobj.group(2)
2429 # extract unsimplified title
2430 mobj = re.search('"title":"(.*?)",', webpage)
2432 title = mobj.group(1).decode('utf-8')
2434 title = simple_title
2436 # construct media url (with uid/token)
2437 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2438 mediaURL = mediaURL % (video_id, stream_token)
2441 description = u'No description available'
2442 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2444 description = mobj.group(1)
2448 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2451 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2452 except Exception, e:
2455 # for soundcloud, a request to a cross domain is required for cookies
2456 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2459 'id': video_id.decode('utf-8'),
2461 'uploader': uploader.decode('utf-8'),
2462 'upload_date': upload_date,
2467 'description': description.decode('utf-8')
2471 class InfoQIE(InfoExtractor):
2472 """Information extractor for infoq.com"""
2474 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2477 def report_webpage(self, video_id):
2478 """Report information extraction."""
2479 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2481 def report_extraction(self, video_id):
2482 """Report information extraction."""
2483 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2485 def _real_extract(self, url):
2486 mobj = re.match(self._VALID_URL, url)
2488 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2491 self.report_webpage(url)
2493 request = urllib2.Request(url)
2495 webpage = urllib2.urlopen(request).read()
2496 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2497 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2500 self.report_extraction(url)
2504 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2506 self._downloader.trouble(u'ERROR: unable to extract video url')
2508 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2512 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2514 self._downloader.trouble(u'ERROR: unable to extract video title')
2516 video_title = mobj.group(1).decode('utf-8')
2518 # Extract description
2519 video_description = u'No description available.'
2520 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2521 if mobj is not None:
2522 video_description = mobj.group(1).decode('utf-8')
2524 video_filename = video_url.split('/')[-1]
2525 video_id, extension = video_filename.split('.')
2531 'upload_date': None,
2532 'title': video_title,
2534 'format': extension, # Extension is always(?) mp4, but seems to be flv
2536 'description': video_description,
2542 class MixcloudIE(InfoExtractor):
2543 """Information extractor for www.mixcloud.com"""
2544 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2545 IE_NAME = u'mixcloud'
2547 def __init__(self, downloader=None):
2548 InfoExtractor.__init__(self, downloader)
2550 def report_download_json(self, file_id):
2551 """Report JSON download."""
2552 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2554 def report_extraction(self, file_id):
2555 """Report information extraction."""
2556 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2558 def get_urls(self, jsonData, fmt, bitrate='best'):
2559 """Get urls from 'audio_formats' section in json"""
2562 bitrate_list = jsonData[fmt]
2563 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2564 bitrate = max(bitrate_list) # select highest
2566 url_list = jsonData[fmt][bitrate]
2567 except TypeError: # we have no bitrate info.
2568 url_list = jsonData[fmt]
2571 def check_urls(self, url_list):
2572 """Returns 1st active url from list"""
2573 for url in url_list:
2575 urllib2.urlopen(url)
2577 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2582 def _print_formats(self, formats):
2583 print 'Available formats:'
2584 for fmt in formats.keys():
2585 for b in formats[fmt]:
2587 ext = formats[fmt][b][0]
2588 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
2589 except TypeError: # we have no bitrate info
2590 ext = formats[fmt][0]
2591 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
2594 def _real_extract(self, url):
2595 mobj = re.match(self._VALID_URL, url)
2597 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2599 # extract uploader & filename from url
2600 uploader = mobj.group(1).decode('utf-8')
2601 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2603 # construct API request
2604 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2605 # retrieve .json file with links to files
2606 request = urllib2.Request(file_url)
2608 self.report_download_json(file_url)
2609 jsonData = urllib2.urlopen(request).read()
2610 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2611 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
2615 json_data = json.loads(jsonData)
2616 player_url = json_data['player_swf_url']
2617 formats = dict(json_data['audio_formats'])
2619 req_format = self._downloader.params.get('format', None)
2622 if self._downloader.params.get('listformats', None):
2623 self._print_formats(formats)
2626 if req_format is None or req_format == 'best':
2627 for format_param in formats.keys():
2628 url_list = self.get_urls(formats, format_param)
2630 file_url = self.check_urls(url_list)
2631 if file_url is not None:
2634 if req_format not in formats.keys():
2635 self._downloader.trouble(u'ERROR: format is not available')
2638 url_list = self.get_urls(formats, req_format)
2639 file_url = self.check_urls(url_list)
2640 format_param = req_format
2643 'id': file_id.decode('utf-8'),
2644 'url': file_url.decode('utf-8'),
2645 'uploader': uploader.decode('utf-8'),
2646 'upload_date': u'NA',
2647 'title': json_data['name'],
2648 'ext': file_url.split('.')[-1].decode('utf-8'),
2649 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2650 'thumbnail': json_data['thumbnail_url'],
2651 'description': json_data['description'],
2652 'player_url': player_url.decode('utf-8'),
2655 class StanfordOpenClassroomIE(InfoExtractor):
2656 """Information extractor for Stanford's Open ClassRoom"""
2658 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2659 IE_NAME = u'stanfordoc'
2661 def report_download_webpage(self, objid):
2662 """Report information extraction."""
2663 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2665 def report_extraction(self, video_id):
2666 """Report information extraction."""
2667 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2669 def _real_extract(self, url):
2670 mobj = re.match(self._VALID_URL, url)
2672 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2675 if mobj.group('course') and mobj.group('video'): # A specific video
2676 course = mobj.group('course')
2677 video = mobj.group('video')
2679 'id': course + '_' + video,
2682 self.report_extraction(info['id'])
2683 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2684 xmlUrl = baseUrl + video + '.xml'
2686 metaXml = urllib2.urlopen(xmlUrl).read()
2687 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2688 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2690 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2692 info['title'] = mdoc.findall('./title')[0].text
2693 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2695 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2697 info['ext'] = info['url'].rpartition('.')[2]
2698 info['format'] = info['ext']
2700 elif mobj.group('course'): # A course page
2701 course = mobj.group('course')
2707 self.report_download_webpage(info['id'])
2709 coursepage = urllib2.urlopen(url).read()
2710 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2711 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2714 m = re.search('<h1>([^<]+)</h1>', coursepage)
2716 info['title'] = unescapeHTML(m.group(1))
2718 info['title'] = info['id']
2720 m = re.search('<description>([^<]+)</description>', coursepage)
2722 info['description'] = unescapeHTML(m.group(1))
2724 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2727 'type': 'reference',
2728 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2732 for entry in info['list']:
2733 assert entry['type'] == 'reference'
2734 results += self.extract(entry['url'])
2739 'id': 'Stanford OpenClassroom',
2743 self.report_download_webpage(info['id'])
2744 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2746 rootpage = urllib2.urlopen(rootURL).read()
2747 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2748 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2751 info['title'] = info['id']
2753 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2756 'type': 'reference',
2757 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2762 for entry in info['list']:
2763 assert entry['type'] == 'reference'
2764 results += self.extract(entry['url'])
2767 class MTVIE(InfoExtractor):
2768 """Information extractor for MTV.com"""
2770 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2773 def report_webpage(self, video_id):
2774 """Report information extraction."""
2775 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2777 def report_extraction(self, video_id):
2778 """Report information extraction."""
2779 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2781 def _real_extract(self, url):
2782 mobj = re.match(self._VALID_URL, url)
2784 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2786 if not mobj.group('proto'):
2787 url = 'http://' + url
2788 video_id = mobj.group('videoid')
2789 self.report_webpage(video_id)
2791 request = urllib2.Request(url)
2793 webpage = urllib2.urlopen(request).read()
2794 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2795 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2798 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2800 self._downloader.trouble(u'ERROR: unable to extract song name')
2802 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2803 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2805 self._downloader.trouble(u'ERROR: unable to extract performer')
2807 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2808 video_title = performer + ' - ' + song_name
2810 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2812 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
2814 mtvn_uri = mobj.group(1)
2816 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2818 self._downloader.trouble(u'ERROR: unable to extract content id')
2820 content_id = mobj.group(1)
2822 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2823 self.report_extraction(video_id)
2824 request = urllib2.Request(videogen_url)
2826 metadataXml = urllib2.urlopen(request).read()
2827 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2828 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
2831 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2832 renditions = mdoc.findall('.//rendition')
2834 # For now, always pick the highest quality.
2835 rendition = renditions[-1]
2838 _,_,ext = rendition.attrib['type'].partition('/')
2839 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2840 video_url = rendition.find('./src').text
2842 self._downloader.trouble('Invalid rendition field.')
2848 'uploader': performer,
2849 'title': video_title,