2 # -*- coding: utf-8 -*-
15 import xml.etree.ElementTree
16 from urlparse import parse_qs
19 import cStringIO as StringIO
26 class InfoExtractor(object):
27 """Information Extractor class.
29 Information extractors are the classes that, given a URL, extract
30 information from the video (or videos) the URL refers to. This
31 information includes the real video URL, the video title and simplified
32 title, author and others. The information is stored in a dictionary
33 which is then passed to the FileDownloader. The FileDownloader
34 processes this information possibly downloading the video to the file
35 system, among other possible outcomes. The dictionaries must include
40 uploader: Nickname of the video uploader.
42 stitle: Simplified title.
43 ext: Video filename extension.
45 player_url: SWF Player URL (may be None).
47 The following fields are optional. Their primary purpose is to allow
48 youtube-dl to serve as the backend for a video search function, such
49 as the one in youtube2mp3. They are only used when their respective
50 forced printing functions are called:
52 thumbnail: Full URL to a video thumbnail image.
53 description: One-line video description.
55 Subclasses of this one should re-define the _real_initialize() and
56 _real_extract() methods and define a _VALID_URL regexp.
57 Probably, they should also be added to the list of extractors.
63 def __init__(self, downloader=None):
64 """Constructor. Receives an optional downloader."""
66 self.set_downloader(downloader)
68 def suitable(self, url):
69 """Receives a URL and returns True if suitable for this IE."""
70 return re.match(self._VALID_URL, url) is not None
73 """Initializes an instance (authentication, etc)."""
75 self._real_initialize()
78 def extract(self, url):
79 """Extracts URL information and returns it in list of dicts."""
81 return self._real_extract(url)
83 def set_downloader(self, downloader):
84 """Sets the downloader for this IE."""
85 self._downloader = downloader
87 def _real_initialize(self):
88 """Real initialization process. Redefine in subclasses."""
91 def _real_extract(self, url):
92 """Real extraction process. Redefine in subclasses."""
96 class YoutubeIE(InfoExtractor):
97 """Information extractor for youtube.com."""
99 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
100 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
101 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
102 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
103 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
104 _NETRC_MACHINE = 'youtube'
105 # Listed in order of quality
106 _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
107 _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
108 _video_extensions = {
114 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
119 _video_dimensions = {
136 def report_lang(self):
137 """Report attempt to set language."""
138 self._downloader.to_screen(u'[youtube] Setting language')
140 def report_login(self):
141 """Report attempt to log in."""
142 self._downloader.to_screen(u'[youtube] Logging in')
144 def report_age_confirmation(self):
145 """Report attempt to confirm age."""
146 self._downloader.to_screen(u'[youtube] Confirming age')
148 def report_video_webpage_download(self, video_id):
149 """Report attempt to download video webpage."""
150 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
152 def report_video_info_webpage_download(self, video_id):
153 """Report attempt to download video info webpage."""
154 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
156 def report_video_subtitles_download(self, video_id):
157 """Report attempt to download video info webpage."""
158 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
160 def report_information_extraction(self, video_id):
161 """Report attempt to extract video information."""
162 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
164 def report_unavailable_format(self, video_id, format):
165 """Report extracted video URL."""
166 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
168 def report_rtmp_download(self):
169 """Indicate the download will use the RTMP protocol."""
170 self._downloader.to_screen(u'[youtube] RTMP download detected')
172 def _closed_captions_xml_to_srt(self, xml_string):
174 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
175 # TODO parse xml instead of regex
176 for n, (start, dur_tag, dur, caption) in enumerate(texts):
177 if not dur: dur = '4'
179 end = start + float(dur)
180 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
181 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
182 caption = unescapeHTML(caption)
183 caption = unescapeHTML(caption) # double cycle, inentional
185 srt += start + ' --> ' + end + '\n'
186 srt += caption + '\n\n'
189 def _print_formats(self, formats):
190 print 'Available formats:'
192 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
194 def _real_initialize(self):
195 if self._downloader is None:
200 downloader_params = self._downloader.params
202 # Attempt to use provided username and password or .netrc data
203 if downloader_params.get('username', None) is not None:
204 username = downloader_params['username']
205 password = downloader_params['password']
206 elif downloader_params.get('usenetrc', False):
208 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
213 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
214 except (IOError, netrc.NetrcParseError), err:
215 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
219 request = urllib2.Request(self._LANG_URL)
222 urllib2.urlopen(request).read()
223 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
224 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
227 # No authentication to be performed
233 'current_form': 'loginForm',
235 'action_login': 'Log In',
236 'username': username,
237 'password': password,
239 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
242 login_results = urllib2.urlopen(request).read()
243 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
244 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
246 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
247 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
253 'action_confirm': 'Confirm',
255 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
257 self.report_age_confirmation()
258 age_results = urllib2.urlopen(request).read()
259 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
260 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
263 def _real_extract(self, url):
264 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
265 mobj = re.search(self._NEXT_URL_RE, url)
267 url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
269 # Extract video id from URL
270 mobj = re.match(self._VALID_URL, url)
272 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
274 video_id = mobj.group(2)
277 self.report_video_webpage_download(video_id)
278 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
280 video_webpage = urllib2.urlopen(request).read()
281 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
282 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
285 # Attempt to extract SWF player URL
286 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
288 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
293 self.report_video_info_webpage_download(video_id)
294 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
295 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
296 % (video_id, el_type))
297 request = urllib2.Request(video_info_url)
299 video_info_webpage = urllib2.urlopen(request).read()
300 video_info = parse_qs(video_info_webpage)
301 if 'token' in video_info:
303 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
304 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
306 if 'token' not in video_info:
307 if 'reason' in video_info:
308 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
310 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
313 # Start extracting information
314 self.report_information_extraction(video_id)
317 if 'author' not in video_info:
318 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
320 video_uploader = urllib.unquote_plus(video_info['author'][0])
323 if 'title' not in video_info:
324 self._downloader.trouble(u'ERROR: unable to extract video title')
326 video_title = urllib.unquote_plus(video_info['title'][0])
327 video_title = video_title.decode('utf-8')
328 video_title = sanitize_title(video_title)
331 simple_title = simplify_title(video_title)
334 if 'thumbnail_url' not in video_info:
335 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
337 else: # don't panic if we can't find it
338 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
342 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
344 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
345 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
346 for expression in format_expressions:
348 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
353 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
354 if video_description: video_description = clean_html(video_description)
355 else: video_description = ''
358 video_subtitles = None
359 if self._downloader.params.get('writesubtitles', False):
360 self.report_video_subtitles_download(video_id)
361 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
363 srt_list = urllib2.urlopen(request).read()
364 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
365 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
367 srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
369 if self._downloader.params.get('subtitleslang', False):
370 srt_lang = self._downloader.params.get('subtitleslang')
371 elif 'en' in srt_lang_list:
374 srt_lang = srt_lang_list[0]
375 if not srt_lang in srt_lang_list:
376 self._downloader.trouble(u'WARNING: no closed captions found in the specified language')
378 request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
380 srt_xml = urllib2.urlopen(request).read()
381 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
382 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
384 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
386 self._downloader.trouble(u'WARNING: video has no closed captions')
389 video_token = urllib.unquote_plus(video_info['token'][0])
391 # Decide which formats to download
392 req_format = self._downloader.params.get('format', None)
394 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
395 self.report_rtmp_download()
396 video_url_list = [(None, video_info['conn'][0])]
397 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
398 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
399 url_data = [parse_qs(uds) for uds in url_data_strs]
400 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
401 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
403 format_limit = self._downloader.params.get('format_limit', None)
404 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
405 if format_limit is not None and format_limit in available_formats:
406 format_list = available_formats[available_formats.index(format_limit):]
408 format_list = available_formats
409 existing_formats = [x for x in format_list if x in url_map]
410 if len(existing_formats) == 0:
411 self._downloader.trouble(u'ERROR: no known formats available for video')
413 if self._downloader.params.get('listformats', None):
414 self._print_formats(existing_formats)
416 if req_format is None or req_format == 'best':
417 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
418 elif req_format == 'worst':
419 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
420 elif req_format in ('-1', 'all'):
421 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
423 # Specific formats. We pick the first in a slash-delimeted sequence.
424 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
425 req_formats = req_format.split('/')
426 video_url_list = None
427 for rf in req_formats:
429 video_url_list = [(rf, url_map[rf])]
431 if video_url_list is None:
432 self._downloader.trouble(u'ERROR: requested format not available')
435 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
439 for format_param, video_real_url in video_url_list:
441 video_extension = self._video_extensions.get(format_param, 'flv')
444 'id': video_id.decode('utf-8'),
445 'url': video_real_url.decode('utf-8'),
446 'uploader': video_uploader.decode('utf-8'),
447 'upload_date': upload_date,
448 'title': video_title,
449 'stitle': simple_title,
450 'ext': video_extension.decode('utf-8'),
451 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
452 'thumbnail': video_thumbnail.decode('utf-8'),
453 'description': video_description,
454 'player_url': player_url,
455 'subtitles': video_subtitles
460 class MetacafeIE(InfoExtractor):
461 """Information Extractor for metacafe.com."""
463 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
464 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
465 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
466 IE_NAME = u'metacafe'
468 def __init__(self, downloader=None):
469 InfoExtractor.__init__(self, downloader)
471 def report_disclaimer(self):
472 """Report disclaimer retrieval."""
473 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
475 def report_age_confirmation(self):
476 """Report attempt to confirm age."""
477 self._downloader.to_screen(u'[metacafe] Confirming age')
479 def report_download_webpage(self, video_id):
480 """Report webpage download."""
481 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
483 def report_extraction(self, video_id):
484 """Report information extraction."""
485 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
487 def _real_initialize(self):
488 # Retrieve disclaimer
489 request = urllib2.Request(self._DISCLAIMER)
491 self.report_disclaimer()
492 disclaimer = urllib2.urlopen(request).read()
493 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
494 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
500 'submit': "Continue - I'm over 18",
502 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
504 self.report_age_confirmation()
505 disclaimer = urllib2.urlopen(request).read()
506 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
507 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
510 def _real_extract(self, url):
511 # Extract id and simplified title from URL
512 mobj = re.match(self._VALID_URL, url)
514 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
517 video_id = mobj.group(1)
519 # Check if video comes from YouTube
520 mobj2 = re.match(r'^yt-(.*)$', video_id)
521 if mobj2 is not None:
522 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
525 simple_title = mobj.group(2).decode('utf-8')
527 # Retrieve video webpage to extract further information
528 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
530 self.report_download_webpage(video_id)
531 webpage = urllib2.urlopen(request).read()
532 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
533 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
536 # Extract URL, uploader and title from webpage
537 self.report_extraction(video_id)
538 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
540 mediaURL = urllib.unquote(mobj.group(1))
541 video_extension = mediaURL[-3:]
543 # Extract gdaKey if available
544 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
548 gdaKey = mobj.group(1)
549 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
551 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
553 self._downloader.trouble(u'ERROR: unable to extract media URL')
555 vardict = parse_qs(mobj.group(1))
556 if 'mediaData' not in vardict:
557 self._downloader.trouble(u'ERROR: unable to extract media URL')
559 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
561 self._downloader.trouble(u'ERROR: unable to extract media URL')
563 mediaURL = mobj.group(1).replace('\\/', '/')
564 video_extension = mediaURL[-3:]
565 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
567 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
569 self._downloader.trouble(u'ERROR: unable to extract title')
571 video_title = mobj.group(1).decode('utf-8')
572 video_title = sanitize_title(video_title)
574 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
576 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
578 video_uploader = mobj.group(1)
581 'id': video_id.decode('utf-8'),
582 'url': video_url.decode('utf-8'),
583 'uploader': video_uploader.decode('utf-8'),
584 'upload_date': u'NA',
585 'title': video_title,
586 'stitle': simple_title,
587 'ext': video_extension.decode('utf-8'),
593 class DailymotionIE(InfoExtractor):
594 """Information Extractor for Dailymotion"""
596 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
597 IE_NAME = u'dailymotion'
599 def __init__(self, downloader=None):
600 InfoExtractor.__init__(self, downloader)
602 def report_download_webpage(self, video_id):
603 """Report webpage download."""
604 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
606 def report_extraction(self, video_id):
607 """Report information extraction."""
608 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
610 def _real_extract(self, url):
611 # Extract id and simplified title from URL
612 mobj = re.match(self._VALID_URL, url)
614 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
617 video_id = mobj.group(1)
619 video_extension = 'flv'
621 # Retrieve video webpage to extract further information
622 request = urllib2.Request(url)
623 request.add_header('Cookie', 'family_filter=off')
625 self.report_download_webpage(video_id)
626 webpage = urllib2.urlopen(request).read()
627 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
628 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
631 # Extract URL, uploader and title from webpage
632 self.report_extraction(video_id)
633 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
635 self._downloader.trouble(u'ERROR: unable to extract media URL')
637 sequence = urllib.unquote(mobj.group(1))
638 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
640 self._downloader.trouble(u'ERROR: unable to extract media URL')
642 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
644 # if needed add http://www.dailymotion.com/ if relative URL
648 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
650 self._downloader.trouble(u'ERROR: unable to extract title')
652 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
653 video_title = sanitize_title(video_title)
654 simple_title = simplify_title(video_title)
656 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
658 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
660 video_uploader = mobj.group(1)
663 'id': video_id.decode('utf-8'),
664 'url': video_url.decode('utf-8'),
665 'uploader': video_uploader.decode('utf-8'),
666 'upload_date': u'NA',
667 'title': video_title,
668 'stitle': simple_title,
669 'ext': video_extension.decode('utf-8'),
675 class GoogleIE(InfoExtractor):
676 """Information extractor for video.google.com."""
678 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
679 IE_NAME = u'video.google'
681 def __init__(self, downloader=None):
682 InfoExtractor.__init__(self, downloader)
684 def report_download_webpage(self, video_id):
685 """Report webpage download."""
686 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
688 def report_extraction(self, video_id):
689 """Report information extraction."""
690 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
692 def _real_extract(self, url):
693 # Extract id from URL
694 mobj = re.match(self._VALID_URL, url)
696 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
699 video_id = mobj.group(1)
701 video_extension = 'mp4'
703 # Retrieve video webpage to extract further information
704 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
706 self.report_download_webpage(video_id)
707 webpage = urllib2.urlopen(request).read()
708 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
709 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
712 # Extract URL, uploader, and title from webpage
713 self.report_extraction(video_id)
714 mobj = re.search(r"download_url:'([^']+)'", webpage)
716 video_extension = 'flv'
717 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
719 self._downloader.trouble(u'ERROR: unable to extract media URL')
721 mediaURL = urllib.unquote(mobj.group(1))
722 mediaURL = mediaURL.replace('\\x3d', '\x3d')
723 mediaURL = mediaURL.replace('\\x26', '\x26')
727 mobj = re.search(r'<title>(.*)</title>', webpage)
729 self._downloader.trouble(u'ERROR: unable to extract title')
731 video_title = mobj.group(1).decode('utf-8')
732 video_title = sanitize_title(video_title)
733 simple_title = simplify_title(video_title)
735 # Extract video description
736 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
738 self._downloader.trouble(u'ERROR: unable to extract video description')
740 video_description = mobj.group(1).decode('utf-8')
741 if not video_description:
742 video_description = 'No description available.'
744 # Extract video thumbnail
745 if self._downloader.params.get('forcethumbnail', False):
746 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
748 webpage = urllib2.urlopen(request).read()
749 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
750 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
752 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
754 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
756 video_thumbnail = mobj.group(1)
757 else: # we need something to pass to process_info
761 'id': video_id.decode('utf-8'),
762 'url': video_url.decode('utf-8'),
764 'upload_date': u'NA',
765 'title': video_title,
766 'stitle': simple_title,
767 'ext': video_extension.decode('utf-8'),
773 class PhotobucketIE(InfoExtractor):
774 """Information extractor for photobucket.com."""
776 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
777 IE_NAME = u'photobucket'
779 def __init__(self, downloader=None):
780 InfoExtractor.__init__(self, downloader)
782 def report_download_webpage(self, video_id):
783 """Report webpage download."""
784 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
786 def report_extraction(self, video_id):
787 """Report information extraction."""
788 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
790 def _real_extract(self, url):
791 # Extract id from URL
792 mobj = re.match(self._VALID_URL, url)
794 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
797 video_id = mobj.group(1)
799 video_extension = 'flv'
801 # Retrieve video webpage to extract further information
802 request = urllib2.Request(url)
804 self.report_download_webpage(video_id)
805 webpage = urllib2.urlopen(request).read()
806 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
807 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
810 # Extract URL, uploader, and title from webpage
811 self.report_extraction(video_id)
812 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
814 self._downloader.trouble(u'ERROR: unable to extract media URL')
816 mediaURL = urllib.unquote(mobj.group(1))
820 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
822 self._downloader.trouble(u'ERROR: unable to extract title')
824 video_title = mobj.group(1).decode('utf-8')
825 video_title = sanitize_title(video_title)
826 simple_title = simplify_title(video_title)
828 video_uploader = mobj.group(2).decode('utf-8')
831 'id': video_id.decode('utf-8'),
832 'url': video_url.decode('utf-8'),
833 'uploader': video_uploader,
834 'upload_date': u'NA',
835 'title': video_title,
836 'stitle': simple_title,
837 'ext': video_extension.decode('utf-8'),
843 class YahooIE(InfoExtractor):
844 """Information extractor for video.yahoo.com."""
846 # _VALID_URL matches all Yahoo! Video URLs
847 # _VPAGE_URL matches only the extractable '/watch/' URLs
848 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
849 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
850 IE_NAME = u'video.yahoo'
852 def __init__(self, downloader=None):
853 InfoExtractor.__init__(self, downloader)
855 def report_download_webpage(self, video_id):
856 """Report webpage download."""
857 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
859 def report_extraction(self, video_id):
860 """Report information extraction."""
861 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
863 def _real_extract(self, url, new_video=True):
864 # Extract ID from URL
865 mobj = re.match(self._VALID_URL, url)
867 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
870 video_id = mobj.group(2)
871 video_extension = 'flv'
873 # Rewrite valid but non-extractable URLs as
874 # extractable English language /watch/ URLs
875 if re.match(self._VPAGE_URL, url) is None:
876 request = urllib2.Request(url)
878 webpage = urllib2.urlopen(request).read()
879 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
880 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
883 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
885 self._downloader.trouble(u'ERROR: Unable to extract id field')
887 yahoo_id = mobj.group(1)
889 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
891 self._downloader.trouble(u'ERROR: Unable to extract vid field')
893 yahoo_vid = mobj.group(1)
895 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
896 return self._real_extract(url, new_video=False)
898 # Retrieve video webpage to extract further information
899 request = urllib2.Request(url)
901 self.report_download_webpage(video_id)
902 webpage = urllib2.urlopen(request).read()
903 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
904 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
907 # Extract uploader and title from webpage
908 self.report_extraction(video_id)
909 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
911 self._downloader.trouble(u'ERROR: unable to extract video title')
913 video_title = mobj.group(1).decode('utf-8')
914 simple_title = simplify_title(video_title)
916 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
918 self._downloader.trouble(u'ERROR: unable to extract video uploader')
920 video_uploader = mobj.group(1).decode('utf-8')
922 # Extract video thumbnail
923 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
925 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
927 video_thumbnail = mobj.group(1).decode('utf-8')
929 # Extract video description
930 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
932 self._downloader.trouble(u'ERROR: unable to extract video description')
934 video_description = mobj.group(1).decode('utf-8')
935 if not video_description:
936 video_description = 'No description available.'
938 # Extract video height and width
939 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
941 self._downloader.trouble(u'ERROR: unable to extract video height')
943 yv_video_height = mobj.group(1)
945 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
947 self._downloader.trouble(u'ERROR: unable to extract video width')
949 yv_video_width = mobj.group(1)
951 # Retrieve video playlist to extract media URL
952 # I'm not completely sure what all these options are, but we
953 # seem to need most of them, otherwise the server sends a 401.
954 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
955 yv_bitrate = '700' # according to Wikipedia this is hard-coded
956 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
957 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
958 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
960 self.report_download_webpage(video_id)
961 webpage = urllib2.urlopen(request).read()
962 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
963 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
966 # Extract media URL from playlist XML
967 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
969 self._downloader.trouble(u'ERROR: Unable to extract media URL')
971 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
972 video_url = unescapeHTML(video_url)
975 'id': video_id.decode('utf-8'),
977 'uploader': video_uploader,
978 'upload_date': u'NA',
979 'title': video_title,
980 'stitle': simple_title,
981 'ext': video_extension.decode('utf-8'),
982 'thumbnail': video_thumbnail.decode('utf-8'),
983 'description': video_description,
984 'thumbnail': video_thumbnail,
989 class VimeoIE(InfoExtractor):
990 """Information extractor for vimeo.com."""
992 # _VALID_URL matches Vimeo URLs
993 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
996 def __init__(self, downloader=None):
997 InfoExtractor.__init__(self, downloader)
999 def report_download_webpage(self, video_id):
1000 """Report webpage download."""
1001 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1003 def report_extraction(self, video_id):
1004 """Report information extraction."""
1005 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1007 def _real_extract(self, url, new_video=True):
1008 # Extract ID from URL
1009 mobj = re.match(self._VALID_URL, url)
1011 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1014 video_id = mobj.group(1)
1016 # Retrieve video webpage to extract further information
1017 request = urllib2.Request(url, None, std_headers)
1019 self.report_download_webpage(video_id)
1020 webpage = urllib2.urlopen(request).read()
1021 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1022 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1025 # Now we begin extracting as much information as we can from what we
1026 # retrieved. First we extract the information common to all extractors,
1027 # and latter we extract those that are Vimeo specific.
1028 self.report_extraction(video_id)
1030 # Extract the config JSON
1031 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1033 config = json.loads(config)
1035 self._downloader.trouble(u'ERROR: unable to extract info section')
1039 video_title = config["video"]["title"]
1040 simple_title = simplify_title(video_title)
1043 video_uploader = config["video"]["owner"]["name"]
1045 # Extract video thumbnail
1046 video_thumbnail = config["video"]["thumbnail"]
1048 # Extract video description
1049 video_description = get_element_by_id("description", webpage.decode('utf8'))
1050 if video_description: video_description = clean_html(video_description)
1051 else: video_description = ''
1053 # Extract upload date
1054 video_upload_date = u'NA'
1055 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1056 if mobj is not None:
1057 video_upload_date = mobj.group(1)
1059 # Vimeo specific: extract request signature and timestamp
1060 sig = config['request']['signature']
1061 timestamp = config['request']['timestamp']
1063 # Vimeo specific: extract video codec and quality information
1064 # TODO bind to format param
1065 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1066 for codec in codecs:
1067 if codec[0] in config["video"]["files"]:
1068 video_codec = codec[0]
1069 video_extension = codec[1]
1070 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
1071 else: quality = 'sd'
1074 self._downloader.trouble(u'ERROR: no known codec found')
1077 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1078 %(video_id, sig, timestamp, quality, video_codec.upper())
1083 'uploader': video_uploader,
1084 'upload_date': video_upload_date,
1085 'title': video_title,
1086 'stitle': simple_title,
1087 'ext': video_extension,
1088 'thumbnail': video_thumbnail,
1089 'description': video_description,
1094 class GenericIE(InfoExtractor):
1095 """Generic last-resort information extractor."""
1098 IE_NAME = u'generic'
1100 def __init__(self, downloader=None):
1101 InfoExtractor.__init__(self, downloader)
1103 def report_download_webpage(self, video_id):
1104 """Report webpage download."""
1105 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1106 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1108 def report_extraction(self, video_id):
1109 """Report information extraction."""
1110 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1112 def report_following_redirect(self, new_url):
1113 """Report information extraction."""
1114 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1116 def _test_redirect(self, url):
1117 """Check if it is a redirect, like url shorteners, in case restart chain."""
1118 class HeadRequest(urllib2.Request):
1119 def get_method(self):
1122 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1124 Subclass the HTTPRedirectHandler to make it use our
1125 HeadRequest also on the redirected URL
1127 def redirect_request(self, req, fp, code, msg, headers, newurl):
1128 if code in (301, 302, 303, 307):
1129 newurl = newurl.replace(' ', '%20')
1130 newheaders = dict((k,v) for k,v in req.headers.items()
1131 if k.lower() not in ("content-length", "content-type"))
1132 return HeadRequest(newurl,
1134 origin_req_host=req.get_origin_req_host(),
1137 raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1139 class HTTPMethodFallback(urllib2.BaseHandler):
1141 Fallback to GET if HEAD is not allowed (405 HTTP error)
1143 def http_error_405(self, req, fp, code, msg, headers):
1147 newheaders = dict((k,v) for k,v in req.headers.items()
1148 if k.lower() not in ("content-length", "content-type"))
1149 return self.parent.open(urllib2.Request(req.get_full_url(),
1151 origin_req_host=req.get_origin_req_host(),
1155 opener = urllib2.OpenerDirector()
1156 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1157 HTTPMethodFallback, HEADRedirectHandler,
1158 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1159 opener.add_handler(handler())
1161 response = opener.open(HeadRequest(url))
1162 new_url = response.geturl()
1164 if url == new_url: return False
1166 self.report_following_redirect(new_url)
1167 self._downloader.download([new_url])
1170 def _real_extract(self, url):
1171 if self._test_redirect(url): return
1173 video_id = url.split('/')[-1]
1174 request = urllib2.Request(url)
1176 self.report_download_webpage(video_id)
1177 webpage = urllib2.urlopen(request).read()
1178 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1179 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1181 except ValueError, err:
1182 # since this is the last-resort InfoExtractor, if
1183 # this error is thrown, it'll be thrown here
1184 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1187 self.report_extraction(video_id)
1188 # Start with something easy: JW Player in SWFObject
1189 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1191 # Broaden the search a little bit
1192 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1194 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1197 # It's possible that one of the regexes
1198 # matched, but returned an empty group:
1199 if mobj.group(1) is None:
1200 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1203 video_url = urllib.unquote(mobj.group(1))
1204 video_id = os.path.basename(video_url)
1206 # here's a fun little line of code for you:
1207 video_extension = os.path.splitext(video_id)[1][1:]
1208 video_id = os.path.splitext(video_id)[0]
1210 # it's tempting to parse this further, but you would
1211 # have to take into account all the variations like
1212 # Video Title - Site Name
1213 # Site Name | Video Title
1214 # Video Title - Tagline | Site Name
1215 # and so on and so forth; it's just not practical
1216 mobj = re.search(r'<title>(.*)</title>', webpage)
1218 self._downloader.trouble(u'ERROR: unable to extract title')
1220 video_title = mobj.group(1).decode('utf-8')
1221 video_title = sanitize_title(video_title)
1222 simple_title = simplify_title(video_title)
1224 # video uploader is domain name
1225 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1227 self._downloader.trouble(u'ERROR: unable to extract title')
1229 video_uploader = mobj.group(1).decode('utf-8')
1232 'id': video_id.decode('utf-8'),
1233 'url': video_url.decode('utf-8'),
1234 'uploader': video_uploader,
1235 'upload_date': u'NA',
1236 'title': video_title,
1237 'stitle': simple_title,
1238 'ext': video_extension.decode('utf-8'),
1244 class YoutubeSearchIE(InfoExtractor):
1245 """Information Extractor for YouTube search queries."""
1246 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1247 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1248 _max_youtube_results = 1000
1249 IE_NAME = u'youtube:search'
1251 def __init__(self, downloader=None):
1252 InfoExtractor.__init__(self, downloader)
1254 def report_download_page(self, query, pagenum):
1255 """Report attempt to download playlist page with given number."""
1256 query = query.decode(preferredencoding())
1257 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1259 def _real_extract(self, query):
1260 mobj = re.match(self._VALID_URL, query)
1262 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1265 prefix, query = query.split(':')
1267 query = query.encode('utf-8')
1269 self._download_n_results(query, 1)
1271 elif prefix == 'all':
1272 self._download_n_results(query, self._max_youtube_results)
1278 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1280 elif n > self._max_youtube_results:
1281 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1282 n = self._max_youtube_results
1283 self._download_n_results(query, n)
1285 except ValueError: # parsing prefix as integer fails
1286 self._download_n_results(query, 1)
1289 def _download_n_results(self, query, n):
1290 """Downloads a specified number of results for a query"""
1296 while (50 * pagenum) < limit:
1297 self.report_download_page(query, pagenum+1)
1298 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1299 request = urllib2.Request(result_url)
1301 data = urllib2.urlopen(request).read()
1302 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1303 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
1305 api_response = json.loads(data)['data']
1307 new_ids = list(video['id'] for video in api_response['items'])
1308 video_ids += new_ids
1310 limit = min(n, api_response['totalItems'])
1313 if len(video_ids) > n:
1314 video_ids = video_ids[:n]
1315 for id in video_ids:
1316 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1320 class GoogleSearchIE(InfoExtractor):
1321 """Information Extractor for Google Video search queries."""
1322 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1323 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1324 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1325 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1326 _max_google_results = 1000
1327 IE_NAME = u'video.google:search'
1329 def __init__(self, downloader=None):
1330 InfoExtractor.__init__(self, downloader)
1332 def report_download_page(self, query, pagenum):
1333 """Report attempt to download playlist page with given number."""
1334 query = query.decode(preferredencoding())
1335 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1337 def _real_extract(self, query):
1338 mobj = re.match(self._VALID_URL, query)
1340 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1343 prefix, query = query.split(':')
1345 query = query.encode('utf-8')
1347 self._download_n_results(query, 1)
1349 elif prefix == 'all':
1350 self._download_n_results(query, self._max_google_results)
1356 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1358 elif n > self._max_google_results:
1359 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1360 n = self._max_google_results
1361 self._download_n_results(query, n)
1363 except ValueError: # parsing prefix as integer fails
1364 self._download_n_results(query, 1)
1367 def _download_n_results(self, query, n):
1368 """Downloads a specified number of results for a query"""
1374 self.report_download_page(query, pagenum)
1375 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1376 request = urllib2.Request(result_url)
1378 page = urllib2.urlopen(request).read()
1379 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1380 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1383 # Extract video identifiers
1384 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1385 video_id = mobj.group(1)
1386 if video_id not in video_ids:
1387 video_ids.append(video_id)
1388 if len(video_ids) == n:
1389 # Specified n videos reached
1390 for id in video_ids:
1391 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1394 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1395 for id in video_ids:
1396 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1399 pagenum = pagenum + 1
1402 class YahooSearchIE(InfoExtractor):
1403 """Information Extractor for Yahoo! Video search queries."""
1404 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1405 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1406 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1407 _MORE_PAGES_INDICATOR = r'\s*Next'
1408 _max_yahoo_results = 1000
1409 IE_NAME = u'video.yahoo:search'
1411 def __init__(self, downloader=None):
1412 InfoExtractor.__init__(self, downloader)
1414 def report_download_page(self, query, pagenum):
1415 """Report attempt to download playlist page with given number."""
1416 query = query.decode(preferredencoding())
1417 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1419 def _real_extract(self, query):
1420 mobj = re.match(self._VALID_URL, query)
1422 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1425 prefix, query = query.split(':')
1427 query = query.encode('utf-8')
1429 self._download_n_results(query, 1)
1431 elif prefix == 'all':
1432 self._download_n_results(query, self._max_yahoo_results)
1438 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1440 elif n > self._max_yahoo_results:
1441 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1442 n = self._max_yahoo_results
1443 self._download_n_results(query, n)
1445 except ValueError: # parsing prefix as integer fails
1446 self._download_n_results(query, 1)
1449 def _download_n_results(self, query, n):
1450 """Downloads a specified number of results for a query"""
1453 already_seen = set()
1457 self.report_download_page(query, pagenum)
1458 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1459 request = urllib2.Request(result_url)
1461 page = urllib2.urlopen(request).read()
1462 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1463 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1466 # Extract video identifiers
1467 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1468 video_id = mobj.group(1)
1469 if video_id not in already_seen:
1470 video_ids.append(video_id)
1471 already_seen.add(video_id)
1472 if len(video_ids) == n:
1473 # Specified n videos reached
1474 for id in video_ids:
1475 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1478 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1479 for id in video_ids:
1480 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1483 pagenum = pagenum + 1
1486 class YoutubePlaylistIE(InfoExtractor):
1487 """Information Extractor for YouTube playlists."""
1489 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1490 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1491 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&list=PL%s&'
1492 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1493 IE_NAME = u'youtube:playlist'
1495 def __init__(self, downloader=None):
1496 InfoExtractor.__init__(self, downloader)
1498 def report_download_page(self, playlist_id, pagenum):
1499 """Report attempt to download playlist page with given number."""
1500 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1502 def _real_extract(self, url):
1503 # Extract playlist id
1504 mobj = re.match(self._VALID_URL, url)
1506 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1510 if mobj.group(3) is not None:
1511 self._downloader.download([mobj.group(3)])
1514 # Download playlist pages
1515 # prefix is 'p' as default for playlists but there are other types that need extra care
1516 playlist_prefix = mobj.group(1)
1517 if playlist_prefix == 'a':
1518 playlist_access = 'artist'
1520 playlist_prefix = 'p'
1521 playlist_access = 'view_play_list'
1522 playlist_id = mobj.group(2)
1527 self.report_download_page(playlist_id, pagenum)
1528 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1529 request = urllib2.Request(url)
1531 page = urllib2.urlopen(request).read()
1532 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1533 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1536 # Extract video identifiers
1538 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1539 if mobj.group(1) not in ids_in_page:
1540 ids_in_page.append(mobj.group(1))
1541 video_ids.extend(ids_in_page)
1543 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1545 pagenum = pagenum + 1
1547 playliststart = self._downloader.params.get('playliststart', 1) - 1
1548 playlistend = self._downloader.params.get('playlistend', -1)
1549 if playlistend == -1:
1550 video_ids = video_ids[playliststart:]
1552 video_ids = video_ids[playliststart:playlistend]
1554 for id in video_ids:
1555 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1559 class YoutubeUserIE(InfoExtractor):
1560 """Information Extractor for YouTube users."""
1562 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1563 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1564 _GDATA_PAGE_SIZE = 50
1565 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1566 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1567 IE_NAME = u'youtube:user'
1569 def __init__(self, downloader=None):
1570 InfoExtractor.__init__(self, downloader)
1572 def report_download_page(self, username, start_index):
1573 """Report attempt to download user page."""
1574 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1575 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1577 def _real_extract(self, url):
1579 mobj = re.match(self._VALID_URL, url)
1581 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1584 username = mobj.group(1)
1586 # Download video ids using YouTube Data API. Result size per
1587 # query is limited (currently to 50 videos) so we need to query
1588 # page by page until there are no video ids - it means we got
1595 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1596 self.report_download_page(username, start_index)
1598 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1601 page = urllib2.urlopen(request).read()
1602 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1603 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1606 # Extract video identifiers
1609 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1610 if mobj.group(1) not in ids_in_page:
1611 ids_in_page.append(mobj.group(1))
1613 video_ids.extend(ids_in_page)
1615 # A little optimization - if current page is not
1616 # "full", ie. does not contain PAGE_SIZE video ids then
1617 # we can assume that this page is the last one - there
1618 # are no more ids on further pages - no need to query
1621 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1626 all_ids_count = len(video_ids)
1627 playliststart = self._downloader.params.get('playliststart', 1) - 1
1628 playlistend = self._downloader.params.get('playlistend', -1)
1630 if playlistend == -1:
1631 video_ids = video_ids[playliststart:]
1633 video_ids = video_ids[playliststart:playlistend]
1635 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1636 (username, all_ids_count, len(video_ids)))
1638 for video_id in video_ids:
1639 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1642 class DepositFilesIE(InfoExtractor):
1643 """Information extractor for depositfiles.com"""
1645 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1646 IE_NAME = u'DepositFiles'
1648 def __init__(self, downloader=None):
1649 InfoExtractor.__init__(self, downloader)
1651 def report_download_webpage(self, file_id):
1652 """Report webpage download."""
1653 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1655 def report_extraction(self, file_id):
1656 """Report information extraction."""
1657 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1659 def _real_extract(self, url):
1660 file_id = url.split('/')[-1]
1661 # Rebuild url in english locale
1662 url = 'http://depositfiles.com/en/files/' + file_id
1664 # Retrieve file webpage with 'Free download' button pressed
1665 free_download_indication = { 'gateway_result' : '1' }
1666 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1668 self.report_download_webpage(file_id)
1669 webpage = urllib2.urlopen(request).read()
1670 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1671 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
1674 # Search for the real file URL
1675 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1676 if (mobj is None) or (mobj.group(1) is None):
1677 # Try to figure out reason of the error.
1678 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1679 if (mobj is not None) and (mobj.group(1) is not None):
1680 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1681 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1683 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1686 file_url = mobj.group(1)
1687 file_extension = os.path.splitext(file_url)[1][1:]
1689 # Search for file title
1690 mobj = re.search(r'<b title="(.*?)">', webpage)
1692 self._downloader.trouble(u'ERROR: unable to extract title')
1694 file_title = mobj.group(1).decode('utf-8')
1697 'id': file_id.decode('utf-8'),
1698 'url': file_url.decode('utf-8'),
1700 'upload_date': u'NA',
1701 'title': file_title,
1702 'stitle': file_title,
1703 'ext': file_extension.decode('utf-8'),
1709 class FacebookIE(InfoExtractor):
1710 """Information Extractor for Facebook"""
1712 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1713 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1714 _NETRC_MACHINE = 'facebook'
1715 _available_formats = ['video', 'highqual', 'lowqual']
1716 _video_extensions = {
1721 IE_NAME = u'facebook'
1723 def __init__(self, downloader=None):
1724 InfoExtractor.__init__(self, downloader)
1726 def _reporter(self, message):
1727 """Add header and report message."""
1728 self._downloader.to_screen(u'[facebook] %s' % message)
1730 def report_login(self):
1731 """Report attempt to log in."""
1732 self._reporter(u'Logging in')
1734 def report_video_webpage_download(self, video_id):
1735 """Report attempt to download video webpage."""
1736 self._reporter(u'%s: Downloading video webpage' % video_id)
1738 def report_information_extraction(self, video_id):
1739 """Report attempt to extract video information."""
1740 self._reporter(u'%s: Extracting video information' % video_id)
1742 def _parse_page(self, video_webpage):
1743 """Extract video information from page"""
1745 data = {'title': r'\("video_title", "(.*?)"\)',
1746 'description': r'<div class="datawrap">(.*?)</div>',
1747 'owner': r'\("video_owner_name", "(.*?)"\)',
1748 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1751 for piece in data.keys():
1752 mobj = re.search(data[piece], video_webpage)
1753 if mobj is not None:
1754 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1758 for fmt in self._available_formats:
1759 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1760 if mobj is not None:
1761 # URL is in a Javascript segment inside an escaped Unicode format within
1762 # the generally utf-8 page
1763 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1764 video_info['video_urls'] = video_urls
1768 def _real_initialize(self):
1769 if self._downloader is None:
1774 downloader_params = self._downloader.params
1776 # Attempt to use provided username and password or .netrc data
1777 if downloader_params.get('username', None) is not None:
1778 useremail = downloader_params['username']
1779 password = downloader_params['password']
1780 elif downloader_params.get('usenetrc', False):
1782 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1783 if info is not None:
1787 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1788 except (IOError, netrc.NetrcParseError), err:
1789 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1792 if useremail is None:
1801 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1804 login_results = urllib2.urlopen(request).read()
1805 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1806 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1808 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1809 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1812 def _real_extract(self, url):
1813 mobj = re.match(self._VALID_URL, url)
1815 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1817 video_id = mobj.group('ID')
1820 self.report_video_webpage_download(video_id)
1821 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
1823 page = urllib2.urlopen(request)
1824 video_webpage = page.read()
1825 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1826 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1829 # Start extracting information
1830 self.report_information_extraction(video_id)
1832 # Extract information
1833 video_info = self._parse_page(video_webpage)
1836 if 'owner' not in video_info:
1837 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1839 video_uploader = video_info['owner']
1842 if 'title' not in video_info:
1843 self._downloader.trouble(u'ERROR: unable to extract video title')
1845 video_title = video_info['title']
1846 video_title = video_title.decode('utf-8')
1847 video_title = sanitize_title(video_title)
1849 simple_title = simplify_title(video_title)
1852 if 'thumbnail' not in video_info:
1853 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1854 video_thumbnail = ''
1856 video_thumbnail = video_info['thumbnail']
1860 if 'upload_date' in video_info:
1861 upload_time = video_info['upload_date']
1862 timetuple = email.utils.parsedate_tz(upload_time)
1863 if timetuple is not None:
1865 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
1870 video_description = video_info.get('description', 'No description available.')
1872 url_map = video_info['video_urls']
1873 if len(url_map.keys()) > 0:
1874 # Decide which formats to download
1875 req_format = self._downloader.params.get('format', None)
1876 format_limit = self._downloader.params.get('format_limit', None)
1878 if format_limit is not None and format_limit in self._available_formats:
1879 format_list = self._available_formats[self._available_formats.index(format_limit):]
1881 format_list = self._available_formats
1882 existing_formats = [x for x in format_list if x in url_map]
1883 if len(existing_formats) == 0:
1884 self._downloader.trouble(u'ERROR: no known formats available for video')
1886 if req_format is None:
1887 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1888 elif req_format == 'worst':
1889 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1890 elif req_format == '-1':
1891 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1894 if req_format not in url_map:
1895 self._downloader.trouble(u'ERROR: requested format not available')
1897 video_url_list = [(req_format, url_map[req_format])] # Specific format
1900 for format_param, video_real_url in video_url_list:
1902 video_extension = self._video_extensions.get(format_param, 'mp4')
1905 'id': video_id.decode('utf-8'),
1906 'url': video_real_url.decode('utf-8'),
1907 'uploader': video_uploader.decode('utf-8'),
1908 'upload_date': upload_date,
1909 'title': video_title,
1910 'stitle': simple_title,
1911 'ext': video_extension.decode('utf-8'),
1912 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1913 'thumbnail': video_thumbnail.decode('utf-8'),
1914 'description': video_description.decode('utf-8'),
1919 class BlipTVIE(InfoExtractor):
1920 """Information extractor for blip.tv"""
1922 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
1923 _URL_EXT = r'^.*\.([a-z0-9]+)$'
1924 IE_NAME = u'blip.tv'
1926 def report_extraction(self, file_id):
1927 """Report information extraction."""
1928 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
1930 def report_direct_download(self, title):
1931 """Report information extraction."""
1932 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
1934 def _real_extract(self, url):
1935 mobj = re.match(self._VALID_URL, url)
1937 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1944 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1945 request = urllib2.Request(json_url)
1946 self.report_extraction(mobj.group(1))
1949 urlh = urllib2.urlopen(request)
1950 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1951 basename = url.split('/')[-1]
1952 title,ext = os.path.splitext(basename)
1953 title = title.decode('UTF-8')
1954 ext = ext.replace('.', '')
1955 self.report_direct_download(title)
1960 'stitle': simplify_title(title),
1964 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1965 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1967 if info is None: # Regular URL
1969 json_code = urlh.read()
1970 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1971 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
1975 json_data = json.loads(json_code)
1976 if 'Post' in json_data:
1977 data = json_data['Post']
1981 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
1982 video_url = data['media']['url']
1983 umobj = re.match(self._URL_EXT, video_url)
1985 raise ValueError('Can not determine filename extension')
1986 ext = umobj.group(1)
1989 'id': data['item_id'],
1991 'uploader': data['display_name'],
1992 'upload_date': upload_date,
1993 'title': data['title'],
1994 'stitle': simplify_title(data['title']),
1996 'format': data['media']['mimeType'],
1997 'thumbnail': data['thumbnailUrl'],
1998 'description': data['description'],
1999 'player_url': data['embedUrl']
2001 except (ValueError,KeyError), err:
2002 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2008 class MyVideoIE(InfoExtractor):
2009 """Information Extractor for myvideo.de."""
2011 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2012 IE_NAME = u'myvideo'
2014 def __init__(self, downloader=None):
2015 InfoExtractor.__init__(self, downloader)
2017 def report_download_webpage(self, video_id):
2018 """Report webpage download."""
2019 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2021 def report_extraction(self, video_id):
2022 """Report information extraction."""
2023 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2025 def _real_extract(self,url):
2026 mobj = re.match(self._VALID_URL, url)
2028 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2031 video_id = mobj.group(1)
2034 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2036 self.report_download_webpage(video_id)
2037 webpage = urllib2.urlopen(request).read()
2038 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2039 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2042 self.report_extraction(video_id)
2043 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2046 self._downloader.trouble(u'ERROR: unable to extract media URL')
2048 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2050 mobj = re.search('<title>([^<]+)</title>', webpage)
2052 self._downloader.trouble(u'ERROR: unable to extract title')
2055 video_title = mobj.group(1)
2056 video_title = sanitize_title(video_title)
2058 simple_title = simplify_title(video_title)
2064 'upload_date': u'NA',
2065 'title': video_title,
2066 'stitle': simple_title,
2072 class ComedyCentralIE(InfoExtractor):
2073 """Information extractor for The Daily Show and Colbert Report """
2075 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2076 IE_NAME = u'comedycentral'
2078 def report_extraction(self, episode_id):
2079 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2081 def report_config_download(self, episode_id):
2082 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2084 def report_index_download(self, episode_id):
2085 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2087 def report_player_url(self, episode_id):
2088 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2090 def _real_extract(self, url):
2091 mobj = re.match(self._VALID_URL, url)
2093 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2096 if mobj.group('shortname'):
2097 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2098 url = u'http://www.thedailyshow.com/full-episodes/'
2100 url = u'http://www.colbertnation.com/full-episodes/'
2101 mobj = re.match(self._VALID_URL, url)
2102 assert mobj is not None
2104 dlNewest = not mobj.group('episode')
2106 epTitle = mobj.group('showname')
2108 epTitle = mobj.group('episode')
2110 req = urllib2.Request(url)
2111 self.report_extraction(epTitle)
2113 htmlHandle = urllib2.urlopen(req)
2114 html = htmlHandle.read()
2115 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2116 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2119 url = htmlHandle.geturl()
2120 mobj = re.match(self._VALID_URL, url)
2122 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2124 if mobj.group('episode') == '':
2125 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2127 epTitle = mobj.group('episode')
2129 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2130 if len(mMovieParams) == 0:
2131 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2134 playerUrl_raw = mMovieParams[0][0]
2135 self.report_player_url(epTitle)
2137 urlHandle = urllib2.urlopen(playerUrl_raw)
2138 playerUrl = urlHandle.geturl()
2139 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2140 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2143 uri = mMovieParams[0][1]
2144 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2145 self.report_index_download(epTitle)
2147 indexXml = urllib2.urlopen(indexUrl).read()
2148 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2149 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2154 idoc = xml.etree.ElementTree.fromstring(indexXml)
2155 itemEls = idoc.findall('.//item')
2156 for itemEl in itemEls:
2157 mediaId = itemEl.findall('./guid')[0].text
2158 shortMediaId = mediaId.split(':')[-1]
2159 showId = mediaId.split(':')[-2].replace('.com', '')
2160 officialTitle = itemEl.findall('./title')[0].text
2161 officialDate = itemEl.findall('./pubDate')[0].text
2163 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2164 urllib.urlencode({'uri': mediaId}))
2165 configReq = urllib2.Request(configUrl)
2166 self.report_config_download(epTitle)
2168 configXml = urllib2.urlopen(configReq).read()
2169 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2170 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2173 cdoc = xml.etree.ElementTree.fromstring(configXml)
2175 for rendition in cdoc.findall('.//rendition'):
2176 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2180 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2183 # For now, just pick the highest bitrate
2184 format,video_url = turls[-1]
2186 effTitle = showId + u'-' + epTitle
2191 'upload_date': officialDate,
2193 'stitle': simplify_title(effTitle),
2197 'description': officialTitle,
2198 'player_url': playerUrl
2201 results.append(info)
2206 class EscapistIE(InfoExtractor):
2207 """Information extractor for The Escapist """
2209 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2210 IE_NAME = u'escapist'
2212 def report_extraction(self, showName):
2213 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2215 def report_config_download(self, showName):
2216 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2218 def _real_extract(self, url):
2219 mobj = re.match(self._VALID_URL, url)
2221 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2223 showName = mobj.group('showname')
2224 videoId = mobj.group('episode')
2226 self.report_extraction(showName)
2228 webPage = urllib2.urlopen(url).read()
2229 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2230 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2233 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2234 description = unescapeHTML(descMatch.group(1))
2235 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2236 imgUrl = unescapeHTML(imgMatch.group(1))
2237 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2238 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2239 configUrlMatch = re.search('config=(.*)$', playerUrl)
2240 configUrl = urllib2.unquote(configUrlMatch.group(1))
2242 self.report_config_download(showName)
2244 configJSON = urllib2.urlopen(configUrl).read()
2245 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2246 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2249 # Technically, it's JavaScript, not JSON
2250 configJSON = configJSON.replace("'", '"')
2253 config = json.loads(configJSON)
2254 except (ValueError,), err:
2255 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2258 playlist = config['playlist']
2259 videoUrl = playlist[1]['url']
2264 'uploader': showName,
2265 'upload_date': None,
2267 'stitle': simplify_title(showName),
2270 'thumbnail': imgUrl,
2271 'description': description,
2272 'player_url': playerUrl,
2278 class CollegeHumorIE(InfoExtractor):
2279 """Information extractor for collegehumor.com"""
2281 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2282 IE_NAME = u'collegehumor'
2284 def report_webpage(self, video_id):
2285 """Report information extraction."""
2286 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2288 def report_extraction(self, video_id):
2289 """Report information extraction."""
2290 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2292 def _real_extract(self, url):
2293 mobj = re.match(self._VALID_URL, url)
2295 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2297 video_id = mobj.group('videoid')
2299 self.report_webpage(video_id)
2300 request = urllib2.Request(url)
2302 webpage = urllib2.urlopen(request).read()
2303 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2304 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2307 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2309 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2311 internal_video_id = m.group('internalvideoid')
2315 'internal_id': internal_video_id,
2318 self.report_extraction(video_id)
2319 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2321 metaXml = urllib2.urlopen(xmlUrl).read()
2322 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2323 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
2326 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2328 videoNode = mdoc.findall('./video')[0]
2329 info['description'] = videoNode.findall('./description')[0].text
2330 info['title'] = videoNode.findall('./caption')[0].text
2331 info['stitle'] = simplify_title(info['title'])
2332 info['url'] = videoNode.findall('./file')[0].text
2333 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2334 info['ext'] = info['url'].rpartition('.')[2]
2335 info['format'] = info['ext']
2337 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2343 class XVideosIE(InfoExtractor):
2344 """Information extractor for xvideos.com"""
2346 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2347 IE_NAME = u'xvideos'
2349 def report_webpage(self, video_id):
2350 """Report information extraction."""
2351 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2353 def report_extraction(self, video_id):
2354 """Report information extraction."""
2355 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2357 def _real_extract(self, url):
2358 mobj = re.match(self._VALID_URL, url)
2360 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2362 video_id = mobj.group(1).decode('utf-8')
2364 self.report_webpage(video_id)
2366 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2368 webpage = urllib2.urlopen(request).read()
2369 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2370 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2373 self.report_extraction(video_id)
2377 mobj = re.search(r'flv_url=(.+?)&', webpage)
2379 self._downloader.trouble(u'ERROR: unable to extract video url')
2381 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2385 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2387 self._downloader.trouble(u'ERROR: unable to extract video title')
2389 video_title = mobj.group(1).decode('utf-8')
2392 # Extract video thumbnail
2393 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
2395 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2397 video_thumbnail = mobj.group(1).decode('utf-8')
2403 'upload_date': None,
2404 'title': video_title,
2405 'stitle': simplify_title(video_title),
2408 'thumbnail': video_thumbnail,
2409 'description': None,
2416 class SoundcloudIE(InfoExtractor):
2417 """Information extractor for soundcloud.com
2418 To access the media, the uid of the song and a stream token
2419 must be extracted from the page source and the script must make
2420 a request to media.soundcloud.com/crossdomain.xml. Then
2421 the media can be grabbed by requesting from an url composed
2422 of the stream token and uid
2425 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2426 IE_NAME = u'soundcloud'
2428 def __init__(self, downloader=None):
2429 InfoExtractor.__init__(self, downloader)
2431 def report_webpage(self, video_id):
2432 """Report information extraction."""
2433 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2435 def report_extraction(self, video_id):
2436 """Report information extraction."""
2437 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2439 def _real_extract(self, url):
2440 mobj = re.match(self._VALID_URL, url)
2442 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2445 # extract uploader (which is in the url)
2446 uploader = mobj.group(1).decode('utf-8')
2447 # extract simple title (uploader + slug of song title)
2448 slug_title = mobj.group(2).decode('utf-8')
2449 simple_title = uploader + '-' + slug_title
2451 self.report_webpage('%s/%s' % (uploader, slug_title))
2453 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2455 webpage = urllib2.urlopen(request).read()
2456 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2457 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2460 self.report_extraction('%s/%s' % (uploader, slug_title))
2462 # extract uid and stream token that soundcloud hands out for access
2463 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2465 video_id = mobj.group(1)
2466 stream_token = mobj.group(2)
2468 # extract unsimplified title
2469 mobj = re.search('"title":"(.*?)",', webpage)
2471 title = mobj.group(1)
2473 # construct media url (with uid/token)
2474 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2475 mediaURL = mediaURL % (video_id, stream_token)
2478 description = u'No description available'
2479 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2481 description = mobj.group(1)
2485 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2488 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2489 except Exception, e:
2492 # for soundcloud, a request to a cross domain is required for cookies
2493 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2496 'id': video_id.decode('utf-8'),
2498 'uploader': uploader.decode('utf-8'),
2499 'upload_date': upload_date,
2500 'title': simple_title.decode('utf-8'),
2501 'stitle': simple_title.decode('utf-8'),
2505 'description': description.decode('utf-8')
2509 class InfoQIE(InfoExtractor):
2510 """Information extractor for infoq.com"""
2512 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2515 def report_webpage(self, video_id):
2516 """Report information extraction."""
2517 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2519 def report_extraction(self, video_id):
2520 """Report information extraction."""
2521 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2523 def _real_extract(self, url):
2524 mobj = re.match(self._VALID_URL, url)
2526 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2529 self.report_webpage(url)
2531 request = urllib2.Request(url)
2533 webpage = urllib2.urlopen(request).read()
2534 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2535 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2538 self.report_extraction(url)
2542 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2544 self._downloader.trouble(u'ERROR: unable to extract video url')
2546 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2550 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2552 self._downloader.trouble(u'ERROR: unable to extract video title')
2554 video_title = mobj.group(1).decode('utf-8')
2556 # Extract description
2557 video_description = u'No description available.'
2558 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2559 if mobj is not None:
2560 video_description = mobj.group(1).decode('utf-8')
2562 video_filename = video_url.split('/')[-1]
2563 video_id, extension = video_filename.split('.')
2569 'upload_date': None,
2570 'title': video_title,
2571 'stitle': simplify_title(video_title),
2573 'format': extension, # Extension is always(?) mp4, but seems to be flv
2575 'description': video_description,
2581 class MixcloudIE(InfoExtractor):
2582 """Information extractor for www.mixcloud.com"""
2583 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2584 IE_NAME = u'mixcloud'
2586 def __init__(self, downloader=None):
2587 InfoExtractor.__init__(self, downloader)
2589 def report_download_json(self, file_id):
2590 """Report JSON download."""
2591 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2593 def report_extraction(self, file_id):
2594 """Report information extraction."""
2595 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2597 def get_urls(self, jsonData, fmt, bitrate='best'):
2598 """Get urls from 'audio_formats' section in json"""
2601 bitrate_list = jsonData[fmt]
2602 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2603 bitrate = max(bitrate_list) # select highest
2605 url_list = jsonData[fmt][bitrate]
2606 except TypeError: # we have no bitrate info.
2607 url_list = jsonData[fmt]
2611 def check_urls(self, url_list):
2612 """Returns 1st active url from list"""
2613 for url in url_list:
2615 urllib2.urlopen(url)
2617 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2622 def _print_formats(self, formats):
2623 print 'Available formats:'
2624 for fmt in formats.keys():
2625 for b in formats[fmt]:
2627 ext = formats[fmt][b][0]
2628 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
2629 except TypeError: # we have no bitrate info
2630 ext = formats[fmt][0]
2631 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
2634 def _real_extract(self, url):
2635 mobj = re.match(self._VALID_URL, url)
2637 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2639 # extract uploader & filename from url
2640 uploader = mobj.group(1).decode('utf-8')
2641 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2643 # construct API request
2644 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2645 # retrieve .json file with links to files
2646 request = urllib2.Request(file_url)
2648 self.report_download_json(file_url)
2649 jsonData = urllib2.urlopen(request).read()
2650 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2651 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
2655 json_data = json.loads(jsonData)
2656 player_url = json_data['player_swf_url']
2657 formats = dict(json_data['audio_formats'])
2659 req_format = self._downloader.params.get('format', None)
2662 if self._downloader.params.get('listformats', None):
2663 self._print_formats(formats)
2666 if req_format is None or req_format == 'best':
2667 for format_param in formats.keys():
2668 url_list = self.get_urls(formats, format_param)
2670 file_url = self.check_urls(url_list)
2671 if file_url is not None:
2674 if req_format not in formats.keys():
2675 self._downloader.trouble(u'ERROR: format is not available')
2678 url_list = self.get_urls(formats, req_format)
2679 file_url = self.check_urls(url_list)
2680 format_param = req_format
2683 'id': file_id.decode('utf-8'),
2684 'url': file_url.decode('utf-8'),
2685 'uploader': uploader.decode('utf-8'),
2686 'upload_date': u'NA',
2687 'title': json_data['name'],
2688 'stitle': simplify_title(json_data['name']),
2689 'ext': file_url.split('.')[-1].decode('utf-8'),
2690 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2691 'thumbnail': json_data['thumbnail_url'],
2692 'description': json_data['description'],
2693 'player_url': player_url.decode('utf-8'),
2696 class StanfordOpenClassroomIE(InfoExtractor):
2697 """Information extractor for Stanford's Open ClassRoom"""
2699 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2700 IE_NAME = u'stanfordoc'
2702 def report_download_webpage(self, objid):
2703 """Report information extraction."""
2704 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2706 def report_extraction(self, video_id):
2707 """Report information extraction."""
2708 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2710 def _real_extract(self, url):
2711 mobj = re.match(self._VALID_URL, url)
2713 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2716 if mobj.group('course') and mobj.group('video'): # A specific video
2717 course = mobj.group('course')
2718 video = mobj.group('video')
2720 'id': simplify_title(course + '_' + video),
2723 self.report_extraction(info['id'])
2724 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2725 xmlUrl = baseUrl + video + '.xml'
2727 metaXml = urllib2.urlopen(xmlUrl).read()
2728 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2729 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2731 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2733 info['title'] = mdoc.findall('./title')[0].text
2734 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2736 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2738 info['stitle'] = simplify_title(info['title'])
2739 info['ext'] = info['url'].rpartition('.')[2]
2740 info['format'] = info['ext']
2742 elif mobj.group('course'): # A course page
2743 course = mobj.group('course')
2745 'id': simplify_title(course),
2749 self.report_download_webpage(info['id'])
2751 coursepage = urllib2.urlopen(url).read()
2752 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2753 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2756 m = re.search('<h1>([^<]+)</h1>', coursepage)
2758 info['title'] = unescapeHTML(m.group(1))
2760 info['title'] = info['id']
2761 info['stitle'] = simplify_title(info['title'])
2763 m = re.search('<description>([^<]+)</description>', coursepage)
2765 info['description'] = unescapeHTML(m.group(1))
2767 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2770 'type': 'reference',
2771 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2775 for entry in info['list']:
2776 assert entry['type'] == 'reference'
2777 results += self.extract(entry['url'])
2782 'id': 'Stanford OpenClassroom',
2786 self.report_download_webpage(info['id'])
2787 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2789 rootpage = urllib2.urlopen(rootURL).read()
2790 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2791 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2794 info['title'] = info['id']
2795 info['stitle'] = simplify_title(info['title'])
2797 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2800 'type': 'reference',
2801 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2806 for entry in info['list']:
2807 assert entry['type'] == 'reference'
2808 results += self.extract(entry['url'])
2811 class MTVIE(InfoExtractor):
2812 """Information extractor for MTV.com"""
2814 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2817 def report_webpage(self, video_id):
2818 """Report information extraction."""
2819 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2821 def report_extraction(self, video_id):
2822 """Report information extraction."""
2823 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2825 def _real_extract(self, url):
2826 mobj = re.match(self._VALID_URL, url)
2828 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2830 if not mobj.group('proto'):
2831 url = 'http://' + url
2832 video_id = mobj.group('videoid')
2833 self.report_webpage(video_id)
2835 request = urllib2.Request(url)
2837 webpage = urllib2.urlopen(request).read()
2838 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2839 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2842 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2844 self._downloader.trouble(u'ERROR: unable to extract song name')
2846 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2847 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2849 self._downloader.trouble(u'ERROR: unable to extract performer')
2851 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2852 video_title = performer + ' - ' + song_name
2854 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2856 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
2858 mtvn_uri = mobj.group(1)
2860 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2862 self._downloader.trouble(u'ERROR: unable to extract content id')
2864 content_id = mobj.group(1)
2866 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2867 self.report_extraction(video_id)
2868 request = urllib2.Request(videogen_url)
2870 metadataXml = urllib2.urlopen(request).read()
2871 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2872 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
2875 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2876 renditions = mdoc.findall('.//rendition')
2878 # For now, always pick the highest quality.
2879 rendition = renditions[-1]
2882 _,_,ext = rendition.attrib['type'].partition('/')
2883 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2884 video_url = rendition.find('./src').text
2886 self._downloader.trouble('Invalid rendition field.')
2892 'uploader': performer,
2893 'title': video_title,
2894 'stitle': simplify_title(video_title),