2 # -*- coding: utf-8 -*-
15 import xml.etree.ElementTree
18 from urlparse import parse_qs
21 import cStringIO as StringIO
28 class InfoExtractor(object):
29 """Information Extractor class.
31 Information extractors are the classes that, given a URL, extract
32 information from the video (or videos) the URL refers to. This
33 information includes the real video URL, the video title and simplified
34 title, author and others. The information is stored in a dictionary
35 which is then passed to the FileDownloader. The FileDownloader
36 processes this information possibly downloading the video to the file
37 system, among other possible outcomes. The dictionaries must include
42 uploader: Nickname of the video uploader.
44 ext: Video filename extension.
46 player_url: SWF Player URL (may be None).
48 The following fields are optional. Their primary purpose is to allow
49 youtube-dl to serve as the backend for a video search function, such
50 as the one in youtube2mp3. They are only used when their respective
51 forced printing functions are called:
53 thumbnail: Full URL to a video thumbnail image.
54 description: One-line video description.
56 Subclasses of this one should re-define the _real_initialize() and
57 _real_extract() methods and define a _VALID_URL regexp.
58 Probably, they should also be added to the list of extractors.
64 def __init__(self, downloader=None):
65 """Constructor. Receives an optional downloader."""
67 self.set_downloader(downloader)
69 def suitable(self, url):
70 """Receives a URL and returns True if suitable for this IE."""
71 return re.match(self._VALID_URL, url) is not None
74 """Initializes an instance (authentication, etc)."""
76 self._real_initialize()
79 def extract(self, url):
80 """Extracts URL information and returns it in list of dicts."""
82 return self._real_extract(url)
84 def set_downloader(self, downloader):
85 """Sets the downloader for this IE."""
86 self._downloader = downloader
88 def _real_initialize(self):
89 """Real initialization process. Redefine in subclasses."""
92 def _real_extract(self, url):
93 """Real extraction process. Redefine in subclasses."""
97 class YoutubeIE(InfoExtractor):
98 """Information extractor for youtube.com."""
100 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
101 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
102 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
103 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
104 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
105 _NETRC_MACHINE = 'youtube'
106 # Listed in order of quality
107 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
108 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
109 _video_extensions = {
115 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
121 _video_dimensions = {
139 def report_lang(self):
140 """Report attempt to set language."""
141 self._downloader.to_screen(u'[youtube] Setting language')
143 def report_login(self):
144 """Report attempt to log in."""
145 self._downloader.to_screen(u'[youtube] Logging in')
147 def report_age_confirmation(self):
148 """Report attempt to confirm age."""
149 self._downloader.to_screen(u'[youtube] Confirming age')
151 def report_video_webpage_download(self, video_id):
152 """Report attempt to download video webpage."""
153 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
155 def report_video_info_webpage_download(self, video_id):
156 """Report attempt to download video info webpage."""
157 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
159 def report_video_subtitles_download(self, video_id):
160 """Report attempt to download video info webpage."""
161 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
163 def report_information_extraction(self, video_id):
164 """Report attempt to extract video information."""
165 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
167 def report_unavailable_format(self, video_id, format):
168 """Report extracted video URL."""
169 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
171 def report_rtmp_download(self):
172 """Indicate the download will use the RTMP protocol."""
173 self._downloader.to_screen(u'[youtube] RTMP download detected')
175 def _closed_captions_xml_to_srt(self, xml_string):
177 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
178 # TODO parse xml instead of regex
179 for n, (start, dur_tag, dur, caption) in enumerate(texts):
180 if not dur: dur = '4'
182 end = start + float(dur)
183 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
184 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
185 caption = unescapeHTML(caption)
186 caption = unescapeHTML(caption) # double cycle, intentional
187 srt += str(n+1) + '\n'
188 srt += start + ' --> ' + end + '\n'
189 srt += caption + '\n\n'
192 def _print_formats(self, formats):
193 print 'Available formats:'
195 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
197 def _real_initialize(self):
198 if self._downloader is None:
203 downloader_params = self._downloader.params
205 # Attempt to use provided username and password or .netrc data
206 if downloader_params.get('username', None) is not None:
207 username = downloader_params['username']
208 password = downloader_params['password']
209 elif downloader_params.get('usenetrc', False):
211 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
216 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
217 except (IOError, netrc.NetrcParseError), err:
218 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
222 request = urllib2.Request(self._LANG_URL)
225 urllib2.urlopen(request).read()
226 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
227 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
230 # No authentication to be performed
236 'current_form': 'loginForm',
238 'action_login': 'Log In',
239 'username': username,
240 'password': password,
242 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
245 login_results = urllib2.urlopen(request).read()
246 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
247 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
249 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
250 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
256 'action_confirm': 'Confirm',
258 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
260 self.report_age_confirmation()
261 age_results = urllib2.urlopen(request).read()
262 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
263 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
266 def _real_extract(self, url):
267 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
268 mobj = re.search(self._NEXT_URL_RE, url)
270 url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
272 # Extract video id from URL
273 mobj = re.match(self._VALID_URL, url)
275 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
277 video_id = mobj.group(2)
280 self.report_video_webpage_download(video_id)
281 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
283 video_webpage = urllib2.urlopen(request).read()
284 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
285 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
288 # Attempt to extract SWF player URL
289 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
291 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
296 self.report_video_info_webpage_download(video_id)
297 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
298 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
299 % (video_id, el_type))
300 request = urllib2.Request(video_info_url)
302 video_info_webpage = urllib2.urlopen(request).read()
303 video_info = parse_qs(video_info_webpage)
304 if 'token' in video_info:
306 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
307 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
309 if 'token' not in video_info:
310 if 'reason' in video_info:
311 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
313 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
316 # Check for "rental" videos
317 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
318 self._downloader.trouble(u'ERROR: "rental" videos not supported')
321 # Start extracting information
322 self.report_information_extraction(video_id)
325 if 'author' not in video_info:
326 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
328 video_uploader = urllib.unquote_plus(video_info['author'][0])
331 if 'title' not in video_info:
332 self._downloader.trouble(u'ERROR: unable to extract video title')
334 video_title = urllib.unquote_plus(video_info['title'][0])
335 video_title = video_title.decode('utf-8')
338 if 'thumbnail_url' not in video_info:
339 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
341 else: # don't panic if we can't find it
342 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
346 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
348 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
349 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
350 for expression in format_expressions:
352 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
357 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
358 if video_description: video_description = clean_html(video_description)
359 else: video_description = ''
362 video_subtitles = None
363 if self._downloader.params.get('writesubtitles', False):
365 self.report_video_subtitles_download(video_id)
366 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
368 srt_list = urllib2.urlopen(request).read()
369 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
370 raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
371 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
372 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
373 if not srt_lang_list:
374 raise Trouble(u'WARNING: video has no closed captions')
375 if self._downloader.params.get('subtitleslang', False):
376 srt_lang = self._downloader.params.get('subtitleslang')
377 elif 'en' in srt_lang_list:
380 srt_lang = srt_lang_list.keys()[0]
381 if not srt_lang in srt_lang_list:
382 raise Trouble(u'WARNING: no closed captions found in the specified language')
383 request = urllib2.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
385 srt_xml = urllib2.urlopen(request).read()
386 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
387 raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
389 raise Trouble(u'WARNING: unable to download video subtitles')
390 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
391 except Trouble as trouble:
392 self._downloader.trouble(trouble[0])
395 video_token = urllib.unquote_plus(video_info['token'][0])
397 # Decide which formats to download
398 req_format = self._downloader.params.get('format', None)
400 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
401 self.report_rtmp_download()
402 video_url_list = [(None, video_info['conn'][0])]
403 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
404 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
405 url_data = [parse_qs(uds) for uds in url_data_strs]
406 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
407 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
409 format_limit = self._downloader.params.get('format_limit', None)
410 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
411 if format_limit is not None and format_limit in available_formats:
412 format_list = available_formats[available_formats.index(format_limit):]
414 format_list = available_formats
415 existing_formats = [x for x in format_list if x in url_map]
416 if len(existing_formats) == 0:
417 self._downloader.trouble(u'ERROR: no known formats available for video')
419 if self._downloader.params.get('listformats', None):
420 self._print_formats(existing_formats)
422 if req_format is None or req_format == 'best':
423 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
424 elif req_format == 'worst':
425 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
426 elif req_format in ('-1', 'all'):
427 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
429 # Specific formats. We pick the first in a slash-delimeted sequence.
430 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
431 req_formats = req_format.split('/')
432 video_url_list = None
433 for rf in req_formats:
435 video_url_list = [(rf, url_map[rf])]
437 if video_url_list is None:
438 self._downloader.trouble(u'ERROR: requested format not available')
441 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
445 for format_param, video_real_url in video_url_list:
447 video_extension = self._video_extensions.get(format_param, 'flv')
450 'id': video_id.decode('utf-8'),
451 'url': video_real_url.decode('utf-8'),
452 'uploader': video_uploader.decode('utf-8'),
453 'upload_date': upload_date,
454 'title': video_title,
455 'ext': video_extension.decode('utf-8'),
456 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
457 'thumbnail': video_thumbnail.decode('utf-8'),
458 'description': video_description,
459 'player_url': player_url,
460 'subtitles': video_subtitles
465 class MetacafeIE(InfoExtractor):
466 """Information Extractor for metacafe.com."""
468 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
469 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
470 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
471 IE_NAME = u'metacafe'
473 def __init__(self, downloader=None):
474 InfoExtractor.__init__(self, downloader)
476 def report_disclaimer(self):
477 """Report disclaimer retrieval."""
478 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
480 def report_age_confirmation(self):
481 """Report attempt to confirm age."""
482 self._downloader.to_screen(u'[metacafe] Confirming age')
484 def report_download_webpage(self, video_id):
485 """Report webpage download."""
486 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
488 def report_extraction(self, video_id):
489 """Report information extraction."""
490 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
492 def _real_initialize(self):
493 # Retrieve disclaimer
494 request = urllib2.Request(self._DISCLAIMER)
496 self.report_disclaimer()
497 disclaimer = urllib2.urlopen(request).read()
498 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
499 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
505 'submit': "Continue - I'm over 18",
507 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
509 self.report_age_confirmation()
510 disclaimer = urllib2.urlopen(request).read()
511 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
512 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
515 def _real_extract(self, url):
516 # Extract id and simplified title from URL
517 mobj = re.match(self._VALID_URL, url)
519 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
522 video_id = mobj.group(1)
524 # Check if video comes from YouTube
525 mobj2 = re.match(r'^yt-(.*)$', video_id)
526 if mobj2 is not None:
527 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
530 # Retrieve video webpage to extract further information
531 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
533 self.report_download_webpage(video_id)
534 webpage = urllib2.urlopen(request).read()
535 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
536 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
539 # Extract URL, uploader and title from webpage
540 self.report_extraction(video_id)
541 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
543 mediaURL = urllib.unquote(mobj.group(1))
544 video_extension = mediaURL[-3:]
546 # Extract gdaKey if available
547 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
551 gdaKey = mobj.group(1)
552 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
554 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
556 self._downloader.trouble(u'ERROR: unable to extract media URL')
558 vardict = parse_qs(mobj.group(1))
559 if 'mediaData' not in vardict:
560 self._downloader.trouble(u'ERROR: unable to extract media URL')
562 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
564 self._downloader.trouble(u'ERROR: unable to extract media URL')
566 mediaURL = mobj.group(1).replace('\\/', '/')
567 video_extension = mediaURL[-3:]
568 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
570 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
572 self._downloader.trouble(u'ERROR: unable to extract title')
574 video_title = mobj.group(1).decode('utf-8')
576 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
578 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
580 video_uploader = mobj.group(1)
583 'id': video_id.decode('utf-8'),
584 'url': video_url.decode('utf-8'),
585 'uploader': video_uploader.decode('utf-8'),
586 'upload_date': u'NA',
587 'title': video_title,
588 'ext': video_extension.decode('utf-8'),
594 class DailymotionIE(InfoExtractor):
595 """Information Extractor for Dailymotion"""
597 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
598 IE_NAME = u'dailymotion'
600 def __init__(self, downloader=None):
601 InfoExtractor.__init__(self, downloader)
603 def report_download_webpage(self, video_id):
604 """Report webpage download."""
605 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
607 def report_extraction(self, video_id):
608 """Report information extraction."""
609 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
611 def _real_extract(self, url):
612 # Extract id and simplified title from URL
613 mobj = re.match(self._VALID_URL, url)
615 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
618 video_id = mobj.group(1)
620 video_extension = 'flv'
622 # Retrieve video webpage to extract further information
623 request = urllib2.Request(url)
624 request.add_header('Cookie', 'family_filter=off')
626 self.report_download_webpage(video_id)
627 webpage = urllib2.urlopen(request).read()
628 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
629 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
632 # Extract URL, uploader and title from webpage
633 self.report_extraction(video_id)
634 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
636 self._downloader.trouble(u'ERROR: unable to extract media URL')
638 sequence = urllib.unquote(mobj.group(1))
639 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
641 self._downloader.trouble(u'ERROR: unable to extract media URL')
643 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
645 # if needed add http://www.dailymotion.com/ if relative URL
649 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
651 self._downloader.trouble(u'ERROR: unable to extract title')
653 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
655 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
657 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
659 video_uploader = mobj.group(1)
662 'id': video_id.decode('utf-8'),
663 'url': video_url.decode('utf-8'),
664 'uploader': video_uploader.decode('utf-8'),
665 'upload_date': u'NA',
666 'title': video_title,
667 'ext': video_extension.decode('utf-8'),
673 class GoogleIE(InfoExtractor):
674 """Information extractor for video.google.com."""
676 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
677 IE_NAME = u'video.google'
679 def __init__(self, downloader=None):
680 InfoExtractor.__init__(self, downloader)
682 def report_download_webpage(self, video_id):
683 """Report webpage download."""
684 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
686 def report_extraction(self, video_id):
687 """Report information extraction."""
688 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
690 def _real_extract(self, url):
691 # Extract id from URL
692 mobj = re.match(self._VALID_URL, url)
694 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
697 video_id = mobj.group(1)
699 video_extension = 'mp4'
701 # Retrieve video webpage to extract further information
702 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
704 self.report_download_webpage(video_id)
705 webpage = urllib2.urlopen(request).read()
706 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
707 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
710 # Extract URL, uploader, and title from webpage
711 self.report_extraction(video_id)
712 mobj = re.search(r"download_url:'([^']+)'", webpage)
714 video_extension = 'flv'
715 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
717 self._downloader.trouble(u'ERROR: unable to extract media URL')
719 mediaURL = urllib.unquote(mobj.group(1))
720 mediaURL = mediaURL.replace('\\x3d', '\x3d')
721 mediaURL = mediaURL.replace('\\x26', '\x26')
725 mobj = re.search(r'<title>(.*)</title>', webpage)
727 self._downloader.trouble(u'ERROR: unable to extract title')
729 video_title = mobj.group(1).decode('utf-8')
731 # Extract video description
732 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
734 self._downloader.trouble(u'ERROR: unable to extract video description')
736 video_description = mobj.group(1).decode('utf-8')
737 if not video_description:
738 video_description = 'No description available.'
740 # Extract video thumbnail
741 if self._downloader.params.get('forcethumbnail', False):
742 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
744 webpage = urllib2.urlopen(request).read()
745 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
746 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
748 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
750 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
752 video_thumbnail = mobj.group(1)
753 else: # we need something to pass to process_info
757 'id': video_id.decode('utf-8'),
758 'url': video_url.decode('utf-8'),
760 'upload_date': u'NA',
761 'title': video_title,
762 'ext': video_extension.decode('utf-8'),
768 class PhotobucketIE(InfoExtractor):
769 """Information extractor for photobucket.com."""
771 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
772 IE_NAME = u'photobucket'
774 def __init__(self, downloader=None):
775 InfoExtractor.__init__(self, downloader)
777 def report_download_webpage(self, video_id):
778 """Report webpage download."""
779 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
781 def report_extraction(self, video_id):
782 """Report information extraction."""
783 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
785 def _real_extract(self, url):
786 # Extract id from URL
787 mobj = re.match(self._VALID_URL, url)
789 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
792 video_id = mobj.group(1)
794 video_extension = 'flv'
796 # Retrieve video webpage to extract further information
797 request = urllib2.Request(url)
799 self.report_download_webpage(video_id)
800 webpage = urllib2.urlopen(request).read()
801 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
802 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
805 # Extract URL, uploader, and title from webpage
806 self.report_extraction(video_id)
807 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
809 self._downloader.trouble(u'ERROR: unable to extract media URL')
811 mediaURL = urllib.unquote(mobj.group(1))
815 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
817 self._downloader.trouble(u'ERROR: unable to extract title')
819 video_title = mobj.group(1).decode('utf-8')
821 video_uploader = mobj.group(2).decode('utf-8')
824 'id': video_id.decode('utf-8'),
825 'url': video_url.decode('utf-8'),
826 'uploader': video_uploader,
827 'upload_date': u'NA',
828 'title': video_title,
829 'ext': video_extension.decode('utf-8'),
835 class YahooIE(InfoExtractor):
836 """Information extractor for video.yahoo.com."""
838 # _VALID_URL matches all Yahoo! Video URLs
839 # _VPAGE_URL matches only the extractable '/watch/' URLs
840 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
841 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
842 IE_NAME = u'video.yahoo'
844 def __init__(self, downloader=None):
845 InfoExtractor.__init__(self, downloader)
847 def report_download_webpage(self, video_id):
848 """Report webpage download."""
849 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
851 def report_extraction(self, video_id):
852 """Report information extraction."""
853 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
855 def _real_extract(self, url, new_video=True):
856 # Extract ID from URL
857 mobj = re.match(self._VALID_URL, url)
859 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
862 video_id = mobj.group(2)
863 video_extension = 'flv'
865 # Rewrite valid but non-extractable URLs as
866 # extractable English language /watch/ URLs
867 if re.match(self._VPAGE_URL, url) is None:
868 request = urllib2.Request(url)
870 webpage = urllib2.urlopen(request).read()
871 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
872 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
875 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
877 self._downloader.trouble(u'ERROR: Unable to extract id field')
879 yahoo_id = mobj.group(1)
881 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
883 self._downloader.trouble(u'ERROR: Unable to extract vid field')
885 yahoo_vid = mobj.group(1)
887 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
888 return self._real_extract(url, new_video=False)
890 # Retrieve video webpage to extract further information
891 request = urllib2.Request(url)
893 self.report_download_webpage(video_id)
894 webpage = urllib2.urlopen(request).read()
895 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
896 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
899 # Extract uploader and title from webpage
900 self.report_extraction(video_id)
901 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
903 self._downloader.trouble(u'ERROR: unable to extract video title')
905 video_title = mobj.group(1).decode('utf-8')
907 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
909 self._downloader.trouble(u'ERROR: unable to extract video uploader')
911 video_uploader = mobj.group(1).decode('utf-8')
913 # Extract video thumbnail
914 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
916 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
918 video_thumbnail = mobj.group(1).decode('utf-8')
920 # Extract video description
921 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
923 self._downloader.trouble(u'ERROR: unable to extract video description')
925 video_description = mobj.group(1).decode('utf-8')
926 if not video_description:
927 video_description = 'No description available.'
929 # Extract video height and width
930 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
932 self._downloader.trouble(u'ERROR: unable to extract video height')
934 yv_video_height = mobj.group(1)
936 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
938 self._downloader.trouble(u'ERROR: unable to extract video width')
940 yv_video_width = mobj.group(1)
942 # Retrieve video playlist to extract media URL
943 # I'm not completely sure what all these options are, but we
944 # seem to need most of them, otherwise the server sends a 401.
945 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
946 yv_bitrate = '700' # according to Wikipedia this is hard-coded
947 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
948 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
949 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
951 self.report_download_webpage(video_id)
952 webpage = urllib2.urlopen(request).read()
953 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
954 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
957 # Extract media URL from playlist XML
958 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
960 self._downloader.trouble(u'ERROR: Unable to extract media URL')
962 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
963 video_url = unescapeHTML(video_url)
966 'id': video_id.decode('utf-8'),
968 'uploader': video_uploader,
969 'upload_date': u'NA',
970 'title': video_title,
971 'ext': video_extension.decode('utf-8'),
972 'thumbnail': video_thumbnail.decode('utf-8'),
973 'description': video_description,
974 'thumbnail': video_thumbnail,
979 class VimeoIE(InfoExtractor):
980 """Information extractor for vimeo.com."""
982 # _VALID_URL matches Vimeo URLs
983 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
986 def __init__(self, downloader=None):
987 InfoExtractor.__init__(self, downloader)
989 def report_download_webpage(self, video_id):
990 """Report webpage download."""
991 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
993 def report_extraction(self, video_id):
994 """Report information extraction."""
995 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
997 def _real_extract(self, url, new_video=True):
998 # Extract ID from URL
999 mobj = re.match(self._VALID_URL, url)
1001 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1004 video_id = mobj.group(1)
1006 # Retrieve video webpage to extract further information
1007 request = urllib2.Request(url, None, std_headers)
1009 self.report_download_webpage(video_id)
1010 webpage = urllib2.urlopen(request).read()
1011 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1012 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1015 # Now we begin extracting as much information as we can from what we
1016 # retrieved. First we extract the information common to all extractors,
1017 # and latter we extract those that are Vimeo specific.
1018 self.report_extraction(video_id)
1020 # Extract the config JSON
1021 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1023 config = json.loads(config)
1025 self._downloader.trouble(u'ERROR: unable to extract info section')
1029 video_title = config["video"]["title"]
1032 video_uploader = config["video"]["owner"]["name"]
1034 # Extract video thumbnail
1035 video_thumbnail = config["video"]["thumbnail"]
1037 # Extract video description
1038 video_description = get_element_by_id("description", webpage.decode('utf8'))
1039 if video_description: video_description = clean_html(video_description)
1040 else: video_description = ''
1042 # Extract upload date
1043 video_upload_date = u'NA'
1044 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1045 if mobj is not None:
1046 video_upload_date = mobj.group(1)
1048 # Vimeo specific: extract request signature and timestamp
1049 sig = config['request']['signature']
1050 timestamp = config['request']['timestamp']
1052 # Vimeo specific: extract video codec and quality information
1053 # TODO bind to format param
1054 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1055 for codec in codecs:
1056 if codec[0] in config["video"]["files"]:
1057 video_codec = codec[0]
1058 video_extension = codec[1]
1059 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
1060 else: quality = 'sd'
1063 self._downloader.trouble(u'ERROR: no known codec found')
1066 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1067 %(video_id, sig, timestamp, quality, video_codec.upper())
1072 'uploader': video_uploader,
1073 'upload_date': video_upload_date,
1074 'title': video_title,
1075 'ext': video_extension,
1076 'thumbnail': video_thumbnail,
1077 'description': video_description,
1082 class GenericIE(InfoExtractor):
1083 """Generic last-resort information extractor."""
1086 IE_NAME = u'generic'
1088 def __init__(self, downloader=None):
1089 InfoExtractor.__init__(self, downloader)
1091 def report_download_webpage(self, video_id):
1092 """Report webpage download."""
1093 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1094 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1096 def report_extraction(self, video_id):
1097 """Report information extraction."""
1098 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1100 def report_following_redirect(self, new_url):
1101 """Report information extraction."""
1102 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1104 def _test_redirect(self, url):
1105 """Check if it is a redirect, like url shorteners, in case restart chain."""
1106 class HeadRequest(urllib2.Request):
1107 def get_method(self):
1110 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1112 Subclass the HTTPRedirectHandler to make it use our
1113 HeadRequest also on the redirected URL
1115 def redirect_request(self, req, fp, code, msg, headers, newurl):
1116 if code in (301, 302, 303, 307):
1117 newurl = newurl.replace(' ', '%20')
1118 newheaders = dict((k,v) for k,v in req.headers.items()
1119 if k.lower() not in ("content-length", "content-type"))
1120 return HeadRequest(newurl,
1122 origin_req_host=req.get_origin_req_host(),
1125 raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1127 class HTTPMethodFallback(urllib2.BaseHandler):
1129 Fallback to GET if HEAD is not allowed (405 HTTP error)
1131 def http_error_405(self, req, fp, code, msg, headers):
1135 newheaders = dict((k,v) for k,v in req.headers.items()
1136 if k.lower() not in ("content-length", "content-type"))
1137 return self.parent.open(urllib2.Request(req.get_full_url(),
1139 origin_req_host=req.get_origin_req_host(),
1143 opener = urllib2.OpenerDirector()
1144 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1145 HTTPMethodFallback, HEADRedirectHandler,
1146 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1147 opener.add_handler(handler())
1149 response = opener.open(HeadRequest(url))
1150 new_url = response.geturl()
1152 if url == new_url: return False
1154 self.report_following_redirect(new_url)
1155 self._downloader.download([new_url])
1158 def _real_extract(self, url):
1159 if self._test_redirect(url): return
1161 video_id = url.split('/')[-1]
1162 request = urllib2.Request(url)
1164 self.report_download_webpage(video_id)
1165 webpage = urllib2.urlopen(request).read()
1166 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1167 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1169 except ValueError, err:
1170 # since this is the last-resort InfoExtractor, if
1171 # this error is thrown, it'll be thrown here
1172 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1175 self.report_extraction(video_id)
1176 # Start with something easy: JW Player in SWFObject
1177 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1179 # Broaden the search a little bit
1180 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1182 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1185 # It's possible that one of the regexes
1186 # matched, but returned an empty group:
1187 if mobj.group(1) is None:
1188 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1191 video_url = urllib.unquote(mobj.group(1))
1192 video_id = os.path.basename(video_url)
1194 # here's a fun little line of code for you:
1195 video_extension = os.path.splitext(video_id)[1][1:]
1196 video_id = os.path.splitext(video_id)[0]
1198 # it's tempting to parse this further, but you would
1199 # have to take into account all the variations like
1200 # Video Title - Site Name
1201 # Site Name | Video Title
1202 # Video Title - Tagline | Site Name
1203 # and so on and so forth; it's just not practical
1204 mobj = re.search(r'<title>(.*)</title>', webpage)
1206 self._downloader.trouble(u'ERROR: unable to extract title')
1208 video_title = mobj.group(1).decode('utf-8')
1210 # video uploader is domain name
1211 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1213 self._downloader.trouble(u'ERROR: unable to extract title')
1215 video_uploader = mobj.group(1).decode('utf-8')
1218 'id': video_id.decode('utf-8'),
1219 'url': video_url.decode('utf-8'),
1220 'uploader': video_uploader,
1221 'upload_date': u'NA',
1222 'title': video_title,
1223 'ext': video_extension.decode('utf-8'),
1229 class YoutubeSearchIE(InfoExtractor):
1230 """Information Extractor for YouTube search queries."""
1231 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1232 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1233 _max_youtube_results = 1000
1234 IE_NAME = u'youtube:search'
1236 def __init__(self, downloader=None):
1237 InfoExtractor.__init__(self, downloader)
1239 def report_download_page(self, query, pagenum):
1240 """Report attempt to download search page with given number."""
1241 query = query.decode(preferredencoding())
1242 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1244 def _real_extract(self, query):
1245 mobj = re.match(self._VALID_URL, query)
1247 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1250 prefix, query = query.split(':')
1252 query = query.encode('utf-8')
1254 self._download_n_results(query, 1)
1256 elif prefix == 'all':
1257 self._download_n_results(query, self._max_youtube_results)
1263 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1265 elif n > self._max_youtube_results:
1266 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1267 n = self._max_youtube_results
1268 self._download_n_results(query, n)
1270 except ValueError: # parsing prefix as integer fails
1271 self._download_n_results(query, 1)
1274 def _download_n_results(self, query, n):
1275 """Downloads a specified number of results for a query"""
1281 while (50 * pagenum) < limit:
1282 self.report_download_page(query, pagenum+1)
1283 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1284 request = urllib2.Request(result_url)
1286 data = urllib2.urlopen(request).read()
1287 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1288 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
1290 api_response = json.loads(data)['data']
1292 new_ids = list(video['id'] for video in api_response['items'])
1293 video_ids += new_ids
1295 limit = min(n, api_response['totalItems'])
1298 if len(video_ids) > n:
1299 video_ids = video_ids[:n]
1300 for id in video_ids:
1301 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1305 class GoogleSearchIE(InfoExtractor):
1306 """Information Extractor for Google Video search queries."""
1307 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1308 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1309 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1310 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1311 _max_google_results = 1000
1312 IE_NAME = u'video.google:search'
1314 def __init__(self, downloader=None):
1315 InfoExtractor.__init__(self, downloader)
1317 def report_download_page(self, query, pagenum):
1318 """Report attempt to download playlist page with given number."""
1319 query = query.decode(preferredencoding())
1320 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1322 def _real_extract(self, query):
1323 mobj = re.match(self._VALID_URL, query)
1325 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1328 prefix, query = query.split(':')
1330 query = query.encode('utf-8')
1332 self._download_n_results(query, 1)
1334 elif prefix == 'all':
1335 self._download_n_results(query, self._max_google_results)
1341 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1343 elif n > self._max_google_results:
1344 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1345 n = self._max_google_results
1346 self._download_n_results(query, n)
1348 except ValueError: # parsing prefix as integer fails
1349 self._download_n_results(query, 1)
1352 def _download_n_results(self, query, n):
1353 """Downloads a specified number of results for a query"""
1359 self.report_download_page(query, pagenum)
1360 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1361 request = urllib2.Request(result_url)
1363 page = urllib2.urlopen(request).read()
1364 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1365 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1368 # Extract video identifiers
1369 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1370 video_id = mobj.group(1)
1371 if video_id not in video_ids:
1372 video_ids.append(video_id)
1373 if len(video_ids) == n:
1374 # Specified n videos reached
1375 for id in video_ids:
1376 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1379 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1380 for id in video_ids:
1381 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1384 pagenum = pagenum + 1
1387 class YahooSearchIE(InfoExtractor):
1388 """Information Extractor for Yahoo! Video search queries."""
1389 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1390 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1391 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1392 _MORE_PAGES_INDICATOR = r'\s*Next'
1393 _max_yahoo_results = 1000
1394 IE_NAME = u'video.yahoo:search'
1396 def __init__(self, downloader=None):
1397 InfoExtractor.__init__(self, downloader)
1399 def report_download_page(self, query, pagenum):
1400 """Report attempt to download playlist page with given number."""
1401 query = query.decode(preferredencoding())
1402 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1404 def _real_extract(self, query):
1405 mobj = re.match(self._VALID_URL, query)
1407 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1410 prefix, query = query.split(':')
1412 query = query.encode('utf-8')
1414 self._download_n_results(query, 1)
1416 elif prefix == 'all':
1417 self._download_n_results(query, self._max_yahoo_results)
1423 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1425 elif n > self._max_yahoo_results:
1426 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1427 n = self._max_yahoo_results
1428 self._download_n_results(query, n)
1430 except ValueError: # parsing prefix as integer fails
1431 self._download_n_results(query, 1)
1434 def _download_n_results(self, query, n):
1435 """Downloads a specified number of results for a query"""
1438 already_seen = set()
1442 self.report_download_page(query, pagenum)
1443 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1444 request = urllib2.Request(result_url)
1446 page = urllib2.urlopen(request).read()
1447 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1448 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1451 # Extract video identifiers
1452 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1453 video_id = mobj.group(1)
1454 if video_id not in already_seen:
1455 video_ids.append(video_id)
1456 already_seen.add(video_id)
1457 if len(video_ids) == n:
1458 # Specified n videos reached
1459 for id in video_ids:
1460 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1463 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1464 for id in video_ids:
1465 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1468 pagenum = pagenum + 1
1471 class YoutubePlaylistIE(InfoExtractor):
1472 """Information Extractor for YouTube playlists."""
1474 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1475 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1476 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&list=(PL)?%s&'
1477 _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1478 IE_NAME = u'youtube:playlist'
1480 def __init__(self, downloader=None):
1481 InfoExtractor.__init__(self, downloader)
1483 def report_download_page(self, playlist_id, pagenum):
1484 """Report attempt to download playlist page with given number."""
1485 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1487 def _real_extract(self, url):
1488 # Extract playlist id
1489 mobj = re.match(self._VALID_URL, url)
1491 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1495 if mobj.group(3) is not None:
1496 self._downloader.download([mobj.group(3)])
1499 # Download playlist pages
1500 # prefix is 'p' as default for playlists but there are other types that need extra care
1501 playlist_prefix = mobj.group(1)
1502 if playlist_prefix == 'a':
1503 playlist_access = 'artist'
1505 playlist_prefix = 'p'
1506 playlist_access = 'view_play_list'
1507 playlist_id = mobj.group(2)
1512 self.report_download_page(playlist_id, pagenum)
1513 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1514 request = urllib2.Request(url)
1516 page = urllib2.urlopen(request).read()
1517 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1518 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1521 # Extract video identifiers
1523 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1524 if mobj.group(1) not in ids_in_page:
1525 ids_in_page.append(mobj.group(1))
1526 video_ids.extend(ids_in_page)
1528 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1530 pagenum = pagenum + 1
1532 playliststart = self._downloader.params.get('playliststart', 1) - 1
1533 playlistend = self._downloader.params.get('playlistend', -1)
1534 if playlistend == -1:
1535 video_ids = video_ids[playliststart:]
1537 video_ids = video_ids[playliststart:playlistend]
1539 for id in video_ids:
1540 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1544 class YoutubeUserIE(InfoExtractor):
1545 """Information Extractor for YouTube users."""
1547 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1548 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1549 _GDATA_PAGE_SIZE = 50
1550 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1551 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1552 IE_NAME = u'youtube:user'
1554 def __init__(self, downloader=None):
1555 InfoExtractor.__init__(self, downloader)
1557 def report_download_page(self, username, start_index):
1558 """Report attempt to download user page."""
1559 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1560 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1562 def _real_extract(self, url):
1564 mobj = re.match(self._VALID_URL, url)
1566 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1569 username = mobj.group(1)
1571 # Download video ids using YouTube Data API. Result size per
1572 # query is limited (currently to 50 videos) so we need to query
1573 # page by page until there are no video ids - it means we got
1580 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1581 self.report_download_page(username, start_index)
1583 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1586 page = urllib2.urlopen(request).read()
1587 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1588 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1591 # Extract video identifiers
1594 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1595 if mobj.group(1) not in ids_in_page:
1596 ids_in_page.append(mobj.group(1))
1598 video_ids.extend(ids_in_page)
1600 # A little optimization - if current page is not
1601 # "full", ie. does not contain PAGE_SIZE video ids then
1602 # we can assume that this page is the last one - there
1603 # are no more ids on further pages - no need to query
1606 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1611 all_ids_count = len(video_ids)
1612 playliststart = self._downloader.params.get('playliststart', 1) - 1
1613 playlistend = self._downloader.params.get('playlistend', -1)
1615 if playlistend == -1:
1616 video_ids = video_ids[playliststart:]
1618 video_ids = video_ids[playliststart:playlistend]
1620 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1621 (username, all_ids_count, len(video_ids)))
1623 for video_id in video_ids:
1624 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1627 class BlipTVUserIE(InfoExtractor):
1628 """Information Extractor for blip.tv users."""
1630 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1632 IE_NAME = u'blip.tv:user'
1634 def __init__(self, downloader=None):
1635 InfoExtractor.__init__(self, downloader)
1637 def report_download_page(self, username, pagenum):
1638 """Report attempt to download user page."""
1639 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1640 (self.IE_NAME, username, pagenum))
1642 def _real_extract(self, url):
1644 mobj = re.match(self._VALID_URL, url)
1646 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1649 username = mobj.group(1)
1651 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1653 request = urllib2.Request(url)
1656 page = urllib2.urlopen(request).read().decode('utf-8')
1657 mobj = re.search(r'data-users-id="([^"]+)"', page)
1658 page_base = page_base % mobj.group(1)
1659 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1660 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1664 # Download video ids using BlipTV Ajax calls. Result size per
1665 # query is limited (currently to 12 videos) so we need to query
1666 # page by page until there are no video ids - it means we got
1673 self.report_download_page(username, pagenum)
1675 request = urllib2.Request( page_base + "&page=" + str(pagenum) )
1678 page = urllib2.urlopen(request).read().decode('utf-8')
1679 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1680 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1683 # Extract video identifiers
1686 for mobj in re.finditer(r'href="/([^"]+)"', page):
1687 if mobj.group(1) not in ids_in_page:
1688 ids_in_page.append(unescapeHTML(mobj.group(1)))
1690 video_ids.extend(ids_in_page)
1692 # A little optimization - if current page is not
1693 # "full", ie. does not contain PAGE_SIZE video ids then
1694 # we can assume that this page is the last one - there
1695 # are no more ids on further pages - no need to query
1698 if len(ids_in_page) < self._PAGE_SIZE:
1703 all_ids_count = len(video_ids)
1704 playliststart = self._downloader.params.get('playliststart', 1) - 1
1705 playlistend = self._downloader.params.get('playlistend', -1)
1707 if playlistend == -1:
1708 video_ids = video_ids[playliststart:]
1710 video_ids = video_ids[playliststart:playlistend]
1712 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1713 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1715 for video_id in video_ids:
1716 self._downloader.download([u'http://blip.tv/'+video_id])
1719 class DepositFilesIE(InfoExtractor):
1720 """Information extractor for depositfiles.com"""
1722 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1723 IE_NAME = u'DepositFiles'
1725 def __init__(self, downloader=None):
1726 InfoExtractor.__init__(self, downloader)
1728 def report_download_webpage(self, file_id):
1729 """Report webpage download."""
1730 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1732 def report_extraction(self, file_id):
1733 """Report information extraction."""
1734 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1736 def _real_extract(self, url):
1737 file_id = url.split('/')[-1]
1738 # Rebuild url in english locale
1739 url = 'http://depositfiles.com/en/files/' + file_id
1741 # Retrieve file webpage with 'Free download' button pressed
1742 free_download_indication = { 'gateway_result' : '1' }
1743 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1745 self.report_download_webpage(file_id)
1746 webpage = urllib2.urlopen(request).read()
1747 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1748 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
1751 # Search for the real file URL
1752 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1753 if (mobj is None) or (mobj.group(1) is None):
1754 # Try to figure out reason of the error.
1755 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1756 if (mobj is not None) and (mobj.group(1) is not None):
1757 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1758 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1760 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1763 file_url = mobj.group(1)
1764 file_extension = os.path.splitext(file_url)[1][1:]
1766 # Search for file title
1767 mobj = re.search(r'<b title="(.*?)">', webpage)
1769 self._downloader.trouble(u'ERROR: unable to extract title')
1771 file_title = mobj.group(1).decode('utf-8')
1774 'id': file_id.decode('utf-8'),
1775 'url': file_url.decode('utf-8'),
1777 'upload_date': u'NA',
1778 'title': file_title,
1779 'ext': file_extension.decode('utf-8'),
1785 class FacebookIE(InfoExtractor):
1786 """Information Extractor for Facebook"""
1788 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1789 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1790 _NETRC_MACHINE = 'facebook'
1791 _available_formats = ['video', 'highqual', 'lowqual']
1792 _video_extensions = {
1797 IE_NAME = u'facebook'
1799 def __init__(self, downloader=None):
1800 InfoExtractor.__init__(self, downloader)
1802 def _reporter(self, message):
1803 """Add header and report message."""
1804 self._downloader.to_screen(u'[facebook] %s' % message)
1806 def report_login(self):
1807 """Report attempt to log in."""
1808 self._reporter(u'Logging in')
1810 def report_video_webpage_download(self, video_id):
1811 """Report attempt to download video webpage."""
1812 self._reporter(u'%s: Downloading video webpage' % video_id)
1814 def report_information_extraction(self, video_id):
1815 """Report attempt to extract video information."""
1816 self._reporter(u'%s: Extracting video information' % video_id)
1818 def _parse_page(self, video_webpage):
1819 """Extract video information from page"""
1821 data = {'title': r'\("video_title", "(.*?)"\)',
1822 'description': r'<div class="datawrap">(.*?)</div>',
1823 'owner': r'\("video_owner_name", "(.*?)"\)',
1824 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1827 for piece in data.keys():
1828 mobj = re.search(data[piece], video_webpage)
1829 if mobj is not None:
1830 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1834 for fmt in self._available_formats:
1835 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1836 if mobj is not None:
1837 # URL is in a Javascript segment inside an escaped Unicode format within
1838 # the generally utf-8 page
1839 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1840 video_info['video_urls'] = video_urls
1844 def _real_initialize(self):
1845 if self._downloader is None:
1850 downloader_params = self._downloader.params
1852 # Attempt to use provided username and password or .netrc data
1853 if downloader_params.get('username', None) is not None:
1854 useremail = downloader_params['username']
1855 password = downloader_params['password']
1856 elif downloader_params.get('usenetrc', False):
1858 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1859 if info is not None:
1863 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1864 except (IOError, netrc.NetrcParseError), err:
1865 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1868 if useremail is None:
1877 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1880 login_results = urllib2.urlopen(request).read()
1881 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1882 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1884 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1885 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1888 def _real_extract(self, url):
1889 mobj = re.match(self._VALID_URL, url)
1891 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1893 video_id = mobj.group('ID')
1896 self.report_video_webpage_download(video_id)
1897 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
1899 page = urllib2.urlopen(request)
1900 video_webpage = page.read()
1901 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1902 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1905 # Start extracting information
1906 self.report_information_extraction(video_id)
1908 # Extract information
1909 video_info = self._parse_page(video_webpage)
1912 if 'owner' not in video_info:
1913 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1915 video_uploader = video_info['owner']
1918 if 'title' not in video_info:
1919 self._downloader.trouble(u'ERROR: unable to extract video title')
1921 video_title = video_info['title']
1922 video_title = video_title.decode('utf-8')
1925 if 'thumbnail' not in video_info:
1926 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1927 video_thumbnail = ''
1929 video_thumbnail = video_info['thumbnail']
1933 if 'upload_date' in video_info:
1934 upload_time = video_info['upload_date']
1935 timetuple = email.utils.parsedate_tz(upload_time)
1936 if timetuple is not None:
1938 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
1943 video_description = video_info.get('description', 'No description available.')
1945 url_map = video_info['video_urls']
1946 if len(url_map.keys()) > 0:
1947 # Decide which formats to download
1948 req_format = self._downloader.params.get('format', None)
1949 format_limit = self._downloader.params.get('format_limit', None)
1951 if format_limit is not None and format_limit in self._available_formats:
1952 format_list = self._available_formats[self._available_formats.index(format_limit):]
1954 format_list = self._available_formats
1955 existing_formats = [x for x in format_list if x in url_map]
1956 if len(existing_formats) == 0:
1957 self._downloader.trouble(u'ERROR: no known formats available for video')
1959 if req_format is None:
1960 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1961 elif req_format == 'worst':
1962 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1963 elif req_format == '-1':
1964 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1967 if req_format not in url_map:
1968 self._downloader.trouble(u'ERROR: requested format not available')
1970 video_url_list = [(req_format, url_map[req_format])] # Specific format
1973 for format_param, video_real_url in video_url_list:
1975 video_extension = self._video_extensions.get(format_param, 'mp4')
1978 'id': video_id.decode('utf-8'),
1979 'url': video_real_url.decode('utf-8'),
1980 'uploader': video_uploader.decode('utf-8'),
1981 'upload_date': upload_date,
1982 'title': video_title,
1983 'ext': video_extension.decode('utf-8'),
1984 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1985 'thumbnail': video_thumbnail.decode('utf-8'),
1986 'description': video_description.decode('utf-8'),
1991 class BlipTVIE(InfoExtractor):
1992 """Information extractor for blip.tv"""
1994 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
1995 _URL_EXT = r'^.*\.([a-z0-9]+)$'
1996 IE_NAME = u'blip.tv'
1998 def report_extraction(self, file_id):
1999 """Report information extraction."""
2000 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2002 def report_direct_download(self, title):
2003 """Report information extraction."""
2004 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2006 def _real_extract(self, url):
2007 mobj = re.match(self._VALID_URL, url)
2009 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2016 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2017 request = urllib2.Request(json_url.encode('utf-8'))
2018 self.report_extraction(mobj.group(1))
2021 urlh = urllib2.urlopen(request)
2022 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2023 basename = url.split('/')[-1]
2024 title,ext = os.path.splitext(basename)
2025 title = title.decode('UTF-8')
2026 ext = ext.replace('.', '')
2027 self.report_direct_download(title)
2035 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2036 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2038 if info is None: # Regular URL
2040 json_code = urlh.read()
2041 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2042 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2046 json_data = json.loads(json_code)
2047 if 'Post' in json_data:
2048 data = json_data['Post']
2052 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2053 video_url = data['media']['url']
2054 umobj = re.match(self._URL_EXT, video_url)
2056 raise ValueError('Can not determine filename extension')
2057 ext = umobj.group(1)
2060 'id': data['item_id'],
2062 'uploader': data['display_name'],
2063 'upload_date': upload_date,
2064 'title': data['title'],
2066 'format': data['media']['mimeType'],
2067 'thumbnail': data['thumbnailUrl'],
2068 'description': data['description'],
2069 'player_url': data['embedUrl']
2071 except (ValueError,KeyError), err:
2072 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2075 std_headers['User-Agent'] = 'iTunes/10.6.1'
2079 class MyVideoIE(InfoExtractor):
2080 """Information Extractor for myvideo.de."""
2082 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2083 IE_NAME = u'myvideo'
2085 def __init__(self, downloader=None):
2086 InfoExtractor.__init__(self, downloader)
2088 def report_download_webpage(self, video_id):
2089 """Report webpage download."""
2090 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2092 def report_extraction(self, video_id):
2093 """Report information extraction."""
2094 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2096 def _real_extract(self,url):
2097 mobj = re.match(self._VALID_URL, url)
2099 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2102 video_id = mobj.group(1)
2105 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2107 self.report_download_webpage(video_id)
2108 webpage = urllib2.urlopen(request).read()
2109 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2110 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2113 self.report_extraction(video_id)
2114 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2117 self._downloader.trouble(u'ERROR: unable to extract media URL')
2119 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2121 mobj = re.search('<title>([^<]+)</title>', webpage)
2123 self._downloader.trouble(u'ERROR: unable to extract title')
2126 video_title = mobj.group(1)
2132 'upload_date': u'NA',
2133 'title': video_title,
2139 class ComedyCentralIE(InfoExtractor):
2140 """Information extractor for The Daily Show and Colbert Report """
2142 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2143 IE_NAME = u'comedycentral'
2145 def report_extraction(self, episode_id):
2146 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2148 def report_config_download(self, episode_id):
2149 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2151 def report_index_download(self, episode_id):
2152 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2154 def report_player_url(self, episode_id):
2155 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2157 def _real_extract(self, url):
2158 mobj = re.match(self._VALID_URL, url)
2160 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2163 if mobj.group('shortname'):
2164 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2165 url = u'http://www.thedailyshow.com/full-episodes/'
2167 url = u'http://www.colbertnation.com/full-episodes/'
2168 mobj = re.match(self._VALID_URL, url)
2169 assert mobj is not None
2171 dlNewest = not mobj.group('episode')
2173 epTitle = mobj.group('showname')
2175 epTitle = mobj.group('episode')
2177 req = urllib2.Request(url)
2178 self.report_extraction(epTitle)
2180 htmlHandle = urllib2.urlopen(req)
2181 html = htmlHandle.read()
2182 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2183 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2186 url = htmlHandle.geturl()
2187 mobj = re.match(self._VALID_URL, url)
2189 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2191 if mobj.group('episode') == '':
2192 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2194 epTitle = mobj.group('episode')
2196 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2197 if len(mMovieParams) == 0:
2198 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2201 playerUrl_raw = mMovieParams[0][0]
2202 self.report_player_url(epTitle)
2204 urlHandle = urllib2.urlopen(playerUrl_raw)
2205 playerUrl = urlHandle.geturl()
2206 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2207 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2210 uri = mMovieParams[0][1]
2211 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2212 self.report_index_download(epTitle)
2214 indexXml = urllib2.urlopen(indexUrl).read()
2215 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2216 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2221 idoc = xml.etree.ElementTree.fromstring(indexXml)
2222 itemEls = idoc.findall('.//item')
2223 for itemEl in itemEls:
2224 mediaId = itemEl.findall('./guid')[0].text
2225 shortMediaId = mediaId.split(':')[-1]
2226 showId = mediaId.split(':')[-2].replace('.com', '')
2227 officialTitle = itemEl.findall('./title')[0].text
2228 officialDate = itemEl.findall('./pubDate')[0].text
2230 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2231 urllib.urlencode({'uri': mediaId}))
2232 configReq = urllib2.Request(configUrl)
2233 self.report_config_download(epTitle)
2235 configXml = urllib2.urlopen(configReq).read()
2236 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2237 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2240 cdoc = xml.etree.ElementTree.fromstring(configXml)
2242 for rendition in cdoc.findall('.//rendition'):
2243 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2247 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2250 # For now, just pick the highest bitrate
2251 format,video_url = turls[-1]
2253 effTitle = showId + u'-' + epTitle
2258 'upload_date': officialDate,
2263 'description': officialTitle,
2264 'player_url': playerUrl
2267 results.append(info)
2272 class EscapistIE(InfoExtractor):
2273 """Information extractor for The Escapist """
2275 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2276 IE_NAME = u'escapist'
2278 def report_extraction(self, showName):
2279 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2281 def report_config_download(self, showName):
2282 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2284 def _real_extract(self, url):
2285 mobj = re.match(self._VALID_URL, url)
2287 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2289 showName = mobj.group('showname')
2290 videoId = mobj.group('episode')
2292 self.report_extraction(showName)
2294 webPage = urllib2.urlopen(url)
2295 webPageBytes = webPage.read()
2296 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2297 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2298 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2299 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2302 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2303 description = unescapeHTML(descMatch.group(1))
2304 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2305 imgUrl = unescapeHTML(imgMatch.group(1))
2306 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2307 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2308 configUrlMatch = re.search('config=(.*)$', playerUrl)
2309 configUrl = urllib2.unquote(configUrlMatch.group(1))
2311 self.report_config_download(showName)
2313 configJSON = urllib2.urlopen(configUrl).read()
2314 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2315 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2318 # Technically, it's JavaScript, not JSON
2319 configJSON = configJSON.replace("'", '"')
2322 config = json.loads(configJSON)
2323 except (ValueError,), err:
2324 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2327 playlist = config['playlist']
2328 videoUrl = playlist[1]['url']
2333 'uploader': showName,
2334 'upload_date': None,
2338 'thumbnail': imgUrl,
2339 'description': description,
2340 'player_url': playerUrl,
2346 class CollegeHumorIE(InfoExtractor):
2347 """Information extractor for collegehumor.com"""
2349 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2350 IE_NAME = u'collegehumor'
2352 def report_webpage(self, video_id):
2353 """Report information extraction."""
2354 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2356 def report_extraction(self, video_id):
2357 """Report information extraction."""
2358 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2360 def _real_extract(self, url):
2361 mobj = re.match(self._VALID_URL, url)
2363 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2365 video_id = mobj.group('videoid')
2367 self.report_webpage(video_id)
2368 request = urllib2.Request(url)
2370 webpage = urllib2.urlopen(request).read()
2371 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2372 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2375 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2377 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2379 internal_video_id = m.group('internalvideoid')
2383 'internal_id': internal_video_id,
2386 self.report_extraction(video_id)
2387 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2389 metaXml = urllib2.urlopen(xmlUrl).read()
2390 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2391 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
2394 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2396 videoNode = mdoc.findall('./video')[0]
2397 info['description'] = videoNode.findall('./description')[0].text
2398 info['title'] = videoNode.findall('./caption')[0].text
2399 info['url'] = videoNode.findall('./file')[0].text
2400 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2401 info['ext'] = info['url'].rpartition('.')[2]
2402 info['format'] = info['ext']
2404 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2410 class XVideosIE(InfoExtractor):
2411 """Information extractor for xvideos.com"""
2413 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2414 IE_NAME = u'xvideos'
2416 def report_webpage(self, video_id):
2417 """Report information extraction."""
2418 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2420 def report_extraction(self, video_id):
2421 """Report information extraction."""
2422 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2424 def _real_extract(self, url):
2425 mobj = re.match(self._VALID_URL, url)
2427 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2429 video_id = mobj.group(1).decode('utf-8')
2431 self.report_webpage(video_id)
2433 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2435 webpage = urllib2.urlopen(request).read()
2436 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2437 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2440 self.report_extraction(video_id)
2444 mobj = re.search(r'flv_url=(.+?)&', webpage)
2446 self._downloader.trouble(u'ERROR: unable to extract video url')
2448 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2452 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2454 self._downloader.trouble(u'ERROR: unable to extract video title')
2456 video_title = mobj.group(1).decode('utf-8')
2459 # Extract video thumbnail
2460 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2462 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2464 video_thumbnail = mobj.group(0).decode('utf-8')
2470 'upload_date': None,
2471 'title': video_title,
2474 'thumbnail': video_thumbnail,
2475 'description': None,
2482 class SoundcloudIE(InfoExtractor):
2483 """Information extractor for soundcloud.com
2484 To access the media, the uid of the song and a stream token
2485 must be extracted from the page source and the script must make
2486 a request to media.soundcloud.com/crossdomain.xml. Then
2487 the media can be grabbed by requesting from an url composed
2488 of the stream token and uid
2491 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2492 IE_NAME = u'soundcloud'
2494 def __init__(self, downloader=None):
2495 InfoExtractor.__init__(self, downloader)
2497 def report_webpage(self, video_id):
2498 """Report information extraction."""
2499 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2501 def report_extraction(self, video_id):
2502 """Report information extraction."""
2503 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2505 def _real_extract(self, url):
2506 mobj = re.match(self._VALID_URL, url)
2508 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2511 # extract uploader (which is in the url)
2512 uploader = mobj.group(1).decode('utf-8')
2513 # extract simple title (uploader + slug of song title)
2514 slug_title = mobj.group(2).decode('utf-8')
2515 simple_title = uploader + u'-' + slug_title
2517 self.report_webpage('%s/%s' % (uploader, slug_title))
2519 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2521 webpage = urllib2.urlopen(request).read()
2522 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2523 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2526 self.report_extraction('%s/%s' % (uploader, slug_title))
2528 # extract uid and stream token that soundcloud hands out for access
2529 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2531 video_id = mobj.group(1)
2532 stream_token = mobj.group(2)
2534 # extract unsimplified title
2535 mobj = re.search('"title":"(.*?)",', webpage)
2537 title = mobj.group(1).decode('utf-8')
2539 title = simple_title
2541 # construct media url (with uid/token)
2542 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2543 mediaURL = mediaURL % (video_id, stream_token)
2546 description = u'No description available'
2547 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2549 description = mobj.group(1)
2553 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2556 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2557 except Exception, e:
2558 self._downloader.to_stderr(str(e))
2560 # for soundcloud, a request to a cross domain is required for cookies
2561 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2564 'id': video_id.decode('utf-8'),
2566 'uploader': uploader.decode('utf-8'),
2567 'upload_date': upload_date,
2572 'description': description.decode('utf-8')
2576 class InfoQIE(InfoExtractor):
2577 """Information extractor for infoq.com"""
2579 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2582 def report_webpage(self, video_id):
2583 """Report information extraction."""
2584 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2586 def report_extraction(self, video_id):
2587 """Report information extraction."""
2588 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2590 def _real_extract(self, url):
2591 mobj = re.match(self._VALID_URL, url)
2593 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2596 self.report_webpage(url)
2598 request = urllib2.Request(url)
2600 webpage = urllib2.urlopen(request).read()
2601 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2602 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2605 self.report_extraction(url)
2609 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2611 self._downloader.trouble(u'ERROR: unable to extract video url')
2613 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2617 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2619 self._downloader.trouble(u'ERROR: unable to extract video title')
2621 video_title = mobj.group(1).decode('utf-8')
2623 # Extract description
2624 video_description = u'No description available.'
2625 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2626 if mobj is not None:
2627 video_description = mobj.group(1).decode('utf-8')
2629 video_filename = video_url.split('/')[-1]
2630 video_id, extension = video_filename.split('.')
2636 'upload_date': None,
2637 'title': video_title,
2639 'format': extension, # Extension is always(?) mp4, but seems to be flv
2641 'description': video_description,
2647 class MixcloudIE(InfoExtractor):
2648 """Information extractor for www.mixcloud.com"""
2649 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2650 IE_NAME = u'mixcloud'
2652 def __init__(self, downloader=None):
2653 InfoExtractor.__init__(self, downloader)
2655 def report_download_json(self, file_id):
2656 """Report JSON download."""
2657 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2659 def report_extraction(self, file_id):
2660 """Report information extraction."""
2661 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2663 def get_urls(self, jsonData, fmt, bitrate='best'):
2664 """Get urls from 'audio_formats' section in json"""
2667 bitrate_list = jsonData[fmt]
2668 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2669 bitrate = max(bitrate_list) # select highest
2671 url_list = jsonData[fmt][bitrate]
2672 except TypeError: # we have no bitrate info.
2673 url_list = jsonData[fmt]
2676 def check_urls(self, url_list):
2677 """Returns 1st active url from list"""
2678 for url in url_list:
2680 urllib2.urlopen(url)
2682 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2687 def _print_formats(self, formats):
2688 print 'Available formats:'
2689 for fmt in formats.keys():
2690 for b in formats[fmt]:
2692 ext = formats[fmt][b][0]
2693 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
2694 except TypeError: # we have no bitrate info
2695 ext = formats[fmt][0]
2696 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
2699 def _real_extract(self, url):
2700 mobj = re.match(self._VALID_URL, url)
2702 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2704 # extract uploader & filename from url
2705 uploader = mobj.group(1).decode('utf-8')
2706 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2708 # construct API request
2709 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2710 # retrieve .json file with links to files
2711 request = urllib2.Request(file_url)
2713 self.report_download_json(file_url)
2714 jsonData = urllib2.urlopen(request).read()
2715 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2716 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
2720 json_data = json.loads(jsonData)
2721 player_url = json_data['player_swf_url']
2722 formats = dict(json_data['audio_formats'])
2724 req_format = self._downloader.params.get('format', None)
2727 if self._downloader.params.get('listformats', None):
2728 self._print_formats(formats)
2731 if req_format is None or req_format == 'best':
2732 for format_param in formats.keys():
2733 url_list = self.get_urls(formats, format_param)
2735 file_url = self.check_urls(url_list)
2736 if file_url is not None:
2739 if req_format not in formats.keys():
2740 self._downloader.trouble(u'ERROR: format is not available')
2743 url_list = self.get_urls(formats, req_format)
2744 file_url = self.check_urls(url_list)
2745 format_param = req_format
2748 'id': file_id.decode('utf-8'),
2749 'url': file_url.decode('utf-8'),
2750 'uploader': uploader.decode('utf-8'),
2751 'upload_date': u'NA',
2752 'title': json_data['name'],
2753 'ext': file_url.split('.')[-1].decode('utf-8'),
2754 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2755 'thumbnail': json_data['thumbnail_url'],
2756 'description': json_data['description'],
2757 'player_url': player_url.decode('utf-8'),
2760 class StanfordOpenClassroomIE(InfoExtractor):
2761 """Information extractor for Stanford's Open ClassRoom"""
2763 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2764 IE_NAME = u'stanfordoc'
2766 def report_download_webpage(self, objid):
2767 """Report information extraction."""
2768 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2770 def report_extraction(self, video_id):
2771 """Report information extraction."""
2772 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2774 def _real_extract(self, url):
2775 mobj = re.match(self._VALID_URL, url)
2777 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2780 if mobj.group('course') and mobj.group('video'): # A specific video
2781 course = mobj.group('course')
2782 video = mobj.group('video')
2784 'id': course + '_' + video,
2787 self.report_extraction(info['id'])
2788 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2789 xmlUrl = baseUrl + video + '.xml'
2791 metaXml = urllib2.urlopen(xmlUrl).read()
2792 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2793 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2795 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2797 info['title'] = mdoc.findall('./title')[0].text
2798 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2800 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2802 info['ext'] = info['url'].rpartition('.')[2]
2803 info['format'] = info['ext']
2805 elif mobj.group('course'): # A course page
2806 course = mobj.group('course')
2812 self.report_download_webpage(info['id'])
2814 coursepage = urllib2.urlopen(url).read()
2815 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2816 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2819 m = re.search('<h1>([^<]+)</h1>', coursepage)
2821 info['title'] = unescapeHTML(m.group(1))
2823 info['title'] = info['id']
2825 m = re.search('<description>([^<]+)</description>', coursepage)
2827 info['description'] = unescapeHTML(m.group(1))
2829 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2832 'type': 'reference',
2833 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2837 for entry in info['list']:
2838 assert entry['type'] == 'reference'
2839 results += self.extract(entry['url'])
2844 'id': 'Stanford OpenClassroom',
2848 self.report_download_webpage(info['id'])
2849 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2851 rootpage = urllib2.urlopen(rootURL).read()
2852 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2853 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2856 info['title'] = info['id']
2858 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2861 'type': 'reference',
2862 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2867 for entry in info['list']:
2868 assert entry['type'] == 'reference'
2869 results += self.extract(entry['url'])
2872 class MTVIE(InfoExtractor):
2873 """Information extractor for MTV.com"""
2875 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2878 def report_webpage(self, video_id):
2879 """Report information extraction."""
2880 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2882 def report_extraction(self, video_id):
2883 """Report information extraction."""
2884 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2886 def _real_extract(self, url):
2887 mobj = re.match(self._VALID_URL, url)
2889 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2891 if not mobj.group('proto'):
2892 url = 'http://' + url
2893 video_id = mobj.group('videoid')
2894 self.report_webpage(video_id)
2896 request = urllib2.Request(url)
2898 webpage = urllib2.urlopen(request).read()
2899 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2900 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2903 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2905 self._downloader.trouble(u'ERROR: unable to extract song name')
2907 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2908 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2910 self._downloader.trouble(u'ERROR: unable to extract performer')
2912 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2913 video_title = performer + ' - ' + song_name
2915 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2917 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
2919 mtvn_uri = mobj.group(1)
2921 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2923 self._downloader.trouble(u'ERROR: unable to extract content id')
2925 content_id = mobj.group(1)
2927 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2928 self.report_extraction(video_id)
2929 request = urllib2.Request(videogen_url)
2931 metadataXml = urllib2.urlopen(request).read()
2932 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2933 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
2936 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2937 renditions = mdoc.findall('.//rendition')
2939 # For now, always pick the highest quality.
2940 rendition = renditions[-1]
2943 _,_,ext = rendition.attrib['type'].partition('/')
2944 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2945 video_url = rendition.find('./src').text
2947 self._downloader.trouble('Invalid rendition field.')
2953 'uploader': performer,
2954 'title': video_title,
2963 class YoukuIE(InfoExtractor):
2965 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2968 def __init__(self, downloader=None):
2969 InfoExtractor.__init__(self, downloader)
2971 def report_download_webpage(self, file_id):
2972 """Report webpage download."""
2973 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
2975 def report_extraction(self, file_id):
2976 """Report information extraction."""
2977 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
2980 nowTime = int(time.time() * 1000)
2981 random1 = random.randint(1000,1998)
2982 random2 = random.randint(1000,9999)
2984 return "%d%d%d" %(nowTime,random1,random2)
2986 def _get_file_ID_mix_string(self, seed):
2988 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2990 for i in range(len(source)):
2991 seed = (seed * 211 + 30031 ) % 65536
2992 index = math.floor(seed / 65536 * len(source) )
2993 mixed.append(source[int(index)])
2994 source.remove(source[int(index)])
2995 #return ''.join(mixed)
2999 def _get_file_id(self, fileId, seed):
3000 mixed = self._get_file_ID_mix_string(seed)
3001 ids = fileId.split('*')
3005 realId.append(mixed[int(ch)])
3006 return ''.join(realId)
3008 def _gen_key(self, key1, key2):
3014 def _real_extract(self, url):
3015 mobj = re.match(self._VALID_URL, url)
3017 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3019 video_id = mobj.group('ID')
3021 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3024 request = urllib2.Request(info_url, None, std_headers)
3026 self.report_download_webpage(video_id)
3027 jsondata = urllib2.urlopen(request).read()
3028 except (urllib2.URLError, httplib.HTTPException, socket.error) as err:
3029 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3032 self.report_extraction(video_id)
3034 config = json.loads(jsondata)
3036 video_title = config['data'][0]['title']
3037 seed = config['data'][0]['seed']
3039 format = self._downloader.params.get('format', None)
3040 supported_format = config['data'][0]['streamfileids'].keys()
3042 if format is None or format == 'best':
3043 if 'hd2' in supported_format:
3048 elif format == 'worst':
3056 fileid = config['data'][0]['streamfileids'][format]
3057 seg_number = len(config['data'][0]['segs'][format])
3060 for i in xrange(seg_number):
3061 keys.append(config['data'][0]['segs'][format][i]['k'])
3064 #youku only could be viewed from mainland china
3066 self._downloader.trouble(u'ERROR: unable to extract info section')
3070 sid = self._gen_sid()
3071 fileid = self._get_file_id(fileid, seed)
3073 #column 8,9 of fileid represent the segment number
3074 #fileid[7:9] should be changed
3075 for index, key in enumerate(keys):
3077 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3078 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3081 'id': '%s_part%d' % (video_id, index),
3082 'url': download_url,
3084 'title': video_title
3088 files_info.append(info)
3091 # vim: tabstop=4 shiftwidth=4 softtabstop=4 noexpandtab