2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # License: Public domain code
22 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.5) Gecko/2008120122 Firefox/3.0.5',
23 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
24 'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
25 'Accept-Language': 'en-us,en;q=0.5',
28 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
30 class DownloadError(Exception):
31 """Download Error exception.
33 This exception may be thrown by FileDownloader objects if they are not
34 configured to continue on errors. They will contain the appropriate
39 class SameFileError(Exception):
40 """Same File exception.
42 This exception will be thrown by FileDownloader objects if they detect
43 multiple files would have to be downloaded to the same file on disk.
47 class PostProcessingError(Exception):
48 """Post Processing exception.
50 This exception may be raised by PostProcessor's .run() method to
51 indicate an error in the postprocessing task.
55 class FileDownloader(object):
56 """File Downloader class.
58 File downloader objects are the ones responsible of downloading the
59 actual video file and writing it to disk if the user has requested
60 it, among some other tasks. In most cases there should be one per
61 program. As, given a video URL, the downloader doesn't know how to
62 extract all the needed information, task that InfoExtractors do, it
63 has to pass the URL to one of them.
65 For this, file downloader objects have a method that allows
66 InfoExtractors to be registered in a given order. When it is passed
67 a URL, the file downloader handles it to the first InfoExtractor it
68 finds that reports being able to handle it. The InfoExtractor returns
69 all the information to the FileDownloader and the latter downloads the
70 file or does whatever it's instructed to do.
72 File downloaders accept a lot of parameters. In order not to saturate
73 the object constructor with arguments, it receives a dictionary of
74 options instead. These options are available through the params
75 attribute for the InfoExtractors to use. The FileDownloader also
76 registers itself as the downloader in charge for the InfoExtractors
77 that are added to it, so this is a "mutual registration".
81 username: Username for authentication purposes.
82 password: Password for authentication purposes.
83 usenetrc: Use netrc for authentication instead.
84 quiet: Do not print messages to stdout.
85 forceurl: Force printing final URL.
86 forcetitle: Force printing title.
87 simulate: Do not download the video files.
88 format: Video format code.
89 outtmpl: Template for output names.
90 ignoreerrors: Do not stop on download errors.
91 ratelimit: Download speed limit, in bytes/sec.
92 nooverwrites: Prevent overwriting files.
99 def __init__(self, params):
100 """Create a FileDownloader object with the given options."""
106 def pmkdir(filename):
107 """Create directory components in filename. Similar to Unix "mkdir -p"."""
108 components = filename.split(os.sep)
109 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
110 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
111 for dir in aggregate:
112 if not os.path.exists(dir):
116 def format_bytes(bytes):
122 exponent = long(math.log(float(bytes), 1024.0))
123 suffix = 'bkMGTPEZY'[exponent]
124 converted = float(bytes) / float(1024**exponent)
125 return '%.2f%s' % (converted, suffix)
128 def calc_percent(byte_counter, data_len):
131 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
134 def calc_eta(start, now, total, current):
138 if current == 0 or dif < 0.001: # One millisecond
140 rate = float(current) / dif
141 eta = long((float(total) - float(current)) / rate)
142 (eta_mins, eta_secs) = divmod(eta, 60)
145 return '%02d:%02d' % (eta_mins, eta_secs)
148 def calc_speed(start, now, bytes):
150 if bytes == 0 or dif < 0.001: # One millisecond
151 return '%10s' % '---b/s'
152 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
155 def best_block_size(elapsed_time, bytes):
156 new_min = max(bytes / 2.0, 1.0)
157 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
158 if elapsed_time < 0.001:
160 rate = bytes / elapsed_time
168 def parse_bytes(bytestr):
169 """Parse a string indicating a byte quantity into a long integer."""
170 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
173 number = float(matchobj.group(1))
174 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
175 return long(round(number * multiplier))
177 def add_info_extractor(self, ie):
178 """Add an InfoExtractor object to the end of the list."""
180 ie.set_downloader(self)
182 def add_post_processor(self, pp):
183 """Add a PostProcessor object to the end of the chain."""
185 pp.set_downloader(self)
187 def to_stdout(self, message, skip_eol=False):
188 """Print message to stdout if not in quiet mode."""
189 if not self.params.get('quiet', False):
190 print u'%s%s' % (message, [u'\n', u''][skip_eol]),
193 def to_stderr(self, message):
194 """Print message to stderr."""
195 print >>sys.stderr, message
197 def fixed_template(self):
198 """Checks if the output template is fixed."""
199 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
201 def trouble(self, message=None):
202 """Determine action to take when a download problem appears.
204 Depending on if the downloader has been configured to ignore
205 download errors or not, this method may throw an exception or
206 not when errors are found, after printing the message. If it
207 doesn't raise, it returns an error code suitable to be returned
208 later as a program exit code to indicate error.
210 if message is not None:
211 self.to_stderr(message)
212 if not self.params.get('ignoreerrors', False):
213 raise DownloadError(message)
216 def slow_down(self, start_time, byte_counter):
217 """Sleep if the download speed is over the rate limit."""
218 rate_limit = self.params.get('ratelimit', None)
219 if rate_limit is None or byte_counter == 0:
222 elapsed = now - start_time
225 speed = float(byte_counter) / elapsed
226 if speed > rate_limit:
227 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
229 def report_destination(self, filename):
230 """Report destination filename."""
231 self.to_stdout(u'[download] Destination: %s' % filename)
233 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
234 """Report download progress."""
235 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
236 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
238 def report_finish(self):
239 """Report download finished."""
242 def process_info(self, info_dict):
243 """Process a single dictionary returned by an InfoExtractor."""
245 if self.params.get('forcetitle', False):
246 print info_dict['title']
247 if self.params.get('forceurl', False):
248 print info_dict['url']
250 # Do nothing else if in simulate mode
251 if self.params.get('simulate', False):
255 filename = self.params['outtmpl'] % info_dict
256 self.report_destination(filename)
257 except (ValueError, KeyError), err:
258 return self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
259 if self.params['nooverwrites'] and os.path.exists(filename):
260 self.to_stderr('WARNING: file exists: %s; skipping' % filename)
263 self.pmkdir(filename)
264 except (OSError, IOError), err:
265 return self.trouble('ERROR: unable to create directories: %s' % str(err))
267 outstream = open(filename, 'wb')
268 except (OSError, IOError), err:
269 return self.trouble('ERROR: unable to open for writing: %s' % str(err))
271 self._do_download(outstream, info_dict['url'])
273 except (OSError, IOError), err:
274 return self.trouble('ERROR: unable to write video data: %s' % str(err))
275 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
276 return self.trouble('ERROR: unable to download video data: %s' % str(err))
278 self.post_process(filename, info_dict)
279 except (PostProcessingError), err:
280 return self.trouble('ERROR: postprocessing: %s' % str(err))
284 def download(self, url_list):
285 """Download a given list of URLs."""
287 if len(url_list) > 1 and self.fixed_template():
288 raise SameFileError(self.params['outtmpl'])
291 suitable_found = False
293 # Go to next InfoExtractor if not suitable
294 if not ie.suitable(url):
297 # Suitable InfoExtractor found
298 suitable_found = True
300 # Extract information from URL
301 all_results = ie.extract(url)
302 results = [x for x in all_results if x is not None]
304 # See if there were problems extracting any information
305 if len(results) != len(all_results):
306 retcode = self.trouble()
308 # Two results could go to the same file
309 if len(results) > 1 and self.fixed_template():
310 raise SameFileError(self.params['outtmpl'])
312 # Process each result
313 for result in results:
314 result = self.process_info(result)
316 # Do not overwrite an error code with a success code
320 # Suitable InfoExtractor had been found; go to next URL
323 if not suitable_found:
324 retcode = self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
328 def post_process(self, filename, ie_info):
329 """Run the postprocessing chain on the given file."""
331 info['filepath'] = filename
337 def _do_download(self, stream, url):
338 request = urllib2.Request(url, None, std_headers)
339 data = urllib2.urlopen(request)
340 data_len = data.info().get('Content-length', None)
341 data_len_str = self.format_bytes(data_len)
347 percent_str = self.calc_percent(byte_counter, data_len)
348 eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
349 speed_str = self.calc_speed(start, time.time(), byte_counter)
350 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
354 data_block = data.read(block_size)
356 data_block_len = len(data_block)
357 if data_block_len == 0:
359 byte_counter += data_block_len
360 stream.write(data_block)
361 block_size = self.best_block_size(after - before, data_block_len)
364 self.slow_down(start, byte_counter)
367 if data_len is not None and str(byte_counter) != data_len:
368 raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
370 class InfoExtractor(object):
371 """Information Extractor class.
373 Information extractors are the classes that, given a URL, extract
374 information from the video (or videos) the URL refers to. This
375 information includes the real video URL, the video title and simplified
376 title, author and others. It is returned in a list of dictionaries when
377 calling its extract() method. It is a list because a URL can refer to
378 more than one video (think of playlists). The dictionaries must include
379 the following fields:
381 id: Video identifier.
382 url: Final video URL.
383 uploader: Nickname of the video uploader.
384 title: Literal title.
385 stitle: Simplified title.
386 ext: Video filename extension.
388 Subclasses of this one should re-define the _real_initialize() and
389 _real_extract() methods, as well as the suitable() static method.
390 Probably, they should also be instantiated and added to the main
397 def __init__(self, downloader=None):
398 """Constructor. Receives an optional downloader."""
400 self.set_downloader(downloader)
404 """Receives a URL and returns True if suitable for this IE."""
407 def initialize(self):
408 """Initializes an instance (authentication, etc)."""
410 self._real_initialize()
413 def extract(self, url):
414 """Extracts URL information and returns it in list of dicts."""
416 return self._real_extract(url)
418 def set_downloader(self, downloader):
419 """Sets the downloader for this IE."""
420 self._downloader = downloader
422 def to_stdout(self, message):
423 """Print message to stdout if downloader is not in quiet mode."""
424 if self._downloader is None or not self._downloader.params.get('quiet', False):
427 def to_stderr(self, message):
428 """Print message to stderr."""
429 print >>sys.stderr, message
431 def _real_initialize(self):
432 """Real initialization process. Redefine in subclasses."""
435 def _real_extract(self, url):
436 """Real extraction process. Redefine in subclasses."""
439 class YoutubeIE(InfoExtractor):
440 """Information extractor for youtube.com."""
442 _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
443 _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
444 _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
445 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
446 _NETRC_MACHINE = 'youtube'
450 return (re.match(YoutubeIE._VALID_URL, url) is not None)
453 def htmlentity_transform(matchobj):
454 """Transforms an HTML entity to a Unicode character."""
455 entity = matchobj.group(1)
457 # Known non-numeric HTML entity
458 if entity in htmlentitydefs.name2codepoint:
459 return unichr(htmlentitydefs.name2codepoint[entity])
462 mobj = re.match(ur'(?u)#(x?\d+)', entity)
464 numstr = mobj.group(1)
465 if numstr.startswith(u'x'):
467 numstr = u'0%s' % numstr
470 return unichr(long(numstr, base))
472 # Unknown entity in name, return its literal representation
473 return (u'&%s;' % entity)
475 def report_lang(self):
476 """Report attempt to set language."""
477 self.to_stdout(u'[youtube] Setting language')
479 def report_login(self):
480 """Report attempt to log in."""
481 self.to_stdout(u'[youtube] Logging in')
483 def report_age_confirmation(self):
484 """Report attempt to confirm age."""
485 self.to_stdout(u'[youtube] Confirming age')
487 def report_webpage_download(self, video_id):
488 """Report attempt to download webpage."""
489 self.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
491 def report_information_extraction(self, video_id):
492 """Report attempt to extract video information."""
493 self.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
495 def report_video_url(self, video_id, video_real_url):
496 """Report extracted video URL."""
497 self.to_stdout(u'[youtube] %s: URL: %s' % (video_id, video_real_url))
499 def _real_initialize(self):
500 if self._downloader is None:
505 downloader_params = self._downloader.params
507 # Attempt to use provided username and password or .netrc data
508 if downloader_params.get('username', None) is not None:
509 username = downloader_params['username']
510 password = downloader_params['password']
511 elif downloader_params.get('usenetrc', False):
513 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
518 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
519 except (IOError, netrc.NetrcParseError), err:
520 self.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
524 request = urllib2.Request(self._LANG_URL, None, std_headers)
527 urllib2.urlopen(request).read()
528 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
529 self.to_stderr(u'WARNING: unable to set language: %s' % str(err))
532 # No authentication to be performed
538 'current_form': 'loginForm',
540 'action_login': 'Log In',
541 'username': username,
542 'password': password,
544 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
547 login_results = urllib2.urlopen(request).read()
548 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
549 self.to_stderr(u'WARNING: unable to log in: bad username or password')
551 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
552 self.to_stderr(u'WARNING: unable to log in: %s' % str(err))
558 'action_confirm': 'Confirm',
560 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
562 self.report_age_confirmation()
563 age_results = urllib2.urlopen(request).read()
564 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
565 self.to_stderr(u'ERROR: unable to confirm age: %s' % str(err))
568 def _real_extract(self, url):
569 # Extract video id from URL
570 mobj = re.match(self._VALID_URL, url)
572 self.to_stderr(u'ERROR: invalid URL: %s' % url)
574 video_id = mobj.group(2)
576 # Downloader parameters
578 if self._downloader is not None:
579 params = self._downloader.params
580 format_param = params.get('format', None)
581 if format_param is None:
589 }.get(format_param, 'flv')
591 # Normalize URL, including format
592 normalized_url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id
593 if format_param is not None:
594 normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
595 request = urllib2.Request(normalized_url, None, std_headers)
597 self.report_webpage_download(video_id)
598 video_webpage = urllib2.urlopen(request).read()
599 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
600 self.to_stderr(u'ERROR: unable to download video webpage: %s' % str(err))
602 self.report_information_extraction(video_id)
605 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
607 self.to_stderr(u'ERROR: unable to extract "t" parameter')
609 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&el=detailpage&ps=' % (video_id, mobj.group(1))
610 if format_param is not None:
611 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
612 self.report_video_url(video_id, video_real_url)
615 mobj = re.search(r"var watchUsername = '([^']+)';", video_webpage)
617 self.to_stderr(u'ERROR: unable to extract uploader nickname')
619 video_uploader = mobj.group(1)
622 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
624 self.to_stderr(u'ERROR: unable to extract video title')
626 video_title = mobj.group(1).decode('utf-8')
627 video_title = re.sub(ur'(?u)&(.+?);', self.htmlentity_transform, video_title)
628 video_title = video_title.replace(os.sep, u'%')
631 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
632 simple_title = simple_title.strip(ur'_')
636 'id': video_id.decode('utf-8'),
637 'url': video_real_url.decode('utf-8'),
638 'uploader': video_uploader.decode('utf-8'),
639 'title': video_title,
640 'stitle': simple_title,
641 'ext': video_extension.decode('utf-8'),
644 class MetacafeIE(InfoExtractor):
645 """Information Extractor for metacafe.com."""
647 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
648 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
651 def __init__(self, youtube_ie, downloader=None):
652 InfoExtractor.__init__(self, downloader)
653 self._youtube_ie = youtube_ie
657 return (re.match(MetacafeIE._VALID_URL, url) is not None)
659 def report_disclaimer(self):
660 """Report disclaimer retrieval."""
661 self.to_stdout(u'[metacafe] Retrieving disclaimer')
663 def report_age_confirmation(self):
664 """Report attempt to confirm age."""
665 self.to_stdout(u'[metacafe] Confirming age')
667 def report_download_webpage(self, video_id):
668 """Report webpage download."""
669 self.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
671 def report_extraction(self, video_id):
672 """Report information extraction."""
673 self.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
675 def _real_initialize(self):
676 # Retrieve disclaimer
677 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
679 self.report_disclaimer()
680 disclaimer = urllib2.urlopen(request).read()
681 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
682 self.to_stderr(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
688 'submit': "Continue - I'm over 18",
690 request = urllib2.Request('http://www.metacafe.com/', urllib.urlencode(disclaimer_form), std_headers)
692 self.report_age_confirmation()
693 disclaimer = urllib2.urlopen(request).read()
694 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
695 self.to_stderr(u'ERROR: unable to confirm age: %s' % str(err))
698 def _real_extract(self, url):
699 # Extract id and simplified title from URL
700 mobj = re.match(self._VALID_URL, url)
702 self.to_stderr(u'ERROR: invalid URL: %s' % url)
705 video_id = mobj.group(1)
707 # Check if video comes from YouTube
708 mobj2 = re.match(r'^yt-(.*)$', video_id)
709 if mobj2 is not None:
710 return self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
712 simple_title = mobj.group(2).decode('utf-8')
713 video_extension = 'flv'
715 # Retrieve video webpage to extract further information
716 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
718 self.report_download_webpage(video_id)
719 webpage = urllib2.urlopen(request).read()
720 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
721 self.to_stderr(u'ERROR: unable retrieve video webpage: %s' % str(err))
724 # Extract URL, uploader and title from webpage
725 self.report_extraction(video_id)
726 mobj = re.search(r'(?m)"mediaURL":"(http.*?\.flv)"', webpage)
728 self.to_stderr(u'ERROR: unable to extract media URL')
730 mediaURL = mobj.group(1).replace('\\', '')
732 mobj = re.search(r'(?m)"gdaKey":"(.*?)"', webpage)
734 self.to_stderr(u'ERROR: unable to extract gdaKey')
736 gdaKey = mobj.group(1)
738 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
740 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
742 self.to_stderr(u'ERROR: unable to extract title')
744 video_title = mobj.group(1).decode('utf-8')
746 mobj = re.search(r'(?m)<li id="ChnlUsr">.*?Submitter:<br />(.*?)</li>', webpage)
748 self.to_stderr(u'ERROR: unable to extract uploader nickname')
750 video_uploader = re.sub(r'<.*?>', '', mobj.group(1))
754 'id': video_id.decode('utf-8'),
755 'url': video_url.decode('utf-8'),
756 'uploader': video_uploader.decode('utf-8'),
757 'title': video_title,
758 'stitle': simple_title,
759 'ext': video_extension.decode('utf-8'),
763 class YoutubeSearchIE(InfoExtractor):
764 """Information Extractor for YouTube search queries."""
765 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
766 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
767 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
768 _MORE_PAGES_INDICATOR = r'>Next</a>'
770 _max_youtube_results = 1000
772 def __init__(self, youtube_ie, downloader=None):
773 InfoExtractor.__init__(self, downloader)
774 self._youtube_ie = youtube_ie
778 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
780 def report_download_page(self, query, pagenum):
781 """Report attempt to download playlist page with given number."""
782 self.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
784 def _real_initialize(self):
785 self._youtube_ie.initialize()
787 def _real_extract(self, query):
788 mobj = re.match(self._VALID_QUERY, query)
790 self.to_stderr(u'ERROR: invalid search query "%s"' % query)
793 prefix, query = query.split(':')
796 return self._download_n_results(query, 1)
797 elif prefix == 'all':
798 return self._download_n_results(query, self._max_youtube_results)
803 self.to_stderr(u'ERROR: invalid download number %s for query "%s"' % (n, query))
805 elif n > self._max_youtube_results:
806 self.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
807 n = self._max_youtube_results
808 return self._download_n_results(query, n)
809 except ValueError: # parsing prefix as int fails
810 return self._download_n_results(query, 1)
812 def _download_n_results(self, query, n):
813 """Downloads a specified number of results for a query"""
820 self.report_download_page(query, pagenum)
821 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
822 request = urllib2.Request(result_url, None, std_headers)
824 page = urllib2.urlopen(request).read()
825 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
826 self.to_stderr(u'ERROR: unable to download webpage: %s' % str(err))
829 # Extract video identifiers
830 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
831 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
832 if video_id not in already_seen:
833 video_ids.append(video_id)
834 already_seen.add(video_id)
835 if len(video_ids) == n:
836 # Specified n videos reached
839 information.extend(self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id))
842 if self._MORE_PAGES_INDICATOR not in page:
845 information.extend(self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id))
848 pagenum = pagenum + 1
850 class YoutubePlaylistIE(InfoExtractor):
851 """Information Extractor for YouTube playlists."""
853 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/view_play_list\?p=(.+)'
854 _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
855 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
856 _MORE_PAGES_INDICATOR = r'/view_play_list?p=%s&page=%s'
859 def __init__(self, youtube_ie, downloader=None):
860 InfoExtractor.__init__(self, downloader)
861 self._youtube_ie = youtube_ie
865 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
867 def report_download_page(self, playlist_id, pagenum):
868 """Report attempt to download playlist page with given number."""
869 self.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
871 def _real_initialize(self):
872 self._youtube_ie.initialize()
874 def _real_extract(self, url):
875 # Extract playlist id
876 mobj = re.match(self._VALID_URL, url)
878 self.to_stderr(u'ERROR: invalid url: %s' % url)
881 # Download playlist pages
882 playlist_id = mobj.group(1)
887 self.report_download_page(playlist_id, pagenum)
888 request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
890 page = urllib2.urlopen(request).read()
891 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
892 self.to_stderr(u'ERROR: unable to download webpage: %s' % str(err))
895 # Extract video identifiers
897 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
898 if mobj.group(1) not in ids_in_page:
899 ids_in_page.append(mobj.group(1))
900 video_ids.extend(ids_in_page)
902 if (self._MORE_PAGES_INDICATOR % (playlist_id, pagenum + 1)) not in page:
904 pagenum = pagenum + 1
908 information.extend(self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id))
911 class PostProcessor(object):
912 """Post Processor class.
914 PostProcessor objects can be added to downloaders with their
915 add_post_processor() method. When the downloader has finished a
916 successful download, it will take its internal chain of PostProcessors
917 and start calling the run() method on each one of them, first with
918 an initial argument and then with the returned value of the previous
921 The chain will be stopped if one of them ever returns None or the end
922 of the chain is reached.
924 PostProcessor objects follow a "mutual registration" process similar
925 to InfoExtractor objects.
930 def __init__(self, downloader=None):
931 self._downloader = downloader
933 def to_stdout(self, message):
934 """Print message to stdout if downloader is not in quiet mode."""
935 if self._downloader is None or not self._downloader.params.get('quiet', False):
938 def to_stderr(self, message):
939 """Print message to stderr."""
940 print >>sys.stderr, message
942 def set_downloader(self, downloader):
943 """Sets the downloader for this PP."""
944 self._downloader = downloader
946 def run(self, information):
947 """Run the PostProcessor.
949 The "information" argument is a dictionary like the ones
950 returned by InfoExtractors. The only difference is that this
951 one has an extra field called "filepath" that points to the
954 When this method returns None, the postprocessing chain is
955 stopped. However, this method may return an information
956 dictionary that will be passed to the next postprocessing
957 object in the chain. It can be the one it received after
958 changing some fields.
960 In addition, this method may raise a PostProcessingError
961 exception that will be taken into account by the downloader
964 return information # by default, do nothing
967 if __name__ == '__main__':
969 # Modules needed only when running the main program
973 # General configuration
974 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
975 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
976 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
979 parser = optparse.OptionParser(
980 usage='Usage: %prog [options] url...',
982 conflict_handler='resolve',
984 parser.add_option('-h', '--help',
985 action='help', help='print this help text and exit')
986 parser.add_option('-v', '--version',
987 action='version', help='print program version and exit')
988 parser.add_option('-u', '--username',
989 dest='username', metavar='UN', help='account username')
990 parser.add_option('-p', '--password',
991 dest='password', metavar='PW', help='account password')
992 parser.add_option('-o', '--output',
993 dest='outtmpl', metavar='TPL', help='output filename template')
994 parser.add_option('-q', '--quiet',
995 action='store_true', dest='quiet', help='activates quiet mode', default=False)
996 parser.add_option('-s', '--simulate',
997 action='store_true', dest='simulate', help='do not download video', default=False)
998 parser.add_option('-t', '--title',
999 action='store_true', dest='usetitle', help='use title in file name', default=False)
1000 parser.add_option('-l', '--literal',
1001 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
1002 parser.add_option('-n', '--netrc',
1003 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
1004 parser.add_option('-g', '--get-url',
1005 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
1006 parser.add_option('-e', '--get-title',
1007 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
1008 parser.add_option('-f', '--format',
1009 dest='format', metavar='FMT', help='video format code')
1010 parser.add_option('-m', '--mobile-version',
1011 action='store_const', dest='format', help='alias for -f 17', const='17')
1012 parser.add_option('-d', '--high-def',
1013 action='store_const', dest='format', help='alias for -f 22', const='22')
1014 parser.add_option('-i', '--ignore-errors',
1015 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
1016 parser.add_option('-r', '--rate-limit',
1017 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
1018 parser.add_option('-a', '--batch-file',
1019 dest='batchfile', metavar='F', help='file containing URLs to download')
1020 parser.add_option('-w', '--no-overwrites',
1021 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
1022 (opts, args) = parser.parse_args()
1024 # Batch file verification
1026 if opts.batchfile is not None:
1028 batchurls = [line.strip() for line in open(opts.batchfile, 'r')]
1030 sys.exit(u'ERROR: batch file could not be read')
1031 all_urls = batchurls + args
1033 # Conflicting, missing and erroneous options
1034 if len(all_urls) < 1:
1035 sys.exit(u'ERROR: you must provide at least one URL')
1036 if opts.usenetrc and (opts.username is not None or opts.password is not None):
1037 sys.exit(u'ERROR: using .netrc conflicts with giving username/password')
1038 if opts.password is not None and opts.username is None:
1039 sys.exit(u'ERROR: account username missing')
1040 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
1041 sys.exit(u'ERROR: using output template conflicts with using title or literal title')
1042 if opts.usetitle and opts.useliteral:
1043 sys.exit(u'ERROR: using title conflicts with using literal title')
1044 if opts.username is not None and opts.password is None:
1045 opts.password = getpass.getpass(u'Type account password and press return:')
1046 if opts.ratelimit is not None:
1047 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
1048 if numeric_limit is None:
1049 sys.exit(u'ERROR: invalid rate limit specified')
1050 opts.ratelimit = numeric_limit
1052 # Information extractors
1053 youtube_ie = YoutubeIE()
1054 metacafe_ie = MetacafeIE(youtube_ie)
1055 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
1056 youtube_search_ie = YoutubeSearchIE(youtube_ie)
1059 charset = locale.getdefaultlocale()[1]
1062 fd = FileDownloader({
1063 'usenetrc': opts.usenetrc,
1064 'username': opts.username,
1065 'password': opts.password,
1066 'quiet': (opts.quiet or opts.geturl or opts.gettitle),
1067 'forceurl': opts.geturl,
1068 'forcetitle': opts.gettitle,
1069 'simulate': (opts.simulate or opts.geturl or opts.gettitle),
1070 'format': opts.format,
1071 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(charset))
1072 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
1073 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
1074 or u'%(id)s.%(ext)s'),
1075 'ignoreerrors': opts.ignoreerrors,
1076 'ratelimit': opts.ratelimit,
1077 'nooverwrites': opts.nooverwrites,
1079 fd.add_info_extractor(youtube_search_ie)
1080 fd.add_info_extractor(youtube_pl_ie)
1081 fd.add_info_extractor(metacafe_ie)
1082 fd.add_info_extractor(youtube_ie)
1083 retcode = fd.download(all_urls)
1086 except DownloadError:
1088 except SameFileError:
1089 sys.exit(u'ERROR: fixed output name but more than one file to download')
1090 except KeyboardInterrupt:
1091 sys.exit(u'\nERROR: Interrupted by user')