2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # License: Public domain code
22 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.0.8) Gecko/2009032609 Firefox/3.0.8',
23 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
24 'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
25 'Accept-Language': 'en-us,en;q=0.5',
28 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
30 class DownloadError(Exception):
31 """Download Error exception.
33 This exception may be thrown by FileDownloader objects if they are not
34 configured to continue on errors. They will contain the appropriate
39 class SameFileError(Exception):
40 """Same File exception.
42 This exception will be thrown by FileDownloader objects if they detect
43 multiple files would have to be downloaded to the same file on disk.
47 class PostProcessingError(Exception):
48 """Post Processing exception.
50 This exception may be raised by PostProcessor's .run() method to
51 indicate an error in the postprocessing task.
55 class UnavailableFormatError(Exception):
56 """Unavailable Format exception.
58 This exception will be thrown when a video is requested
59 in a format that is not available for that video.
63 class ContentTooShortError(Exception):
64 """Content Too Short exception.
66 This exception may be raised by FileDownloader objects when a file they
67 download is too small for what the server announced first, indicating
68 the connection was probably interrupted.
74 def __init__(self, downloaded, expected):
75 self.downloaded = downloaded
76 self.expected = expected
78 class FileDownloader(object):
79 """File Downloader class.
81 File downloader objects are the ones responsible of downloading the
82 actual video file and writing it to disk if the user has requested
83 it, among some other tasks. In most cases there should be one per
84 program. As, given a video URL, the downloader doesn't know how to
85 extract all the needed information, task that InfoExtractors do, it
86 has to pass the URL to one of them.
88 For this, file downloader objects have a method that allows
89 InfoExtractors to be registered in a given order. When it is passed
90 a URL, the file downloader handles it to the first InfoExtractor it
91 finds that reports being able to handle it. The InfoExtractor extracts
92 all the information about the video or videos the URL refers to, and
93 asks the FileDownloader to process the video information, possibly
94 downloading the video.
96 File downloaders accept a lot of parameters. In order not to saturate
97 the object constructor with arguments, it receives a dictionary of
98 options instead. These options are available through the params
99 attribute for the InfoExtractors to use. The FileDownloader also
100 registers itself as the downloader in charge for the InfoExtractors
101 that are added to it, so this is a "mutual registration".
105 username: Username for authentication purposes.
106 password: Password for authentication purposes.
107 usenetrc: Use netrc for authentication instead.
108 quiet: Do not print messages to stdout.
109 forceurl: Force printing final URL.
110 forcetitle: Force printing title.
111 simulate: Do not download the video files.
112 format: Video format code.
113 outtmpl: Template for output names.
114 ignoreerrors: Do not stop on download errors.
115 ratelimit: Download speed limit, in bytes/sec.
116 nooverwrites: Prevent overwriting files.
122 _download_retcode = None
124 def __init__(self, params):
125 """Create a FileDownloader object with the given options."""
128 self._download_retcode = 0
132 def pmkdir(filename):
133 """Create directory components in filename. Similar to Unix "mkdir -p"."""
134 components = filename.split(os.sep)
135 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
136 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
137 for dir in aggregate:
138 if not os.path.exists(dir):
142 def format_bytes(bytes):
148 exponent = long(math.log(float(bytes), 1024.0))
149 suffix = 'bkMGTPEZY'[exponent]
150 converted = float(bytes) / float(1024**exponent)
151 return '%.2f%s' % (converted, suffix)
154 def calc_percent(byte_counter, data_len):
157 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
160 def calc_eta(start, now, total, current):
164 if current == 0 or dif < 0.001: # One millisecond
166 rate = float(current) / dif
167 eta = long((float(total) - float(current)) / rate)
168 (eta_mins, eta_secs) = divmod(eta, 60)
171 return '%02d:%02d' % (eta_mins, eta_secs)
174 def calc_speed(start, now, bytes):
176 if bytes == 0 or dif < 0.001: # One millisecond
177 return '%10s' % '---b/s'
178 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
181 def best_block_size(elapsed_time, bytes):
182 new_min = max(bytes / 2.0, 1.0)
183 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
184 if elapsed_time < 0.001:
186 rate = bytes / elapsed_time
194 def parse_bytes(bytestr):
195 """Parse a string indicating a byte quantity into a long integer."""
196 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
199 number = float(matchobj.group(1))
200 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
201 return long(round(number * multiplier))
205 """Verify a URL is valid and data could be downloaded."""
206 request = urllib2.Request(url, None, std_headers)
207 data = urllib2.urlopen(request)
211 def add_info_extractor(self, ie):
212 """Add an InfoExtractor object to the end of the list."""
214 ie.set_downloader(self)
216 def add_post_processor(self, pp):
217 """Add a PostProcessor object to the end of the chain."""
219 pp.set_downloader(self)
221 def to_stdout(self, message, skip_eol=False):
222 """Print message to stdout if not in quiet mode."""
223 if not self.params.get('quiet', False):
224 print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(locale.getpreferredencoding()),
227 def to_stderr(self, message):
228 """Print message to stderr."""
229 print >>sys.stderr, message
231 def fixed_template(self):
232 """Checks if the output template is fixed."""
233 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
235 def trouble(self, message=None):
236 """Determine action to take when a download problem appears.
238 Depending on if the downloader has been configured to ignore
239 download errors or not, this method may throw an exception or
240 not when errors are found, after printing the message.
242 if message is not None:
243 self.to_stderr(message)
244 if not self.params.get('ignoreerrors', False):
245 raise DownloadError(message)
246 self._download_retcode = 1
248 def slow_down(self, start_time, byte_counter):
249 """Sleep if the download speed is over the rate limit."""
250 rate_limit = self.params.get('ratelimit', None)
251 if rate_limit is None or byte_counter == 0:
254 elapsed = now - start_time
257 speed = float(byte_counter) / elapsed
258 if speed > rate_limit:
259 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
261 def report_destination(self, filename):
262 """Report destination filename."""
263 self.to_stdout(u'[download] Destination: %s' % filename)
265 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
266 """Report download progress."""
267 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
268 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
270 def report_finish(self):
271 """Report download finished."""
274 def process_info(self, info_dict):
275 """Process a single dictionary returned by an InfoExtractor."""
276 # Do nothing else if in simulate mode
277 if self.params.get('simulate', False):
279 self.verify_url(info_dict['url'])
280 except (OSError, IOError, urllib2.URLError, httplib.HTTPException, socket.error), err:
281 raise UnavailableFormatError
284 if self.params.get('forcetitle', False):
285 print info_dict['title'].encode(locale.getpreferredencoding())
286 if self.params.get('forceurl', False):
287 print info_dict['url'].encode(locale.getpreferredencoding())
293 template_dict = dict(info_dict)
294 template_dict['epoch'] = unicode(long(time.time()))
295 filename = self.params['outtmpl'] % template_dict
296 self.report_destination(filename)
297 except (ValueError, KeyError), err:
298 self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
299 if self.params['nooverwrites'] and os.path.exists(filename):
300 self.to_stderr('WARNING: file exists: %s; skipping' % filename)
304 self.pmkdir(filename)
305 except (OSError, IOError), err:
306 self.trouble('ERROR: unable to create directories: %s' % str(err))
310 outstream = open(filename, 'wb')
311 except (OSError, IOError), err:
312 self.trouble('ERROR: unable to open for writing: %s' % str(err))
316 self._do_download(outstream, info_dict['url'])
318 except (OSError, IOError), err:
321 raise UnavailableFormatError
322 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
323 self.trouble('ERROR: unable to download video data: %s' % str(err))
325 except (ContentTooShortError, ), err:
326 self.trouble('ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
330 self.post_process(filename, info_dict)
331 except (PostProcessingError), err:
332 self.trouble('ERROR: postprocessing: %s' % str(err))
335 def download(self, url_list):
336 """Download a given list of URLs."""
337 if len(url_list) > 1 and self.fixed_template():
338 raise SameFileError(self.params['outtmpl'])
341 suitable_found = False
343 # Go to next InfoExtractor if not suitable
344 if not ie.suitable(url):
347 # Suitable InfoExtractor found
348 suitable_found = True
350 # Extract information from URL and process it
353 # Suitable InfoExtractor had been found; go to next URL
356 if not suitable_found:
357 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
359 return self._download_retcode
361 def post_process(self, filename, ie_info):
362 """Run the postprocessing chain on the given file."""
364 info['filepath'] = filename
370 def _do_download(self, stream, url):
371 request = urllib2.Request(url, None, std_headers)
372 data = urllib2.urlopen(request)
373 data_len = data.info().get('Content-length', None)
374 data_len_str = self.format_bytes(data_len)
380 percent_str = self.calc_percent(byte_counter, data_len)
381 eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
382 speed_str = self.calc_speed(start, time.time(), byte_counter)
383 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
387 data_block = data.read(block_size)
389 data_block_len = len(data_block)
390 if data_block_len == 0:
392 byte_counter += data_block_len
393 stream.write(data_block)
394 block_size = self.best_block_size(after - before, data_block_len)
397 self.slow_down(start, byte_counter)
400 if data_len is not None and str(byte_counter) != data_len:
401 raise ContentTooShortError(byte_counter, long(data_len))
403 class InfoExtractor(object):
404 """Information Extractor class.
406 Information extractors are the classes that, given a URL, extract
407 information from the video (or videos) the URL refers to. This
408 information includes the real video URL, the video title and simplified
409 title, author and others. The information is stored in a dictionary
410 which is then passed to the FileDownloader. The FileDownloader
411 processes this information possibly downloading the video to the file
412 system, among other possible outcomes. The dictionaries must include
413 the following fields:
415 id: Video identifier.
416 url: Final video URL.
417 uploader: Nickname of the video uploader.
418 title: Literal title.
419 stitle: Simplified title.
420 ext: Video filename extension.
422 Subclasses of this one should re-define the _real_initialize() and
423 _real_extract() methods, as well as the suitable() static method.
424 Probably, they should also be instantiated and added to the main
431 def __init__(self, downloader=None):
432 """Constructor. Receives an optional downloader."""
434 self.set_downloader(downloader)
438 """Receives a URL and returns True if suitable for this IE."""
441 def initialize(self):
442 """Initializes an instance (authentication, etc)."""
444 self._real_initialize()
447 def extract(self, url):
448 """Extracts URL information and returns it in list of dicts."""
450 return self._real_extract(url)
452 def set_downloader(self, downloader):
453 """Sets the downloader for this IE."""
454 self._downloader = downloader
456 def _real_initialize(self):
457 """Real initialization process. Redefine in subclasses."""
460 def _real_extract(self, url):
461 """Real extraction process. Redefine in subclasses."""
464 class YoutubeIE(InfoExtractor):
465 """Information extractor for youtube.com."""
467 _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
468 _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
469 _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
470 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
471 _NETRC_MACHINE = 'youtube'
472 _available_formats = ['22', '35', '18', '17', '13'] # listed in order of priority for -b flag
473 _video_extensions = {
482 return (re.match(YoutubeIE._VALID_URL, url) is not None)
485 def htmlentity_transform(matchobj):
486 """Transforms an HTML entity to a Unicode character."""
487 entity = matchobj.group(1)
489 # Known non-numeric HTML entity
490 if entity in htmlentitydefs.name2codepoint:
491 return unichr(htmlentitydefs.name2codepoint[entity])
494 mobj = re.match(ur'(?u)#(x?\d+)', entity)
496 numstr = mobj.group(1)
497 if numstr.startswith(u'x'):
499 numstr = u'0%s' % numstr
502 return unichr(long(numstr, base))
504 # Unknown entity in name, return its literal representation
505 return (u'&%s;' % entity)
507 def report_lang(self):
508 """Report attempt to set language."""
509 self._downloader.to_stdout(u'[youtube] Setting language')
511 def report_login(self):
512 """Report attempt to log in."""
513 self._downloader.to_stdout(u'[youtube] Logging in')
515 def report_age_confirmation(self):
516 """Report attempt to confirm age."""
517 self._downloader.to_stdout(u'[youtube] Confirming age')
519 def report_webpage_download(self, video_id):
520 """Report attempt to download webpage."""
521 self._downloader.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
523 def report_information_extraction(self, video_id):
524 """Report attempt to extract video information."""
525 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
527 def report_video_url(self, video_id, video_real_url):
528 """Report extracted video URL."""
529 self._downloader.to_stdout(u'[youtube] %s: URL: %s' % (video_id, video_real_url))
531 def report_unavailable_format(self, video_id, format):
532 """Report extracted video URL."""
533 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
535 def _real_initialize(self):
536 if self._downloader is None:
541 downloader_params = self._downloader.params
543 # Attempt to use provided username and password or .netrc data
544 if downloader_params.get('username', None) is not None:
545 username = downloader_params['username']
546 password = downloader_params['password']
547 elif downloader_params.get('usenetrc', False):
549 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
554 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
555 except (IOError, netrc.NetrcParseError), err:
556 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
560 request = urllib2.Request(self._LANG_URL, None, std_headers)
563 urllib2.urlopen(request).read()
564 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
565 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
568 # No authentication to be performed
574 'current_form': 'loginForm',
576 'action_login': 'Log In',
577 'username': username,
578 'password': password,
580 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
583 login_results = urllib2.urlopen(request).read()
584 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
585 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
587 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
588 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
594 'action_confirm': 'Confirm',
596 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
598 self.report_age_confirmation()
599 age_results = urllib2.urlopen(request).read()
600 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
601 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
604 def _real_extract(self, url):
605 # Extract video id from URL
606 mobj = re.match(self._VALID_URL, url)
608 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
610 video_id = mobj.group(2)
612 # Downloader parameters
616 if self._downloader is not None:
617 params = self._downloader.params
618 format_param = params.get('format', None)
619 if format_param == '0':
620 format_param = self._available_formats[quality_index]
626 video_extension = self._video_extensions.get(format_param, 'flv')
628 # Normalize URL, including format
629 normalized_url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id
630 if format_param is not None:
631 normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
632 request = urllib2.Request(normalized_url, None, std_headers)
634 self.report_webpage_download(video_id)
635 video_webpage = urllib2.urlopen(request).read()
636 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
637 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
639 self.report_information_extraction(video_id)
642 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
644 self._downloader.trouble(u'ERROR: unable to extract "t" parameter')
646 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&el=detailpage&ps=' % (video_id, mobj.group(1))
647 if format_param is not None:
648 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
649 self.report_video_url(video_id, video_real_url)
652 mobj = re.search(r"var watchUsername = '([^']+)';", video_webpage)
654 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
656 video_uploader = mobj.group(1)
659 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
661 self._downloader.trouble(u'ERROR: unable to extract video title')
663 video_title = mobj.group(1).decode('utf-8')
664 video_title = re.sub(ur'(?u)&(.+?);', self.htmlentity_transform, video_title)
665 video_title = video_title.replace(os.sep, u'%')
668 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
669 simple_title = simple_title.strip(ur'_')
671 # Process video information
672 self._downloader.process_info({
673 'id': video_id.decode('utf-8'),
674 'url': video_real_url.decode('utf-8'),
675 'uploader': video_uploader.decode('utf-8'),
676 'title': video_title,
677 'stitle': simple_title,
678 'ext': video_extension.decode('utf-8'),
683 except UnavailableFormatError, err:
685 if quality_index == len(self._available_formats) - 1:
686 # I don't ever expect this to happen
687 self._downloader.trouble(u'ERROR: no known formats available for video')
690 self.report_unavailable_format(video_id, format_param)
692 format_param = self._available_formats[quality_index]
695 self._downloader.trouble('ERROR: format not available for video')
699 class MetacafeIE(InfoExtractor):
700 """Information Extractor for metacafe.com."""
702 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
703 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
704 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
707 def __init__(self, youtube_ie, downloader=None):
708 InfoExtractor.__init__(self, downloader)
709 self._youtube_ie = youtube_ie
713 return (re.match(MetacafeIE._VALID_URL, url) is not None)
715 def report_disclaimer(self):
716 """Report disclaimer retrieval."""
717 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
719 def report_age_confirmation(self):
720 """Report attempt to confirm age."""
721 self._downloader.to_stdout(u'[metacafe] Confirming age')
723 def report_download_webpage(self, video_id):
724 """Report webpage download."""
725 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
727 def report_extraction(self, video_id):
728 """Report information extraction."""
729 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
731 def _real_initialize(self):
732 # Retrieve disclaimer
733 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
735 self.report_disclaimer()
736 disclaimer = urllib2.urlopen(request).read()
737 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
738 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
744 'submit': "Continue - I'm over 18",
746 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
748 self.report_age_confirmation()
749 disclaimer = urllib2.urlopen(request).read()
750 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
751 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
754 def _real_extract(self, url):
755 # Extract id and simplified title from URL
756 mobj = re.match(self._VALID_URL, url)
758 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
761 video_id = mobj.group(1)
763 # Check if video comes from YouTube
764 mobj2 = re.match(r'^yt-(.*)$', video_id)
765 if mobj2 is not None:
766 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
769 simple_title = mobj.group(2).decode('utf-8')
770 video_extension = 'flv'
772 # Retrieve video webpage to extract further information
773 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
775 self.report_download_webpage(video_id)
776 webpage = urllib2.urlopen(request).read()
777 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
778 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
781 # Extract URL, uploader and title from webpage
782 self.report_extraction(video_id)
783 mobj = re.search(r'(?m)&mediaURL=(http.*?\.flv)', webpage)
785 self._downloader.trouble(u'ERROR: unable to extract media URL')
787 mediaURL = urllib.unquote(mobj.group(1))
789 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
791 self._downloader.trouble(u'ERROR: unable to extract gdaKey')
793 gdaKey = mobj.group(1)
795 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
797 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
799 self._downloader.trouble(u'ERROR: unable to extract title')
801 video_title = mobj.group(1).decode('utf-8')
803 mobj = re.search(r'(?ms)<li id="ChnlUsr">.*?Submitter:.*?<a .*?>(.*?)<', webpage)
805 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
807 video_uploader = mobj.group(1)
810 # Process video information
811 self._downloader.process_info({
812 'id': video_id.decode('utf-8'),
813 'url': video_url.decode('utf-8'),
814 'uploader': video_uploader.decode('utf-8'),
815 'title': video_title,
816 'stitle': simple_title,
817 'ext': video_extension.decode('utf-8'),
819 except UnavailableFormatError:
820 self._downloader.trouble(u'ERROR: format not available for video')
823 class YoutubeSearchIE(InfoExtractor):
824 """Information Extractor for YouTube search queries."""
825 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
826 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
827 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
828 _MORE_PAGES_INDICATOR = r'>Next</a>'
830 _max_youtube_results = 1000
832 def __init__(self, youtube_ie, downloader=None):
833 InfoExtractor.__init__(self, downloader)
834 self._youtube_ie = youtube_ie
838 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
840 def report_download_page(self, query, pagenum):
841 """Report attempt to download playlist page with given number."""
842 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
844 def _real_initialize(self):
845 self._youtube_ie.initialize()
847 def _real_extract(self, query):
848 mobj = re.match(self._VALID_QUERY, query)
850 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
853 prefix, query = query.split(':')
856 self._download_n_results(query, 1)
858 elif prefix == 'all':
859 self._download_n_results(query, self._max_youtube_results)
865 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
867 elif n > self._max_youtube_results:
868 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
869 n = self._max_youtube_results
870 self._download_n_results(query, n)
872 except ValueError: # parsing prefix as int fails
873 self._download_n_results(query, 1)
876 def _download_n_results(self, query, n):
877 """Downloads a specified number of results for a query"""
884 self.report_download_page(query, pagenum)
885 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
886 request = urllib2.Request(result_url, None, std_headers)
888 page = urllib2.urlopen(request).read()
889 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
890 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
893 # Extract video identifiers
894 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
895 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
896 if video_id not in already_seen:
897 video_ids.append(video_id)
898 already_seen.add(video_id)
899 if len(video_ids) == n:
900 # Specified n videos reached
902 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
905 if self._MORE_PAGES_INDICATOR not in page:
907 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
910 pagenum = pagenum + 1
912 class YoutubePlaylistIE(InfoExtractor):
913 """Information Extractor for YouTube playlists."""
915 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/view_play_list\?p=(.+)'
916 _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
917 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
918 _MORE_PAGES_INDICATOR = r'/view_play_list?p=%s&page=%s'
921 def __init__(self, youtube_ie, downloader=None):
922 InfoExtractor.__init__(self, downloader)
923 self._youtube_ie = youtube_ie
927 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
929 def report_download_page(self, playlist_id, pagenum):
930 """Report attempt to download playlist page with given number."""
931 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
933 def _real_initialize(self):
934 self._youtube_ie.initialize()
936 def _real_extract(self, url):
937 # Extract playlist id
938 mobj = re.match(self._VALID_URL, url)
940 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
943 # Download playlist pages
944 playlist_id = mobj.group(1)
949 self.report_download_page(playlist_id, pagenum)
950 request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
952 page = urllib2.urlopen(request).read()
953 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
954 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
957 # Extract video identifiers
959 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
960 if mobj.group(1) not in ids_in_page:
961 ids_in_page.append(mobj.group(1))
962 video_ids.extend(ids_in_page)
964 if (self._MORE_PAGES_INDICATOR % (playlist_id, pagenum + 1)) not in page:
966 pagenum = pagenum + 1
969 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
972 class PostProcessor(object):
973 """Post Processor class.
975 PostProcessor objects can be added to downloaders with their
976 add_post_processor() method. When the downloader has finished a
977 successful download, it will take its internal chain of PostProcessors
978 and start calling the run() method on each one of them, first with
979 an initial argument and then with the returned value of the previous
982 The chain will be stopped if one of them ever returns None or the end
983 of the chain is reached.
985 PostProcessor objects follow a "mutual registration" process similar
986 to InfoExtractor objects.
991 def __init__(self, downloader=None):
992 self._downloader = downloader
994 def set_downloader(self, downloader):
995 """Sets the downloader for this PP."""
996 self._downloader = downloader
998 def run(self, information):
999 """Run the PostProcessor.
1001 The "information" argument is a dictionary like the ones
1002 composed by InfoExtractors. The only difference is that this
1003 one has an extra field called "filepath" that points to the
1006 When this method returns None, the postprocessing chain is
1007 stopped. However, this method may return an information
1008 dictionary that will be passed to the next postprocessing
1009 object in the chain. It can be the one it received after
1010 changing some fields.
1012 In addition, this method may raise a PostProcessingError
1013 exception that will be taken into account by the downloader
1016 return information # by default, do nothing
1018 ### MAIN PROGRAM ###
1019 if __name__ == '__main__':
1021 # Modules needed only when running the main program
1025 # General configuration
1026 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
1027 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
1028 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
1030 # Parse command line
1031 parser = optparse.OptionParser(
1032 usage='Usage: %prog [options] url...',
1034 conflict_handler='resolve',
1037 parser.add_option('-h', '--help',
1038 action='help', help='print this help text and exit')
1039 parser.add_option('-v', '--version',
1040 action='version', help='print program version and exit')
1041 parser.add_option('-i', '--ignore-errors',
1042 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
1043 parser.add_option('-r', '--rate-limit',
1044 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
1046 authentication = optparse.OptionGroup(parser, 'Authentication Options')
1047 authentication.add_option('-u', '--username',
1048 dest='username', metavar='UN', help='account username')
1049 authentication.add_option('-p', '--password',
1050 dest='password', metavar='PW', help='account password')
1051 authentication.add_option('-n', '--netrc',
1052 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
1053 parser.add_option_group(authentication)
1055 video_format = optparse.OptionGroup(parser, 'Video Format Options')
1056 video_format.add_option('-f', '--format',
1057 action='append', dest='format', metavar='FMT', help='video format code')
1058 video_format.add_option('-b', '--best-quality',
1059 action='store_const', dest='format', help='download the best quality video possible', const='0')
1060 video_format.add_option('-m', '--mobile-version',
1061 action='store_const', dest='format', help='alias for -f 17', const='17')
1062 video_format.add_option('-d', '--high-def',
1063 action='store_const', dest='format', help='alias for -f 22', const='22')
1064 parser.add_option_group(video_format)
1066 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
1067 verbosity.add_option('-q', '--quiet',
1068 action='store_true', dest='quiet', help='activates quiet mode', default=False)
1069 verbosity.add_option('-s', '--simulate',
1070 action='store_true', dest='simulate', help='do not download video', default=False)
1071 verbosity.add_option('-g', '--get-url',
1072 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
1073 verbosity.add_option('-e', '--get-title',
1074 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
1075 parser.add_option_group(verbosity)
1077 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
1078 filesystem.add_option('-t', '--title',
1079 action='store_true', dest='usetitle', help='use title in file name', default=False)
1080 filesystem.add_option('-l', '--literal',
1081 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
1082 filesystem.add_option('-o', '--output',
1083 dest='outtmpl', metavar='TPL', help='output filename template')
1084 filesystem.add_option('-a', '--batch-file',
1085 dest='batchfile', metavar='F', help='file containing URLs to download')
1086 filesystem.add_option('-w', '--no-overwrites',
1087 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
1088 parser.add_option_group(filesystem)
1090 (opts, args) = parser.parse_args()
1092 # Batch file verification
1094 if opts.batchfile is not None:
1096 batchurls = open(opts.batchfile, 'r').readlines()
1097 batchurls = [x.strip() for x in batchurls]
1098 batchurls = [x for x in batchurls if len(x) > 0]
1100 sys.exit(u'ERROR: batch file could not be read')
1101 all_urls = batchurls + args
1103 # Conflicting, missing and erroneous options
1104 if len(all_urls) < 1:
1105 parser.error(u'you must provide at least one URL')
1106 if opts.usenetrc and (opts.username is not None or opts.password is not None):
1107 parser.error(u'using .netrc conflicts with giving username/password')
1108 if opts.password is not None and opts.username is None:
1109 parser.error(u'account username missing')
1110 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
1111 parser.error(u'using output template conflicts with using title or literal title')
1112 if opts.usetitle and opts.useliteral:
1113 parser.error(u'using title conflicts with using literal title')
1114 if opts.username is not None and opts.password is None:
1115 opts.password = getpass.getpass(u'Type account password and press return:')
1116 if opts.ratelimit is not None:
1117 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
1118 if numeric_limit is None:
1119 parser.error(u'invalid rate limit specified')
1120 opts.ratelimit = numeric_limit
1121 if opts.format is not None and len(opts.format) > 1:
1122 parser.error(u'pass at most one of the video format option flags (-f, -b, -m, -d)')
1123 if opts.format is None:
1126 real_format = opts.format[0]
1129 # Information extractors
1130 youtube_ie = YoutubeIE()
1131 metacafe_ie = MetacafeIE(youtube_ie)
1132 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
1133 youtube_search_ie = YoutubeSearchIE(youtube_ie)
1136 fd = FileDownloader({
1137 'usenetrc': opts.usenetrc,
1138 'username': opts.username,
1139 'password': opts.password,
1140 'quiet': (opts.quiet or opts.geturl or opts.gettitle),
1141 'forceurl': opts.geturl,
1142 'forcetitle': opts.gettitle,
1143 'simulate': (opts.simulate or opts.geturl or opts.gettitle),
1144 'format': real_format,
1145 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(locale.getpreferredencoding()))
1146 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
1147 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
1148 or u'%(id)s.%(ext)s'),
1149 'ignoreerrors': opts.ignoreerrors,
1150 'ratelimit': opts.ratelimit,
1151 'nooverwrites': opts.nooverwrites,
1153 fd.add_info_extractor(youtube_search_ie)
1154 fd.add_info_extractor(youtube_pl_ie)
1155 fd.add_info_extractor(metacafe_ie)
1156 fd.add_info_extractor(youtube_ie)
1157 retcode = fd.download(all_urls)
1160 except DownloadError:
1162 except SameFileError:
1163 sys.exit(u'ERROR: fixed output name but more than one file to download')
1164 except KeyboardInterrupt:
1165 sys.exit(u'\nERROR: Interrupted by user')