2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # License: Public domain code
22 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.0.8) Gecko/2009032609 Firefox/3.0.8',
23 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
24 'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
25 'Accept-Language': 'en-us,en;q=0.5',
28 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
30 class DownloadError(Exception):
31 """Download Error exception.
33 This exception may be thrown by FileDownloader objects if they are not
34 configured to continue on errors. They will contain the appropriate
39 class SameFileError(Exception):
40 """Same File exception.
42 This exception will be thrown by FileDownloader objects if they detect
43 multiple files would have to be downloaded to the same file on disk.
47 class PostProcessingError(Exception):
48 """Post Processing exception.
50 This exception may be raised by PostProcessor's .run() method to
51 indicate an error in the postprocessing task.
55 class FileDownloader(object):
56 """File Downloader class.
58 File downloader objects are the ones responsible of downloading the
59 actual video file and writing it to disk if the user has requested
60 it, among some other tasks. In most cases there should be one per
61 program. As, given a video URL, the downloader doesn't know how to
62 extract all the needed information, task that InfoExtractors do, it
63 has to pass the URL to one of them.
65 For this, file downloader objects have a method that allows
66 InfoExtractors to be registered in a given order. When it is passed
67 a URL, the file downloader handles it to the first InfoExtractor it
68 finds that reports being able to handle it. The InfoExtractor returns
69 all the information to the FileDownloader and the latter downloads the
70 file or does whatever it's instructed to do.
72 File downloaders accept a lot of parameters. In order not to saturate
73 the object constructor with arguments, it receives a dictionary of
74 options instead. These options are available through the params
75 attribute for the InfoExtractors to use. The FileDownloader also
76 registers itself as the downloader in charge for the InfoExtractors
77 that are added to it, so this is a "mutual registration".
81 username: Username for authentication purposes.
82 password: Password for authentication purposes.
83 usenetrc: Use netrc for authentication instead.
84 quiet: Do not print messages to stdout.
85 forceurl: Force printing final URL.
86 forcetitle: Force printing title.
87 simulate: Do not download the video files.
88 format: Video format code.
89 outtmpl: Template for output names.
90 ignoreerrors: Do not stop on download errors.
91 ratelimit: Download speed limit, in bytes/sec.
92 nooverwrites: Prevent overwriting files.
98 _download_retcode = None
100 def __init__(self, params):
101 """Create a FileDownloader object with the given options."""
104 self._download_retcode = 0
108 def pmkdir(filename):
109 """Create directory components in filename. Similar to Unix "mkdir -p"."""
110 components = filename.split(os.sep)
111 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
112 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
113 for dir in aggregate:
114 if not os.path.exists(dir):
118 def format_bytes(bytes):
124 exponent = long(math.log(float(bytes), 1024.0))
125 suffix = 'bkMGTPEZY'[exponent]
126 converted = float(bytes) / float(1024**exponent)
127 return '%.2f%s' % (converted, suffix)
130 def calc_percent(byte_counter, data_len):
133 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
136 def calc_eta(start, now, total, current):
140 if current == 0 or dif < 0.001: # One millisecond
142 rate = float(current) / dif
143 eta = long((float(total) - float(current)) / rate)
144 (eta_mins, eta_secs) = divmod(eta, 60)
147 return '%02d:%02d' % (eta_mins, eta_secs)
150 def calc_speed(start, now, bytes):
152 if bytes == 0 or dif < 0.001: # One millisecond
153 return '%10s' % '---b/s'
154 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
157 def best_block_size(elapsed_time, bytes):
158 new_min = max(bytes / 2.0, 1.0)
159 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
160 if elapsed_time < 0.001:
162 rate = bytes / elapsed_time
170 def parse_bytes(bytestr):
171 """Parse a string indicating a byte quantity into a long integer."""
172 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
175 number = float(matchobj.group(1))
176 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
177 return long(round(number * multiplier))
179 def add_info_extractor(self, ie):
180 """Add an InfoExtractor object to the end of the list."""
182 ie.set_downloader(self)
184 def add_post_processor(self, pp):
185 """Add a PostProcessor object to the end of the chain."""
187 pp.set_downloader(self)
189 def to_stdout(self, message, skip_eol=False):
190 """Print message to stdout if not in quiet mode."""
191 if not self.params.get('quiet', False):
192 print u'%s%s' % (message, [u'\n', u''][skip_eol]),
195 def to_stderr(self, message):
196 """Print message to stderr."""
197 print >>sys.stderr, message
199 def fixed_template(self):
200 """Checks if the output template is fixed."""
201 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
203 def trouble(self, message=None):
204 """Determine action to take when a download problem appears.
206 Depending on if the downloader has been configured to ignore
207 download errors or not, this method may throw an exception or
208 not when errors are found, after printing the message.
210 if message is not None:
211 self.to_stderr(message)
212 if not self.params.get('ignoreerrors', False):
213 raise DownloadError(message)
214 self._download_retcode = 1
216 def slow_down(self, start_time, byte_counter):
217 """Sleep if the download speed is over the rate limit."""
218 rate_limit = self.params.get('ratelimit', None)
219 if rate_limit is None or byte_counter == 0:
222 elapsed = now - start_time
225 speed = float(byte_counter) / elapsed
226 if speed > rate_limit:
227 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
229 def report_destination(self, filename):
230 """Report destination filename."""
231 self.to_stdout(u'[download] Destination: %s' % filename)
233 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
234 """Report download progress."""
235 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
236 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
238 def report_finish(self):
239 """Report download finished."""
242 def process_info(self, info_dict):
243 """Process a single dictionary returned by an InfoExtractor."""
245 if self.params.get('forcetitle', False):
246 print info_dict['title']
247 if self.params.get('forceurl', False):
248 print info_dict['url']
250 # Do nothing else if in simulate mode
251 if self.params.get('simulate', False):
255 filename = self.params['outtmpl'] % info_dict
256 self.report_destination(filename)
257 except (ValueError, KeyError), err:
258 self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
259 if self.params['nooverwrites'] and os.path.exists(filename):
260 self.to_stderr('WARNING: file exists: %s; skipping' % filename)
263 self.pmkdir(filename)
264 except (OSError, IOError), err:
265 self.trouble('ERROR: unable to create directories: %s' % str(err))
268 outstream = open(filename, 'wb')
269 except (OSError, IOError), err:
270 self.trouble('ERROR: unable to open for writing: %s' % str(err))
273 self._do_download(outstream, info_dict['url'])
275 except (OSError, IOError), err:
276 self.trouble('ERROR: unable to write video data: %s' % str(err))
278 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
279 self.trouble('ERROR: unable to download video data: %s' % str(err))
282 self.post_process(filename, info_dict)
283 except (PostProcessingError), err:
284 self.trouble('ERROR: postprocessing: %s' % str(err))
289 def download(self, url_list):
290 """Download a given list of URLs."""
291 if len(url_list) > 1 and self.fixed_template():
292 raise SameFileError(self.params['outtmpl'])
295 suitable_found = False
297 # Go to next InfoExtractor if not suitable
298 if not ie.suitable(url):
301 # Suitable InfoExtractor found
302 suitable_found = True
304 # Extract information from URL
305 all_results = ie.extract(url)
306 results = [x for x in all_results if x is not None]
308 # See if there were problems extracting any information
309 if len(results) != len(all_results):
312 # Two results could go to the same file
313 if len(results) > 1 and self.fixed_template():
314 raise SameFileError(self.params['outtmpl'])
316 # Process each result
317 for result in results:
318 self.process_info(result)
320 # Suitable InfoExtractor had been found; go to next URL
323 if not suitable_found:
324 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
326 return self._download_retcode
328 def post_process(self, filename, ie_info):
329 """Run the postprocessing chain on the given file."""
331 info['filepath'] = filename
337 def _do_download(self, stream, url):
338 request = urllib2.Request(url, None, std_headers)
339 data = urllib2.urlopen(request)
340 data_len = data.info().get('Content-length', None)
341 data_len_str = self.format_bytes(data_len)
347 percent_str = self.calc_percent(byte_counter, data_len)
348 eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
349 speed_str = self.calc_speed(start, time.time(), byte_counter)
350 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
354 data_block = data.read(block_size)
356 data_block_len = len(data_block)
357 if data_block_len == 0:
359 byte_counter += data_block_len
360 stream.write(data_block)
361 block_size = self.best_block_size(after - before, data_block_len)
364 self.slow_down(start, byte_counter)
367 if data_len is not None and str(byte_counter) != data_len:
368 raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
370 class InfoExtractor(object):
371 """Information Extractor class.
373 Information extractors are the classes that, given a URL, extract
374 information from the video (or videos) the URL refers to. This
375 information includes the real video URL, the video title and simplified
376 title, author and others. It is returned in a list of dictionaries when
377 calling its extract() method. It is a list because a URL can refer to
378 more than one video (think of playlists). The dictionaries must include
379 the following fields:
381 id: Video identifier.
382 url: Final video URL.
383 uploader: Nickname of the video uploader.
384 title: Literal title.
385 stitle: Simplified title.
386 ext: Video filename extension.
388 Subclasses of this one should re-define the _real_initialize() and
389 _real_extract() methods, as well as the suitable() static method.
390 Probably, they should also be instantiated and added to the main
397 def __init__(self, downloader=None):
398 """Constructor. Receives an optional downloader."""
400 self.set_downloader(downloader)
404 """Receives a URL and returns True if suitable for this IE."""
407 def initialize(self):
408 """Initializes an instance (authentication, etc)."""
410 self._real_initialize()
413 def extract(self, url):
414 """Extracts URL information and returns it in list of dicts."""
416 return self._real_extract(url)
418 def set_downloader(self, downloader):
419 """Sets the downloader for this IE."""
420 self._downloader = downloader
422 def _real_initialize(self):
423 """Real initialization process. Redefine in subclasses."""
426 def _real_extract(self, url):
427 """Real extraction process. Redefine in subclasses."""
430 class YoutubeIE(InfoExtractor):
431 """Information extractor for youtube.com."""
433 _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
434 _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
435 _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
436 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
437 _NETRC_MACHINE = 'youtube'
441 return (re.match(YoutubeIE._VALID_URL, url) is not None)
444 def htmlentity_transform(matchobj):
445 """Transforms an HTML entity to a Unicode character."""
446 entity = matchobj.group(1)
448 # Known non-numeric HTML entity
449 if entity in htmlentitydefs.name2codepoint:
450 return unichr(htmlentitydefs.name2codepoint[entity])
453 mobj = re.match(ur'(?u)#(x?\d+)', entity)
455 numstr = mobj.group(1)
456 if numstr.startswith(u'x'):
458 numstr = u'0%s' % numstr
461 return unichr(long(numstr, base))
463 # Unknown entity in name, return its literal representation
464 return (u'&%s;' % entity)
466 def report_lang(self):
467 """Report attempt to set language."""
468 self._downloader.to_stdout(u'[youtube] Setting language')
470 def report_login(self):
471 """Report attempt to log in."""
472 self._downloader.to_stdout(u'[youtube] Logging in')
474 def report_age_confirmation(self):
475 """Report attempt to confirm age."""
476 self._downloader.to_stdout(u'[youtube] Confirming age')
478 def report_webpage_download(self, video_id):
479 """Report attempt to download webpage."""
480 self._downloader.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
482 def report_information_extraction(self, video_id):
483 """Report attempt to extract video information."""
484 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
486 def report_video_url(self, video_id, video_real_url):
487 """Report extracted video URL."""
488 self._downloader.to_stdout(u'[youtube] %s: URL: %s' % (video_id, video_real_url))
490 def _real_initialize(self):
491 if self._downloader is None:
496 downloader_params = self._downloader.params
498 # Attempt to use provided username and password or .netrc data
499 if downloader_params.get('username', None) is not None:
500 username = downloader_params['username']
501 password = downloader_params['password']
502 elif downloader_params.get('usenetrc', False):
504 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
509 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
510 except (IOError, netrc.NetrcParseError), err:
511 self._downloader.trouble(u'WARNING: parsing .netrc: %s' % str(err))
515 request = urllib2.Request(self._LANG_URL, None, std_headers)
518 urllib2.urlopen(request).read()
519 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
520 self._downloader.trouble(u'WARNING: unable to set language: %s' % str(err))
523 # No authentication to be performed
529 'current_form': 'loginForm',
531 'action_login': 'Log In',
532 'username': username,
533 'password': password,
535 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
538 login_results = urllib2.urlopen(request).read()
539 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
540 self._downloader.trouble(u'WARNING: unable to log in: bad username or password')
542 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
543 self._downloader.trouble(u'WARNING: unable to log in: %s' % str(err))
549 'action_confirm': 'Confirm',
551 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
553 self.report_age_confirmation()
554 age_results = urllib2.urlopen(request).read()
555 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
556 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
559 def _real_extract(self, url):
560 # Extract video id from URL
561 mobj = re.match(self._VALID_URL, url)
563 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
565 video_id = mobj.group(2)
567 # Downloader parameters
569 if self._downloader is not None:
570 params = self._downloader.params
571 format_param = params.get('format', None)
578 }.get(format_param, 'flv')
580 # Normalize URL, including format
581 normalized_url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id
582 if format_param is not None:
583 normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
584 request = urllib2.Request(normalized_url, None, std_headers)
586 self.report_webpage_download(video_id)
587 video_webpage = urllib2.urlopen(request).read()
588 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
589 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
591 self.report_information_extraction(video_id)
594 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
596 self._downloader.trouble(u'ERROR: unable to extract "t" parameter')
598 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&el=detailpage&ps=' % (video_id, mobj.group(1))
599 if format_param is not None:
600 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
601 self.report_video_url(video_id, video_real_url)
604 mobj = re.search(r"var watchUsername = '([^']+)';", video_webpage)
606 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
608 video_uploader = mobj.group(1)
611 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
613 self._downloader.trouble(u'ERROR: unable to extract video title')
615 video_title = mobj.group(1).decode('utf-8')
616 video_title = re.sub(ur'(?u)&(.+?);', self.htmlentity_transform, video_title)
617 video_title = video_title.replace(os.sep, u'%')
620 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
621 simple_title = simple_title.strip(ur'_')
623 # Process video information
625 'id': video_id.decode('utf-8'),
626 'url': video_real_url.decode('utf-8'),
627 'uploader': video_uploader.decode('utf-8'),
628 'title': video_title,
629 'stitle': simple_title,
630 'ext': video_extension.decode('utf-8'),
633 class MetacafeIE(InfoExtractor):
634 """Information Extractor for metacafe.com."""
636 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
637 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
640 def __init__(self, youtube_ie, downloader=None):
641 InfoExtractor.__init__(self, downloader)
642 self._youtube_ie = youtube_ie
646 return (re.match(MetacafeIE._VALID_URL, url) is not None)
648 def report_disclaimer(self):
649 """Report disclaimer retrieval."""
650 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
652 def report_age_confirmation(self):
653 """Report attempt to confirm age."""
654 self._downloader.to_stdout(u'[metacafe] Confirming age')
656 def report_download_webpage(self, video_id):
657 """Report webpage download."""
658 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
660 def report_extraction(self, video_id):
661 """Report information extraction."""
662 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
664 def _real_initialize(self):
665 # Retrieve disclaimer
666 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
668 self.report_disclaimer()
669 disclaimer = urllib2.urlopen(request).read()
670 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
671 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
677 'submit': "Continue - I'm over 18",
679 request = urllib2.Request('http://www.metacafe.com/', urllib.urlencode(disclaimer_form), std_headers)
681 self.report_age_confirmation()
682 disclaimer = urllib2.urlopen(request).read()
683 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
684 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
687 def _real_extract(self, url):
688 # Extract id and simplified title from URL
689 mobj = re.match(self._VALID_URL, url)
691 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
694 video_id = mobj.group(1)
696 # Check if video comes from YouTube
697 mobj2 = re.match(r'^yt-(.*)$', video_id)
698 if mobj2 is not None:
699 return self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
701 simple_title = mobj.group(2).decode('utf-8')
702 video_extension = 'flv'
704 # Retrieve video webpage to extract further information
705 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
707 self.report_download_webpage(video_id)
708 webpage = urllib2.urlopen(request).read()
709 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
710 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
713 # Extract URL, uploader and title from webpage
714 self.report_extraction(video_id)
715 mobj = re.search(r'(?m)"mediaURL":"(http.*?\.flv)"', webpage)
717 self._downloader.trouble(u'ERROR: unable to extract media URL')
719 mediaURL = mobj.group(1).replace('\\', '')
721 mobj = re.search(r'(?m)"gdaKey":"(.*?)"', webpage)
723 self._downloader.trouble(u'ERROR: unable to extract gdaKey')
725 gdaKey = mobj.group(1)
727 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
729 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
731 self._downloader.trouble(u'ERROR: unable to extract title')
733 video_title = mobj.group(1).decode('utf-8')
735 mobj = re.search(r'(?m)<li id="ChnlUsr">.*?Submitter:<br />(.*?)</li>', webpage)
737 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
739 video_uploader = re.sub(r'<.*?>', '', mobj.group(1))
743 'id': video_id.decode('utf-8'),
744 'url': video_url.decode('utf-8'),
745 'uploader': video_uploader.decode('utf-8'),
746 'title': video_title,
747 'stitle': simple_title,
748 'ext': video_extension.decode('utf-8'),
752 class YoutubeSearchIE(InfoExtractor):
753 """Information Extractor for YouTube search queries."""
754 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
755 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
756 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
757 _MORE_PAGES_INDICATOR = r'>Next</a>'
759 _max_youtube_results = 1000
761 def __init__(self, youtube_ie, downloader=None):
762 InfoExtractor.__init__(self, downloader)
763 self._youtube_ie = youtube_ie
767 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
769 def report_download_page(self, query, pagenum):
770 """Report attempt to download playlist page with given number."""
771 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
773 def _real_initialize(self):
774 self._youtube_ie.initialize()
776 def _real_extract(self, query):
777 mobj = re.match(self._VALID_QUERY, query)
779 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
782 prefix, query = query.split(':')
785 return self._download_n_results(query, 1)
786 elif prefix == 'all':
787 return self._download_n_results(query, self._max_youtube_results)
792 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
794 elif n > self._max_youtube_results:
795 self._downloader.trouble(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
796 n = self._max_youtube_results
797 return self._download_n_results(query, n)
798 except ValueError: # parsing prefix as int fails
799 return self._download_n_results(query, 1)
801 def _download_n_results(self, query, n):
802 """Downloads a specified number of results for a query"""
809 self.report_download_page(query, pagenum)
810 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
811 request = urllib2.Request(result_url, None, std_headers)
813 page = urllib2.urlopen(request).read()
814 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
815 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
818 # Extract video identifiers
819 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
820 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
821 if video_id not in already_seen:
822 video_ids.append(video_id)
823 already_seen.add(video_id)
824 if len(video_ids) == n:
825 # Specified n videos reached
828 information.extend(self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id))
831 if self._MORE_PAGES_INDICATOR not in page:
834 information.extend(self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id))
837 pagenum = pagenum + 1
839 class YoutubePlaylistIE(InfoExtractor):
840 """Information Extractor for YouTube playlists."""
842 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/view_play_list\?p=(.+)'
843 _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
844 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
845 _MORE_PAGES_INDICATOR = r'/view_play_list?p=%s&page=%s'
848 def __init__(self, youtube_ie, downloader=None):
849 InfoExtractor.__init__(self, downloader)
850 self._youtube_ie = youtube_ie
854 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
856 def report_download_page(self, playlist_id, pagenum):
857 """Report attempt to download playlist page with given number."""
858 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
860 def _real_initialize(self):
861 self._youtube_ie.initialize()
863 def _real_extract(self, url):
864 # Extract playlist id
865 mobj = re.match(self._VALID_URL, url)
867 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
870 # Download playlist pages
871 playlist_id = mobj.group(1)
876 self.report_download_page(playlist_id, pagenum)
877 request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
879 page = urllib2.urlopen(request).read()
880 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
881 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
884 # Extract video identifiers
886 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
887 if mobj.group(1) not in ids_in_page:
888 ids_in_page.append(mobj.group(1))
889 video_ids.extend(ids_in_page)
891 if (self._MORE_PAGES_INDICATOR % (playlist_id, pagenum + 1)) not in page:
893 pagenum = pagenum + 1
897 information.extend(self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id))
900 class PostProcessor(object):
901 """Post Processor class.
903 PostProcessor objects can be added to downloaders with their
904 add_post_processor() method. When the downloader has finished a
905 successful download, it will take its internal chain of PostProcessors
906 and start calling the run() method on each one of them, first with
907 an initial argument and then with the returned value of the previous
910 The chain will be stopped if one of them ever returns None or the end
911 of the chain is reached.
913 PostProcessor objects follow a "mutual registration" process similar
914 to InfoExtractor objects.
919 def __init__(self, downloader=None):
920 self._downloader = downloader
922 def set_downloader(self, downloader):
923 """Sets the downloader for this PP."""
924 self._downloader = downloader
926 def run(self, information):
927 """Run the PostProcessor.
929 The "information" argument is a dictionary like the ones
930 returned by InfoExtractors. The only difference is that this
931 one has an extra field called "filepath" that points to the
934 When this method returns None, the postprocessing chain is
935 stopped. However, this method may return an information
936 dictionary that will be passed to the next postprocessing
937 object in the chain. It can be the one it received after
938 changing some fields.
940 In addition, this method may raise a PostProcessingError
941 exception that will be taken into account by the downloader
944 return information # by default, do nothing
947 if __name__ == '__main__':
949 # Modules needed only when running the main program
953 # General configuration
954 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
955 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
956 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
959 parser = optparse.OptionParser(
960 usage='Usage: %prog [options] url...',
962 conflict_handler='resolve',
964 parser.add_option('-h', '--help',
965 action='help', help='print this help text and exit')
966 parser.add_option('-v', '--version',
967 action='version', help='print program version and exit')
968 parser.add_option('-u', '--username',
969 dest='username', metavar='UN', help='account username')
970 parser.add_option('-p', '--password',
971 dest='password', metavar='PW', help='account password')
972 parser.add_option('-o', '--output',
973 dest='outtmpl', metavar='TPL', help='output filename template')
974 parser.add_option('-q', '--quiet',
975 action='store_true', dest='quiet', help='activates quiet mode', default=False)
976 parser.add_option('-s', '--simulate',
977 action='store_true', dest='simulate', help='do not download video', default=False)
978 parser.add_option('-t', '--title',
979 action='store_true', dest='usetitle', help='use title in file name', default=False)
980 parser.add_option('-l', '--literal',
981 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
982 parser.add_option('-n', '--netrc',
983 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
984 parser.add_option('-g', '--get-url',
985 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
986 parser.add_option('-e', '--get-title',
987 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
988 parser.add_option('-f', '--format',
989 dest='format', metavar='FMT', help='video format code')
990 parser.add_option('-m', '--mobile-version',
991 action='store_const', dest='format', help='alias for -f 17', const='17')
992 parser.add_option('-d', '--high-def',
993 action='store_const', dest='format', help='alias for -f 22', const='22')
994 parser.add_option('-i', '--ignore-errors',
995 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
996 parser.add_option('-r', '--rate-limit',
997 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
998 parser.add_option('-a', '--batch-file',
999 dest='batchfile', metavar='F', help='file containing URLs to download')
1000 parser.add_option('-w', '--no-overwrites',
1001 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
1002 (opts, args) = parser.parse_args()
1004 # Batch file verification
1006 if opts.batchfile is not None:
1008 batchurls = [line.strip() for line in open(opts.batchfile, 'r')]
1010 sys.exit(u'ERROR: batch file could not be read')
1011 all_urls = batchurls + args
1013 # Conflicting, missing and erroneous options
1014 if len(all_urls) < 1:
1015 sys.exit(u'ERROR: you must provide at least one URL')
1016 if opts.usenetrc and (opts.username is not None or opts.password is not None):
1017 sys.exit(u'ERROR: using .netrc conflicts with giving username/password')
1018 if opts.password is not None and opts.username is None:
1019 sys.exit(u'ERROR: account username missing')
1020 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
1021 sys.exit(u'ERROR: using output template conflicts with using title or literal title')
1022 if opts.usetitle and opts.useliteral:
1023 sys.exit(u'ERROR: using title conflicts with using literal title')
1024 if opts.username is not None and opts.password is None:
1025 opts.password = getpass.getpass(u'Type account password and press return:')
1026 if opts.ratelimit is not None:
1027 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
1028 if numeric_limit is None:
1029 sys.exit(u'ERROR: invalid rate limit specified')
1030 opts.ratelimit = numeric_limit
1032 # Information extractors
1033 youtube_ie = YoutubeIE()
1034 metacafe_ie = MetacafeIE(youtube_ie)
1035 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
1036 youtube_search_ie = YoutubeSearchIE(youtube_ie)
1039 charset = locale.getpreferredencoding()
1042 fd = FileDownloader({
1043 'usenetrc': opts.usenetrc,
1044 'username': opts.username,
1045 'password': opts.password,
1046 'quiet': (opts.quiet or opts.geturl or opts.gettitle),
1047 'forceurl': opts.geturl,
1048 'forcetitle': opts.gettitle,
1049 'simulate': (opts.simulate or opts.geturl or opts.gettitle),
1050 'format': opts.format,
1051 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(charset))
1052 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
1053 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
1054 or u'%(id)s.%(ext)s'),
1055 'ignoreerrors': opts.ignoreerrors,
1056 'ratelimit': opts.ratelimit,
1057 'nooverwrites': opts.nooverwrites,
1059 fd.add_info_extractor(youtube_search_ie)
1060 fd.add_info_extractor(youtube_pl_ie)
1061 fd.add_info_extractor(metacafe_ie)
1062 fd.add_info_extractor(youtube_ie)
1063 retcode = fd.download(all_urls)
1066 except DownloadError:
1068 except SameFileError:
1069 sys.exit(u'ERROR: fixed output name but more than one file to download')
1070 except KeyboardInterrupt:
1071 sys.exit(u'\nERROR: Interrupted by user')