2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # License: Public domain code
22 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.5) Gecko/2008120122 Firefox/3.0.5',
23 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
24 'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
25 'Accept-Language': 'en-us,en;q=0.5',
28 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
30 class DownloadError(Exception):
31 """Download Error exception.
33 This exception may be thrown by FileDownloader objects if they are not
34 configured to continue on errors. They will contain the appropriate
39 class SameFileError(Exception):
40 """Same File exception.
42 This exception will be thrown by FileDownloader objects if they detect
43 multiple files would have to be downloaded to the same file on disk.
47 class PostProcessingError(Exception):
48 """Post Processing exception.
50 This exception may be raised by PostProcessor's .run() method to
51 indicate an error in the postprocessing task.
55 class FileDownloader(object):
56 """File Downloader class.
58 File downloader objects are the ones responsible of downloading the
59 actual video file and writing it to disk if the user has requested
60 it, among some other tasks. In most cases there should be one per
61 program. As, given a video URL, the downloader doesn't know how to
62 extract all the needed information, task that InfoExtractors do, it
63 has to pass the URL to one of them.
65 For this, file downloader objects have a method that allows
66 InfoExtractors to be registered in a given order. When it is passed
67 a URL, the file downloader handles it to the first InfoExtractor it
68 finds that reports being able to handle it. The InfoExtractor returns
69 all the information to the FileDownloader and the latter downloads the
70 file or does whatever it's instructed to do.
72 File downloaders accept a lot of parameters. In order not to saturate
73 the object constructor with arguments, it receives a dictionary of
74 options instead. These options are available through the get_params()
75 method for the InfoExtractors to use. The FileDownloader also registers
76 itself as the downloader in charge for the InfoExtractors that are
77 added to it, so this is a "mutual registration".
81 username: Username for authentication purposes.
82 password: Password for authentication purposes.
83 usenetrc: Use netrc for authentication instead.
84 quiet: Do not print messages to stdout.
85 forceurl: Force printing final URL.
86 forcetitle: Force printing title.
87 simulate: Do not download the video files.
88 format: Video format code.
89 outtmpl: Template for output names.
90 ignoreerrors: Do not stop on download errors.
91 ratelimit: Download speed limit, in bytes/sec.
98 def __init__(self, params):
99 """Create a FileDownloader object with the given options."""
102 self.set_params(params)
105 def pmkdir(filename):
106 """Create directory components in filename. Similar to Unix "mkdir -p"."""
107 components = filename.split(os.sep)
108 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
109 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
110 for dir in aggregate:
111 if not os.path.exists(dir):
115 def format_bytes(bytes):
121 exponent = long(math.log(float(bytes), 1024.0))
122 suffix = 'bkMGTPEZY'[exponent]
123 converted = float(bytes) / float(1024**exponent)
124 return '%.2f%s' % (converted, suffix)
127 def calc_percent(byte_counter, data_len):
130 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
133 def calc_eta(start, now, total, current):
137 if current == 0 or dif < 0.001: # One millisecond
139 rate = float(current) / dif
140 eta = long((float(total) - float(current)) / rate)
141 (eta_mins, eta_secs) = divmod(eta, 60)
144 return '%02d:%02d' % (eta_mins, eta_secs)
147 def calc_speed(start, now, bytes):
149 if bytes == 0 or dif < 0.001: # One millisecond
150 return '%10s' % '---b/s'
151 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
154 def best_block_size(elapsed_time, bytes):
155 new_min = max(bytes / 2.0, 1.0)
156 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
157 if elapsed_time < 0.001:
159 rate = bytes / elapsed_time
167 def parse_bytes(bytestr):
168 """Parse a string indicating a byte quantity into a long integer."""
169 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
172 number = float(matchobj.group(1))
173 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
174 return long(round(number * multiplier))
176 def set_params(self, params):
177 """Sets parameters."""
178 if type(params) != dict:
179 raise ValueError('params: dictionary expected')
180 self._params = params
182 def get_params(self):
183 """Get parameters."""
186 def add_info_extractor(self, ie):
187 """Add an InfoExtractor object to the end of the list."""
189 ie.set_downloader(self)
191 def add_post_processor(self, pp):
192 """Add a PostProcessor object to the end of the chain."""
194 pp.set_downloader(self)
196 def to_stdout(self, message, skip_eol=False):
197 """Print message to stdout if not in quiet mode."""
198 if not self._params.get('quiet', False):
199 print u'%s%s' % (message, [u'\n', u''][skip_eol]),
202 def to_stderr(self, message):
203 """Print message to stderr."""
204 print >>sys.stderr, message
206 def fixed_template(self):
207 """Checks if the output template is fixed."""
208 return (re.search(ur'(?u)%\(.+?\)s', self._params['outtmpl']) is None)
210 def trouble(self, message=None):
211 """Determine action to take when a download problem appears.
213 Depending on if the downloader has been configured to ignore
214 download errors or not, this method may throw an exception or
215 not when errors are found, after printing the message. If it
216 doesn't raise, it returns an error code suitable to be returned
217 later as a program exit code to indicate error.
219 if message is not None:
220 self.to_stderr(message)
221 if not self._params.get('ignoreerrors', False):
222 raise DownloadError(message)
225 def slow_down(self, start_time, byte_counter):
226 """Sleep if the download speed is over the rate limit."""
227 rate_limit = self._params.get('ratelimit', None)
228 if rate_limit is None or byte_counter == 0:
231 elapsed = now - start_time
234 speed = float(byte_counter) / elapsed
235 if speed > rate_limit:
236 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
238 def report_destination(self, filename):
239 """Report destination filename."""
240 self.to_stdout(u'[download] Destination: %s' % filename)
242 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
243 """Report download progress."""
244 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
245 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
247 def report_finish(self):
248 """Report download finished."""
251 def download(self, url_list):
252 """Download a given list of URLs."""
254 if len(url_list) > 1 and self.fixed_template():
255 raise SameFileError(self._params['outtmpl'])
258 suitable_found = False
260 if not ie.suitable(url):
262 # Suitable InfoExtractor found
263 suitable_found = True
264 all_results = ie.extract(url)
265 results = [x for x in all_results if x is not None]
266 if len(results) != len(all_results):
267 retcode = self.trouble()
269 if len(results) > 1 and self.fixed_template():
270 raise SameFileError(self._params['outtmpl'])
272 for result in results:
274 if self._params.get('forcetitle', False):
275 print result['title']
276 if self._params.get('forceurl', False):
279 # Do nothing else if in simulate mode
280 if self._params.get('simulate', False):
284 filename = self._params['outtmpl'] % result
285 self.report_destination(filename)
286 except (ValueError, KeyError), err:
287 retcode = self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
290 self.pmkdir(filename)
291 except (OSError, IOError), err:
292 retcode = self.trouble('ERROR: unable to create directories: %s' % str(err))
295 outstream = open(filename, 'wb')
296 except (OSError, IOError), err:
297 retcode = self.trouble('ERROR: unable to open for writing: %s' % str(err))
300 self._do_download(outstream, result['url'])
302 except (OSError, IOError), err:
303 retcode = self.trouble('ERROR: unable to write video data: %s' % str(err))
305 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
306 retcode = self.trouble('ERROR: unable to download video data: %s' % str(err))
309 self.post_process(filename, result)
310 except (PostProcessingError), err:
311 retcode = self.trouble('ERROR: postprocessing: %s' % str(err))
315 if not suitable_found:
316 retcode = self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
320 def post_process(self, filename, ie_info):
321 """Run the postprocessing chain on the given file."""
323 info['filepath'] = filename
329 def _do_download(self, stream, url):
330 request = urllib2.Request(url, None, std_headers)
331 data = urllib2.urlopen(request)
332 data_len = data.info().get('Content-length', None)
333 data_len_str = self.format_bytes(data_len)
339 percent_str = self.calc_percent(byte_counter, data_len)
340 eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
341 speed_str = self.calc_speed(start, time.time(), byte_counter)
342 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
346 data_block = data.read(block_size)
348 data_block_len = len(data_block)
349 if data_block_len == 0:
351 byte_counter += data_block_len
352 stream.write(data_block)
353 block_size = self.best_block_size(after - before, data_block_len)
356 self.slow_down(start, byte_counter)
359 if data_len is not None and str(byte_counter) != data_len:
360 raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
362 class InfoExtractor(object):
363 """Information Extractor class.
365 Information extractors are the classes that, given a URL, extract
366 information from the video (or videos) the URL refers to. This
367 information includes the real video URL, the video title and simplified
368 title, author and others. It is returned in a list of dictionaries when
369 calling its extract() method. It is a list because a URL can refer to
370 more than one video (think of playlists). The dictionaries must include
371 the following fields:
373 id: Video identifier.
374 url: Final video URL.
375 uploader: Nickname of the video uploader.
376 title: Literal title.
377 stitle: Simplified title.
378 ext: Video filename extension.
380 Subclasses of this one should re-define the _real_initialize() and
381 _real_extract() methods, as well as the suitable() static method.
382 Probably, they should also be instantiated and added to the main
389 def __init__(self, downloader=None):
390 """Constructor. Receives an optional downloader."""
392 self.set_downloader(downloader)
396 """Receives a URL and returns True if suitable for this IE."""
399 def initialize(self):
400 """Initializes an instance (authentication, etc)."""
402 self._real_initialize()
405 def extract(self, url):
406 """Extracts URL information and returns it in list of dicts."""
408 return self._real_extract(url)
410 def set_downloader(self, downloader):
411 """Sets the downloader for this IE."""
412 self._downloader = downloader
414 def to_stdout(self, message):
415 """Print message to stdout if downloader is not in quiet mode."""
416 if self._downloader is None or not self._downloader.get_params().get('quiet', False):
419 def to_stderr(self, message):
420 """Print message to stderr."""
421 print >>sys.stderr, message
423 def _real_initialize(self):
424 """Real initialization process. Redefine in subclasses."""
427 def _real_extract(self, url):
428 """Real extraction process. Redefine in subclasses."""
431 class YoutubeIE(InfoExtractor):
432 """Information extractor for youtube.com."""
434 _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
435 _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
436 _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
437 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
438 _NETRC_MACHINE = 'youtube'
442 return (re.match(YoutubeIE._VALID_URL, url) is not None)
444 def report_lang(self):
445 """Report attempt to set language."""
446 self.to_stdout(u'[youtube] Setting language')
448 def report_login(self):
449 """Report attempt to log in."""
450 self.to_stdout(u'[youtube] Logging in')
452 def report_age_confirmation(self):
453 """Report attempt to confirm age."""
454 self.to_stdout(u'[youtube] Confirming age')
456 def report_webpage_download(self, video_id):
457 """Report attempt to download webpage."""
458 self.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
460 def report_information_extraction(self, video_id):
461 """Report attempt to extract video information."""
462 self.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
464 def report_video_url(self, video_id, video_real_url):
465 """Report extracted video URL."""
466 self.to_stdout(u'[youtube] %s: URL: %s' % (video_id, video_real_url))
468 def _real_initialize(self):
469 if self._downloader is None:
474 downloader_params = self._downloader.get_params()
476 # Attempt to use provided username and password or .netrc data
477 if downloader_params.get('username', None) is not None:
478 username = downloader_params['username']
479 password = downloader_params['password']
480 elif downloader_params.get('usenetrc', False):
482 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
487 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
488 except (IOError, netrc.NetrcParseError), err:
489 self.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
492 # No authentication to be performed
497 request = urllib2.Request(self._LOGIN_URL, None, std_headers)
500 urllib2.urlopen(request).read()
501 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
502 self.to_stderr(u'WARNING: unable to set language: %s' % str(err))
507 'current_form': 'loginForm',
509 'action_login': 'Log In',
510 'username': username,
511 'password': password,
513 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
516 login_results = urllib2.urlopen(request).read()
517 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
518 self.to_stderr(u'WARNING: unable to log in: bad username or password')
520 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
521 self.to_stderr(u'WARNING: unable to log in: %s' % str(err))
527 'action_confirm': 'Confirm',
529 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
531 self.report_age_confirmation()
532 age_results = urllib2.urlopen(request).read()
533 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
534 self.to_stderr(u'ERROR: unable to confirm age: %s' % str(err))
537 def _real_extract(self, url):
538 # Extract video id from URL
539 mobj = re.match(self._VALID_URL, url)
541 self.to_stderr(u'ERROR: invalid URL: %s' % url)
543 video_id = mobj.group(2)
545 # Downloader parameters
547 if self._downloader is not None:
548 params = self._downloader.get_params()
549 format_param = params.get('format', None)
552 video_extension = {'18': 'mp4', '17': '3gp'}.get(format_param, 'flv')
554 # Normalize URL, including format
555 normalized_url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id
556 if format_param is not None:
557 normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
558 request = urllib2.Request(normalized_url, None, std_headers)
560 self.report_webpage_download(video_id)
561 video_webpage = urllib2.urlopen(request).read()
562 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
563 self.to_stderr(u'ERROR: unable to download video webpage: %s' % str(err))
565 self.report_information_extraction(video_id)
568 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
570 self.to_stderr(u'ERROR: unable to extract "t" parameter')
572 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s' % (video_id, mobj.group(1))
573 if format_param is not None:
574 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
575 self.report_video_url(video_id, video_real_url)
578 mobj = re.search(r"var watchUsername = '([^']+)';", video_webpage)
580 self.to_stderr(u'ERROR: unable to extract uploader nickname')
582 video_uploader = mobj.group(1)
585 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
587 self.to_stderr(u'ERROR: unable to extract video title')
589 video_title = mobj.group(1).decode('utf-8')
590 video_title = re.sub(ur'(?u)&(.+?);', lambda x: unichr(htmlentitydefs.name2codepoint[x.group(1)]), video_title)
591 video_title = video_title.replace(os.sep, u'%')
594 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
595 simple_title = simple_title.strip(ur'_')
599 'id': video_id.decode('utf-8'),
600 'url': video_real_url.decode('utf-8'),
601 'uploader': video_uploader.decode('utf-8'),
602 'title': video_title,
603 'stitle': simple_title,
604 'ext': video_extension.decode('utf-8'),
607 class MetacafeIE(InfoExtractor):
608 """Information Extractor for metacafe.com."""
610 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
611 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
614 def __init__(self, youtube_ie, downloader=None):
615 InfoExtractor.__init__(self, downloader)
616 self._youtube_ie = youtube_ie
620 return (re.match(MetacafeIE._VALID_URL, url) is not None)
622 def report_disclaimer(self):
623 """Report disclaimer retrieval."""
624 self.to_stdout(u'[metacafe] Retrieving disclaimer')
626 def report_age_confirmation(self):
627 """Report attempt to confirm age."""
628 self.to_stdout(u'[metacafe] Confirming age')
630 def report_download_webpage(self, video_id):
631 """Report webpage download."""
632 self.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
634 def report_extraction(self, video_id):
635 """Report information extraction."""
636 self.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
638 def _real_initialize(self):
639 # Retrieve disclaimer
640 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
642 self.report_disclaimer()
643 disclaimer = urllib2.urlopen(request).read()
644 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
645 self.to_stderr(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
651 'submit': "Continue - I'm over 18",
653 request = urllib2.Request('http://www.metacafe.com/', urllib.urlencode(disclaimer_form), std_headers)
655 self.report_age_confirmation()
656 disclaimer = urllib2.urlopen(request).read()
657 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
658 self.to_stderr(u'ERROR: unable to confirm age: %s' % str(err))
661 def _real_extract(self, url):
662 # Extract id and simplified title from URL
663 mobj = re.match(self._VALID_URL, url)
665 self.to_stderr(u'ERROR: invalid URL: %s' % url)
668 video_id = mobj.group(1)
670 # Check if video comes from YouTube
671 mobj2 = re.match(r'^yt-(.*)$', video_id)
672 if mobj2 is not None:
673 return self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
675 simple_title = mobj.group(2).decode('utf-8')
676 video_extension = 'flv'
678 # Retrieve video webpage to extract further information
679 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
681 self.report_download_webpage(video_id)
682 webpage = urllib2.urlopen(request).read()
683 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
684 self.to_stderr(u'ERROR: unable retrieve video webpage: %s' % str(err))
687 # Extract URL, uploader and title from webpage
688 self.report_extraction(video_id)
689 mobj = re.search(r'(?m)"mediaURL":"(http.*?\.flv)"', webpage)
691 self.to_stderr(u'ERROR: unable to extract media URL')
693 mediaURL = mobj.group(1).replace('\\', '')
695 mobj = re.search(r'(?m)"gdaKey":"(.*?)"', webpage)
697 self.to_stderr(u'ERROR: unable to extract gdaKey')
699 gdaKey = mobj.group(1)
701 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
703 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
705 self.to_stderr(u'ERROR: unable to extract title')
707 video_title = mobj.group(1).decode('utf-8')
709 mobj = re.search(r'(?m)<li id="ChnlUsr">.*?Submitter:<br />(.*?)</li>', webpage)
711 self.to_stderr(u'ERROR: unable to extract uploader nickname')
713 video_uploader = re.sub(r'<.*?>', '', mobj.group(1))
717 'id': video_id.decode('utf-8'),
718 'url': video_url.decode('utf-8'),
719 'uploader': video_uploader.decode('utf-8'),
720 'title': video_title,
721 'stitle': simple_title,
722 'ext': video_extension.decode('utf-8'),
726 class YoutubeSearchIE(InfoExtractor):
727 """Information Extractor for YouTube search queries."""
728 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
729 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
730 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
731 _MORE_PAGES_INDICATOR = r'>Next</a>'
734 def __init__(self, youtube_ie, downloader=None):
735 InfoExtractor.__init__(self, downloader)
736 self._youtube_ie = youtube_ie
740 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
742 def report_download_page(self, query, pagenum):
743 """Report attempt to download playlist page with given number."""
744 self.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
746 def _real_initialize(self):
747 self._youtube_ie.initialize()
749 def _real_extract(self, query):
750 mobj = re.match(self._VALID_QUERY, query)
752 self.to_stderr(u'ERROR: invalid search query "%s"' % query)
755 prefix, query = query.split(':')
758 return self._download_n_results(query, 1)
759 elif prefix == 'all':
760 return self._download_n_results(query, -1)
765 self.to_stderr(u'ERROR: invalid download number %s for query "%s"' % (n, query))
767 return self._download_n_results(query, n)
768 except ValueError: # parsing prefix as int fails
769 return self._download_n_results(query, 1)
771 def _download_n_results(self, query, n):
772 """Downloads a specified number of results for a query"""
779 self.report_download_page(query, pagenum)
780 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
781 request = urllib2.Request(result_url, None, std_headers)
783 page = urllib2.urlopen(request).read()
784 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
785 self.to_stderr(u'ERROR: unable to download webpage: %s' % str(err))
788 # Extract video identifiers
789 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
790 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
791 if video_id not in already_seen:
792 video_ids.append(video_id)
793 already_seen.add(video_id)
794 if len(video_ids) == n:
795 # Specified n videos reached
798 information.extend(self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id))
801 if self._MORE_PAGES_INDICATOR not in page:
804 information.extend(self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id))
807 pagenum = pagenum + 1
809 class YoutubePlaylistIE(InfoExtractor):
810 """Information Extractor for YouTube playlists."""
812 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/view_play_list\?p=(.+)'
813 _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
814 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
815 _MORE_PAGES_INDICATOR = r'/view_play_list?p=%s&page=%s'
818 def __init__(self, youtube_ie, downloader=None):
819 InfoExtractor.__init__(self, downloader)
820 self._youtube_ie = youtube_ie
824 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
826 def report_download_page(self, playlist_id, pagenum):
827 """Report attempt to download playlist page with given number."""
828 self.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
830 def _real_initialize(self):
831 self._youtube_ie.initialize()
833 def _real_extract(self, url):
834 # Extract playlist id
835 mobj = re.match(self._VALID_URL, url)
837 self.to_stderr(u'ERROR: invalid url: %s' % url)
840 # Download playlist pages
841 playlist_id = mobj.group(1)
846 self.report_download_page(playlist_id, pagenum)
847 request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
849 page = urllib2.urlopen(request).read()
850 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
851 self.to_stderr(u'ERROR: unable to download webpage: %s' % str(err))
854 # Extract video identifiers
856 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
857 if mobj.group(1) not in ids_in_page:
858 ids_in_page.append(mobj.group(1))
859 video_ids.extend(ids_in_page)
861 if (self._MORE_PAGES_INDICATOR % (playlist_id, pagenum + 1)) not in page:
863 pagenum = pagenum + 1
867 information.extend(self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id))
870 class PostProcessor(object):
871 """Post Processor class.
873 PostProcessor objects can be added to downloaders with their
874 add_post_processor() method. When the downloader has finished a
875 successful download, it will take its internal chain of PostProcessors
876 and start calling the run() method on each one of them, first with
877 an initial argument and then with the returned value of the previous
880 The chain will be stopped if one of them ever returns None or the end
881 of the chain is reached.
883 PostProcessor objects follow a "mutual registration" process similar
884 to InfoExtractor objects.
889 def __init__(self, downloader=None):
890 self._downloader = downloader
892 def to_stdout(self, message):
893 """Print message to stdout if downloader is not in quiet mode."""
894 if self._downloader is None or not self._downloader.get_params().get('quiet', False):
897 def to_stderr(self, message):
898 """Print message to stderr."""
899 print >>sys.stderr, message
901 def set_downloader(self, downloader):
902 """Sets the downloader for this PP."""
903 self._downloader = downloader
905 def run(self, information):
906 """Run the PostProcessor.
908 The "information" argument is a dictionary like the ones
909 returned by InfoExtractors. The only difference is that this
910 one has an extra field called "filepath" that points to the
913 When this method returns None, the postprocessing chain is
914 stopped. However, this method may return an information
915 dictionary that will be passed to the next postprocessing
916 object in the chain. It can be the one it received after
917 changing some fields.
919 In addition, this method may raise a PostProcessingError
920 exception that will be taken into account by the downloader
923 return information # by default, do nothing
926 if __name__ == '__main__':
928 # Modules needed only when running the main program
932 # General configuration
933 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
934 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
935 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
938 parser = optparse.OptionParser(
939 usage='Usage: %prog [options] url...',
940 version='2009.01.31',
941 conflict_handler='resolve',
943 parser.add_option('-h', '--help',
944 action='help', help='print this help text and exit')
945 parser.add_option('-v', '--version',
946 action='version', help='print program version and exit')
947 parser.add_option('-u', '--username',
948 dest='username', metavar='UN', help='account username')
949 parser.add_option('-p', '--password',
950 dest='password', metavar='PW', help='account password')
951 parser.add_option('-o', '--output',
952 dest='outtmpl', metavar='TPL', help='output filename template')
953 parser.add_option('-q', '--quiet',
954 action='store_true', dest='quiet', help='activates quiet mode', default=False)
955 parser.add_option('-s', '--simulate',
956 action='store_true', dest='simulate', help='do not download video', default=False)
957 parser.add_option('-t', '--title',
958 action='store_true', dest='usetitle', help='use title in file name', default=False)
959 parser.add_option('-l', '--literal',
960 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
961 parser.add_option('-n', '--netrc',
962 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
963 parser.add_option('-g', '--get-url',
964 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
965 parser.add_option('-e', '--get-title',
966 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
967 parser.add_option('-f', '--format',
968 dest='format', metavar='FMT', help='video format code')
969 parser.add_option('-b', '--best-quality',
970 action='store_const', dest='format', help='alias for -f 18', const='18')
971 parser.add_option('-m', '--mobile-version',
972 action='store_const', dest='format', help='alias for -f 17', const='17')
973 parser.add_option('-i', '--ignore-errors',
974 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
975 parser.add_option('-r', '--rate-limit',
976 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
977 parser.add_option('-a', '--batch-file',
978 dest='batchfile', metavar='F', help='file containing URLs to download')
979 (opts, args) = parser.parse_args()
981 # Batch file verification
983 if opts.batchfile is not None:
985 batchurls = [line.strip() for line in open(opts.batchfile, 'r')]
987 sys.exit(u'ERROR: batch file could not be read')
988 all_urls = batchurls + args
990 # Conflicting, missing and erroneous options
991 if len(all_urls) < 1:
992 sys.exit(u'ERROR: you must provide at least one URL')
993 if opts.usenetrc and (opts.username is not None or opts.password is not None):
994 sys.exit(u'ERROR: using .netrc conflicts with giving username/password')
995 if opts.password is not None and opts.username is None:
996 sys.exit(u'ERROR: account username missing')
997 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
998 sys.exit(u'ERROR: using output template conflicts with using title or literal title')
999 if opts.usetitle and opts.useliteral:
1000 sys.exit(u'ERROR: using title conflicts with using literal title')
1001 if opts.username is not None and opts.password is None:
1002 opts.password = getpass.getpass(u'Type account password and press return:')
1003 if opts.ratelimit is not None:
1004 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
1005 if numeric_limit is None:
1006 sys.exit(u'ERROR: invalid rate limit specified')
1007 opts.ratelimit = numeric_limit
1009 # Information extractors
1010 youtube_ie = YoutubeIE()
1011 metacafe_ie = MetacafeIE(youtube_ie)
1012 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
1013 youtube_search_ie = YoutubeSearchIE(youtube_ie)
1016 charset = locale.getdefaultlocale()[1]
1019 fd = FileDownloader({
1020 'usenetrc': opts.usenetrc,
1021 'username': opts.username,
1022 'password': opts.password,
1023 'quiet': (opts.quiet or opts.geturl or opts.gettitle),
1024 'forceurl': opts.geturl,
1025 'forcetitle': opts.gettitle,
1026 'simulate': (opts.simulate or opts.geturl or opts.gettitle),
1027 'format': opts.format,
1028 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(charset))
1029 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
1030 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
1031 or u'%(id)s.%(ext)s'),
1032 'ignoreerrors': opts.ignoreerrors,
1033 'ratelimit': opts.ratelimit,
1035 fd.add_info_extractor(youtube_search_ie)
1036 fd.add_info_extractor(youtube_pl_ie)
1037 fd.add_info_extractor(metacafe_ie)
1038 fd.add_info_extractor(youtube_ie)
1039 retcode = fd.download(all_urls)
1042 except DownloadError:
1044 except SameFileError:
1045 sys.exit(u'ERROR: fixed output name but more than one file to download')
1046 except KeyboardInterrupt:
1047 sys.exit(u'\nERROR: Interrupted by user')