2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # License: Public domain code
22 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.5) Gecko/2008120122 Firefox/3.0.5',
23 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
24 'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
25 'Accept-Language': 'en-us,en;q=0.5',
28 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
30 class DownloadError(Exception):
31 """Download Error exception.
33 This exception may be thrown by FileDownloader objects if they are not
34 configured to continue on errors. They will contain the appropriate
39 class SameFileError(Exception):
40 """Same File exception.
42 This exception will be thrown by FileDownloader objects if they detect
43 multiple files would have to be downloaded to the same file on disk.
47 class PostProcessingError(Exception):
48 """Post Processing exception.
50 This exception may be raised by PostProcessor's .run() method to
51 indicate an error in the postprocessing task.
55 class FileDownloader(object):
56 """File Downloader class.
58 File downloader objects are the ones responsible of downloading the
59 actual video file and writing it to disk if the user has requested
60 it, among some other tasks. In most cases there should be one per
61 program. As, given a video URL, the downloader doesn't know how to
62 extract all the needed information, task that InfoExtractors do, it
63 has to pass the URL to one of them.
65 For this, file downloader objects have a method that allows
66 InfoExtractors to be registered in a given order. When it is passed
67 a URL, the file downloader handles it to the first InfoExtractor it
68 finds that reports being able to handle it. The InfoExtractor returns
69 all the information to the FileDownloader and the latter downloads the
70 file or does whatever it's instructed to do.
72 File downloaders accept a lot of parameters. In order not to saturate
73 the object constructor with arguments, it receives a dictionary of
74 options instead. These options are available through the params
75 attribute for the InfoExtractors to use. The FileDownloader also
76 registers itself as the downloader in charge for the InfoExtractors
77 that are added to it, so this is a "mutual registration".
81 username: Username for authentication purposes.
82 password: Password for authentication purposes.
83 usenetrc: Use netrc for authentication instead.
84 quiet: Do not print messages to stdout.
85 forceurl: Force printing final URL.
86 forcetitle: Force printing title.
87 simulate: Do not download the video files.
88 format: Video format code.
89 outtmpl: Template for output names.
90 ignoreerrors: Do not stop on download errors.
91 ratelimit: Download speed limit, in bytes/sec.
92 nooverwrites: Prevent overwriting files.
99 def __init__(self, params):
100 """Create a FileDownloader object with the given options."""
106 def pmkdir(filename):
107 """Create directory components in filename. Similar to Unix "mkdir -p"."""
108 components = filename.split(os.sep)
109 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
110 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
111 for dir in aggregate:
112 if not os.path.exists(dir):
116 def format_bytes(bytes):
122 exponent = long(math.log(float(bytes), 1024.0))
123 suffix = 'bkMGTPEZY'[exponent]
124 converted = float(bytes) / float(1024**exponent)
125 return '%.2f%s' % (converted, suffix)
128 def calc_percent(byte_counter, data_len):
131 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
134 def calc_eta(start, now, total, current):
138 if current == 0 or dif < 0.001: # One millisecond
140 rate = float(current) / dif
141 eta = long((float(total) - float(current)) / rate)
142 (eta_mins, eta_secs) = divmod(eta, 60)
145 return '%02d:%02d' % (eta_mins, eta_secs)
148 def calc_speed(start, now, bytes):
150 if bytes == 0 or dif < 0.001: # One millisecond
151 return '%10s' % '---b/s'
152 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
155 def best_block_size(elapsed_time, bytes):
156 new_min = max(bytes / 2.0, 1.0)
157 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
158 if elapsed_time < 0.001:
160 rate = bytes / elapsed_time
168 def parse_bytes(bytestr):
169 """Parse a string indicating a byte quantity into a long integer."""
170 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
173 number = float(matchobj.group(1))
174 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
175 return long(round(number * multiplier))
177 def add_info_extractor(self, ie):
178 """Add an InfoExtractor object to the end of the list."""
180 ie.set_downloader(self)
182 def add_post_processor(self, pp):
183 """Add a PostProcessor object to the end of the chain."""
185 pp.set_downloader(self)
187 def to_stdout(self, message, skip_eol=False):
188 """Print message to stdout if not in quiet mode."""
189 if not self.params.get('quiet', False):
190 print u'%s%s' % (message, [u'\n', u''][skip_eol]),
193 def to_stderr(self, message):
194 """Print message to stderr."""
195 print >>sys.stderr, message
197 def fixed_template(self):
198 """Checks if the output template is fixed."""
199 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
201 def trouble(self, message=None):
202 """Determine action to take when a download problem appears.
204 Depending on if the downloader has been configured to ignore
205 download errors or not, this method may throw an exception or
206 not when errors are found, after printing the message. If it
207 doesn't raise, it returns an error code suitable to be returned
208 later as a program exit code to indicate error.
210 if message is not None:
211 self.to_stderr(message)
212 if not self.params.get('ignoreerrors', False):
213 raise DownloadError(message)
216 def slow_down(self, start_time, byte_counter):
217 """Sleep if the download speed is over the rate limit."""
218 rate_limit = self.params.get('ratelimit', None)
219 if rate_limit is None or byte_counter == 0:
222 elapsed = now - start_time
225 speed = float(byte_counter) / elapsed
226 if speed > rate_limit:
227 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
229 def report_destination(self, filename):
230 """Report destination filename."""
231 self.to_stdout(u'[download] Destination: %s' % filename)
233 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
234 """Report download progress."""
235 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
236 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
238 def report_finish(self):
239 """Report download finished."""
242 def download(self, url_list):
243 """Download a given list of URLs."""
245 if len(url_list) > 1 and self.fixed_template():
246 raise SameFileError(self.params['outtmpl'])
249 suitable_found = False
251 if not ie.suitable(url):
253 # Suitable InfoExtractor found
254 suitable_found = True
255 all_results = ie.extract(url)
256 results = [x for x in all_results if x is not None]
257 if len(results) != len(all_results):
258 retcode = self.trouble()
260 if len(results) > 1 and self.fixed_template():
261 raise SameFileError(self.params['outtmpl'])
263 for result in results:
265 if self.params.get('forcetitle', False):
266 print result['title']
267 if self.params.get('forceurl', False):
270 # Do nothing else if in simulate mode
271 if self.params.get('simulate', False):
275 filename = self.params['outtmpl'] % result
276 self.report_destination(filename)
277 except (ValueError, KeyError), err:
278 retcode = self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
280 if self.params['nooverwrites'] and os.path.exists(filename):
281 self.to_stderr('WARNING: file exists: %s; skipping' % filename)
284 self.pmkdir(filename)
285 except (OSError, IOError), err:
286 retcode = self.trouble('ERROR: unable to create directories: %s' % str(err))
289 outstream = open(filename, 'wb')
290 except (OSError, IOError), err:
291 retcode = self.trouble('ERROR: unable to open for writing: %s' % str(err))
294 self._do_download(outstream, result['url'])
296 except (OSError, IOError), err:
297 retcode = self.trouble('ERROR: unable to write video data: %s' % str(err))
299 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
300 retcode = self.trouble('ERROR: unable to download video data: %s' % str(err))
303 self.post_process(filename, result)
304 except (PostProcessingError), err:
305 retcode = self.trouble('ERROR: postprocessing: %s' % str(err))
309 if not suitable_found:
310 retcode = self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
314 def post_process(self, filename, ie_info):
315 """Run the postprocessing chain on the given file."""
317 info['filepath'] = filename
323 def _do_download(self, stream, url):
324 request = urllib2.Request(url, None, std_headers)
325 data = urllib2.urlopen(request)
326 data_len = data.info().get('Content-length', None)
327 data_len_str = self.format_bytes(data_len)
333 percent_str = self.calc_percent(byte_counter, data_len)
334 eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
335 speed_str = self.calc_speed(start, time.time(), byte_counter)
336 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
340 data_block = data.read(block_size)
342 data_block_len = len(data_block)
343 if data_block_len == 0:
345 byte_counter += data_block_len
346 stream.write(data_block)
347 block_size = self.best_block_size(after - before, data_block_len)
350 self.slow_down(start, byte_counter)
353 if data_len is not None and str(byte_counter) != data_len:
354 raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
356 class InfoExtractor(object):
357 """Information Extractor class.
359 Information extractors are the classes that, given a URL, extract
360 information from the video (or videos) the URL refers to. This
361 information includes the real video URL, the video title and simplified
362 title, author and others. It is returned in a list of dictionaries when
363 calling its extract() method. It is a list because a URL can refer to
364 more than one video (think of playlists). The dictionaries must include
365 the following fields:
367 id: Video identifier.
368 url: Final video URL.
369 uploader: Nickname of the video uploader.
370 title: Literal title.
371 stitle: Simplified title.
372 ext: Video filename extension.
374 Subclasses of this one should re-define the _real_initialize() and
375 _real_extract() methods, as well as the suitable() static method.
376 Probably, they should also be instantiated and added to the main
383 def __init__(self, downloader=None):
384 """Constructor. Receives an optional downloader."""
386 self.set_downloader(downloader)
390 """Receives a URL and returns True if suitable for this IE."""
393 def initialize(self):
394 """Initializes an instance (authentication, etc)."""
396 self._real_initialize()
399 def extract(self, url):
400 """Extracts URL information and returns it in list of dicts."""
402 return self._real_extract(url)
404 def set_downloader(self, downloader):
405 """Sets the downloader for this IE."""
406 self._downloader = downloader
408 def to_stdout(self, message):
409 """Print message to stdout if downloader is not in quiet mode."""
410 if self._downloader is None or not self._downloader.params.get('quiet', False):
413 def to_stderr(self, message):
414 """Print message to stderr."""
415 print >>sys.stderr, message
417 def _real_initialize(self):
418 """Real initialization process. Redefine in subclasses."""
421 def _real_extract(self, url):
422 """Real extraction process. Redefine in subclasses."""
425 class YoutubeIE(InfoExtractor):
426 """Information extractor for youtube.com."""
428 _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
429 _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
430 _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
431 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
432 _NETRC_MACHINE = 'youtube'
436 return (re.match(YoutubeIE._VALID_URL, url) is not None)
439 def htmlentity_transform(matchobj):
440 """Transforms an HTML entity to a Unicode character."""
441 entity = matchobj.group(1)
443 # Known non-numeric HTML entity
444 if entity in htmlentitydefs.name2codepoint:
445 return unichr(htmlentitydefs.name2codepoint[entity])
448 mobj = re.match(ur'(?u)#(x?\d+)', entity)
450 numstr = mobj.group(1)
451 if numstr.startswith(u'x'):
453 numstr = u'0%s' % numstr
456 return unichr(long(numstr, base))
458 # Unknown entity in name, return its literal representation
459 return (u'&%s;' % entity)
461 def report_lang(self):
462 """Report attempt to set language."""
463 self.to_stdout(u'[youtube] Setting language')
465 def report_login(self):
466 """Report attempt to log in."""
467 self.to_stdout(u'[youtube] Logging in')
469 def report_age_confirmation(self):
470 """Report attempt to confirm age."""
471 self.to_stdout(u'[youtube] Confirming age')
473 def report_webpage_download(self, video_id):
474 """Report attempt to download webpage."""
475 self.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
477 def report_information_extraction(self, video_id):
478 """Report attempt to extract video information."""
479 self.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
481 def report_video_url(self, video_id, video_real_url):
482 """Report extracted video URL."""
483 self.to_stdout(u'[youtube] %s: URL: %s' % (video_id, video_real_url))
485 def _real_initialize(self):
486 if self._downloader is None:
491 downloader_params = self._downloader.params
493 # Attempt to use provided username and password or .netrc data
494 if downloader_params.get('username', None) is not None:
495 username = downloader_params['username']
496 password = downloader_params['password']
497 elif downloader_params.get('usenetrc', False):
499 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
504 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
505 except (IOError, netrc.NetrcParseError), err:
506 self.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
510 request = urllib2.Request(self._LANG_URL, None, std_headers)
513 urllib2.urlopen(request).read()
514 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
515 self.to_stderr(u'WARNING: unable to set language: %s' % str(err))
518 # No authentication to be performed
524 'current_form': 'loginForm',
526 'action_login': 'Log In',
527 'username': username,
528 'password': password,
530 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
533 login_results = urllib2.urlopen(request).read()
534 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
535 self.to_stderr(u'WARNING: unable to log in: bad username or password')
537 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
538 self.to_stderr(u'WARNING: unable to log in: %s' % str(err))
544 'action_confirm': 'Confirm',
546 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
548 self.report_age_confirmation()
549 age_results = urllib2.urlopen(request).read()
550 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
551 self.to_stderr(u'ERROR: unable to confirm age: %s' % str(err))
554 def _real_extract(self, url):
555 # Extract video id from URL
556 mobj = re.match(self._VALID_URL, url)
558 self.to_stderr(u'ERROR: invalid URL: %s' % url)
560 video_id = mobj.group(2)
562 # Downloader parameters
564 if self._downloader is not None:
565 params = self._downloader.params
566 format_param = params.get('format', None)
573 }.get(format_param, 'flv')
575 # Normalize URL, including format
576 normalized_url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id
577 if format_param is not None:
578 normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
579 request = urllib2.Request(normalized_url, None, std_headers)
581 self.report_webpage_download(video_id)
582 video_webpage = urllib2.urlopen(request).read()
583 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
584 self.to_stderr(u'ERROR: unable to download video webpage: %s' % str(err))
586 self.report_information_extraction(video_id)
589 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
591 self.to_stderr(u'ERROR: unable to extract "t" parameter')
593 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s' % (video_id, mobj.group(1))
594 if format_param is not None:
595 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
596 self.report_video_url(video_id, video_real_url)
599 mobj = re.search(r"var watchUsername = '([^']+)';", video_webpage)
601 self.to_stderr(u'ERROR: unable to extract uploader nickname')
603 video_uploader = mobj.group(1)
606 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
608 self.to_stderr(u'ERROR: unable to extract video title')
610 video_title = mobj.group(1).decode('utf-8')
611 video_title = re.sub(ur'(?u)&(.+?);', self.htmlentity_transform, video_title)
612 video_title = video_title.replace(os.sep, u'%')
615 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
616 simple_title = simple_title.strip(ur'_')
620 'id': video_id.decode('utf-8'),
621 'url': video_real_url.decode('utf-8'),
622 'uploader': video_uploader.decode('utf-8'),
623 'title': video_title,
624 'stitle': simple_title,
625 'ext': video_extension.decode('utf-8'),
628 class MetacafeIE(InfoExtractor):
629 """Information Extractor for metacafe.com."""
631 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
632 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
635 def __init__(self, youtube_ie, downloader=None):
636 InfoExtractor.__init__(self, downloader)
637 self._youtube_ie = youtube_ie
641 return (re.match(MetacafeIE._VALID_URL, url) is not None)
643 def report_disclaimer(self):
644 """Report disclaimer retrieval."""
645 self.to_stdout(u'[metacafe] Retrieving disclaimer')
647 def report_age_confirmation(self):
648 """Report attempt to confirm age."""
649 self.to_stdout(u'[metacafe] Confirming age')
651 def report_download_webpage(self, video_id):
652 """Report webpage download."""
653 self.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
655 def report_extraction(self, video_id):
656 """Report information extraction."""
657 self.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
659 def _real_initialize(self):
660 # Retrieve disclaimer
661 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
663 self.report_disclaimer()
664 disclaimer = urllib2.urlopen(request).read()
665 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
666 self.to_stderr(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
672 'submit': "Continue - I'm over 18",
674 request = urllib2.Request('http://www.metacafe.com/', urllib.urlencode(disclaimer_form), std_headers)
676 self.report_age_confirmation()
677 disclaimer = urllib2.urlopen(request).read()
678 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
679 self.to_stderr(u'ERROR: unable to confirm age: %s' % str(err))
682 def _real_extract(self, url):
683 # Extract id and simplified title from URL
684 mobj = re.match(self._VALID_URL, url)
686 self.to_stderr(u'ERROR: invalid URL: %s' % url)
689 video_id = mobj.group(1)
691 # Check if video comes from YouTube
692 mobj2 = re.match(r'^yt-(.*)$', video_id)
693 if mobj2 is not None:
694 return self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
696 simple_title = mobj.group(2).decode('utf-8')
697 video_extension = 'flv'
699 # Retrieve video webpage to extract further information
700 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
702 self.report_download_webpage(video_id)
703 webpage = urllib2.urlopen(request).read()
704 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
705 self.to_stderr(u'ERROR: unable retrieve video webpage: %s' % str(err))
708 # Extract URL, uploader and title from webpage
709 self.report_extraction(video_id)
710 mobj = re.search(r'(?m)"mediaURL":"(http.*?\.flv)"', webpage)
712 self.to_stderr(u'ERROR: unable to extract media URL')
714 mediaURL = mobj.group(1).replace('\\', '')
716 mobj = re.search(r'(?m)"gdaKey":"(.*?)"', webpage)
718 self.to_stderr(u'ERROR: unable to extract gdaKey')
720 gdaKey = mobj.group(1)
722 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
724 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
726 self.to_stderr(u'ERROR: unable to extract title')
728 video_title = mobj.group(1).decode('utf-8')
730 mobj = re.search(r'(?m)<li id="ChnlUsr">.*?Submitter:<br />(.*?)</li>', webpage)
732 self.to_stderr(u'ERROR: unable to extract uploader nickname')
734 video_uploader = re.sub(r'<.*?>', '', mobj.group(1))
738 'id': video_id.decode('utf-8'),
739 'url': video_url.decode('utf-8'),
740 'uploader': video_uploader.decode('utf-8'),
741 'title': video_title,
742 'stitle': simple_title,
743 'ext': video_extension.decode('utf-8'),
747 class YoutubeSearchIE(InfoExtractor):
748 """Information Extractor for YouTube search queries."""
749 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
750 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
751 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
752 _MORE_PAGES_INDICATOR = r'>Next</a>'
754 _max_youtube_results = 1000
756 def __init__(self, youtube_ie, downloader=None):
757 InfoExtractor.__init__(self, downloader)
758 self._youtube_ie = youtube_ie
762 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
764 def report_download_page(self, query, pagenum):
765 """Report attempt to download playlist page with given number."""
766 self.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
768 def _real_initialize(self):
769 self._youtube_ie.initialize()
771 def _real_extract(self, query):
772 mobj = re.match(self._VALID_QUERY, query)
774 self.to_stderr(u'ERROR: invalid search query "%s"' % query)
777 prefix, query = query.split(':')
780 return self._download_n_results(query, 1)
781 elif prefix == 'all':
782 return self._download_n_results(query, self._max_youtube_results)
787 self.to_stderr(u'ERROR: invalid download number %s for query "%s"' % (n, query))
789 elif n > self._max_youtube_results:
790 self.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
791 n = self._max_youtube_results
792 return self._download_n_results(query, n)
793 except ValueError: # parsing prefix as int fails
794 return self._download_n_results(query, 1)
796 def _download_n_results(self, query, n):
797 """Downloads a specified number of results for a query"""
804 self.report_download_page(query, pagenum)
805 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
806 request = urllib2.Request(result_url, None, std_headers)
808 page = urllib2.urlopen(request).read()
809 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
810 self.to_stderr(u'ERROR: unable to download webpage: %s' % str(err))
813 # Extract video identifiers
814 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
815 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
816 if video_id not in already_seen:
817 video_ids.append(video_id)
818 already_seen.add(video_id)
819 if len(video_ids) == n:
820 # Specified n videos reached
823 information.extend(self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id))
826 if self._MORE_PAGES_INDICATOR not in page:
829 information.extend(self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id))
832 pagenum = pagenum + 1
834 class YoutubePlaylistIE(InfoExtractor):
835 """Information Extractor for YouTube playlists."""
837 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/view_play_list\?p=(.+)'
838 _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
839 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
840 _MORE_PAGES_INDICATOR = r'/view_play_list?p=%s&page=%s'
843 def __init__(self, youtube_ie, downloader=None):
844 InfoExtractor.__init__(self, downloader)
845 self._youtube_ie = youtube_ie
849 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
851 def report_download_page(self, playlist_id, pagenum):
852 """Report attempt to download playlist page with given number."""
853 self.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
855 def _real_initialize(self):
856 self._youtube_ie.initialize()
858 def _real_extract(self, url):
859 # Extract playlist id
860 mobj = re.match(self._VALID_URL, url)
862 self.to_stderr(u'ERROR: invalid url: %s' % url)
865 # Download playlist pages
866 playlist_id = mobj.group(1)
871 self.report_download_page(playlist_id, pagenum)
872 request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
874 page = urllib2.urlopen(request).read()
875 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
876 self.to_stderr(u'ERROR: unable to download webpage: %s' % str(err))
879 # Extract video identifiers
881 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
882 if mobj.group(1) not in ids_in_page:
883 ids_in_page.append(mobj.group(1))
884 video_ids.extend(ids_in_page)
886 if (self._MORE_PAGES_INDICATOR % (playlist_id, pagenum + 1)) not in page:
888 pagenum = pagenum + 1
892 information.extend(self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id))
895 class PostProcessor(object):
896 """Post Processor class.
898 PostProcessor objects can be added to downloaders with their
899 add_post_processor() method. When the downloader has finished a
900 successful download, it will take its internal chain of PostProcessors
901 and start calling the run() method on each one of them, first with
902 an initial argument and then with the returned value of the previous
905 The chain will be stopped if one of them ever returns None or the end
906 of the chain is reached.
908 PostProcessor objects follow a "mutual registration" process similar
909 to InfoExtractor objects.
914 def __init__(self, downloader=None):
915 self._downloader = downloader
917 def to_stdout(self, message):
918 """Print message to stdout if downloader is not in quiet mode."""
919 if self._downloader is None or not self._downloader.params.get('quiet', False):
922 def to_stderr(self, message):
923 """Print message to stderr."""
924 print >>sys.stderr, message
926 def set_downloader(self, downloader):
927 """Sets the downloader for this PP."""
928 self._downloader = downloader
930 def run(self, information):
931 """Run the PostProcessor.
933 The "information" argument is a dictionary like the ones
934 returned by InfoExtractors. The only difference is that this
935 one has an extra field called "filepath" that points to the
938 When this method returns None, the postprocessing chain is
939 stopped. However, this method may return an information
940 dictionary that will be passed to the next postprocessing
941 object in the chain. It can be the one it received after
942 changing some fields.
944 In addition, this method may raise a PostProcessingError
945 exception that will be taken into account by the downloader
948 return information # by default, do nothing
951 if __name__ == '__main__':
953 # Modules needed only when running the main program
957 # General configuration
958 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
959 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
960 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
963 parser = optparse.OptionParser(
964 usage='Usage: %prog [options] url...',
966 conflict_handler='resolve',
968 parser.add_option('-h', '--help',
969 action='help', help='print this help text and exit')
970 parser.add_option('-v', '--version',
971 action='version', help='print program version and exit')
972 parser.add_option('-u', '--username',
973 dest='username', metavar='UN', help='account username')
974 parser.add_option('-p', '--password',
975 dest='password', metavar='PW', help='account password')
976 parser.add_option('-o', '--output',
977 dest='outtmpl', metavar='TPL', help='output filename template')
978 parser.add_option('-q', '--quiet',
979 action='store_true', dest='quiet', help='activates quiet mode', default=False)
980 parser.add_option('-s', '--simulate',
981 action='store_true', dest='simulate', help='do not download video', default=False)
982 parser.add_option('-t', '--title',
983 action='store_true', dest='usetitle', help='use title in file name', default=False)
984 parser.add_option('-l', '--literal',
985 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
986 parser.add_option('-n', '--netrc',
987 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
988 parser.add_option('-g', '--get-url',
989 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
990 parser.add_option('-e', '--get-title',
991 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
992 parser.add_option('-f', '--format',
993 dest='format', metavar='FMT', help='video format code')
994 parser.add_option('-m', '--mobile-version',
995 action='store_const', dest='format', help='alias for -f 17', const='17')
996 parser.add_option('-d', '--high-def',
997 action='store_const', dest='format', help='alias for -f 22', const='22')
998 parser.add_option('-i', '--ignore-errors',
999 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
1000 parser.add_option('-r', '--rate-limit',
1001 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
1002 parser.add_option('-a', '--batch-file',
1003 dest='batchfile', metavar='F', help='file containing URLs to download')
1004 parser.add_option('-w', '--no-overwrites',
1005 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
1006 (opts, args) = parser.parse_args()
1008 # Batch file verification
1010 if opts.batchfile is not None:
1012 batchurls = [line.strip() for line in open(opts.batchfile, 'r')]
1014 sys.exit(u'ERROR: batch file could not be read')
1015 all_urls = batchurls + args
1017 # Conflicting, missing and erroneous options
1018 if len(all_urls) < 1:
1019 sys.exit(u'ERROR: you must provide at least one URL')
1020 if opts.usenetrc and (opts.username is not None or opts.password is not None):
1021 sys.exit(u'ERROR: using .netrc conflicts with giving username/password')
1022 if opts.password is not None and opts.username is None:
1023 sys.exit(u'ERROR: account username missing')
1024 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
1025 sys.exit(u'ERROR: using output template conflicts with using title or literal title')
1026 if opts.usetitle and opts.useliteral:
1027 sys.exit(u'ERROR: using title conflicts with using literal title')
1028 if opts.username is not None and opts.password is None:
1029 opts.password = getpass.getpass(u'Type account password and press return:')
1030 if opts.ratelimit is not None:
1031 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
1032 if numeric_limit is None:
1033 sys.exit(u'ERROR: invalid rate limit specified')
1034 opts.ratelimit = numeric_limit
1036 # Information extractors
1037 youtube_ie = YoutubeIE()
1038 metacafe_ie = MetacafeIE(youtube_ie)
1039 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
1040 youtube_search_ie = YoutubeSearchIE(youtube_ie)
1043 charset = locale.getdefaultlocale()[1]
1046 fd = FileDownloader({
1047 'usenetrc': opts.usenetrc,
1048 'username': opts.username,
1049 'password': opts.password,
1050 'quiet': (opts.quiet or opts.geturl or opts.gettitle),
1051 'forceurl': opts.geturl,
1052 'forcetitle': opts.gettitle,
1053 'simulate': (opts.simulate or opts.geturl or opts.gettitle),
1054 'format': opts.format,
1055 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(charset))
1056 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
1057 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
1058 or u'%(id)s.%(ext)s'),
1059 'ignoreerrors': opts.ignoreerrors,
1060 'ratelimit': opts.ratelimit,
1061 'nooverwrites': opts.nooverwrites,
1063 fd.add_info_extractor(youtube_search_ie)
1064 fd.add_info_extractor(youtube_pl_ie)
1065 fd.add_info_extractor(metacafe_ie)
1066 fd.add_info_extractor(youtube_ie)
1067 retcode = fd.download(all_urls)
1070 except DownloadError:
1072 except SameFileError:
1073 sys.exit(u'ERROR: fixed output name but more than one file to download')
1074 except KeyboardInterrupt:
1075 sys.exit(u'\nERROR: Interrupted by user')