2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # Author: Vasyl' Vavrychuk
7 # Author: Witold Baryluk
8 # Author: Paweł Paprota
9 # Author: Gergely Imreh
10 # License: Public domain code
34 # parse_qs was moved from the cgi module to the urlparse module recently.
36 from urlparse import parse_qs
38 from cgi import parse_qs
41 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
42 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
43 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 'Accept-Encoding': 'gzip, deflate',
45 'Accept-Language': 'en-us,en;q=0.5',
48 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
50 def preferredencoding():
51 """Get preferred encoding.
53 Returns the best encoding scheme for the system, based on
54 locale.getpreferredencoding() and some further tweaks.
56 def yield_preferredencoding():
58 pref = locale.getpreferredencoding()
64 return yield_preferredencoding().next()
66 def htmlentity_transform(matchobj):
67 """Transforms an HTML entity to a Unicode character.
69 This function receives a match object and is intended to be used with
70 the re.sub() function.
72 entity = matchobj.group(1)
74 # Known non-numeric HTML entity
75 if entity in htmlentitydefs.name2codepoint:
76 return unichr(htmlentitydefs.name2codepoint[entity])
79 mobj = re.match(ur'(?u)#(x?\d+)', entity)
81 numstr = mobj.group(1)
82 if numstr.startswith(u'x'):
84 numstr = u'0%s' % numstr
87 return unichr(long(numstr, base))
89 # Unknown entity in name, return its literal representation
90 return (u'&%s;' % entity)
92 def sanitize_title(utitle):
93 """Sanitizes a video title so it could be used as part of a filename."""
94 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
95 return utitle.replace(unicode(os.sep), u'%')
97 def sanitize_open(filename, open_mode):
98 """Try to open the given filename, and slightly tweak it if this fails.
100 Attempts to open the given filename. If this fails, it tries to change
101 the filename slightly, step by step, until it's either able to open it
102 or it fails and raises a final exception, like the standard open()
105 It returns the tuple (stream, definitive_file_name).
109 if sys.platform == 'win32':
111 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
112 return (sys.stdout, filename)
113 stream = open(filename, open_mode)
114 return (stream, filename)
115 except (IOError, OSError), err:
116 # In case of error, try to remove win32 forbidden chars
117 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
119 # An exception here should be caught in the caller
120 stream = open(filename, open_mode)
121 return (stream, filename)
123 def timeconvert(timestr):
124 """Convert RFC 2822 defined time string into system timestamp"""
126 timetuple = email.utils.parsedate_tz(timestr)
127 if timetuple is not None:
128 timestamp = email.utils.mktime_tz(timetuple)
131 class DownloadError(Exception):
132 """Download Error exception.
134 This exception may be thrown by FileDownloader objects if they are not
135 configured to continue on errors. They will contain the appropriate
140 class SameFileError(Exception):
141 """Same File exception.
143 This exception will be thrown by FileDownloader objects if they detect
144 multiple files would have to be downloaded to the same file on disk.
148 class PostProcessingError(Exception):
149 """Post Processing exception.
151 This exception may be raised by PostProcessor's .run() method to
152 indicate an error in the postprocessing task.
156 class UnavailableVideoError(Exception):
157 """Unavailable Format exception.
159 This exception will be thrown when a video is requested
160 in a format that is not available for that video.
164 class ContentTooShortError(Exception):
165 """Content Too Short exception.
167 This exception may be raised by FileDownloader objects when a file they
168 download is too small for what the server announced first, indicating
169 the connection was probably interrupted.
175 def __init__(self, downloaded, expected):
176 self.downloaded = downloaded
177 self.expected = expected
179 class YoutubeDLHandler(urllib2.HTTPHandler):
180 """Handler for HTTP requests and responses.
182 This class, when installed with an OpenerDirector, automatically adds
183 the standard headers to every HTTP request and handles gzipped and
184 deflated responses from web servers. If compression is to be avoided in
185 a particular request, the original request in the program code only has
186 to include the HTTP header "Youtubedl-No-Compression", which will be
187 removed before making the real request.
189 Part of this code was copied from:
191 http://techknack.net/python-urllib2-handlers/
193 Andrew Rowls, the author of that code, agreed to release it to the
200 return zlib.decompress(data, -zlib.MAX_WBITS)
202 return zlib.decompress(data)
205 def addinfourl_wrapper(stream, headers, url, code):
206 if hasattr(urllib2.addinfourl, 'getcode'):
207 return urllib2.addinfourl(stream, headers, url, code)
208 ret = urllib2.addinfourl(stream, headers, url)
212 def http_request(self, req):
213 for h in std_headers:
216 req.add_header(h, std_headers[h])
217 if 'Youtubedl-no-compression' in req.headers:
218 if 'Accept-encoding' in req.headers:
219 del req.headers['Accept-encoding']
220 del req.headers['Youtubedl-no-compression']
223 def http_response(self, req, resp):
226 if resp.headers.get('Content-encoding', '') == 'gzip':
227 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
228 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
229 resp.msg = old_resp.msg
231 if resp.headers.get('Content-encoding', '') == 'deflate':
232 gz = StringIO.StringIO(self.deflate(resp.read()))
233 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
234 resp.msg = old_resp.msg
237 class FileDownloader(object):
238 """File Downloader class.
240 File downloader objects are the ones responsible of downloading the
241 actual video file and writing it to disk if the user has requested
242 it, among some other tasks. In most cases there should be one per
243 program. As, given a video URL, the downloader doesn't know how to
244 extract all the needed information, task that InfoExtractors do, it
245 has to pass the URL to one of them.
247 For this, file downloader objects have a method that allows
248 InfoExtractors to be registered in a given order. When it is passed
249 a URL, the file downloader handles it to the first InfoExtractor it
250 finds that reports being able to handle it. The InfoExtractor extracts
251 all the information about the video or videos the URL refers to, and
252 asks the FileDownloader to process the video information, possibly
253 downloading the video.
255 File downloaders accept a lot of parameters. In order not to saturate
256 the object constructor with arguments, it receives a dictionary of
257 options instead. These options are available through the params
258 attribute for the InfoExtractors to use. The FileDownloader also
259 registers itself as the downloader in charge for the InfoExtractors
260 that are added to it, so this is a "mutual registration".
264 username: Username for authentication purposes.
265 password: Password for authentication purposes.
266 usenetrc: Use netrc for authentication instead.
267 quiet: Do not print messages to stdout.
268 forceurl: Force printing final URL.
269 forcetitle: Force printing title.
270 forcethumbnail: Force printing thumbnail URL.
271 forcedescription: Force printing description.
272 forcefilename: Force printing final filename.
273 simulate: Do not download the video files.
274 format: Video format code.
275 format_limit: Highest quality format to try.
276 outtmpl: Template for output names.
277 ignoreerrors: Do not stop on download errors.
278 ratelimit: Download speed limit, in bytes/sec.
279 nooverwrites: Prevent overwriting files.
280 retries: Number of times to retry for HTTP error 5xx
281 continuedl: Try to continue downloads if possible.
282 noprogress: Do not print the progress bar.
283 playliststart: Playlist item to start at.
284 playlistend: Playlist item to end at.
285 logtostderr: Log messages to stderr instead of stdout.
286 consoletitle: Display progress in console window's titlebar.
287 nopart: Do not use temporary .part files.
288 updatetime: Use the Last-modified header to set output file timestamps.
294 _download_retcode = None
295 _num_downloads = None
298 def __init__(self, params):
299 """Create a FileDownloader object with the given options."""
302 self._download_retcode = 0
303 self._num_downloads = 0
304 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
308 def pmkdir(filename):
309 """Create directory components in filename. Similar to Unix "mkdir -p"."""
310 components = filename.split(os.sep)
311 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
312 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
313 for dir in aggregate:
314 if not os.path.exists(dir):
318 def format_bytes(bytes):
321 if type(bytes) is str:
326 exponent = long(math.log(bytes, 1024.0))
327 suffix = 'bkMGTPEZY'[exponent]
328 converted = float(bytes) / float(1024**exponent)
329 return '%.2f%s' % (converted, suffix)
332 def calc_percent(byte_counter, data_len):
335 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
338 def calc_eta(start, now, total, current):
342 if current == 0 or dif < 0.001: # One millisecond
344 rate = float(current) / dif
345 eta = long((float(total) - float(current)) / rate)
346 (eta_mins, eta_secs) = divmod(eta, 60)
349 return '%02d:%02d' % (eta_mins, eta_secs)
352 def calc_speed(start, now, bytes):
354 if bytes == 0 or dif < 0.001: # One millisecond
355 return '%10s' % '---b/s'
356 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
359 def best_block_size(elapsed_time, bytes):
360 new_min = max(bytes / 2.0, 1.0)
361 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
362 if elapsed_time < 0.001:
364 rate = bytes / elapsed_time
372 def parse_bytes(bytestr):
373 """Parse a string indicating a byte quantity into a long integer."""
374 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
377 number = float(matchobj.group(1))
378 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
379 return long(round(number * multiplier))
381 def add_info_extractor(self, ie):
382 """Add an InfoExtractor object to the end of the list."""
384 ie.set_downloader(self)
386 def add_post_processor(self, pp):
387 """Add a PostProcessor object to the end of the chain."""
389 pp.set_downloader(self)
391 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
392 """Print message to stdout if not in quiet mode."""
394 if not self.params.get('quiet', False):
395 terminator = [u'\n', u''][skip_eol]
396 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
397 self._screen_file.flush()
398 except (UnicodeEncodeError), err:
399 if not ignore_encoding_errors:
402 def to_stderr(self, message):
403 """Print message to stderr."""
404 print >>sys.stderr, message.encode(preferredencoding())
406 def to_cons_title(self, message):
407 """Set console/terminal window title to message."""
408 if not self.params.get('consoletitle', False):
410 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
411 # c_wchar_p() might not be necessary if `message` is
412 # already of type unicode()
413 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
414 elif 'TERM' in os.environ:
415 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
417 def fixed_template(self):
418 """Checks if the output template is fixed."""
419 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
421 def trouble(self, message=None):
422 """Determine action to take when a download problem appears.
424 Depending on if the downloader has been configured to ignore
425 download errors or not, this method may throw an exception or
426 not when errors are found, after printing the message.
428 if message is not None:
429 self.to_stderr(message)
430 if not self.params.get('ignoreerrors', False):
431 raise DownloadError(message)
432 self._download_retcode = 1
434 def slow_down(self, start_time, byte_counter):
435 """Sleep if the download speed is over the rate limit."""
436 rate_limit = self.params.get('ratelimit', None)
437 if rate_limit is None or byte_counter == 0:
440 elapsed = now - start_time
443 speed = float(byte_counter) / elapsed
444 if speed > rate_limit:
445 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
447 def temp_name(self, filename):
448 """Returns a temporary filename for the given filename."""
449 if self.params.get('nopart', False) or filename == u'-' or \
450 (os.path.exists(filename) and not os.path.isfile(filename)):
452 return filename + u'.part'
454 def undo_temp_name(self, filename):
455 if filename.endswith(u'.part'):
456 return filename[:-len(u'.part')]
459 def try_rename(self, old_filename, new_filename):
461 if old_filename == new_filename:
463 os.rename(old_filename, new_filename)
464 except (IOError, OSError), err:
465 self.trouble(u'ERROR: unable to rename file')
467 def try_utime(self, filename, last_modified_hdr):
468 """Try to set the last-modified time of the given file."""
469 if last_modified_hdr is None:
471 if not os.path.isfile(filename):
473 timestr = last_modified_hdr
476 filetime = timeconvert(timestr)
480 os.utime(filename,(time.time(), filetime))
484 def report_destination(self, filename):
485 """Report destination filename."""
486 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
488 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
489 """Report download progress."""
490 if self.params.get('noprogress', False):
492 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
493 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
494 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
495 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
497 def report_resuming_byte(self, resume_len):
498 """Report attempt to resume at given byte."""
499 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
501 def report_retry(self, count, retries):
502 """Report retry in case of HTTP error 5xx"""
503 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
505 def report_file_already_downloaded(self, file_name):
506 """Report file has already been fully downloaded."""
508 self.to_screen(u'[download] %s has already been downloaded' % file_name)
509 except (UnicodeEncodeError), err:
510 self.to_screen(u'[download] The file has already been downloaded')
512 def report_unable_to_resume(self):
513 """Report it was impossible to resume download."""
514 self.to_screen(u'[download] Unable to resume')
516 def report_finish(self):
517 """Report download finished."""
518 if self.params.get('noprogress', False):
519 self.to_screen(u'[download] Download completed')
523 def increment_downloads(self):
524 """Increment the ordinal that assigns a number to each file."""
525 self._num_downloads += 1
527 def prepare_filename(self, info_dict):
528 """Generate the output filename."""
530 template_dict = dict(info_dict)
531 template_dict['epoch'] = unicode(long(time.time()))
532 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
533 filename = self.params['outtmpl'] % template_dict
535 except (ValueError, KeyError), err:
536 self.trouble(u'ERROR: invalid system charset or erroneous output template')
539 def process_info(self, info_dict):
540 """Process a single dictionary returned by an InfoExtractor."""
541 filename = self.prepare_filename(info_dict)
542 # Do nothing else if in simulate mode
543 if self.params.get('simulate', False):
545 if self.params.get('forcetitle', False):
546 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
547 if self.params.get('forceurl', False):
548 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
549 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
550 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
551 if self.params.get('forcedescription', False) and 'description' in info_dict:
552 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
553 if self.params.get('forcefilename', False) and filename is not None:
554 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
560 if self.params.get('nooverwrites', False) and os.path.exists(filename):
561 self.to_stderr(u'WARNING: file exists and will be skipped')
565 self.pmkdir(filename)
566 except (OSError, IOError), err:
567 self.trouble(u'ERROR: unable to create directories: %s' % str(err))
571 success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
572 except (OSError, IOError), err:
573 raise UnavailableVideoError
574 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
575 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
577 except (ContentTooShortError, ), err:
578 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
583 self.post_process(filename, info_dict)
584 except (PostProcessingError), err:
585 self.trouble(u'ERROR: postprocessing: %s' % str(err))
588 def download(self, url_list):
589 """Download a given list of URLs."""
590 if len(url_list) > 1 and self.fixed_template():
591 raise SameFileError(self.params['outtmpl'])
594 suitable_found = False
596 # Go to next InfoExtractor if not suitable
597 if not ie.suitable(url):
600 # Suitable InfoExtractor found
601 suitable_found = True
603 # Extract information from URL and process it
606 # Suitable InfoExtractor had been found; go to next URL
609 if not suitable_found:
610 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
612 return self._download_retcode
614 def post_process(self, filename, ie_info):
615 """Run the postprocessing chain on the given file."""
617 info['filepath'] = filename
623 def _download_with_rtmpdump(self, filename, url, player_url):
624 self.report_destination(filename)
625 tmpfilename = self.temp_name(filename)
627 # Check for rtmpdump first
629 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
630 except (OSError, IOError):
631 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
634 # Download using rtmpdump. rtmpdump returns exit code 2 when
635 # the connection was interrumpted and resuming appears to be
636 # possible. This is part of rtmpdump's normal usage, AFAIK.
637 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
638 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
639 while retval == 2 or retval == 1:
640 prevsize = os.path.getsize(tmpfilename)
641 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
642 time.sleep(5.0) # This seems to be needed
643 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
644 cursize = os.path.getsize(tmpfilename)
645 if prevsize == cursize and retval == 1:
648 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
649 self.try_rename(tmpfilename, filename)
652 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
655 def _do_download(self, filename, url, player_url):
656 # Check file already present
657 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
658 self.report_file_already_downloaded(filename)
661 # Attempt to download using rtmpdump
662 if url.startswith('rtmp'):
663 return self._download_with_rtmpdump(filename, url, player_url)
665 tmpfilename = self.temp_name(filename)
669 # Do not include the Accept-Encoding header
670 headers = {'Youtubedl-no-compression': 'True'}
671 basic_request = urllib2.Request(url, None, headers)
672 request = urllib2.Request(url, None, headers)
674 # Establish possible resume length
675 if os.path.isfile(tmpfilename):
676 resume_len = os.path.getsize(tmpfilename)
680 # Request parameters in case of being able to resume
681 if self.params.get('continuedl', False) and resume_len != 0:
682 self.report_resuming_byte(resume_len)
683 request.add_header('Range','bytes=%d-' % resume_len)
687 retries = self.params.get('retries', 0)
688 while count <= retries:
689 # Establish connection
691 data = urllib2.urlopen(request)
693 except (urllib2.HTTPError, ), err:
694 if (err.code < 500 or err.code >= 600) and err.code != 416:
695 # Unexpected HTTP error
697 elif err.code == 416:
698 # Unable to resume (requested range not satisfiable)
700 # Open the connection again without the range header
701 data = urllib2.urlopen(basic_request)
702 content_length = data.info()['Content-Length']
703 except (urllib2.HTTPError, ), err:
704 if err.code < 500 or err.code >= 600:
707 # Examine the reported length
708 if (content_length is not None and
709 (resume_len - 100 < long(content_length) < resume_len + 100)):
710 # The file had already been fully downloaded.
711 # Explanation to the above condition: in issue #175 it was revealed that
712 # YouTube sometimes adds or removes a few bytes from the end of the file,
713 # changing the file size slightly and causing problems for some users. So
714 # I decided to implement a suggested change and consider the file
715 # completely downloaded if the file size differs less than 100 bytes from
716 # the one in the hard drive.
717 self.report_file_already_downloaded(filename)
718 self.try_rename(tmpfilename, filename)
721 # The length does not match, we start the download over
722 self.report_unable_to_resume()
728 self.report_retry(count, retries)
731 self.trouble(u'ERROR: giving up after %s retries' % retries)
734 data_len = data.info().get('Content-length', None)
735 if data_len is not None:
736 data_len = long(data_len) + resume_len
737 data_len_str = self.format_bytes(data_len)
738 byte_counter = 0 + resume_len
744 data_block = data.read(block_size)
746 if len(data_block) == 0:
748 byte_counter += len(data_block)
750 # Open file just in time
753 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
754 filename = self.undo_temp_name(tmpfilename)
755 self.report_destination(filename)
756 except (OSError, IOError), err:
757 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
760 stream.write(data_block)
761 except (IOError, OSError), err:
762 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
764 block_size = self.best_block_size(after - before, len(data_block))
767 percent_str = self.calc_percent(byte_counter, data_len)
768 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
769 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
770 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
773 self.slow_down(start, byte_counter - resume_len)
777 if data_len is not None and byte_counter != data_len:
778 raise ContentTooShortError(byte_counter, long(data_len))
779 self.try_rename(tmpfilename, filename)
781 # Update file modification time
782 if self.params.get('updatetime', True):
783 self.try_utime(filename, data.info().get('last-modified', None))
787 class InfoExtractor(object):
788 """Information Extractor class.
790 Information extractors are the classes that, given a URL, extract
791 information from the video (or videos) the URL refers to. This
792 information includes the real video URL, the video title and simplified
793 title, author and others. The information is stored in a dictionary
794 which is then passed to the FileDownloader. The FileDownloader
795 processes this information possibly downloading the video to the file
796 system, among other possible outcomes. The dictionaries must include
797 the following fields:
799 id: Video identifier.
800 url: Final video URL.
801 uploader: Nickname of the video uploader.
802 title: Literal title.
803 stitle: Simplified title.
804 ext: Video filename extension.
805 format: Video format.
806 player_url: SWF Player URL (may be None).
808 The following fields are optional. Their primary purpose is to allow
809 youtube-dl to serve as the backend for a video search function, such
810 as the one in youtube2mp3. They are only used when their respective
811 forced printing functions are called:
813 thumbnail: Full URL to a video thumbnail image.
814 description: One-line video description.
816 Subclasses of this one should re-define the _real_initialize() and
817 _real_extract() methods, as well as the suitable() static method.
818 Probably, they should also be instantiated and added to the main
825 def __init__(self, downloader=None):
826 """Constructor. Receives an optional downloader."""
828 self.set_downloader(downloader)
832 """Receives a URL and returns True if suitable for this IE."""
835 def initialize(self):
836 """Initializes an instance (authentication, etc)."""
838 self._real_initialize()
841 def extract(self, url):
842 """Extracts URL information and returns it in list of dicts."""
844 return self._real_extract(url)
846 def set_downloader(self, downloader):
847 """Sets the downloader for this IE."""
848 self._downloader = downloader
850 def _real_initialize(self):
851 """Real initialization process. Redefine in subclasses."""
854 def _real_extract(self, url):
855 """Real extraction process. Redefine in subclasses."""
858 class YoutubeIE(InfoExtractor):
859 """Information extractor for youtube.com."""
861 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
862 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
863 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
864 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
865 _NETRC_MACHINE = 'youtube'
866 # Listed in order of quality
867 _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
868 _video_extensions = {
874 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
881 return (re.match(YoutubeIE._VALID_URL, url) is not None)
883 def report_lang(self):
884 """Report attempt to set language."""
885 self._downloader.to_screen(u'[youtube] Setting language')
887 def report_login(self):
888 """Report attempt to log in."""
889 self._downloader.to_screen(u'[youtube] Logging in')
891 def report_age_confirmation(self):
892 """Report attempt to confirm age."""
893 self._downloader.to_screen(u'[youtube] Confirming age')
895 def report_video_webpage_download(self, video_id):
896 """Report attempt to download video webpage."""
897 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
899 def report_video_info_webpage_download(self, video_id):
900 """Report attempt to download video info webpage."""
901 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
903 def report_information_extraction(self, video_id):
904 """Report attempt to extract video information."""
905 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
907 def report_unavailable_format(self, video_id, format):
908 """Report extracted video URL."""
909 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
911 def report_rtmp_download(self):
912 """Indicate the download will use the RTMP protocol."""
913 self._downloader.to_screen(u'[youtube] RTMP download detected')
915 def _real_initialize(self):
916 if self._downloader is None:
921 downloader_params = self._downloader.params
923 # Attempt to use provided username and password or .netrc data
924 if downloader_params.get('username', None) is not None:
925 username = downloader_params['username']
926 password = downloader_params['password']
927 elif downloader_params.get('usenetrc', False):
929 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
934 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
935 except (IOError, netrc.NetrcParseError), err:
936 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
940 request = urllib2.Request(self._LANG_URL)
943 urllib2.urlopen(request).read()
944 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
945 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
948 # No authentication to be performed
954 'current_form': 'loginForm',
956 'action_login': 'Log In',
957 'username': username,
958 'password': password,
960 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
963 login_results = urllib2.urlopen(request).read()
964 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
965 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
967 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
968 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
974 'action_confirm': 'Confirm',
976 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
978 self.report_age_confirmation()
979 age_results = urllib2.urlopen(request).read()
980 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
981 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
984 def _real_extract(self, url):
985 # Extract video id from URL
986 mobj = re.match(self._VALID_URL, url)
988 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
990 video_id = mobj.group(2)
993 self.report_video_webpage_download(video_id)
994 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
996 video_webpage = urllib2.urlopen(request).read()
997 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
998 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1001 # Attempt to extract SWF player URL
1002 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1003 if mobj is not None:
1004 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1009 self.report_video_info_webpage_download(video_id)
1010 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1011 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1012 % (video_id, el_type))
1013 request = urllib2.Request(video_info_url)
1015 video_info_webpage = urllib2.urlopen(request).read()
1016 video_info = parse_qs(video_info_webpage)
1017 if 'token' in video_info:
1019 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1020 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1022 if 'token' not in video_info:
1023 if 'reason' in video_info:
1024 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1026 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1029 # Start extracting information
1030 self.report_information_extraction(video_id)
1033 if 'author' not in video_info:
1034 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1036 video_uploader = urllib.unquote_plus(video_info['author'][0])
1039 if 'title' not in video_info:
1040 self._downloader.trouble(u'ERROR: unable to extract video title')
1042 video_title = urllib.unquote_plus(video_info['title'][0])
1043 video_title = video_title.decode('utf-8')
1044 video_title = sanitize_title(video_title)
1047 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1048 simple_title = simple_title.strip(ur'_')
1051 if 'thumbnail_url' not in video_info:
1052 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1053 video_thumbnail = ''
1054 else: # don't panic if we can't find it
1055 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1059 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1060 if mobj is not None:
1061 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1062 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1063 for expression in format_expressions:
1065 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1070 video_description = 'No description available.'
1071 if self._downloader.params.get('forcedescription', False):
1072 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1073 if mobj is not None:
1074 video_description = mobj.group(1)
1077 video_token = urllib.unquote_plus(video_info['token'][0])
1079 # Decide which formats to download
1080 req_format = self._downloader.params.get('format', None)
1082 if 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1083 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1084 url_data = [dict(pairStr.split('=') for pairStr in uds.split('&')) for uds in url_data_strs]
1085 url_map = dict((ud['itag'], urllib.unquote(ud['url'])) for ud in url_data)
1086 format_limit = self._downloader.params.get('format_limit', None)
1087 if format_limit is not None and format_limit in self._available_formats:
1088 format_list = self._available_formats[self._available_formats.index(format_limit):]
1090 format_list = self._available_formats
1091 existing_formats = [x for x in format_list if x in url_map]
1092 if len(existing_formats) == 0:
1093 self._downloader.trouble(u'ERROR: no known formats available for video')
1095 if req_format is None:
1096 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1097 elif req_format == '-1':
1098 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1101 if req_format not in url_map:
1102 self._downloader.trouble(u'ERROR: requested format not available')
1104 video_url_list = [(req_format, url_map[req_format])] # Specific format
1106 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1107 self.report_rtmp_download()
1108 video_url_list = [(None, video_info['conn'][0])]
1111 self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
1114 for format_param, video_real_url in video_url_list:
1115 # At this point we have a new video
1116 self._downloader.increment_downloads()
1119 video_extension = self._video_extensions.get(format_param, 'flv')
1121 # Find the video URL in fmt_url_map or conn paramters
1123 # Process video information
1124 self._downloader.process_info({
1125 'id': video_id.decode('utf-8'),
1126 'url': video_real_url.decode('utf-8'),
1127 'uploader': video_uploader.decode('utf-8'),
1128 'upload_date': upload_date,
1129 'title': video_title,
1130 'stitle': simple_title,
1131 'ext': video_extension.decode('utf-8'),
1132 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1133 'thumbnail': video_thumbnail.decode('utf-8'),
1134 'description': video_description.decode('utf-8'),
1135 'player_url': player_url,
1137 except UnavailableVideoError, err:
1138 self._downloader.trouble(u'\nERROR: unable to download video')
1141 class MetacafeIE(InfoExtractor):
1142 """Information Extractor for metacafe.com."""
1144 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1145 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1146 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1149 def __init__(self, youtube_ie, downloader=None):
1150 InfoExtractor.__init__(self, downloader)
1151 self._youtube_ie = youtube_ie
1155 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1157 def report_disclaimer(self):
1158 """Report disclaimer retrieval."""
1159 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1161 def report_age_confirmation(self):
1162 """Report attempt to confirm age."""
1163 self._downloader.to_screen(u'[metacafe] Confirming age')
1165 def report_download_webpage(self, video_id):
1166 """Report webpage download."""
1167 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1169 def report_extraction(self, video_id):
1170 """Report information extraction."""
1171 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1173 def _real_initialize(self):
1174 # Retrieve disclaimer
1175 request = urllib2.Request(self._DISCLAIMER)
1177 self.report_disclaimer()
1178 disclaimer = urllib2.urlopen(request).read()
1179 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1180 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1186 'submit': "Continue - I'm over 18",
1188 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1190 self.report_age_confirmation()
1191 disclaimer = urllib2.urlopen(request).read()
1192 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1193 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1196 def _real_extract(self, url):
1197 # Extract id and simplified title from URL
1198 mobj = re.match(self._VALID_URL, url)
1200 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1203 video_id = mobj.group(1)
1205 # Check if video comes from YouTube
1206 mobj2 = re.match(r'^yt-(.*)$', video_id)
1207 if mobj2 is not None:
1208 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1211 # At this point we have a new video
1212 self._downloader.increment_downloads()
1214 simple_title = mobj.group(2).decode('utf-8')
1216 # Retrieve video webpage to extract further information
1217 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1219 self.report_download_webpage(video_id)
1220 webpage = urllib2.urlopen(request).read()
1221 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1222 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1225 # Extract URL, uploader and title from webpage
1226 self.report_extraction(video_id)
1227 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1228 if mobj is not None:
1229 mediaURL = urllib.unquote(mobj.group(1))
1230 video_extension = mediaURL[-3:]
1232 # Extract gdaKey if available
1233 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1235 video_url = mediaURL
1237 gdaKey = mobj.group(1)
1238 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1240 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1242 self._downloader.trouble(u'ERROR: unable to extract media URL')
1244 vardict = parse_qs(mobj.group(1))
1245 if 'mediaData' not in vardict:
1246 self._downloader.trouble(u'ERROR: unable to extract media URL')
1248 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1250 self._downloader.trouble(u'ERROR: unable to extract media URL')
1252 mediaURL = mobj.group(1).replace('\\/', '/')
1253 video_extension = mediaURL[-3:]
1254 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1256 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1258 self._downloader.trouble(u'ERROR: unable to extract title')
1260 video_title = mobj.group(1).decode('utf-8')
1261 video_title = sanitize_title(video_title)
1263 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1265 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1267 video_uploader = mobj.group(1)
1270 # Process video information
1271 self._downloader.process_info({
1272 'id': video_id.decode('utf-8'),
1273 'url': video_url.decode('utf-8'),
1274 'uploader': video_uploader.decode('utf-8'),
1275 'upload_date': u'NA',
1276 'title': video_title,
1277 'stitle': simple_title,
1278 'ext': video_extension.decode('utf-8'),
1282 except UnavailableVideoError:
1283 self._downloader.trouble(u'\nERROR: unable to download video')
1286 class DailymotionIE(InfoExtractor):
1287 """Information Extractor for Dailymotion"""
1289 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1291 def __init__(self, downloader=None):
1292 InfoExtractor.__init__(self, downloader)
1296 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1298 def report_download_webpage(self, video_id):
1299 """Report webpage download."""
1300 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1302 def report_extraction(self, video_id):
1303 """Report information extraction."""
1304 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1306 def _real_initialize(self):
1309 def _real_extract(self, url):
1310 # Extract id and simplified title from URL
1311 mobj = re.match(self._VALID_URL, url)
1313 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1316 # At this point we have a new video
1317 self._downloader.increment_downloads()
1318 video_id = mobj.group(1)
1320 simple_title = mobj.group(2).decode('utf-8')
1321 video_extension = 'flv'
1323 # Retrieve video webpage to extract further information
1324 request = urllib2.Request(url)
1326 self.report_download_webpage(video_id)
1327 webpage = urllib2.urlopen(request).read()
1328 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1329 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1332 # Extract URL, uploader and title from webpage
1333 self.report_extraction(video_id)
1334 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1336 self._downloader.trouble(u'ERROR: unable to extract media URL')
1338 mediaURL = urllib.unquote(mobj.group(1))
1340 # if needed add http://www.dailymotion.com/ if relative URL
1342 video_url = mediaURL
1344 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1345 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1347 self._downloader.trouble(u'ERROR: unable to extract title')
1349 video_title = mobj.group(1).decode('utf-8')
1350 video_title = sanitize_title(video_title)
1352 mobj = re.search(r'(?im)<Attribute name="owner">(.+?)</Attribute>', webpage)
1354 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1356 video_uploader = mobj.group(1)
1359 # Process video information
1360 self._downloader.process_info({
1361 'id': video_id.decode('utf-8'),
1362 'url': video_url.decode('utf-8'),
1363 'uploader': video_uploader.decode('utf-8'),
1364 'upload_date': u'NA',
1365 'title': video_title,
1366 'stitle': simple_title,
1367 'ext': video_extension.decode('utf-8'),
1371 except UnavailableVideoError:
1372 self._downloader.trouble(u'\nERROR: unable to download video')
1374 class GoogleIE(InfoExtractor):
1375 """Information extractor for video.google.com."""
1377 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1379 def __init__(self, downloader=None):
1380 InfoExtractor.__init__(self, downloader)
1384 return (re.match(GoogleIE._VALID_URL, url) is not None)
1386 def report_download_webpage(self, video_id):
1387 """Report webpage download."""
1388 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1390 def report_extraction(self, video_id):
1391 """Report information extraction."""
1392 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1394 def _real_initialize(self):
1397 def _real_extract(self, url):
1398 # Extract id from URL
1399 mobj = re.match(self._VALID_URL, url)
1401 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1404 # At this point we have a new video
1405 self._downloader.increment_downloads()
1406 video_id = mobj.group(1)
1408 video_extension = 'mp4'
1410 # Retrieve video webpage to extract further information
1411 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1413 self.report_download_webpage(video_id)
1414 webpage = urllib2.urlopen(request).read()
1415 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1416 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1419 # Extract URL, uploader, and title from webpage
1420 self.report_extraction(video_id)
1421 mobj = re.search(r"download_url:'([^']+)'", webpage)
1423 video_extension = 'flv'
1424 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1426 self._downloader.trouble(u'ERROR: unable to extract media URL')
1428 mediaURL = urllib.unquote(mobj.group(1))
1429 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1430 mediaURL = mediaURL.replace('\\x26', '\x26')
1432 video_url = mediaURL
1434 mobj = re.search(r'<title>(.*)</title>', webpage)
1436 self._downloader.trouble(u'ERROR: unable to extract title')
1438 video_title = mobj.group(1).decode('utf-8')
1439 video_title = sanitize_title(video_title)
1440 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1442 # Extract video description
1443 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1445 self._downloader.trouble(u'ERROR: unable to extract video description')
1447 video_description = mobj.group(1).decode('utf-8')
1448 if not video_description:
1449 video_description = 'No description available.'
1451 # Extract video thumbnail
1452 if self._downloader.params.get('forcethumbnail', False):
1453 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1455 webpage = urllib2.urlopen(request).read()
1456 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1457 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1459 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1461 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1463 video_thumbnail = mobj.group(1)
1464 else: # we need something to pass to process_info
1465 video_thumbnail = ''
1469 # Process video information
1470 self._downloader.process_info({
1471 'id': video_id.decode('utf-8'),
1472 'url': video_url.decode('utf-8'),
1474 'upload_date': u'NA',
1475 'title': video_title,
1476 'stitle': simple_title,
1477 'ext': video_extension.decode('utf-8'),
1481 except UnavailableVideoError:
1482 self._downloader.trouble(u'\nERROR: unable to download video')
1485 class PhotobucketIE(InfoExtractor):
1486 """Information extractor for photobucket.com."""
1488 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1490 def __init__(self, downloader=None):
1491 InfoExtractor.__init__(self, downloader)
1495 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1497 def report_download_webpage(self, video_id):
1498 """Report webpage download."""
1499 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1501 def report_extraction(self, video_id):
1502 """Report information extraction."""
1503 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1505 def _real_initialize(self):
1508 def _real_extract(self, url):
1509 # Extract id from URL
1510 mobj = re.match(self._VALID_URL, url)
1512 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1515 # At this point we have a new video
1516 self._downloader.increment_downloads()
1517 video_id = mobj.group(1)
1519 video_extension = 'flv'
1521 # Retrieve video webpage to extract further information
1522 request = urllib2.Request(url)
1524 self.report_download_webpage(video_id)
1525 webpage = urllib2.urlopen(request).read()
1526 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1527 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1530 # Extract URL, uploader, and title from webpage
1531 self.report_extraction(video_id)
1532 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1534 self._downloader.trouble(u'ERROR: unable to extract media URL')
1536 mediaURL = urllib.unquote(mobj.group(1))
1538 video_url = mediaURL
1540 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1542 self._downloader.trouble(u'ERROR: unable to extract title')
1544 video_title = mobj.group(1).decode('utf-8')
1545 video_title = sanitize_title(video_title)
1546 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1548 video_uploader = mobj.group(2).decode('utf-8')
1551 # Process video information
1552 self._downloader.process_info({
1553 'id': video_id.decode('utf-8'),
1554 'url': video_url.decode('utf-8'),
1555 'uploader': video_uploader,
1556 'upload_date': u'NA',
1557 'title': video_title,
1558 'stitle': simple_title,
1559 'ext': video_extension.decode('utf-8'),
1563 except UnavailableVideoError:
1564 self._downloader.trouble(u'\nERROR: unable to download video')
1567 class YahooIE(InfoExtractor):
1568 """Information extractor for video.yahoo.com."""
1570 # _VALID_URL matches all Yahoo! Video URLs
1571 # _VPAGE_URL matches only the extractable '/watch/' URLs
1572 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1573 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1575 def __init__(self, downloader=None):
1576 InfoExtractor.__init__(self, downloader)
1580 return (re.match(YahooIE._VALID_URL, url) is not None)
1582 def report_download_webpage(self, video_id):
1583 """Report webpage download."""
1584 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1586 def report_extraction(self, video_id):
1587 """Report information extraction."""
1588 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1590 def _real_initialize(self):
1593 def _real_extract(self, url, new_video=True):
1594 # Extract ID from URL
1595 mobj = re.match(self._VALID_URL, url)
1597 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1600 # At this point we have a new video
1601 self._downloader.increment_downloads()
1602 video_id = mobj.group(2)
1603 video_extension = 'flv'
1605 # Rewrite valid but non-extractable URLs as
1606 # extractable English language /watch/ URLs
1607 if re.match(self._VPAGE_URL, url) is None:
1608 request = urllib2.Request(url)
1610 webpage = urllib2.urlopen(request).read()
1611 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1612 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1615 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1617 self._downloader.trouble(u'ERROR: Unable to extract id field')
1619 yahoo_id = mobj.group(1)
1621 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1623 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1625 yahoo_vid = mobj.group(1)
1627 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1628 return self._real_extract(url, new_video=False)
1630 # Retrieve video webpage to extract further information
1631 request = urllib2.Request(url)
1633 self.report_download_webpage(video_id)
1634 webpage = urllib2.urlopen(request).read()
1635 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1636 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1639 # Extract uploader and title from webpage
1640 self.report_extraction(video_id)
1641 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1643 self._downloader.trouble(u'ERROR: unable to extract video title')
1645 video_title = mobj.group(1).decode('utf-8')
1646 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1648 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1650 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1652 video_uploader = mobj.group(1).decode('utf-8')
1654 # Extract video thumbnail
1655 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1657 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1659 video_thumbnail = mobj.group(1).decode('utf-8')
1661 # Extract video description
1662 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1664 self._downloader.trouble(u'ERROR: unable to extract video description')
1666 video_description = mobj.group(1).decode('utf-8')
1667 if not video_description: video_description = 'No description available.'
1669 # Extract video height and width
1670 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1672 self._downloader.trouble(u'ERROR: unable to extract video height')
1674 yv_video_height = mobj.group(1)
1676 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1678 self._downloader.trouble(u'ERROR: unable to extract video width')
1680 yv_video_width = mobj.group(1)
1682 # Retrieve video playlist to extract media URL
1683 # I'm not completely sure what all these options are, but we
1684 # seem to need most of them, otherwise the server sends a 401.
1685 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1686 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1687 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1688 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1689 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1691 self.report_download_webpage(video_id)
1692 webpage = urllib2.urlopen(request).read()
1693 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1694 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1697 # Extract media URL from playlist XML
1698 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1700 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1702 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1703 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1706 # Process video information
1707 self._downloader.process_info({
1708 'id': video_id.decode('utf-8'),
1710 'uploader': video_uploader,
1711 'upload_date': u'NA',
1712 'title': video_title,
1713 'stitle': simple_title,
1714 'ext': video_extension.decode('utf-8'),
1715 'thumbnail': video_thumbnail.decode('utf-8'),
1716 'description': video_description,
1717 'thumbnail': video_thumbnail,
1718 'description': video_description,
1721 except UnavailableVideoError:
1722 self._downloader.trouble(u'\nERROR: unable to download video')
1725 class VimeoIE(InfoExtractor):
1726 """Information extractor for vimeo.com."""
1728 # _VALID_URL matches Vimeo URLs
1729 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1731 def __init__(self, downloader=None):
1732 InfoExtractor.__init__(self, downloader)
1736 return (re.match(VimeoIE._VALID_URL, url) is not None)
1738 def report_download_webpage(self, video_id):
1739 """Report webpage download."""
1740 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1742 def report_extraction(self, video_id):
1743 """Report information extraction."""
1744 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1746 def _real_initialize(self):
1749 def _real_extract(self, url, new_video=True):
1750 # Extract ID from URL
1751 mobj = re.match(self._VALID_URL, url)
1753 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1756 # At this point we have a new video
1757 self._downloader.increment_downloads()
1758 video_id = mobj.group(1)
1760 # Retrieve video webpage to extract further information
1761 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
1763 self.report_download_webpage(video_id)
1764 webpage = urllib2.urlopen(request).read()
1765 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1766 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1769 # Now we begin extracting as much information as we can from what we
1770 # retrieved. First we extract the information common to all extractors,
1771 # and latter we extract those that are Vimeo specific.
1772 self.report_extraction(video_id)
1775 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
1777 self._downloader.trouble(u'ERROR: unable to extract video title')
1779 video_title = mobj.group(1).decode('utf-8')
1780 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1783 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
1785 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1787 video_uploader = mobj.group(1).decode('utf-8')
1789 # Extract video thumbnail
1790 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
1792 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1794 video_thumbnail = mobj.group(1).decode('utf-8')
1796 # # Extract video description
1797 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
1799 # self._downloader.trouble(u'ERROR: unable to extract video description')
1801 # video_description = mobj.group(1).decode('utf-8')
1802 # if not video_description: video_description = 'No description available.'
1803 video_description = 'Foo.'
1805 # Vimeo specific: extract request signature
1806 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
1808 self._downloader.trouble(u'ERROR: unable to extract request signature')
1810 sig = mobj.group(1).decode('utf-8')
1812 # Vimeo specific: Extract request signature expiration
1813 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
1815 self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
1817 sig_exp = mobj.group(1).decode('utf-8')
1819 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s" % (video_id, sig, sig_exp)
1822 # Process video information
1823 self._downloader.process_info({
1824 'id': video_id.decode('utf-8'),
1826 'uploader': video_uploader,
1827 'upload_date': u'NA',
1828 'title': video_title,
1829 'stitle': simple_title,
1831 'thumbnail': video_thumbnail.decode('utf-8'),
1832 'description': video_description,
1833 'thumbnail': video_thumbnail,
1834 'description': video_description,
1837 except UnavailableVideoError:
1838 self._downloader.trouble(u'ERROR: unable to download video')
1841 class GenericIE(InfoExtractor):
1842 """Generic last-resort information extractor."""
1844 def __init__(self, downloader=None):
1845 InfoExtractor.__init__(self, downloader)
1851 def report_download_webpage(self, video_id):
1852 """Report webpage download."""
1853 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1854 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1856 def report_extraction(self, video_id):
1857 """Report information extraction."""
1858 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1860 def _real_initialize(self):
1863 def _real_extract(self, url):
1864 # At this point we have a new video
1865 self._downloader.increment_downloads()
1867 video_id = url.split('/')[-1]
1868 request = urllib2.Request(url)
1870 self.report_download_webpage(video_id)
1871 webpage = urllib2.urlopen(request).read()
1872 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1873 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1875 except ValueError, err:
1876 # since this is the last-resort InfoExtractor, if
1877 # this error is thrown, it'll be thrown here
1878 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1881 self.report_extraction(video_id)
1882 # Start with something easy: JW Player in SWFObject
1883 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1885 # Broaden the search a little bit
1886 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1888 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1891 # It's possible that one of the regexes
1892 # matched, but returned an empty group:
1893 if mobj.group(1) is None:
1894 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1897 video_url = urllib.unquote(mobj.group(1))
1898 video_id = os.path.basename(video_url)
1900 # here's a fun little line of code for you:
1901 video_extension = os.path.splitext(video_id)[1][1:]
1902 video_id = os.path.splitext(video_id)[0]
1904 # it's tempting to parse this further, but you would
1905 # have to take into account all the variations like
1906 # Video Title - Site Name
1907 # Site Name | Video Title
1908 # Video Title - Tagline | Site Name
1909 # and so on and so forth; it's just not practical
1910 mobj = re.search(r'<title>(.*)</title>', webpage)
1912 self._downloader.trouble(u'ERROR: unable to extract title')
1914 video_title = mobj.group(1).decode('utf-8')
1915 video_title = sanitize_title(video_title)
1916 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1918 # video uploader is domain name
1919 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1921 self._downloader.trouble(u'ERROR: unable to extract title')
1923 video_uploader = mobj.group(1).decode('utf-8')
1926 # Process video information
1927 self._downloader.process_info({
1928 'id': video_id.decode('utf-8'),
1929 'url': video_url.decode('utf-8'),
1930 'uploader': video_uploader,
1931 'upload_date': u'NA',
1932 'title': video_title,
1933 'stitle': simple_title,
1934 'ext': video_extension.decode('utf-8'),
1938 except UnavailableVideoError, err:
1939 self._downloader.trouble(u'\nERROR: unable to download video')
1942 class YoutubeSearchIE(InfoExtractor):
1943 """Information Extractor for YouTube search queries."""
1944 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1945 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1946 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1947 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1949 _max_youtube_results = 1000
1951 def __init__(self, youtube_ie, downloader=None):
1952 InfoExtractor.__init__(self, downloader)
1953 self._youtube_ie = youtube_ie
1957 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1959 def report_download_page(self, query, pagenum):
1960 """Report attempt to download playlist page with given number."""
1961 query = query.decode(preferredencoding())
1962 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1964 def _real_initialize(self):
1965 self._youtube_ie.initialize()
1967 def _real_extract(self, query):
1968 mobj = re.match(self._VALID_QUERY, query)
1970 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1973 prefix, query = query.split(':')
1975 query = query.encode('utf-8')
1977 self._download_n_results(query, 1)
1979 elif prefix == 'all':
1980 self._download_n_results(query, self._max_youtube_results)
1986 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1988 elif n > self._max_youtube_results:
1989 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1990 n = self._max_youtube_results
1991 self._download_n_results(query, n)
1993 except ValueError: # parsing prefix as integer fails
1994 self._download_n_results(query, 1)
1997 def _download_n_results(self, query, n):
1998 """Downloads a specified number of results for a query"""
2001 already_seen = set()
2005 self.report_download_page(query, pagenum)
2006 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2007 request = urllib2.Request(result_url)
2009 page = urllib2.urlopen(request).read()
2010 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2011 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2014 # Extract video identifiers
2015 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2016 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2017 if video_id not in already_seen:
2018 video_ids.append(video_id)
2019 already_seen.add(video_id)
2020 if len(video_ids) == n:
2021 # Specified n videos reached
2022 for id in video_ids:
2023 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2026 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2027 for id in video_ids:
2028 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2031 pagenum = pagenum + 1
2033 class GoogleSearchIE(InfoExtractor):
2034 """Information Extractor for Google Video search queries."""
2035 _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
2036 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2037 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2038 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2040 _max_google_results = 1000
2042 def __init__(self, google_ie, downloader=None):
2043 InfoExtractor.__init__(self, downloader)
2044 self._google_ie = google_ie
2048 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
2050 def report_download_page(self, query, pagenum):
2051 """Report attempt to download playlist page with given number."""
2052 query = query.decode(preferredencoding())
2053 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2055 def _real_initialize(self):
2056 self._google_ie.initialize()
2058 def _real_extract(self, query):
2059 mobj = re.match(self._VALID_QUERY, query)
2061 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2064 prefix, query = query.split(':')
2066 query = query.encode('utf-8')
2068 self._download_n_results(query, 1)
2070 elif prefix == 'all':
2071 self._download_n_results(query, self._max_google_results)
2077 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2079 elif n > self._max_google_results:
2080 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2081 n = self._max_google_results
2082 self._download_n_results(query, n)
2084 except ValueError: # parsing prefix as integer fails
2085 self._download_n_results(query, 1)
2088 def _download_n_results(self, query, n):
2089 """Downloads a specified number of results for a query"""
2092 already_seen = set()
2096 self.report_download_page(query, pagenum)
2097 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2098 request = urllib2.Request(result_url)
2100 page = urllib2.urlopen(request).read()
2101 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2102 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2105 # Extract video identifiers
2106 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2107 video_id = mobj.group(1)
2108 if video_id not in already_seen:
2109 video_ids.append(video_id)
2110 already_seen.add(video_id)
2111 if len(video_ids) == n:
2112 # Specified n videos reached
2113 for id in video_ids:
2114 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2117 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2118 for id in video_ids:
2119 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2122 pagenum = pagenum + 1
2124 class YahooSearchIE(InfoExtractor):
2125 """Information Extractor for Yahoo! Video search queries."""
2126 _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
2127 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2128 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2129 _MORE_PAGES_INDICATOR = r'\s*Next'
2131 _max_yahoo_results = 1000
2133 def __init__(self, yahoo_ie, downloader=None):
2134 InfoExtractor.__init__(self, downloader)
2135 self._yahoo_ie = yahoo_ie
2139 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
2141 def report_download_page(self, query, pagenum):
2142 """Report attempt to download playlist page with given number."""
2143 query = query.decode(preferredencoding())
2144 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2146 def _real_initialize(self):
2147 self._yahoo_ie.initialize()
2149 def _real_extract(self, query):
2150 mobj = re.match(self._VALID_QUERY, query)
2152 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2155 prefix, query = query.split(':')
2157 query = query.encode('utf-8')
2159 self._download_n_results(query, 1)
2161 elif prefix == 'all':
2162 self._download_n_results(query, self._max_yahoo_results)
2168 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2170 elif n > self._max_yahoo_results:
2171 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2172 n = self._max_yahoo_results
2173 self._download_n_results(query, n)
2175 except ValueError: # parsing prefix as integer fails
2176 self._download_n_results(query, 1)
2179 def _download_n_results(self, query, n):
2180 """Downloads a specified number of results for a query"""
2183 already_seen = set()
2187 self.report_download_page(query, pagenum)
2188 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2189 request = urllib2.Request(result_url)
2191 page = urllib2.urlopen(request).read()
2192 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2193 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2196 # Extract video identifiers
2197 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2198 video_id = mobj.group(1)
2199 if video_id not in already_seen:
2200 video_ids.append(video_id)
2201 already_seen.add(video_id)
2202 if len(video_ids) == n:
2203 # Specified n videos reached
2204 for id in video_ids:
2205 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2208 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2209 for id in video_ids:
2210 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2213 pagenum = pagenum + 1
2215 class YoutubePlaylistIE(InfoExtractor):
2216 """Information Extractor for YouTube playlists."""
2218 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist)\?.*?(p|a)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2219 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2220 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2221 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2224 def __init__(self, youtube_ie, downloader=None):
2225 InfoExtractor.__init__(self, downloader)
2226 self._youtube_ie = youtube_ie
2230 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2232 def report_download_page(self, playlist_id, pagenum):
2233 """Report attempt to download playlist page with given number."""
2234 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2236 def _real_initialize(self):
2237 self._youtube_ie.initialize()
2239 def _real_extract(self, url):
2240 # Extract playlist id
2241 mobj = re.match(self._VALID_URL, url)
2243 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2247 if mobj.group(3) is not None:
2248 self._youtube_ie.extract(mobj.group(3))
2251 # Download playlist pages
2252 # prefix is 'p' as default for playlists but there are other types that need extra care
2253 playlist_prefix = mobj.group(1)
2254 if playlist_prefix == 'a':
2255 playlist_access = 'artist'
2257 playlist_prefix = 'p'
2258 playlist_access = 'view_play_list'
2259 playlist_id = mobj.group(2)
2264 self.report_download_page(playlist_id, pagenum)
2265 request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2267 page = urllib2.urlopen(request).read()
2268 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2269 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2272 # Extract video identifiers
2274 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2275 if mobj.group(1) not in ids_in_page:
2276 ids_in_page.append(mobj.group(1))
2277 video_ids.extend(ids_in_page)
2279 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2281 pagenum = pagenum + 1
2283 playliststart = self._downloader.params.get('playliststart', 1) - 1
2284 playlistend = self._downloader.params.get('playlistend', -1)
2285 video_ids = video_ids[playliststart:playlistend]
2287 for id in video_ids:
2288 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2291 class YoutubeUserIE(InfoExtractor):
2292 """Information Extractor for YouTube users."""
2294 _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2295 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2296 _GDATA_PAGE_SIZE = 50
2297 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2298 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2301 def __init__(self, youtube_ie, downloader=None):
2302 InfoExtractor.__init__(self, downloader)
2303 self._youtube_ie = youtube_ie
2307 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2309 def report_download_page(self, username, start_index):
2310 """Report attempt to download user page."""
2311 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2312 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2314 def _real_initialize(self):
2315 self._youtube_ie.initialize()
2317 def _real_extract(self, url):
2319 mobj = re.match(self._VALID_URL, url)
2321 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2324 username = mobj.group(1)
2326 # Download video ids using YouTube Data API. Result size per
2327 # query is limited (currently to 50 videos) so we need to query
2328 # page by page until there are no video ids - it means we got
2335 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2336 self.report_download_page(username, start_index)
2338 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2341 page = urllib2.urlopen(request).read()
2342 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2343 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2346 # Extract video identifiers
2349 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2350 if mobj.group(1) not in ids_in_page:
2351 ids_in_page.append(mobj.group(1))
2353 video_ids.extend(ids_in_page)
2355 # A little optimization - if current page is not
2356 # "full", ie. does not contain PAGE_SIZE video ids then
2357 # we can assume that this page is the last one - there
2358 # are no more ids on further pages - no need to query
2361 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2366 all_ids_count = len(video_ids)
2367 playliststart = self._downloader.params.get('playliststart', 1) - 1
2368 playlistend = self._downloader.params.get('playlistend', -1)
2370 if playlistend == -1:
2371 video_ids = video_ids[playliststart:]
2373 video_ids = video_ids[playliststart:playlistend]
2375 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2376 (username, all_ids_count, len(video_ids)))
2378 for video_id in video_ids:
2379 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2382 class DepositFilesIE(InfoExtractor):
2383 """Information extractor for depositfiles.com"""
2385 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2387 def __init__(self, downloader=None):
2388 InfoExtractor.__init__(self, downloader)
2392 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2394 def report_download_webpage(self, file_id):
2395 """Report webpage download."""
2396 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2398 def report_extraction(self, file_id):
2399 """Report information extraction."""
2400 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2402 def _real_initialize(self):
2405 def _real_extract(self, url):
2406 # At this point we have a new file
2407 self._downloader.increment_downloads()
2409 file_id = url.split('/')[-1]
2410 # Rebuild url in english locale
2411 url = 'http://depositfiles.com/en/files/' + file_id
2413 # Retrieve file webpage with 'Free download' button pressed
2414 free_download_indication = { 'gateway_result' : '1' }
2415 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2417 self.report_download_webpage(file_id)
2418 webpage = urllib2.urlopen(request).read()
2419 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2420 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2423 # Search for the real file URL
2424 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2425 if (mobj is None) or (mobj.group(1) is None):
2426 # Try to figure out reason of the error.
2427 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2428 if (mobj is not None) and (mobj.group(1) is not None):
2429 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2430 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2432 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2435 file_url = mobj.group(1)
2436 file_extension = os.path.splitext(file_url)[1][1:]
2438 # Search for file title
2439 mobj = re.search(r'<b title="(.*?)">', webpage)
2441 self._downloader.trouble(u'ERROR: unable to extract title')
2443 file_title = mobj.group(1).decode('utf-8')
2446 # Process file information
2447 self._downloader.process_info({
2448 'id': file_id.decode('utf-8'),
2449 'url': file_url.decode('utf-8'),
2451 'upload_date': u'NA',
2452 'title': file_title,
2453 'stitle': file_title,
2454 'ext': file_extension.decode('utf-8'),
2458 except UnavailableVideoError, err:
2459 self._downloader.trouble(u'ERROR: unable to download file')
2461 class FacebookIE(InfoExtractor):
2462 """Information Extractor for Facebook"""
2464 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2465 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2466 _NETRC_MACHINE = 'facebook'
2467 _available_formats = ['highqual', 'lowqual']
2468 _video_extensions = {
2473 def __init__(self, downloader=None):
2474 InfoExtractor.__init__(self, downloader)
2478 return (re.match(FacebookIE._VALID_URL, url) is not None)
2480 def _reporter(self, message):
2481 """Add header and report message."""
2482 self._downloader.to_screen(u'[facebook] %s' % message)
2484 def report_login(self):
2485 """Report attempt to log in."""
2486 self._reporter(u'Logging in')
2488 def report_video_webpage_download(self, video_id):
2489 """Report attempt to download video webpage."""
2490 self._reporter(u'%s: Downloading video webpage' % video_id)
2492 def report_information_extraction(self, video_id):
2493 """Report attempt to extract video information."""
2494 self._reporter(u'%s: Extracting video information' % video_id)
2496 def _parse_page(self, video_webpage):
2497 """Extract video information from page"""
2499 data = {'title': r'class="video_title datawrap">(.*?)</',
2500 'description': r'<div class="datawrap">(.*?)</div>',
2501 'owner': r'\("video_owner_name", "(.*?)"\)',
2502 'upload_date': r'data-date="(.*?)"',
2503 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2506 for piece in data.keys():
2507 mobj = re.search(data[piece], video_webpage)
2508 if mobj is not None:
2509 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2513 for fmt in self._available_formats:
2514 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2515 if mobj is not None:
2516 # URL is in a Javascript segment inside an escaped Unicode format within
2517 # the generally utf-8 page
2518 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2519 video_info['video_urls'] = video_urls
2523 def _real_initialize(self):
2524 if self._downloader is None:
2529 downloader_params = self._downloader.params
2531 # Attempt to use provided username and password or .netrc data
2532 if downloader_params.get('username', None) is not None:
2533 useremail = downloader_params['username']
2534 password = downloader_params['password']
2535 elif downloader_params.get('usenetrc', False):
2537 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2538 if info is not None:
2542 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2543 except (IOError, netrc.NetrcParseError), err:
2544 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2547 if useremail is None:
2556 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2559 login_results = urllib2.urlopen(request).read()
2560 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2561 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2563 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2564 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2567 def _real_extract(self, url):
2568 mobj = re.match(self._VALID_URL, url)
2570 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2572 video_id = mobj.group('ID')
2575 self.report_video_webpage_download(video_id)
2576 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2578 page = urllib2.urlopen(request)
2579 video_webpage = page.read()
2580 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2581 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2584 # Start extracting information
2585 self.report_information_extraction(video_id)
2587 # Extract information
2588 video_info = self._parse_page(video_webpage)
2591 if 'owner' not in video_info:
2592 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2594 video_uploader = video_info['owner']
2597 if 'title' not in video_info:
2598 self._downloader.trouble(u'ERROR: unable to extract video title')
2600 video_title = video_info['title']
2601 video_title = video_title.decode('utf-8')
2602 video_title = sanitize_title(video_title)
2605 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2606 simple_title = simple_title.strip(ur'_')
2609 if 'thumbnail' not in video_info:
2610 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2611 video_thumbnail = ''
2613 video_thumbnail = video_info['thumbnail']
2617 if 'upload_date' in video_info:
2618 upload_time = video_info['upload_date']
2619 timetuple = email.utils.parsedate_tz(upload_time)
2620 if timetuple is not None:
2622 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2627 video_description = 'No description available.'
2628 if (self._downloader.params.get('forcedescription', False) and
2629 'description' in video_info):
2630 video_description = video_info['description']
2632 url_map = video_info['video_urls']
2633 if len(url_map.keys()) > 0:
2634 # Decide which formats to download
2635 req_format = self._downloader.params.get('format', None)
2636 format_limit = self._downloader.params.get('format_limit', None)
2638 if format_limit is not None and format_limit in self._available_formats:
2639 format_list = self._available_formats[self._available_formats.index(format_limit):]
2641 format_list = self._available_formats
2642 existing_formats = [x for x in format_list if x in url_map]
2643 if len(existing_formats) == 0:
2644 self._downloader.trouble(u'ERROR: no known formats available for video')
2646 if req_format is None:
2647 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2648 elif req_format == '-1':
2649 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2652 if req_format not in url_map:
2653 self._downloader.trouble(u'ERROR: requested format not available')
2655 video_url_list = [(req_format, url_map[req_format])] # Specific format
2657 for format_param, video_real_url in video_url_list:
2659 # At this point we have a new video
2660 self._downloader.increment_downloads()
2663 video_extension = self._video_extensions.get(format_param, 'mp4')
2665 # Find the video URL in fmt_url_map or conn paramters
2667 # Process video information
2668 self._downloader.process_info({
2669 'id': video_id.decode('utf-8'),
2670 'url': video_real_url.decode('utf-8'),
2671 'uploader': video_uploader.decode('utf-8'),
2672 'upload_date': upload_date,
2673 'title': video_title,
2674 'stitle': simple_title,
2675 'ext': video_extension.decode('utf-8'),
2676 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2677 'thumbnail': video_thumbnail.decode('utf-8'),
2678 'description': video_description.decode('utf-8'),
2681 except UnavailableVideoError, err:
2682 self._downloader.trouble(u'\nERROR: unable to download video')
2684 class PostProcessor(object):
2685 """Post Processor class.
2687 PostProcessor objects can be added to downloaders with their
2688 add_post_processor() method. When the downloader has finished a
2689 successful download, it will take its internal chain of PostProcessors
2690 and start calling the run() method on each one of them, first with
2691 an initial argument and then with the returned value of the previous
2694 The chain will be stopped if one of them ever returns None or the end
2695 of the chain is reached.
2697 PostProcessor objects follow a "mutual registration" process similar
2698 to InfoExtractor objects.
2703 def __init__(self, downloader=None):
2704 self._downloader = downloader
2706 def set_downloader(self, downloader):
2707 """Sets the downloader for this PP."""
2708 self._downloader = downloader
2710 def run(self, information):
2711 """Run the PostProcessor.
2713 The "information" argument is a dictionary like the ones
2714 composed by InfoExtractors. The only difference is that this
2715 one has an extra field called "filepath" that points to the
2718 When this method returns None, the postprocessing chain is
2719 stopped. However, this method may return an information
2720 dictionary that will be passed to the next postprocessing
2721 object in the chain. It can be the one it received after
2722 changing some fields.
2724 In addition, this method may raise a PostProcessingError
2725 exception that will be taken into account by the downloader
2728 return information # by default, do nothing
2730 class FFmpegExtractAudioPP(PostProcessor):
2732 def __init__(self, downloader=None, preferredcodec=None):
2733 PostProcessor.__init__(self, downloader)
2734 if preferredcodec is None:
2735 preferredcodec = 'best'
2736 self._preferredcodec = preferredcodec
2739 def get_audio_codec(path):
2741 cmd = ['ffprobe', '-show_streams', '--', path]
2742 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
2743 output = handle.communicate()[0]
2744 if handle.wait() != 0:
2746 except (IOError, OSError):
2749 for line in output.split('\n'):
2750 if line.startswith('codec_name='):
2751 audio_codec = line.split('=')[1].strip()
2752 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
2757 def run_ffmpeg(path, out_path, codec, more_opts):
2759 cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
2760 ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
2762 except (IOError, OSError):
2765 def run(self, information):
2766 path = information['filepath']
2768 filecodec = self.get_audio_codec(path)
2769 if filecodec is None:
2770 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
2774 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
2775 if filecodec == 'aac' or filecodec == 'mp3':
2776 # Lossless if possible
2778 extension = filecodec
2779 if filecodec == 'aac':
2780 more_opts = ['-f', 'adts']
2783 acodec = 'libmp3lame'
2785 more_opts = ['-ab', '128k']
2787 # We convert the audio (lossy)
2788 acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
2789 extension = self._preferredcodec
2790 more_opts = ['-ab', '128k']
2791 if self._preferredcodec == 'aac':
2792 more_opts += ['-f', 'adts']
2794 (prefix, ext) = os.path.splitext(path)
2795 new_path = prefix + '.' + extension
2796 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
2797 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
2800 self._downloader.to_stderr(u'WARNING: error running ffmpeg')
2805 except (IOError, OSError):
2806 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
2809 information['filepath'] = new_path
2812 ### MAIN PROGRAM ###
2813 if __name__ == '__main__':
2815 # Modules needed only when running the main program
2819 # Function to update the program file with the latest version from the repository.
2820 def update_self(downloader, filename):
2821 # Note: downloader only used for options
2822 if not os.access(filename, os.W_OK):
2823 sys.exit('ERROR: no write permissions on %s' % filename)
2825 downloader.to_screen('Updating to latest stable version...')
2827 latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
2828 latest_version = urllib.urlopen(latest_url).read().strip()
2829 prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2830 newcontent = urllib.urlopen(prog_url).read()
2831 except (IOError, OSError), err:
2832 sys.exit('ERROR: unable to download latest version')
2834 stream = open(filename, 'w')
2835 stream.write(newcontent)
2837 except (IOError, OSError), err:
2838 sys.exit('ERROR: unable to overwrite current version')
2839 downloader.to_screen('Updated to version %s' % latest_version)
2841 # Parse command line
2842 parser = optparse.OptionParser(
2843 usage='Usage: %prog [options] url...',
2844 version='2011.08.04',
2845 conflict_handler='resolve',
2848 parser.add_option('-h', '--help',
2849 action='help', help='print this help text and exit')
2850 parser.add_option('-v', '--version',
2851 action='version', help='print program version and exit')
2852 parser.add_option('-U', '--update',
2853 action='store_true', dest='update_self', help='update this program to latest stable version')
2854 parser.add_option('-i', '--ignore-errors',
2855 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2856 parser.add_option('-r', '--rate-limit',
2857 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2858 parser.add_option('-R', '--retries',
2859 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2860 parser.add_option('--playlist-start',
2861 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2862 parser.add_option('--playlist-end',
2863 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
2864 parser.add_option('--dump-user-agent',
2865 action='store_true', dest='dump_user_agent',
2866 help='display the current browser identification', default=False)
2868 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2869 authentication.add_option('-u', '--username',
2870 dest='username', metavar='USERNAME', help='account username')
2871 authentication.add_option('-p', '--password',
2872 dest='password', metavar='PASSWORD', help='account password')
2873 authentication.add_option('-n', '--netrc',
2874 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2875 parser.add_option_group(authentication)
2877 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2878 video_format.add_option('-f', '--format',
2879 action='store', dest='format', metavar='FORMAT', help='video format code')
2880 video_format.add_option('--all-formats',
2881 action='store_const', dest='format', help='download all available video formats', const='-1')
2882 video_format.add_option('--max-quality',
2883 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2884 parser.add_option_group(video_format)
2886 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2887 verbosity.add_option('-q', '--quiet',
2888 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2889 verbosity.add_option('-s', '--simulate',
2890 action='store_true', dest='simulate', help='do not download video', default=False)
2891 verbosity.add_option('-g', '--get-url',
2892 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2893 verbosity.add_option('-e', '--get-title',
2894 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2895 verbosity.add_option('--get-thumbnail',
2896 action='store_true', dest='getthumbnail',
2897 help='simulate, quiet but print thumbnail URL', default=False)
2898 verbosity.add_option('--get-description',
2899 action='store_true', dest='getdescription',
2900 help='simulate, quiet but print video description', default=False)
2901 verbosity.add_option('--get-filename',
2902 action='store_true', dest='getfilename',
2903 help='simulate, quiet but print output filename', default=False)
2904 verbosity.add_option('--no-progress',
2905 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2906 verbosity.add_option('--console-title',
2907 action='store_true', dest='consoletitle',
2908 help='display progress in console titlebar', default=False)
2909 parser.add_option_group(verbosity)
2911 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2912 filesystem.add_option('-t', '--title',
2913 action='store_true', dest='usetitle', help='use title in file name', default=False)
2914 filesystem.add_option('-l', '--literal',
2915 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2916 filesystem.add_option('-A', '--auto-number',
2917 action='store_true', dest='autonumber',
2918 help='number downloaded files starting from 00000', default=False)
2919 filesystem.add_option('-o', '--output',
2920 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2921 filesystem.add_option('-a', '--batch-file',
2922 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2923 filesystem.add_option('-w', '--no-overwrites',
2924 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2925 filesystem.add_option('-c', '--continue',
2926 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2927 filesystem.add_option('--cookies',
2928 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
2929 filesystem.add_option('--no-part',
2930 action='store_true', dest='nopart', help='do not use .part files', default=False)
2931 filesystem.add_option('--no-mtime',
2932 action='store_false', dest='updatetime',
2933 help='do not use the Last-modified header to set the file modification time', default=True)
2934 parser.add_option_group(filesystem)
2936 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
2937 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
2938 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
2939 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
2940 help='"best", "aac" or "mp3"; best by default')
2941 parser.add_option_group(postproc)
2943 (opts, args) = parser.parse_args()
2945 # Open appropriate CookieJar
2946 if opts.cookiefile is None:
2947 jar = cookielib.CookieJar()
2950 jar = cookielib.MozillaCookieJar(opts.cookiefile)
2951 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
2953 except (IOError, OSError), err:
2954 sys.exit(u'ERROR: unable to open cookie file')
2957 if opts.dump_user_agent:
2958 print std_headers['User-Agent']
2961 # General configuration
2962 cookie_processor = urllib2.HTTPCookieProcessor(jar)
2963 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler()))
2964 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2966 # Batch file verification
2968 if opts.batchfile is not None:
2970 if opts.batchfile == '-':
2973 batchfd = open(opts.batchfile, 'r')
2974 batchurls = batchfd.readlines()
2975 batchurls = [x.strip() for x in batchurls]
2976 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
2978 sys.exit(u'ERROR: batch file could not be read')
2979 all_urls = batchurls + args
2981 # Conflicting, missing and erroneous options
2982 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2983 parser.error(u'using .netrc conflicts with giving username/password')
2984 if opts.password is not None and opts.username is None:
2985 parser.error(u'account username missing')
2986 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
2987 parser.error(u'using output template conflicts with using title, literal title or auto number')
2988 if opts.usetitle and opts.useliteral:
2989 parser.error(u'using title conflicts with using literal title')
2990 if opts.username is not None and opts.password is None:
2991 opts.password = getpass.getpass(u'Type account password and press return:')
2992 if opts.ratelimit is not None:
2993 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2994 if numeric_limit is None:
2995 parser.error(u'invalid rate limit specified')
2996 opts.ratelimit = numeric_limit
2997 if opts.retries is not None:
2999 opts.retries = long(opts.retries)
3000 except (TypeError, ValueError), err:
3001 parser.error(u'invalid retry count specified')
3003 opts.playliststart = long(opts.playliststart)
3004 if opts.playliststart <= 0:
3006 except (TypeError, ValueError), err:
3007 parser.error(u'invalid playlist start number specified')
3009 opts.playlistend = long(opts.playlistend)
3010 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3012 except (TypeError, ValueError), err:
3013 parser.error(u'invalid playlist end number specified')
3014 if opts.extractaudio:
3015 if opts.audioformat not in ['best', 'aac', 'mp3']:
3016 parser.error(u'invalid audio format specified')
3018 # Information extractors
3019 vimeo_ie = VimeoIE()
3020 youtube_ie = YoutubeIE()
3021 metacafe_ie = MetacafeIE(youtube_ie)
3022 dailymotion_ie = DailymotionIE()
3023 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
3024 youtube_user_ie = YoutubeUserIE(youtube_ie)
3025 youtube_search_ie = YoutubeSearchIE(youtube_ie)
3026 google_ie = GoogleIE()
3027 google_search_ie = GoogleSearchIE(google_ie)
3028 photobucket_ie = PhotobucketIE()
3029 yahoo_ie = YahooIE()
3030 yahoo_search_ie = YahooSearchIE(yahoo_ie)
3031 deposit_files_ie = DepositFilesIE()
3032 facebook_ie = FacebookIE()
3033 generic_ie = GenericIE()
3036 fd = FileDownloader({
3037 'usenetrc': opts.usenetrc,
3038 'username': opts.username,
3039 'password': opts.password,
3040 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3041 'forceurl': opts.geturl,
3042 'forcetitle': opts.gettitle,
3043 'forcethumbnail': opts.getthumbnail,
3044 'forcedescription': opts.getdescription,
3045 'forcefilename': opts.getfilename,
3046 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3047 'format': opts.format,
3048 'format_limit': opts.format_limit,
3049 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3050 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3051 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3052 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3053 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3054 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3055 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3056 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3057 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3058 or u'%(id)s.%(ext)s'),
3059 'ignoreerrors': opts.ignoreerrors,
3060 'ratelimit': opts.ratelimit,
3061 'nooverwrites': opts.nooverwrites,
3062 'retries': opts.retries,
3063 'continuedl': opts.continue_dl,
3064 'noprogress': opts.noprogress,
3065 'playliststart': opts.playliststart,
3066 'playlistend': opts.playlistend,
3067 'logtostderr': opts.outtmpl == '-',
3068 'consoletitle': opts.consoletitle,
3069 'nopart': opts.nopart,
3070 'updatetime': opts.updatetime,
3072 fd.add_info_extractor(vimeo_ie)
3073 fd.add_info_extractor(youtube_search_ie)
3074 fd.add_info_extractor(youtube_pl_ie)
3075 fd.add_info_extractor(youtube_user_ie)
3076 fd.add_info_extractor(metacafe_ie)
3077 fd.add_info_extractor(dailymotion_ie)
3078 fd.add_info_extractor(youtube_ie)
3079 fd.add_info_extractor(google_ie)
3080 fd.add_info_extractor(google_search_ie)
3081 fd.add_info_extractor(photobucket_ie)
3082 fd.add_info_extractor(yahoo_ie)
3083 fd.add_info_extractor(yahoo_search_ie)
3084 fd.add_info_extractor(deposit_files_ie)
3085 fd.add_info_extractor(facebook_ie)
3087 # This must come last since it's the
3088 # fallback if none of the others work
3089 fd.add_info_extractor(generic_ie)
3092 if opts.extractaudio:
3093 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))
3096 if opts.update_self:
3097 update_self(fd, sys.argv[0])
3100 if len(all_urls) < 1:
3101 if not opts.update_self:
3102 parser.error(u'you must provide at least one URL')
3105 retcode = fd.download(all_urls)
3107 # Dump cookie jar if requested
3108 if opts.cookiefile is not None:
3111 except (IOError, OSError), err:
3112 sys.exit(u'ERROR: unable to save cookie jar')
3116 except DownloadError:
3118 except SameFileError:
3119 sys.exit(u'ERROR: fixed output name but more than one file to download')
3120 except KeyboardInterrupt:
3121 sys.exit(u'\nERROR: Interrupted by user')