2 # -*- coding: utf-8 -*-
5 'Ricardo Garcia Gonzalez',
15 __license__ = 'Public Domain'
16 __version__ = '2011.08.28-phihag'
18 UPDATE_URL = 'https://raw.github.com/phihag/youtube-dl/master/youtube-dl'
46 except ImportError: # Python 2.4
49 import cStringIO as StringIO
53 # parse_qs was moved from the cgi module to the urlparse module recently.
55 from urlparse import parse_qs
57 from cgi import parse_qs
65 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
66 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
67 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
68 'Accept-Encoding': 'gzip, deflate',
69 'Accept-Language': 'en-us,en;q=0.5',
72 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
76 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
82 def raiseError(msg, i):
83 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
84 def skipSpace(i, expectMore=True):
85 while i < len(s) and s[i] in ' \t\r\n':
89 raiseError('Premature end', i)
91 def decodeEscape(match):
107 return unichr(int(esc[1:5], 16))
108 if len(esc) == 5+6 and esc[5:7] == '\\u':
109 hi = int(esc[1:5], 16)
110 low = int(esc[7:11], 16)
111 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
112 raise ValueError('Unknown escape ' + str(esc))
119 while s[e-bslashes-1] == '\\':
121 if bslashes % 2 == 1:
125 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
126 stri = rexp.sub(decodeEscape, s[i:e])
132 if s[i] == '}': # Empty dictionary
136 raiseError('Expected a string object key', i)
137 i,key = parseString(i)
139 if i >= len(s) or s[i] != ':':
140 raiseError('Expected a colon', i)
147 raiseError('Expected comma or closing curly brace', i)
152 if s[i] == ']': # Empty array
157 i = skipSpace(i) # Raise exception if premature end
161 raiseError('Expected a comma or closing bracket', i)
163 def parseDiscrete(i):
164 for k,v in {'true': True, 'false': False, 'null': None}.items():
165 if s.startswith(k, i):
167 raiseError('Not a boolean (or null)', i)
169 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
171 raiseError('Not a number', i)
173 if '.' in nums or 'e' in nums or 'E' in nums:
174 return (i+len(nums), float(nums))
175 return (i+len(nums), int(nums))
176 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
179 i,res = CHARMAP.get(s[i], parseNumber)(i)
180 i = skipSpace(i, False)
184 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
187 def preferredencoding():
188 """Get preferred encoding.
190 Returns the best encoding scheme for the system, based on
191 locale.getpreferredencoding() and some further tweaks.
193 def yield_preferredencoding():
195 pref = locale.getpreferredencoding()
201 return yield_preferredencoding().next()
203 def htmlentity_transform(matchobj):
204 """Transforms an HTML entity to a Unicode character.
206 This function receives a match object and is intended to be used with
207 the re.sub() function.
209 entity = matchobj.group(1)
211 # Known non-numeric HTML entity
212 if entity in htmlentitydefs.name2codepoint:
213 return unichr(htmlentitydefs.name2codepoint[entity])
216 mobj = re.match(ur'(?u)#(x?\d+)', entity)
218 numstr = mobj.group(1)
219 if numstr.startswith(u'x'):
221 numstr = u'0%s' % numstr
224 return unichr(long(numstr, base))
226 # Unknown entity in name, return its literal representation
227 return (u'&%s;' % entity)
229 def sanitize_title(utitle):
230 """Sanitizes a video title so it could be used as part of a filename."""
231 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
232 return utitle.replace(unicode(os.sep), u'%')
234 def sanitize_open(filename, open_mode):
235 """Try to open the given filename, and slightly tweak it if this fails.
237 Attempts to open the given filename. If this fails, it tries to change
238 the filename slightly, step by step, until it's either able to open it
239 or it fails and raises a final exception, like the standard open()
242 It returns the tuple (stream, definitive_file_name).
246 if sys.platform == 'win32':
248 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
249 return (sys.stdout, filename)
250 stream = open(filename, open_mode)
251 return (stream, filename)
252 except (IOError, OSError), err:
253 # In case of error, try to remove win32 forbidden chars
254 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
256 # An exception here should be caught in the caller
257 stream = open(filename, open_mode)
258 return (stream, filename)
260 def timeconvert(timestr):
261 """Convert RFC 2822 defined time string into system timestamp"""
263 timetuple = email.utils.parsedate_tz(timestr)
264 if timetuple is not None:
265 timestamp = email.utils.mktime_tz(timetuple)
268 class DownloadError(Exception):
269 """Download Error exception.
271 This exception may be thrown by FileDownloader objects if they are not
272 configured to continue on errors. They will contain the appropriate
277 class SameFileError(Exception):
278 """Same File exception.
280 This exception will be thrown by FileDownloader objects if they detect
281 multiple files would have to be downloaded to the same file on disk.
285 class PostProcessingError(Exception):
286 """Post Processing exception.
288 This exception may be raised by PostProcessor's .run() method to
289 indicate an error in the postprocessing task.
293 class UnavailableVideoError(Exception):
294 """Unavailable Format exception.
296 This exception will be thrown when a video is requested
297 in a format that is not available for that video.
301 class ContentTooShortError(Exception):
302 """Content Too Short exception.
304 This exception may be raised by FileDownloader objects when a file they
305 download is too small for what the server announced first, indicating
306 the connection was probably interrupted.
312 def __init__(self, downloaded, expected):
313 self.downloaded = downloaded
314 self.expected = expected
316 class YoutubeDLHandler(urllib2.HTTPHandler):
317 """Handler for HTTP requests and responses.
319 This class, when installed with an OpenerDirector, automatically adds
320 the standard headers to every HTTP request and handles gzipped and
321 deflated responses from web servers. If compression is to be avoided in
322 a particular request, the original request in the program code only has
323 to include the HTTP header "Youtubedl-No-Compression", which will be
324 removed before making the real request.
326 Part of this code was copied from:
328 http://techknack.net/python-urllib2-handlers/
330 Andrew Rowls, the author of that code, agreed to release it to the
337 return zlib.decompress(data, -zlib.MAX_WBITS)
339 return zlib.decompress(data)
342 def addinfourl_wrapper(stream, headers, url, code):
343 if hasattr(urllib2.addinfourl, 'getcode'):
344 return urllib2.addinfourl(stream, headers, url, code)
345 ret = urllib2.addinfourl(stream, headers, url)
349 def http_request(self, req):
350 for h in std_headers:
353 req.add_header(h, std_headers[h])
354 if 'Youtubedl-no-compression' in req.headers:
355 if 'Accept-encoding' in req.headers:
356 del req.headers['Accept-encoding']
357 del req.headers['Youtubedl-no-compression']
360 def http_response(self, req, resp):
363 if resp.headers.get('Content-encoding', '') == 'gzip':
364 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
365 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
366 resp.msg = old_resp.msg
368 if resp.headers.get('Content-encoding', '') == 'deflate':
369 gz = StringIO.StringIO(self.deflate(resp.read()))
370 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
371 resp.msg = old_resp.msg
374 class FileDownloader(object):
375 """File Downloader class.
377 File downloader objects are the ones responsible of downloading the
378 actual video file and writing it to disk if the user has requested
379 it, among some other tasks. In most cases there should be one per
380 program. As, given a video URL, the downloader doesn't know how to
381 extract all the needed information, task that InfoExtractors do, it
382 has to pass the URL to one of them.
384 For this, file downloader objects have a method that allows
385 InfoExtractors to be registered in a given order. When it is passed
386 a URL, the file downloader handles it to the first InfoExtractor it
387 finds that reports being able to handle it. The InfoExtractor extracts
388 all the information about the video or videos the URL refers to, and
389 asks the FileDownloader to process the video information, possibly
390 downloading the video.
392 File downloaders accept a lot of parameters. In order not to saturate
393 the object constructor with arguments, it receives a dictionary of
394 options instead. These options are available through the params
395 attribute for the InfoExtractors to use. The FileDownloader also
396 registers itself as the downloader in charge for the InfoExtractors
397 that are added to it, so this is a "mutual registration".
401 username: Username for authentication purposes.
402 password: Password for authentication purposes.
403 usenetrc: Use netrc for authentication instead.
404 quiet: Do not print messages to stdout.
405 forceurl: Force printing final URL.
406 forcetitle: Force printing title.
407 forcethumbnail: Force printing thumbnail URL.
408 forcedescription: Force printing description.
409 forcefilename: Force printing final filename.
410 simulate: Do not download the video files.
411 format: Video format code.
412 format_limit: Highest quality format to try.
413 outtmpl: Template for output names.
414 ignoreerrors: Do not stop on download errors.
415 ratelimit: Download speed limit, in bytes/sec.
416 nooverwrites: Prevent overwriting files.
417 retries: Number of times to retry for HTTP error 5xx
418 continuedl: Try to continue downloads if possible.
419 noprogress: Do not print the progress bar.
420 playliststart: Playlist item to start at.
421 playlistend: Playlist item to end at.
422 logtostderr: Log messages to stderr instead of stdout.
423 consoletitle: Display progress in console window's titlebar.
424 nopart: Do not use temporary .part files.
425 updatetime: Use the Last-modified header to set output file timestamps.
426 writedescription: Write the video description to a .description file
427 writeinfojson: Write the video description to a .info.json file
433 _download_retcode = None
434 _num_downloads = None
437 def __init__(self, params):
438 """Create a FileDownloader object with the given options."""
441 self._download_retcode = 0
442 self._num_downloads = 0
443 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
447 def pmkdir(filename):
448 """Create directory components in filename. Similar to Unix "mkdir -p"."""
449 components = filename.split(os.sep)
450 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
451 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
452 for dir in aggregate:
453 if not os.path.exists(dir):
457 def format_bytes(bytes):
460 if type(bytes) is str:
465 exponent = long(math.log(bytes, 1024.0))
466 suffix = 'bkMGTPEZY'[exponent]
467 converted = float(bytes) / float(1024**exponent)
468 return '%.2f%s' % (converted, suffix)
471 def calc_percent(byte_counter, data_len):
474 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
477 def calc_eta(start, now, total, current):
481 if current == 0 or dif < 0.001: # One millisecond
483 rate = float(current) / dif
484 eta = long((float(total) - float(current)) / rate)
485 (eta_mins, eta_secs) = divmod(eta, 60)
488 return '%02d:%02d' % (eta_mins, eta_secs)
491 def calc_speed(start, now, bytes):
493 if bytes == 0 or dif < 0.001: # One millisecond
494 return '%10s' % '---b/s'
495 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
498 def best_block_size(elapsed_time, bytes):
499 new_min = max(bytes / 2.0, 1.0)
500 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
501 if elapsed_time < 0.001:
503 rate = bytes / elapsed_time
511 def parse_bytes(bytestr):
512 """Parse a string indicating a byte quantity into a long integer."""
513 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
516 number = float(matchobj.group(1))
517 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
518 return long(round(number * multiplier))
520 def add_info_extractor(self, ie):
521 """Add an InfoExtractor object to the end of the list."""
523 ie.set_downloader(self)
525 def add_post_processor(self, pp):
526 """Add a PostProcessor object to the end of the chain."""
528 pp.set_downloader(self)
530 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
531 """Print message to stdout if not in quiet mode."""
533 if not self.params.get('quiet', False):
534 terminator = [u'\n', u''][skip_eol]
535 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
536 self._screen_file.flush()
537 except (UnicodeEncodeError), err:
538 if not ignore_encoding_errors:
541 def to_stderr(self, message):
542 """Print message to stderr."""
543 print >>sys.stderr, message.encode(preferredencoding())
545 def to_cons_title(self, message):
546 """Set console/terminal window title to message."""
547 if not self.params.get('consoletitle', False):
549 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
550 # c_wchar_p() might not be necessary if `message` is
551 # already of type unicode()
552 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
553 elif 'TERM' in os.environ:
554 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
556 def fixed_template(self):
557 """Checks if the output template is fixed."""
558 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
560 def trouble(self, message=None):
561 """Determine action to take when a download problem appears.
563 Depending on if the downloader has been configured to ignore
564 download errors or not, this method may throw an exception or
565 not when errors are found, after printing the message.
567 if message is not None:
568 self.to_stderr(message)
569 if not self.params.get('ignoreerrors', False):
570 raise DownloadError(message)
571 self._download_retcode = 1
573 def slow_down(self, start_time, byte_counter):
574 """Sleep if the download speed is over the rate limit."""
575 rate_limit = self.params.get('ratelimit', None)
576 if rate_limit is None or byte_counter == 0:
579 elapsed = now - start_time
582 speed = float(byte_counter) / elapsed
583 if speed > rate_limit:
584 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
586 def temp_name(self, filename):
587 """Returns a temporary filename for the given filename."""
588 if self.params.get('nopart', False) or filename == u'-' or \
589 (os.path.exists(filename) and not os.path.isfile(filename)):
591 return filename + u'.part'
593 def undo_temp_name(self, filename):
594 if filename.endswith(u'.part'):
595 return filename[:-len(u'.part')]
598 def try_rename(self, old_filename, new_filename):
600 if old_filename == new_filename:
602 os.rename(old_filename, new_filename)
603 except (IOError, OSError), err:
604 self.trouble(u'ERROR: unable to rename file')
606 def try_utime(self, filename, last_modified_hdr):
607 """Try to set the last-modified time of the given file."""
608 if last_modified_hdr is None:
610 if not os.path.isfile(filename):
612 timestr = last_modified_hdr
615 filetime = timeconvert(timestr)
619 os.utime(filename,(time.time(), filetime))
623 def report_writedescription(self, descfn):
624 """ Report that the description file is being written """
625 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
627 def report_writeinfojson(self, infofn):
628 """ Report that the metadata file has been written """
629 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
631 def report_destination(self, filename):
632 """Report destination filename."""
633 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
635 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
636 """Report download progress."""
637 if self.params.get('noprogress', False):
639 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
640 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
641 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
642 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
644 def report_resuming_byte(self, resume_len):
645 """Report attempt to resume at given byte."""
646 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
648 def report_retry(self, count, retries):
649 """Report retry in case of HTTP error 5xx"""
650 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
652 def report_file_already_downloaded(self, file_name):
653 """Report file has already been fully downloaded."""
655 self.to_screen(u'[download] %s has already been downloaded' % file_name)
656 except (UnicodeEncodeError), err:
657 self.to_screen(u'[download] The file has already been downloaded')
659 def report_unable_to_resume(self):
660 """Report it was impossible to resume download."""
661 self.to_screen(u'[download] Unable to resume')
663 def report_finish(self):
664 """Report download finished."""
665 if self.params.get('noprogress', False):
666 self.to_screen(u'[download] Download completed')
670 def increment_downloads(self):
671 """Increment the ordinal that assigns a number to each file."""
672 self._num_downloads += 1
674 def prepare_filename(self, info_dict):
675 """Generate the output filename."""
677 template_dict = dict(info_dict)
678 template_dict['epoch'] = unicode(long(time.time()))
679 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
680 filename = self.params['outtmpl'] % template_dict
682 except (ValueError, KeyError), err:
683 self.trouble(u'ERROR: invalid system charset or erroneous output template')
686 def process_info(self, info_dict):
687 """Process a single dictionary returned by an InfoExtractor."""
688 filename = self.prepare_filename(info_dict)
689 # Do nothing else if in simulate mode
690 if self.params.get('simulate', False):
692 if self.params.get('forcetitle', False):
693 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
694 if self.params.get('forceurl', False):
695 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
696 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
697 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
698 if self.params.get('forcedescription', False) and 'description' in info_dict:
699 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
700 if self.params.get('forcefilename', False) and filename is not None:
701 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
707 if self.params.get('nooverwrites', False) and os.path.exists(filename):
708 self.to_stderr(u'WARNING: file exists and will be skipped')
712 self.pmkdir(filename)
713 except (OSError, IOError), err:
714 self.trouble(u'ERROR: unable to create directories: %s' % str(err))
717 if self.params.get('writedescription', False):
719 descfn = filename + '.description'
720 self.report_writedescription(descfn)
721 descfile = open(descfn, 'wb')
723 descfile.write(info_dict['description'].encode('utf-8'))
726 except (OSError, IOError):
727 self.trouble(u'ERROR: Cannot write description file: %s' % str(descfn))
730 if self.params.get('writeinfojson', False):
731 infofn = filename + '.info.json'
732 self.report_writeinfojson(infofn)
735 except (NameError,AttributeError):
736 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
739 infof = open(infofn, 'wb')
741 json.dump(info_dict, infof)
744 except (OSError, IOError):
745 self.trouble(u'ERROR: Cannot write metadata to JSON file: %s' % str(infofn))
749 success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
750 except (OSError, IOError), err:
751 raise UnavailableVideoError
752 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
753 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
755 except (ContentTooShortError, ), err:
756 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
761 self.post_process(filename, info_dict)
762 except (PostProcessingError), err:
763 self.trouble(u'ERROR: postprocessing: %s' % str(err))
766 def download(self, url_list):
767 """Download a given list of URLs."""
768 if len(url_list) > 1 and self.fixed_template():
769 raise SameFileError(self.params['outtmpl'])
772 suitable_found = False
774 # Go to next InfoExtractor if not suitable
775 if not ie.suitable(url):
778 # Suitable InfoExtractor found
779 suitable_found = True
781 # Extract information from URL and process it
784 # Suitable InfoExtractor had been found; go to next URL
787 if not suitable_found:
788 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
790 return self._download_retcode
792 def post_process(self, filename, ie_info):
793 """Run the postprocessing chain on the given file."""
795 info['filepath'] = filename
801 def _download_with_rtmpdump(self, filename, url, player_url):
802 self.report_destination(filename)
803 tmpfilename = self.temp_name(filename)
805 # Check for rtmpdump first
807 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
808 except (OSError, IOError):
809 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
812 # Download using rtmpdump. rtmpdump returns exit code 2 when
813 # the connection was interrumpted and resuming appears to be
814 # possible. This is part of rtmpdump's normal usage, AFAIK.
815 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
816 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
817 while retval == 2 or retval == 1:
818 prevsize = os.path.getsize(tmpfilename)
819 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
820 time.sleep(5.0) # This seems to be needed
821 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
822 cursize = os.path.getsize(tmpfilename)
823 if prevsize == cursize and retval == 1:
826 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
827 self.try_rename(tmpfilename, filename)
830 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
833 def _do_download(self, filename, url, player_url):
834 # Check file already present
835 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
836 self.report_file_already_downloaded(filename)
839 # Attempt to download using rtmpdump
840 if url.startswith('rtmp'):
841 return self._download_with_rtmpdump(filename, url, player_url)
843 tmpfilename = self.temp_name(filename)
847 # Do not include the Accept-Encoding header
848 headers = {'Youtubedl-no-compression': 'True'}
849 basic_request = urllib2.Request(url, None, headers)
850 request = urllib2.Request(url, None, headers)
852 # Establish possible resume length
853 if os.path.isfile(tmpfilename):
854 resume_len = os.path.getsize(tmpfilename)
858 # Request parameters in case of being able to resume
859 if self.params.get('continuedl', False) and resume_len != 0:
860 self.report_resuming_byte(resume_len)
861 request.add_header('Range','bytes=%d-' % resume_len)
865 retries = self.params.get('retries', 0)
866 while count <= retries:
867 # Establish connection
869 data = urllib2.urlopen(request)
871 except (urllib2.HTTPError, ), err:
872 if (err.code < 500 or err.code >= 600) and err.code != 416:
873 # Unexpected HTTP error
875 elif err.code == 416:
876 # Unable to resume (requested range not satisfiable)
878 # Open the connection again without the range header
879 data = urllib2.urlopen(basic_request)
880 content_length = data.info()['Content-Length']
881 except (urllib2.HTTPError, ), err:
882 if err.code < 500 or err.code >= 600:
885 # Examine the reported length
886 if (content_length is not None and
887 (resume_len - 100 < long(content_length) < resume_len + 100)):
888 # The file had already been fully downloaded.
889 # Explanation to the above condition: in issue #175 it was revealed that
890 # YouTube sometimes adds or removes a few bytes from the end of the file,
891 # changing the file size slightly and causing problems for some users. So
892 # I decided to implement a suggested change and consider the file
893 # completely downloaded if the file size differs less than 100 bytes from
894 # the one in the hard drive.
895 self.report_file_already_downloaded(filename)
896 self.try_rename(tmpfilename, filename)
899 # The length does not match, we start the download over
900 self.report_unable_to_resume()
906 self.report_retry(count, retries)
909 self.trouble(u'ERROR: giving up after %s retries' % retries)
912 data_len = data.info().get('Content-length', None)
913 if data_len is not None:
914 data_len = long(data_len) + resume_len
915 data_len_str = self.format_bytes(data_len)
916 byte_counter = 0 + resume_len
922 data_block = data.read(block_size)
924 if len(data_block) == 0:
926 byte_counter += len(data_block)
928 # Open file just in time
931 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
932 filename = self.undo_temp_name(tmpfilename)
933 self.report_destination(filename)
934 except (OSError, IOError), err:
935 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
938 stream.write(data_block)
939 except (IOError, OSError), err:
940 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
942 block_size = self.best_block_size(after - before, len(data_block))
945 percent_str = self.calc_percent(byte_counter, data_len)
946 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
947 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
948 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
951 self.slow_down(start, byte_counter - resume_len)
955 if data_len is not None and byte_counter != data_len:
956 raise ContentTooShortError(byte_counter, long(data_len))
957 self.try_rename(tmpfilename, filename)
959 # Update file modification time
960 if self.params.get('updatetime', True):
961 self.try_utime(filename, data.info().get('last-modified', None))
965 class InfoExtractor(object):
966 """Information Extractor class.
968 Information extractors are the classes that, given a URL, extract
969 information from the video (or videos) the URL refers to. This
970 information includes the real video URL, the video title and simplified
971 title, author and others. The information is stored in a dictionary
972 which is then passed to the FileDownloader. The FileDownloader
973 processes this information possibly downloading the video to the file
974 system, among other possible outcomes. The dictionaries must include
975 the following fields:
977 id: Video identifier.
978 url: Final video URL.
979 uploader: Nickname of the video uploader.
980 title: Literal title.
981 stitle: Simplified title.
982 ext: Video filename extension.
983 format: Video format.
984 player_url: SWF Player URL (may be None).
986 The following fields are optional. Their primary purpose is to allow
987 youtube-dl to serve as the backend for a video search function, such
988 as the one in youtube2mp3. They are only used when their respective
989 forced printing functions are called:
991 thumbnail: Full URL to a video thumbnail image.
992 description: One-line video description.
994 Subclasses of this one should re-define the _real_initialize() and
995 _real_extract() methods, as well as the suitable() static method.
996 Probably, they should also be instantiated and added to the main
1003 def __init__(self, downloader=None):
1004 """Constructor. Receives an optional downloader."""
1006 self.set_downloader(downloader)
1010 """Receives a URL and returns True if suitable for this IE."""
1013 def initialize(self):
1014 """Initializes an instance (authentication, etc)."""
1016 self._real_initialize()
1019 def extract(self, url):
1020 """Extracts URL information and returns it in list of dicts."""
1022 return self._real_extract(url)
1024 def set_downloader(self, downloader):
1025 """Sets the downloader for this IE."""
1026 self._downloader = downloader
1028 def _real_initialize(self):
1029 """Real initialization process. Redefine in subclasses."""
1032 def _real_extract(self, url):
1033 """Real extraction process. Redefine in subclasses."""
1036 class YoutubeIE(InfoExtractor):
1037 """Information extractor for youtube.com."""
1039 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1040 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1041 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1042 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1043 _NETRC_MACHINE = 'youtube'
1044 # Listed in order of quality
1045 _available_formats = ['38', '37', '45', '22', '43', '35', '34', '18', '6', '5', '17', '13']
1046 _video_extensions = {
1052 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1059 return (re.match(YoutubeIE._VALID_URL, url) is not None)
1061 def report_lang(self):
1062 """Report attempt to set language."""
1063 self._downloader.to_screen(u'[youtube] Setting language')
1065 def report_login(self):
1066 """Report attempt to log in."""
1067 self._downloader.to_screen(u'[youtube] Logging in')
1069 def report_age_confirmation(self):
1070 """Report attempt to confirm age."""
1071 self._downloader.to_screen(u'[youtube] Confirming age')
1073 def report_video_webpage_download(self, video_id):
1074 """Report attempt to download video webpage."""
1075 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1077 def report_video_info_webpage_download(self, video_id):
1078 """Report attempt to download video info webpage."""
1079 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1081 def report_information_extraction(self, video_id):
1082 """Report attempt to extract video information."""
1083 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1085 def report_unavailable_format(self, video_id, format):
1086 """Report extracted video URL."""
1087 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1089 def report_rtmp_download(self):
1090 """Indicate the download will use the RTMP protocol."""
1091 self._downloader.to_screen(u'[youtube] RTMP download detected')
1093 def _real_initialize(self):
1094 if self._downloader is None:
1099 downloader_params = self._downloader.params
1101 # Attempt to use provided username and password or .netrc data
1102 if downloader_params.get('username', None) is not None:
1103 username = downloader_params['username']
1104 password = downloader_params['password']
1105 elif downloader_params.get('usenetrc', False):
1107 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1108 if info is not None:
1112 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1113 except (IOError, netrc.NetrcParseError), err:
1114 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1118 request = urllib2.Request(self._LANG_URL)
1121 urllib2.urlopen(request).read()
1122 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1123 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1126 # No authentication to be performed
1127 if username is None:
1132 'current_form': 'loginForm',
1134 'action_login': 'Log In',
1135 'username': username,
1136 'password': password,
1138 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1141 login_results = urllib2.urlopen(request).read()
1142 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1143 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1145 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1146 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1152 'action_confirm': 'Confirm',
1154 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1156 self.report_age_confirmation()
1157 age_results = urllib2.urlopen(request).read()
1158 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1159 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1162 def _real_extract(self, url):
1163 # Extract video id from URL
1164 mobj = re.match(self._VALID_URL, url)
1166 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1168 video_id = mobj.group(2)
1171 self.report_video_webpage_download(video_id)
1172 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1174 video_webpage = urllib2.urlopen(request).read()
1175 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1176 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1179 # Attempt to extract SWF player URL
1180 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1181 if mobj is not None:
1182 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1187 self.report_video_info_webpage_download(video_id)
1188 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1189 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1190 % (video_id, el_type))
1191 request = urllib2.Request(video_info_url)
1193 video_info_webpage = urllib2.urlopen(request).read()
1194 video_info = parse_qs(video_info_webpage)
1195 if 'token' in video_info:
1197 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1198 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1200 if 'token' not in video_info:
1201 if 'reason' in video_info:
1202 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1204 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1207 # Start extracting information
1208 self.report_information_extraction(video_id)
1211 if 'author' not in video_info:
1212 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1214 video_uploader = urllib.unquote_plus(video_info['author'][0])
1217 if 'title' not in video_info:
1218 self._downloader.trouble(u'ERROR: unable to extract video title')
1220 video_title = urllib.unquote_plus(video_info['title'][0])
1221 video_title = video_title.decode('utf-8')
1222 video_title = sanitize_title(video_title)
1225 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1226 simple_title = simple_title.strip(ur'_')
1229 if 'thumbnail_url' not in video_info:
1230 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1231 video_thumbnail = ''
1232 else: # don't panic if we can't find it
1233 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1237 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1238 if mobj is not None:
1239 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1240 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1241 for expression in format_expressions:
1243 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1251 video_description = u'No description available.'
1252 if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1253 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1254 if mobj is not None:
1255 video_description = mobj.group(1).decode('utf-8')
1257 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1258 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1259 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1260 # TODO use another parser
1263 video_token = urllib.unquote_plus(video_info['token'][0])
1265 # Decide which formats to download
1266 req_format = self._downloader.params.get('format', None)
1268 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1269 self.report_rtmp_download()
1270 video_url_list = [(None, video_info['conn'][0])]
1271 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1272 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1273 url_data = [parse_qs(uds) for uds in url_data_strs]
1274 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1275 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1277 format_limit = self._downloader.params.get('format_limit', None)
1278 if format_limit is not None and format_limit in self._available_formats:
1279 format_list = self._available_formats[self._available_formats.index(format_limit):]
1281 format_list = self._available_formats
1282 existing_formats = [x for x in format_list if x in url_map]
1283 if len(existing_formats) == 0:
1284 self._downloader.trouble(u'ERROR: no known formats available for video')
1286 if req_format is None:
1287 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1288 elif req_format == '-1':
1289 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1292 if req_format not in url_map:
1293 self._downloader.trouble(u'ERROR: requested format not available')
1295 video_url_list = [(req_format, url_map[req_format])] # Specific format
1297 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1300 for format_param, video_real_url in video_url_list:
1301 # At this point we have a new video
1302 self._downloader.increment_downloads()
1305 video_extension = self._video_extensions.get(format_param, 'flv')
1308 # Process video information
1309 self._downloader.process_info({
1310 'id': video_id.decode('utf-8'),
1311 'url': video_real_url.decode('utf-8'),
1312 'uploader': video_uploader.decode('utf-8'),
1313 'upload_date': upload_date,
1314 'title': video_title,
1315 'stitle': simple_title,
1316 'ext': video_extension.decode('utf-8'),
1317 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1318 'thumbnail': video_thumbnail.decode('utf-8'),
1319 'description': video_description,
1320 'player_url': player_url,
1322 except UnavailableVideoError, err:
1323 self._downloader.trouble(u'\nERROR: unable to download video')
1326 class MetacafeIE(InfoExtractor):
1327 """Information Extractor for metacafe.com."""
1329 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1330 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1331 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1334 def __init__(self, youtube_ie, downloader=None):
1335 InfoExtractor.__init__(self, downloader)
1336 self._youtube_ie = youtube_ie
1340 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1342 def report_disclaimer(self):
1343 """Report disclaimer retrieval."""
1344 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1346 def report_age_confirmation(self):
1347 """Report attempt to confirm age."""
1348 self._downloader.to_screen(u'[metacafe] Confirming age')
1350 def report_download_webpage(self, video_id):
1351 """Report webpage download."""
1352 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1354 def report_extraction(self, video_id):
1355 """Report information extraction."""
1356 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1358 def _real_initialize(self):
1359 # Retrieve disclaimer
1360 request = urllib2.Request(self._DISCLAIMER)
1362 self.report_disclaimer()
1363 disclaimer = urllib2.urlopen(request).read()
1364 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1365 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1371 'submit': "Continue - I'm over 18",
1373 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1375 self.report_age_confirmation()
1376 disclaimer = urllib2.urlopen(request).read()
1377 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1378 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1381 def _real_extract(self, url):
1382 # Extract id and simplified title from URL
1383 mobj = re.match(self._VALID_URL, url)
1385 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1388 video_id = mobj.group(1)
1390 # Check if video comes from YouTube
1391 mobj2 = re.match(r'^yt-(.*)$', video_id)
1392 if mobj2 is not None:
1393 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1396 # At this point we have a new video
1397 self._downloader.increment_downloads()
1399 simple_title = mobj.group(2).decode('utf-8')
1401 # Retrieve video webpage to extract further information
1402 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1404 self.report_download_webpage(video_id)
1405 webpage = urllib2.urlopen(request).read()
1406 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1407 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1410 # Extract URL, uploader and title from webpage
1411 self.report_extraction(video_id)
1412 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1413 if mobj is not None:
1414 mediaURL = urllib.unquote(mobj.group(1))
1415 video_extension = mediaURL[-3:]
1417 # Extract gdaKey if available
1418 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1420 video_url = mediaURL
1422 gdaKey = mobj.group(1)
1423 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1425 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1427 self._downloader.trouble(u'ERROR: unable to extract media URL')
1429 vardict = parse_qs(mobj.group(1))
1430 if 'mediaData' not in vardict:
1431 self._downloader.trouble(u'ERROR: unable to extract media URL')
1433 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1435 self._downloader.trouble(u'ERROR: unable to extract media URL')
1437 mediaURL = mobj.group(1).replace('\\/', '/')
1438 video_extension = mediaURL[-3:]
1439 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1441 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1443 self._downloader.trouble(u'ERROR: unable to extract title')
1445 video_title = mobj.group(1).decode('utf-8')
1446 video_title = sanitize_title(video_title)
1448 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1450 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1452 video_uploader = mobj.group(1)
1455 # Process video information
1456 self._downloader.process_info({
1457 'id': video_id.decode('utf-8'),
1458 'url': video_url.decode('utf-8'),
1459 'uploader': video_uploader.decode('utf-8'),
1460 'upload_date': u'NA',
1461 'title': video_title,
1462 'stitle': simple_title,
1463 'ext': video_extension.decode('utf-8'),
1467 except UnavailableVideoError:
1468 self._downloader.trouble(u'\nERROR: unable to download video')
1471 class DailymotionIE(InfoExtractor):
1472 """Information Extractor for Dailymotion"""
1474 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1476 def __init__(self, downloader=None):
1477 InfoExtractor.__init__(self, downloader)
1481 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1483 def report_download_webpage(self, video_id):
1484 """Report webpage download."""
1485 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1487 def report_extraction(self, video_id):
1488 """Report information extraction."""
1489 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1491 def _real_initialize(self):
1494 def _real_extract(self, url):
1495 # Extract id and simplified title from URL
1496 mobj = re.match(self._VALID_URL, url)
1498 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1501 # At this point we have a new video
1502 self._downloader.increment_downloads()
1503 video_id = mobj.group(1)
1505 simple_title = mobj.group(2).decode('utf-8')
1506 video_extension = 'flv'
1508 # Retrieve video webpage to extract further information
1509 request = urllib2.Request(url)
1511 self.report_download_webpage(video_id)
1512 webpage = urllib2.urlopen(request).read()
1513 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1514 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1517 # Extract URL, uploader and title from webpage
1518 self.report_extraction(video_id)
1519 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1521 self._downloader.trouble(u'ERROR: unable to extract media URL')
1523 mediaURL = urllib.unquote(mobj.group(1))
1525 # if needed add http://www.dailymotion.com/ if relative URL
1527 video_url = mediaURL
1529 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1530 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1532 self._downloader.trouble(u'ERROR: unable to extract title')
1534 video_title = mobj.group(1).decode('utf-8')
1535 video_title = sanitize_title(video_title)
1537 mobj = re.search(r'(?im)<Attribute name="owner">(.+?)</Attribute>', webpage)
1539 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1541 video_uploader = mobj.group(1)
1544 # Process video information
1545 self._downloader.process_info({
1546 'id': video_id.decode('utf-8'),
1547 'url': video_url.decode('utf-8'),
1548 'uploader': video_uploader.decode('utf-8'),
1549 'upload_date': u'NA',
1550 'title': video_title,
1551 'stitle': simple_title,
1552 'ext': video_extension.decode('utf-8'),
1556 except UnavailableVideoError:
1557 self._downloader.trouble(u'\nERROR: unable to download video')
1559 class GoogleIE(InfoExtractor):
1560 """Information extractor for video.google.com."""
1562 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1564 def __init__(self, downloader=None):
1565 InfoExtractor.__init__(self, downloader)
1569 return (re.match(GoogleIE._VALID_URL, url) is not None)
1571 def report_download_webpage(self, video_id):
1572 """Report webpage download."""
1573 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1575 def report_extraction(self, video_id):
1576 """Report information extraction."""
1577 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1579 def _real_initialize(self):
1582 def _real_extract(self, url):
1583 # Extract id from URL
1584 mobj = re.match(self._VALID_URL, url)
1586 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1589 # At this point we have a new video
1590 self._downloader.increment_downloads()
1591 video_id = mobj.group(1)
1593 video_extension = 'mp4'
1595 # Retrieve video webpage to extract further information
1596 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1598 self.report_download_webpage(video_id)
1599 webpage = urllib2.urlopen(request).read()
1600 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1601 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1604 # Extract URL, uploader, and title from webpage
1605 self.report_extraction(video_id)
1606 mobj = re.search(r"download_url:'([^']+)'", webpage)
1608 video_extension = 'flv'
1609 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1611 self._downloader.trouble(u'ERROR: unable to extract media URL')
1613 mediaURL = urllib.unquote(mobj.group(1))
1614 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1615 mediaURL = mediaURL.replace('\\x26', '\x26')
1617 video_url = mediaURL
1619 mobj = re.search(r'<title>(.*)</title>', webpage)
1621 self._downloader.trouble(u'ERROR: unable to extract title')
1623 video_title = mobj.group(1).decode('utf-8')
1624 video_title = sanitize_title(video_title)
1625 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1627 # Extract video description
1628 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1630 self._downloader.trouble(u'ERROR: unable to extract video description')
1632 video_description = mobj.group(1).decode('utf-8')
1633 if not video_description:
1634 video_description = 'No description available.'
1636 # Extract video thumbnail
1637 if self._downloader.params.get('forcethumbnail', False):
1638 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1640 webpage = urllib2.urlopen(request).read()
1641 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1642 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1644 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1646 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1648 video_thumbnail = mobj.group(1)
1649 else: # we need something to pass to process_info
1650 video_thumbnail = ''
1654 # Process video information
1655 self._downloader.process_info({
1656 'id': video_id.decode('utf-8'),
1657 'url': video_url.decode('utf-8'),
1659 'upload_date': u'NA',
1660 'title': video_title,
1661 'stitle': simple_title,
1662 'ext': video_extension.decode('utf-8'),
1666 except UnavailableVideoError:
1667 self._downloader.trouble(u'\nERROR: unable to download video')
1670 class PhotobucketIE(InfoExtractor):
1671 """Information extractor for photobucket.com."""
1673 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1675 def __init__(self, downloader=None):
1676 InfoExtractor.__init__(self, downloader)
1680 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1682 def report_download_webpage(self, video_id):
1683 """Report webpage download."""
1684 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1686 def report_extraction(self, video_id):
1687 """Report information extraction."""
1688 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1690 def _real_initialize(self):
1693 def _real_extract(self, url):
1694 # Extract id from URL
1695 mobj = re.match(self._VALID_URL, url)
1697 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1700 # At this point we have a new video
1701 self._downloader.increment_downloads()
1702 video_id = mobj.group(1)
1704 video_extension = 'flv'
1706 # Retrieve video webpage to extract further information
1707 request = urllib2.Request(url)
1709 self.report_download_webpage(video_id)
1710 webpage = urllib2.urlopen(request).read()
1711 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1712 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1715 # Extract URL, uploader, and title from webpage
1716 self.report_extraction(video_id)
1717 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1719 self._downloader.trouble(u'ERROR: unable to extract media URL')
1721 mediaURL = urllib.unquote(mobj.group(1))
1723 video_url = mediaURL
1725 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1727 self._downloader.trouble(u'ERROR: unable to extract title')
1729 video_title = mobj.group(1).decode('utf-8')
1730 video_title = sanitize_title(video_title)
1731 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1733 video_uploader = mobj.group(2).decode('utf-8')
1736 # Process video information
1737 self._downloader.process_info({
1738 'id': video_id.decode('utf-8'),
1739 'url': video_url.decode('utf-8'),
1740 'uploader': video_uploader,
1741 'upload_date': u'NA',
1742 'title': video_title,
1743 'stitle': simple_title,
1744 'ext': video_extension.decode('utf-8'),
1748 except UnavailableVideoError:
1749 self._downloader.trouble(u'\nERROR: unable to download video')
1752 class YahooIE(InfoExtractor):
1753 """Information extractor for video.yahoo.com."""
1755 # _VALID_URL matches all Yahoo! Video URLs
1756 # _VPAGE_URL matches only the extractable '/watch/' URLs
1757 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1758 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1760 def __init__(self, downloader=None):
1761 InfoExtractor.__init__(self, downloader)
1765 return (re.match(YahooIE._VALID_URL, url) is not None)
1767 def report_download_webpage(self, video_id):
1768 """Report webpage download."""
1769 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1771 def report_extraction(self, video_id):
1772 """Report information extraction."""
1773 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1775 def _real_initialize(self):
1778 def _real_extract(self, url, new_video=True):
1779 # Extract ID from URL
1780 mobj = re.match(self._VALID_URL, url)
1782 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1785 # At this point we have a new video
1786 self._downloader.increment_downloads()
1787 video_id = mobj.group(2)
1788 video_extension = 'flv'
1790 # Rewrite valid but non-extractable URLs as
1791 # extractable English language /watch/ URLs
1792 if re.match(self._VPAGE_URL, url) is None:
1793 request = urllib2.Request(url)
1795 webpage = urllib2.urlopen(request).read()
1796 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1797 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1800 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1802 self._downloader.trouble(u'ERROR: Unable to extract id field')
1804 yahoo_id = mobj.group(1)
1806 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1808 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1810 yahoo_vid = mobj.group(1)
1812 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1813 return self._real_extract(url, new_video=False)
1815 # Retrieve video webpage to extract further information
1816 request = urllib2.Request(url)
1818 self.report_download_webpage(video_id)
1819 webpage = urllib2.urlopen(request).read()
1820 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1821 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1824 # Extract uploader and title from webpage
1825 self.report_extraction(video_id)
1826 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1828 self._downloader.trouble(u'ERROR: unable to extract video title')
1830 video_title = mobj.group(1).decode('utf-8')
1831 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1833 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1835 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1837 video_uploader = mobj.group(1).decode('utf-8')
1839 # Extract video thumbnail
1840 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1842 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1844 video_thumbnail = mobj.group(1).decode('utf-8')
1846 # Extract video description
1847 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1849 self._downloader.trouble(u'ERROR: unable to extract video description')
1851 video_description = mobj.group(1).decode('utf-8')
1852 if not video_description: video_description = 'No description available.'
1854 # Extract video height and width
1855 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1857 self._downloader.trouble(u'ERROR: unable to extract video height')
1859 yv_video_height = mobj.group(1)
1861 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1863 self._downloader.trouble(u'ERROR: unable to extract video width')
1865 yv_video_width = mobj.group(1)
1867 # Retrieve video playlist to extract media URL
1868 # I'm not completely sure what all these options are, but we
1869 # seem to need most of them, otherwise the server sends a 401.
1870 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1871 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1872 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1873 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1874 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1876 self.report_download_webpage(video_id)
1877 webpage = urllib2.urlopen(request).read()
1878 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1879 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1882 # Extract media URL from playlist XML
1883 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1885 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1887 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1888 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1891 # Process video information
1892 self._downloader.process_info({
1893 'id': video_id.decode('utf-8'),
1895 'uploader': video_uploader,
1896 'upload_date': u'NA',
1897 'title': video_title,
1898 'stitle': simple_title,
1899 'ext': video_extension.decode('utf-8'),
1900 'thumbnail': video_thumbnail.decode('utf-8'),
1901 'description': video_description,
1902 'thumbnail': video_thumbnail,
1903 'description': video_description,
1906 except UnavailableVideoError:
1907 self._downloader.trouble(u'\nERROR: unable to download video')
1910 class VimeoIE(InfoExtractor):
1911 """Information extractor for vimeo.com."""
1913 # _VALID_URL matches Vimeo URLs
1914 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1916 def __init__(self, downloader=None):
1917 InfoExtractor.__init__(self, downloader)
1921 return (re.match(VimeoIE._VALID_URL, url) is not None)
1923 def report_download_webpage(self, video_id):
1924 """Report webpage download."""
1925 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1927 def report_extraction(self, video_id):
1928 """Report information extraction."""
1929 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1931 def _real_initialize(self):
1934 def _real_extract(self, url, new_video=True):
1935 # Extract ID from URL
1936 mobj = re.match(self._VALID_URL, url)
1938 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1941 # At this point we have a new video
1942 self._downloader.increment_downloads()
1943 video_id = mobj.group(1)
1945 # Retrieve video webpage to extract further information
1946 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
1948 self.report_download_webpage(video_id)
1949 webpage = urllib2.urlopen(request).read()
1950 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1951 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1954 # Now we begin extracting as much information as we can from what we
1955 # retrieved. First we extract the information common to all extractors,
1956 # and latter we extract those that are Vimeo specific.
1957 self.report_extraction(video_id)
1960 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
1962 self._downloader.trouble(u'ERROR: unable to extract video title')
1964 video_title = mobj.group(1).decode('utf-8')
1965 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1968 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
1970 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1972 video_uploader = mobj.group(1).decode('utf-8')
1974 # Extract video thumbnail
1975 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
1977 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1979 video_thumbnail = mobj.group(1).decode('utf-8')
1981 # # Extract video description
1982 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
1984 # self._downloader.trouble(u'ERROR: unable to extract video description')
1986 # video_description = mobj.group(1).decode('utf-8')
1987 # if not video_description: video_description = 'No description available.'
1988 video_description = 'Foo.'
1990 # Vimeo specific: extract request signature
1991 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
1993 self._downloader.trouble(u'ERROR: unable to extract request signature')
1995 sig = mobj.group(1).decode('utf-8')
1997 # Vimeo specific: Extract request signature expiration
1998 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2000 self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2002 sig_exp = mobj.group(1).decode('utf-8')
2004 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s" % (video_id, sig, sig_exp)
2007 # Process video information
2008 self._downloader.process_info({
2009 'id': video_id.decode('utf-8'),
2011 'uploader': video_uploader,
2012 'upload_date': u'NA',
2013 'title': video_title,
2014 'stitle': simple_title,
2016 'thumbnail': video_thumbnail.decode('utf-8'),
2017 'description': video_description,
2018 'thumbnail': video_thumbnail,
2019 'description': video_description,
2022 except UnavailableVideoError:
2023 self._downloader.trouble(u'ERROR: unable to download video')
2026 class GenericIE(InfoExtractor):
2027 """Generic last-resort information extractor."""
2029 def __init__(self, downloader=None):
2030 InfoExtractor.__init__(self, downloader)
2036 def report_download_webpage(self, video_id):
2037 """Report webpage download."""
2038 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2039 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2041 def report_extraction(self, video_id):
2042 """Report information extraction."""
2043 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2045 def _real_initialize(self):
2048 def _real_extract(self, url):
2049 # At this point we have a new video
2050 self._downloader.increment_downloads()
2052 video_id = url.split('/')[-1]
2053 request = urllib2.Request(url)
2055 self.report_download_webpage(video_id)
2056 webpage = urllib2.urlopen(request).read()
2057 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2058 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2060 except ValueError, err:
2061 # since this is the last-resort InfoExtractor, if
2062 # this error is thrown, it'll be thrown here
2063 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2066 self.report_extraction(video_id)
2067 # Start with something easy: JW Player in SWFObject
2068 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2070 # Broaden the search a little bit
2071 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2073 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2076 # It's possible that one of the regexes
2077 # matched, but returned an empty group:
2078 if mobj.group(1) is None:
2079 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2082 video_url = urllib.unquote(mobj.group(1))
2083 video_id = os.path.basename(video_url)
2085 # here's a fun little line of code for you:
2086 video_extension = os.path.splitext(video_id)[1][1:]
2087 video_id = os.path.splitext(video_id)[0]
2089 # it's tempting to parse this further, but you would
2090 # have to take into account all the variations like
2091 # Video Title - Site Name
2092 # Site Name | Video Title
2093 # Video Title - Tagline | Site Name
2094 # and so on and so forth; it's just not practical
2095 mobj = re.search(r'<title>(.*)</title>', webpage)
2097 self._downloader.trouble(u'ERROR: unable to extract title')
2099 video_title = mobj.group(1).decode('utf-8')
2100 video_title = sanitize_title(video_title)
2101 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2103 # video uploader is domain name
2104 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2106 self._downloader.trouble(u'ERROR: unable to extract title')
2108 video_uploader = mobj.group(1).decode('utf-8')
2111 # Process video information
2112 self._downloader.process_info({
2113 'id': video_id.decode('utf-8'),
2114 'url': video_url.decode('utf-8'),
2115 'uploader': video_uploader,
2116 'upload_date': u'NA',
2117 'title': video_title,
2118 'stitle': simple_title,
2119 'ext': video_extension.decode('utf-8'),
2123 except UnavailableVideoError, err:
2124 self._downloader.trouble(u'\nERROR: unable to download video')
2127 class YoutubeSearchIE(InfoExtractor):
2128 """Information Extractor for YouTube search queries."""
2129 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
2130 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2131 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2132 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2134 _max_youtube_results = 1000
2136 def __init__(self, youtube_ie, downloader=None):
2137 InfoExtractor.__init__(self, downloader)
2138 self._youtube_ie = youtube_ie
2142 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
2144 def report_download_page(self, query, pagenum):
2145 """Report attempt to download playlist page with given number."""
2146 query = query.decode(preferredencoding())
2147 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2149 def _real_initialize(self):
2150 self._youtube_ie.initialize()
2152 def _real_extract(self, query):
2153 mobj = re.match(self._VALID_QUERY, query)
2155 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2158 prefix, query = query.split(':')
2160 query = query.encode('utf-8')
2162 self._download_n_results(query, 1)
2164 elif prefix == 'all':
2165 self._download_n_results(query, self._max_youtube_results)
2171 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2173 elif n > self._max_youtube_results:
2174 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2175 n = self._max_youtube_results
2176 self._download_n_results(query, n)
2178 except ValueError: # parsing prefix as integer fails
2179 self._download_n_results(query, 1)
2182 def _download_n_results(self, query, n):
2183 """Downloads a specified number of results for a query"""
2186 already_seen = set()
2190 self.report_download_page(query, pagenum)
2191 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2192 request = urllib2.Request(result_url)
2194 page = urllib2.urlopen(request).read()
2195 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2196 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2199 # Extract video identifiers
2200 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2201 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2202 if video_id not in already_seen:
2203 video_ids.append(video_id)
2204 already_seen.add(video_id)
2205 if len(video_ids) == n:
2206 # Specified n videos reached
2207 for id in video_ids:
2208 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2211 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2212 for id in video_ids:
2213 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2216 pagenum = pagenum + 1
2218 class GoogleSearchIE(InfoExtractor):
2219 """Information Extractor for Google Video search queries."""
2220 _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
2221 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2222 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2223 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2225 _max_google_results = 1000
2227 def __init__(self, google_ie, downloader=None):
2228 InfoExtractor.__init__(self, downloader)
2229 self._google_ie = google_ie
2233 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
2235 def report_download_page(self, query, pagenum):
2236 """Report attempt to download playlist page with given number."""
2237 query = query.decode(preferredencoding())
2238 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2240 def _real_initialize(self):
2241 self._google_ie.initialize()
2243 def _real_extract(self, query):
2244 mobj = re.match(self._VALID_QUERY, query)
2246 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2249 prefix, query = query.split(':')
2251 query = query.encode('utf-8')
2253 self._download_n_results(query, 1)
2255 elif prefix == 'all':
2256 self._download_n_results(query, self._max_google_results)
2262 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2264 elif n > self._max_google_results:
2265 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2266 n = self._max_google_results
2267 self._download_n_results(query, n)
2269 except ValueError: # parsing prefix as integer fails
2270 self._download_n_results(query, 1)
2273 def _download_n_results(self, query, n):
2274 """Downloads a specified number of results for a query"""
2277 already_seen = set()
2281 self.report_download_page(query, pagenum)
2282 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2283 request = urllib2.Request(result_url)
2285 page = urllib2.urlopen(request).read()
2286 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2287 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2290 # Extract video identifiers
2291 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2292 video_id = mobj.group(1)
2293 if video_id not in already_seen:
2294 video_ids.append(video_id)
2295 already_seen.add(video_id)
2296 if len(video_ids) == n:
2297 # Specified n videos reached
2298 for id in video_ids:
2299 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2302 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2303 for id in video_ids:
2304 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2307 pagenum = pagenum + 1
2309 class YahooSearchIE(InfoExtractor):
2310 """Information Extractor for Yahoo! Video search queries."""
2311 _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
2312 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2313 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2314 _MORE_PAGES_INDICATOR = r'\s*Next'
2316 _max_yahoo_results = 1000
2318 def __init__(self, yahoo_ie, downloader=None):
2319 InfoExtractor.__init__(self, downloader)
2320 self._yahoo_ie = yahoo_ie
2324 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
2326 def report_download_page(self, query, pagenum):
2327 """Report attempt to download playlist page with given number."""
2328 query = query.decode(preferredencoding())
2329 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2331 def _real_initialize(self):
2332 self._yahoo_ie.initialize()
2334 def _real_extract(self, query):
2335 mobj = re.match(self._VALID_QUERY, query)
2337 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2340 prefix, query = query.split(':')
2342 query = query.encode('utf-8')
2344 self._download_n_results(query, 1)
2346 elif prefix == 'all':
2347 self._download_n_results(query, self._max_yahoo_results)
2353 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2355 elif n > self._max_yahoo_results:
2356 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2357 n = self._max_yahoo_results
2358 self._download_n_results(query, n)
2360 except ValueError: # parsing prefix as integer fails
2361 self._download_n_results(query, 1)
2364 def _download_n_results(self, query, n):
2365 """Downloads a specified number of results for a query"""
2368 already_seen = set()
2372 self.report_download_page(query, pagenum)
2373 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2374 request = urllib2.Request(result_url)
2376 page = urllib2.urlopen(request).read()
2377 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2378 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2381 # Extract video identifiers
2382 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2383 video_id = mobj.group(1)
2384 if video_id not in already_seen:
2385 video_ids.append(video_id)
2386 already_seen.add(video_id)
2387 if len(video_ids) == n:
2388 # Specified n videos reached
2389 for id in video_ids:
2390 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2393 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2394 for id in video_ids:
2395 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2398 pagenum = pagenum + 1
2400 class YoutubePlaylistIE(InfoExtractor):
2401 """Information Extractor for YouTube playlists."""
2403 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist)\?.*?(p|a)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2404 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2405 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2406 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2409 def __init__(self, youtube_ie, downloader=None):
2410 InfoExtractor.__init__(self, downloader)
2411 self._youtube_ie = youtube_ie
2415 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2417 def report_download_page(self, playlist_id, pagenum):
2418 """Report attempt to download playlist page with given number."""
2419 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2421 def _real_initialize(self):
2422 self._youtube_ie.initialize()
2424 def _real_extract(self, url):
2425 # Extract playlist id
2426 mobj = re.match(self._VALID_URL, url)
2428 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2432 if mobj.group(3) is not None:
2433 self._youtube_ie.extract(mobj.group(3))
2436 # Download playlist pages
2437 # prefix is 'p' as default for playlists but there are other types that need extra care
2438 playlist_prefix = mobj.group(1)
2439 if playlist_prefix == 'a':
2440 playlist_access = 'artist'
2442 playlist_prefix = 'p'
2443 playlist_access = 'view_play_list'
2444 playlist_id = mobj.group(2)
2449 self.report_download_page(playlist_id, pagenum)
2450 request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2452 page = urllib2.urlopen(request).read()
2453 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2454 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2457 # Extract video identifiers
2459 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2460 if mobj.group(1) not in ids_in_page:
2461 ids_in_page.append(mobj.group(1))
2462 video_ids.extend(ids_in_page)
2464 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2466 pagenum = pagenum + 1
2468 playliststart = self._downloader.params.get('playliststart', 1) - 1
2469 playlistend = self._downloader.params.get('playlistend', -1)
2470 video_ids = video_ids[playliststart:playlistend]
2472 for id in video_ids:
2473 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2476 class YoutubeUserIE(InfoExtractor):
2477 """Information Extractor for YouTube users."""
2479 _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2480 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2481 _GDATA_PAGE_SIZE = 50
2482 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2483 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2486 def __init__(self, youtube_ie, downloader=None):
2487 InfoExtractor.__init__(self, downloader)
2488 self._youtube_ie = youtube_ie
2492 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2494 def report_download_page(self, username, start_index):
2495 """Report attempt to download user page."""
2496 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2497 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2499 def _real_initialize(self):
2500 self._youtube_ie.initialize()
2502 def _real_extract(self, url):
2504 mobj = re.match(self._VALID_URL, url)
2506 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2509 username = mobj.group(1)
2511 # Download video ids using YouTube Data API. Result size per
2512 # query is limited (currently to 50 videos) so we need to query
2513 # page by page until there are no video ids - it means we got
2520 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2521 self.report_download_page(username, start_index)
2523 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2526 page = urllib2.urlopen(request).read()
2527 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2528 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2531 # Extract video identifiers
2534 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2535 if mobj.group(1) not in ids_in_page:
2536 ids_in_page.append(mobj.group(1))
2538 video_ids.extend(ids_in_page)
2540 # A little optimization - if current page is not
2541 # "full", ie. does not contain PAGE_SIZE video ids then
2542 # we can assume that this page is the last one - there
2543 # are no more ids on further pages - no need to query
2546 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2551 all_ids_count = len(video_ids)
2552 playliststart = self._downloader.params.get('playliststart', 1) - 1
2553 playlistend = self._downloader.params.get('playlistend', -1)
2555 if playlistend == -1:
2556 video_ids = video_ids[playliststart:]
2558 video_ids = video_ids[playliststart:playlistend]
2560 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2561 (username, all_ids_count, len(video_ids)))
2563 for video_id in video_ids:
2564 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2567 class DepositFilesIE(InfoExtractor):
2568 """Information extractor for depositfiles.com"""
2570 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2572 def __init__(self, downloader=None):
2573 InfoExtractor.__init__(self, downloader)
2577 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2579 def report_download_webpage(self, file_id):
2580 """Report webpage download."""
2581 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2583 def report_extraction(self, file_id):
2584 """Report information extraction."""
2585 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2587 def _real_initialize(self):
2590 def _real_extract(self, url):
2591 # At this point we have a new file
2592 self._downloader.increment_downloads()
2594 file_id = url.split('/')[-1]
2595 # Rebuild url in english locale
2596 url = 'http://depositfiles.com/en/files/' + file_id
2598 # Retrieve file webpage with 'Free download' button pressed
2599 free_download_indication = { 'gateway_result' : '1' }
2600 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2602 self.report_download_webpage(file_id)
2603 webpage = urllib2.urlopen(request).read()
2604 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2605 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2608 # Search for the real file URL
2609 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2610 if (mobj is None) or (mobj.group(1) is None):
2611 # Try to figure out reason of the error.
2612 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2613 if (mobj is not None) and (mobj.group(1) is not None):
2614 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2615 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2617 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2620 file_url = mobj.group(1)
2621 file_extension = os.path.splitext(file_url)[1][1:]
2623 # Search for file title
2624 mobj = re.search(r'<b title="(.*?)">', webpage)
2626 self._downloader.trouble(u'ERROR: unable to extract title')
2628 file_title = mobj.group(1).decode('utf-8')
2631 # Process file information
2632 self._downloader.process_info({
2633 'id': file_id.decode('utf-8'),
2634 'url': file_url.decode('utf-8'),
2636 'upload_date': u'NA',
2637 'title': file_title,
2638 'stitle': file_title,
2639 'ext': file_extension.decode('utf-8'),
2643 except UnavailableVideoError, err:
2644 self._downloader.trouble(u'ERROR: unable to download file')
2646 class FacebookIE(InfoExtractor):
2647 """Information Extractor for Facebook"""
2649 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2650 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2651 _NETRC_MACHINE = 'facebook'
2652 _available_formats = ['highqual', 'lowqual']
2653 _video_extensions = {
2658 def __init__(self, downloader=None):
2659 InfoExtractor.__init__(self, downloader)
2663 return (re.match(FacebookIE._VALID_URL, url) is not None)
2665 def _reporter(self, message):
2666 """Add header and report message."""
2667 self._downloader.to_screen(u'[facebook] %s' % message)
2669 def report_login(self):
2670 """Report attempt to log in."""
2671 self._reporter(u'Logging in')
2673 def report_video_webpage_download(self, video_id):
2674 """Report attempt to download video webpage."""
2675 self._reporter(u'%s: Downloading video webpage' % video_id)
2677 def report_information_extraction(self, video_id):
2678 """Report attempt to extract video information."""
2679 self._reporter(u'%s: Extracting video information' % video_id)
2681 def _parse_page(self, video_webpage):
2682 """Extract video information from page"""
2684 data = {'title': r'class="video_title datawrap">(.*?)</',
2685 'description': r'<div class="datawrap">(.*?)</div>',
2686 'owner': r'\("video_owner_name", "(.*?)"\)',
2687 'upload_date': r'data-date="(.*?)"',
2688 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2691 for piece in data.keys():
2692 mobj = re.search(data[piece], video_webpage)
2693 if mobj is not None:
2694 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2698 for fmt in self._available_formats:
2699 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2700 if mobj is not None:
2701 # URL is in a Javascript segment inside an escaped Unicode format within
2702 # the generally utf-8 page
2703 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2704 video_info['video_urls'] = video_urls
2708 def _real_initialize(self):
2709 if self._downloader is None:
2714 downloader_params = self._downloader.params
2716 # Attempt to use provided username and password or .netrc data
2717 if downloader_params.get('username', None) is not None:
2718 useremail = downloader_params['username']
2719 password = downloader_params['password']
2720 elif downloader_params.get('usenetrc', False):
2722 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2723 if info is not None:
2727 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2728 except (IOError, netrc.NetrcParseError), err:
2729 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2732 if useremail is None:
2741 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2744 login_results = urllib2.urlopen(request).read()
2745 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2746 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2748 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2749 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2752 def _real_extract(self, url):
2753 mobj = re.match(self._VALID_URL, url)
2755 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2757 video_id = mobj.group('ID')
2760 self.report_video_webpage_download(video_id)
2761 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2763 page = urllib2.urlopen(request)
2764 video_webpage = page.read()
2765 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2766 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2769 # Start extracting information
2770 self.report_information_extraction(video_id)
2772 # Extract information
2773 video_info = self._parse_page(video_webpage)
2776 if 'owner' not in video_info:
2777 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2779 video_uploader = video_info['owner']
2782 if 'title' not in video_info:
2783 self._downloader.trouble(u'ERROR: unable to extract video title')
2785 video_title = video_info['title']
2786 video_title = video_title.decode('utf-8')
2787 video_title = sanitize_title(video_title)
2790 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2791 simple_title = simple_title.strip(ur'_')
2794 if 'thumbnail' not in video_info:
2795 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2796 video_thumbnail = ''
2798 video_thumbnail = video_info['thumbnail']
2802 if 'upload_date' in video_info:
2803 upload_time = video_info['upload_date']
2804 timetuple = email.utils.parsedate_tz(upload_time)
2805 if timetuple is not None:
2807 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2812 video_description = video_info.get('description', 'No description available.')
2814 url_map = video_info['video_urls']
2815 if len(url_map.keys()) > 0:
2816 # Decide which formats to download
2817 req_format = self._downloader.params.get('format', None)
2818 format_limit = self._downloader.params.get('format_limit', None)
2820 if format_limit is not None and format_limit in self._available_formats:
2821 format_list = self._available_formats[self._available_formats.index(format_limit):]
2823 format_list = self._available_formats
2824 existing_formats = [x for x in format_list if x in url_map]
2825 if len(existing_formats) == 0:
2826 self._downloader.trouble(u'ERROR: no known formats available for video')
2828 if req_format is None:
2829 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2830 elif req_format == '-1':
2831 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2834 if req_format not in url_map:
2835 self._downloader.trouble(u'ERROR: requested format not available')
2837 video_url_list = [(req_format, url_map[req_format])] # Specific format
2839 for format_param, video_real_url in video_url_list:
2841 # At this point we have a new video
2842 self._downloader.increment_downloads()
2845 video_extension = self._video_extensions.get(format_param, 'mp4')
2848 # Process video information
2849 self._downloader.process_info({
2850 'id': video_id.decode('utf-8'),
2851 'url': video_real_url.decode('utf-8'),
2852 'uploader': video_uploader.decode('utf-8'),
2853 'upload_date': upload_date,
2854 'title': video_title,
2855 'stitle': simple_title,
2856 'ext': video_extension.decode('utf-8'),
2857 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2858 'thumbnail': video_thumbnail.decode('utf-8'),
2859 'description': video_description.decode('utf-8'),
2862 except UnavailableVideoError, err:
2863 self._downloader.trouble(u'\nERROR: unable to download video')
2865 class BlipTVIE(InfoExtractor):
2866 """Information extractor for blip.tv"""
2868 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2869 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2873 return (re.match(BlipTVIE._VALID_URL, url) is not None)
2875 def report_extraction(self, file_id):
2876 """Report information extraction."""
2877 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
2879 def _simplify_title(self, title):
2880 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2881 res = res.strip(ur'_')
2884 def _real_extract(self, url):
2885 mobj = re.match(self._VALID_URL, url)
2887 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2894 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2895 request = urllib2.Request(json_url)
2896 self.report_extraction(mobj.group(1))
2898 json_code = urllib2.urlopen(request).read()
2899 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2900 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2903 json_data = json.loads(json_code)
2904 if 'Post' in json_data:
2905 data = json_data['Post']
2909 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2910 video_url = data['media']['url']
2911 umobj = re.match(self._URL_EXT, video_url)
2913 raise ValueError('Can not determine filename extension')
2914 ext = umobj.group(1)
2916 self._downloader.increment_downloads()
2919 'id': data['item_id'],
2921 'uploader': data['display_name'],
2922 'upload_date': upload_date,
2923 'title': data['title'],
2924 'stitle': self._simplify_title(data['title']),
2926 'format': data['media']['mimeType'],
2927 'thumbnail': data['thumbnailUrl'],
2928 'description': data['description'],
2929 'player_url': data['embedUrl']
2931 except (ValueError,KeyError), err:
2932 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2936 self._downloader.process_info(info)
2937 except UnavailableVideoError, err:
2938 self._downloader.trouble(u'\nERROR: unable to download video')
2941 class PostProcessor(object):
2942 """Post Processor class.
2944 PostProcessor objects can be added to downloaders with their
2945 add_post_processor() method. When the downloader has finished a
2946 successful download, it will take its internal chain of PostProcessors
2947 and start calling the run() method on each one of them, first with
2948 an initial argument and then with the returned value of the previous
2951 The chain will be stopped if one of them ever returns None or the end
2952 of the chain is reached.
2954 PostProcessor objects follow a "mutual registration" process similar
2955 to InfoExtractor objects.
2960 def __init__(self, downloader=None):
2961 self._downloader = downloader
2963 def set_downloader(self, downloader):
2964 """Sets the downloader for this PP."""
2965 self._downloader = downloader
2967 def run(self, information):
2968 """Run the PostProcessor.
2970 The "information" argument is a dictionary like the ones
2971 composed by InfoExtractors. The only difference is that this
2972 one has an extra field called "filepath" that points to the
2975 When this method returns None, the postprocessing chain is
2976 stopped. However, this method may return an information
2977 dictionary that will be passed to the next postprocessing
2978 object in the chain. It can be the one it received after
2979 changing some fields.
2981 In addition, this method may raise a PostProcessingError
2982 exception that will be taken into account by the downloader
2985 return information # by default, do nothing
2987 class FFmpegExtractAudioPP(PostProcessor):
2989 def __init__(self, downloader=None, preferredcodec=None):
2990 PostProcessor.__init__(self, downloader)
2991 if preferredcodec is None:
2992 preferredcodec = 'best'
2993 self._preferredcodec = preferredcodec
2996 def get_audio_codec(path):
2998 cmd = ['ffprobe', '-show_streams', '--', path]
2999 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3000 output = handle.communicate()[0]
3001 if handle.wait() != 0:
3003 except (IOError, OSError):
3006 for line in output.split('\n'):
3007 if line.startswith('codec_name='):
3008 audio_codec = line.split('=')[1].strip()
3009 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3014 def run_ffmpeg(path, out_path, codec, more_opts):
3016 cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3017 ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3019 except (IOError, OSError):
3022 def run(self, information):
3023 path = information['filepath']
3025 filecodec = self.get_audio_codec(path)
3026 if filecodec is None:
3027 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3031 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3032 if filecodec == 'aac' or filecodec == 'mp3':
3033 # Lossless if possible
3035 extension = filecodec
3036 if filecodec == 'aac':
3037 more_opts = ['-f', 'adts']
3040 acodec = 'libmp3lame'
3042 more_opts = ['-ab', '128k']
3044 # We convert the audio (lossy)
3045 acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
3046 extension = self._preferredcodec
3047 more_opts = ['-ab', '128k']
3048 if self._preferredcodec == 'aac':
3049 more_opts += ['-f', 'adts']
3051 (prefix, ext) = os.path.splitext(path)
3052 new_path = prefix + '.' + extension
3053 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3054 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3057 self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3062 except (IOError, OSError):
3063 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3066 information['filepath'] = new_path
3070 def updateSelf(downloader, filename):
3071 ''' Update the program file with the latest version from the repository '''
3072 # Note: downloader only used for options
3073 if not os.access(filename, os.W_OK):
3074 sys.exit('ERROR: no write permissions on %s' % filename)
3076 downloader.to_screen('Updating to latest version...')
3080 urlh = urllib.urlopen(UPDATE_URL)
3081 newcontent = urlh.read()
3084 except (IOError, OSError), err:
3085 sys.exit('ERROR: unable to download latest version')
3088 outf = open(filename, 'wb')
3090 outf.write(newcontent)
3093 except (IOError, OSError), err:
3094 sys.exit('ERROR: unable to overwrite current version')
3096 downloader.to_screen('Updated youtube-dl. Restart to use the new version.')
3103 def _format_option_string(option):
3104 ''' ('-o', '--option') -> -o, --format METAVAR'''
3108 if option._short_opts: opts.append(option._short_opts[0])
3109 if option._long_opts: opts.append(option._long_opts[0])
3110 if len(opts) > 1: opts.insert(1, ', ')
3112 if option.takes_value(): opts.append(' %s' % option.metavar)
3114 return "".join(opts)
3116 def _find_term_columns():
3117 columns = os.environ.get('COLUMNS', None)
3122 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3123 out,err = sp.communicate()
3124 return int(out.split()[1])
3130 max_help_position = 80
3132 # No need to wrap help messages if we're on a wide console
3133 columns = _find_term_columns()
3134 if columns: max_width = columns
3136 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
3137 fmt.format_option_strings = _format_option_string
3140 'version' : __version__,
3142 'usage' : '%prog [options] url...',
3143 'conflict_handler' : 'resolve',
3146 parser = optparse.OptionParser(**kw)
3149 general = optparse.OptionGroup(parser, 'General Options')
3150 authentication = optparse.OptionGroup(parser, 'Authentication Options')
3151 video_format = optparse.OptionGroup(parser, 'Video Format Options')
3152 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
3153 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
3154 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3156 general.add_option('-h', '--help',
3157 action='help', help='print this help text and exit')
3158 general.add_option('-v', '--version',
3159 action='version', help='print program version and exit')
3160 general.add_option('-U', '--update',
3161 action='store_true', dest='update_self', help='update this program to latest stable version')
3162 general.add_option('-i', '--ignore-errors',
3163 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3164 general.add_option('-r', '--rate-limit',
3165 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3166 general.add_option('-R', '--retries',
3167 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
3168 general.add_option('--playlist-start',
3169 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3170 general.add_option('--playlist-end',
3171 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3172 general.add_option('--dump-user-agent',
3173 action='store_true', dest='dump_user_agent',
3174 help='display the current browser identification', default=False)
3176 authentication.add_option('-u', '--username',
3177 dest='username', metavar='USERNAME', help='account username')
3178 authentication.add_option('-p', '--password',
3179 dest='password', metavar='PASSWORD', help='account password')
3180 authentication.add_option('-n', '--netrc',
3181 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3184 video_format.add_option('-f', '--format',
3185 action='store', dest='format', metavar='FORMAT', help='video format code')
3186 video_format.add_option('--all-formats',
3187 action='store_const', dest='format', help='download all available video formats', const='-1')
3188 video_format.add_option('--max-quality',
3189 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3192 verbosity.add_option('-q', '--quiet',
3193 action='store_true', dest='quiet', help='activates quiet mode', default=False)
3194 verbosity.add_option('-s', '--simulate',
3195 action='store_true', dest='simulate', help='do not download video', default=False)
3196 verbosity.add_option('-g', '--get-url',
3197 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3198 verbosity.add_option('-e', '--get-title',
3199 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3200 verbosity.add_option('--get-thumbnail',
3201 action='store_true', dest='getthumbnail',
3202 help='simulate, quiet but print thumbnail URL', default=False)
3203 verbosity.add_option('--get-description',
3204 action='store_true', dest='getdescription',
3205 help='simulate, quiet but print video description', default=False)
3206 verbosity.add_option('--get-filename',
3207 action='store_true', dest='getfilename',
3208 help='simulate, quiet but print output filename', default=False)
3209 verbosity.add_option('--no-progress',
3210 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3211 verbosity.add_option('--console-title',
3212 action='store_true', dest='consoletitle',
3213 help='display progress in console titlebar', default=False)
3216 filesystem.add_option('-t', '--title',
3217 action='store_true', dest='usetitle', help='use title in file name', default=False)
3218 filesystem.add_option('-l', '--literal',
3219 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3220 filesystem.add_option('-A', '--auto-number',
3221 action='store_true', dest='autonumber',
3222 help='number downloaded files starting from 00000', default=False)
3223 filesystem.add_option('-o', '--output',
3224 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
3225 filesystem.add_option('-a', '--batch-file',
3226 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3227 filesystem.add_option('-w', '--no-overwrites',
3228 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3229 filesystem.add_option('-c', '--continue',
3230 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3231 filesystem.add_option('--cookies',
3232 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
3233 filesystem.add_option('--no-part',
3234 action='store_true', dest='nopart', help='do not use .part files', default=False)
3235 filesystem.add_option('--no-mtime',
3236 action='store_false', dest='updatetime',
3237 help='do not use the Last-modified header to set the file modification time', default=True)
3238 filesystem.add_option('--write-description',
3239 action='store_true', dest='writedescription',
3240 help='write video description to a .description file', default=False)
3241 filesystem.add_option('--write-info-json',
3242 action='store_true', dest='writeinfojson',
3243 help='write video metadata to a .info.json file', default=False)
3246 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3247 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3248 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3249 help='"best", "aac" or "mp3"; best by default')
3252 parser.add_option_group(general)
3253 parser.add_option_group(filesystem)
3254 parser.add_option_group(verbosity)
3255 parser.add_option_group(video_format)
3256 parser.add_option_group(authentication)
3257 parser.add_option_group(postproc)
3259 opts, args = parser.parse_args()
3261 return parser, opts, args
3264 parser, opts, args = parseOpts()
3266 # Open appropriate CookieJar
3267 if opts.cookiefile is None:
3268 jar = cookielib.CookieJar()
3271 jar = cookielib.MozillaCookieJar(opts.cookiefile)
3272 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3274 except (IOError, OSError), err:
3275 sys.exit(u'ERROR: unable to open cookie file')
3278 if opts.dump_user_agent:
3279 print std_headers['User-Agent']
3282 # General configuration
3283 cookie_processor = urllib2.HTTPCookieProcessor(jar)
3284 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler()))
3285 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3287 # Batch file verification
3289 if opts.batchfile is not None:
3291 if opts.batchfile == '-':
3294 batchfd = open(opts.batchfile, 'r')
3295 batchurls = batchfd.readlines()
3296 batchurls = [x.strip() for x in batchurls]
3297 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3299 sys.exit(u'ERROR: batch file could not be read')
3300 all_urls = batchurls + args
3302 # Conflicting, missing and erroneous options
3303 if opts.usenetrc and (opts.username is not None or opts.password is not None):
3304 parser.error(u'using .netrc conflicts with giving username/password')
3305 if opts.password is not None and opts.username is None:
3306 parser.error(u'account username missing')
3307 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3308 parser.error(u'using output template conflicts with using title, literal title or auto number')
3309 if opts.usetitle and opts.useliteral:
3310 parser.error(u'using title conflicts with using literal title')
3311 if opts.username is not None and opts.password is None:
3312 opts.password = getpass.getpass(u'Type account password and press return:')
3313 if opts.ratelimit is not None:
3314 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3315 if numeric_limit is None:
3316 parser.error(u'invalid rate limit specified')
3317 opts.ratelimit = numeric_limit
3318 if opts.retries is not None:
3320 opts.retries = long(opts.retries)
3321 except (TypeError, ValueError), err:
3322 parser.error(u'invalid retry count specified')
3324 opts.playliststart = int(opts.playliststart)
3325 if opts.playliststart <= 0:
3326 raise ValueError(u'Playlist start must be positive')
3327 except (TypeError, ValueError), err:
3328 parser.error(u'invalid playlist start number specified')
3330 opts.playlistend = int(opts.playlistend)
3331 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3332 raise ValueError(u'Playlist end must be greater than playlist start')
3333 except (TypeError, ValueError), err:
3334 parser.error(u'invalid playlist end number specified')
3335 if opts.extractaudio:
3336 if opts.audioformat not in ['best', 'aac', 'mp3']:
3337 parser.error(u'invalid audio format specified')
3339 # Information extractors
3340 youtube_ie = YoutubeIE()
3341 metacafe_ie = MetacafeIE(youtube_ie)
3342 dailymotion_ie = DailymotionIE()
3343 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
3344 youtube_user_ie = YoutubeUserIE(youtube_ie)
3345 youtube_search_ie = YoutubeSearchIE(youtube_ie)
3346 google_ie = GoogleIE()
3347 google_search_ie = GoogleSearchIE(google_ie)
3348 photobucket_ie = PhotobucketIE()
3349 yahoo_ie = YahooIE()
3350 yahoo_search_ie = YahooSearchIE(yahoo_ie)
3351 deposit_files_ie = DepositFilesIE()
3352 facebook_ie = FacebookIE()
3353 bliptv_ie = BlipTVIE()
3354 vimeo_ie = VimeoIE()
3355 generic_ie = GenericIE()
3358 fd = FileDownloader({
3359 'usenetrc': opts.usenetrc,
3360 'username': opts.username,
3361 'password': opts.password,
3362 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3363 'forceurl': opts.geturl,
3364 'forcetitle': opts.gettitle,
3365 'forcethumbnail': opts.getthumbnail,
3366 'forcedescription': opts.getdescription,
3367 'forcefilename': opts.getfilename,
3368 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3369 'format': opts.format,
3370 'format_limit': opts.format_limit,
3371 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3372 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3373 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3374 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3375 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3376 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3377 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3378 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3379 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3380 or u'%(id)s.%(ext)s'),
3381 'ignoreerrors': opts.ignoreerrors,
3382 'ratelimit': opts.ratelimit,
3383 'nooverwrites': opts.nooverwrites,
3384 'retries': opts.retries,
3385 'continuedl': opts.continue_dl,
3386 'noprogress': opts.noprogress,
3387 'playliststart': opts.playliststart,
3388 'playlistend': opts.playlistend,
3389 'logtostderr': opts.outtmpl == '-',
3390 'consoletitle': opts.consoletitle,
3391 'nopart': opts.nopart,
3392 'updatetime': opts.updatetime,
3393 'writedescription': opts.writedescription,
3394 'writeinfojson': opts.writeinfojson,
3396 fd.add_info_extractor(youtube_search_ie)
3397 fd.add_info_extractor(youtube_pl_ie)
3398 fd.add_info_extractor(youtube_user_ie)
3399 fd.add_info_extractor(metacafe_ie)
3400 fd.add_info_extractor(dailymotion_ie)
3401 fd.add_info_extractor(youtube_ie)
3402 fd.add_info_extractor(google_ie)
3403 fd.add_info_extractor(google_search_ie)
3404 fd.add_info_extractor(photobucket_ie)
3405 fd.add_info_extractor(yahoo_ie)
3406 fd.add_info_extractor(yahoo_search_ie)
3407 fd.add_info_extractor(deposit_files_ie)
3408 fd.add_info_extractor(facebook_ie)
3409 fd.add_info_extractor(bliptv_ie)
3410 fd.add_info_extractor(vimeo_ie)
3412 # This must come last since it's the
3413 # fallback if none of the others work
3414 fd.add_info_extractor(generic_ie)
3417 if opts.extractaudio:
3418 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))
3421 if opts.update_self:
3422 updateSelf(fd, sys.argv[0])
3425 if len(all_urls) < 1:
3426 if not opts.update_self:
3427 parser.error(u'you must provide at least one URL')
3430 retcode = fd.download(all_urls)
3432 # Dump cookie jar if requested
3433 if opts.cookiefile is not None:
3436 except (IOError, OSError), err:
3437 sys.exit(u'ERROR: unable to save cookie jar')
3442 if __name__ == '__main__':
3445 except DownloadError:
3447 except SameFileError:
3448 sys.exit(u'ERROR: fixed output name but more than one file to download')
3449 except KeyboardInterrupt:
3450 sys.exit(u'\nERROR: Interrupted by user')
3452 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: