2 # -*- coding: utf-8 -*-
5 'Ricardo Garcia Gonzalez',
15 __license__ = 'Public Domain'
16 __version__ = '2011.08.28-phihag'
44 except ImportError: # Python 2.4
47 import cStringIO as StringIO
51 # parse_qs was moved from the cgi module to the urlparse module recently.
53 from urlparse import parse_qs
55 from cgi import parse_qs
63 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
64 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
65 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
66 'Accept-Encoding': 'gzip, deflate',
67 'Accept-Language': 'en-us,en;q=0.5',
70 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
74 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
80 def raiseError(msg, i):
81 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
82 def skipSpace(i, expectMore=True):
83 while i < len(s) and s[i] in ' \t\r\n':
87 raiseError('Premature end', i)
89 def decodeEscape(match):
105 return unichr(int(esc[1:5], 16))
106 if len(esc) == 5+6 and esc[5:7] == '\\u':
107 hi = int(esc[1:5], 16)
108 low = int(esc[7:11], 16)
109 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
110 raise ValueError('Unknown escape ' + str(esc))
117 while s[e-bslashes-1] == '\\':
119 if bslashes % 2 == 1:
123 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
124 stri = rexp.sub(decodeEscape, s[i:e])
130 if s[i] == '}': # Empty dictionary
134 raiseError('Expected a string object key', i)
135 i,key = parseString(i)
137 if i >= len(s) or s[i] != ':':
138 raiseError('Expected a colon', i)
145 raiseError('Expected comma or closing curly brace', i)
150 if s[i] == ']': # Empty array
155 i = skipSpace(i) # Raise exception if premature end
159 raiseError('Expected a comma or closing bracket', i)
161 def parseDiscrete(i):
162 for k,v in {'true': True, 'false': False, 'null': None}.items():
163 if s.startswith(k, i):
165 raiseError('Not a boolean (or null)', i)
167 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
169 raiseError('Not a number', i)
171 if '.' in nums or 'e' in nums or 'E' in nums:
172 return (i+len(nums), float(nums))
173 return (i+len(nums), int(nums))
174 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
177 i,res = CHARMAP.get(s[i], parseNumber)(i)
178 i = skipSpace(i, False)
182 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
185 def preferredencoding():
186 """Get preferred encoding.
188 Returns the best encoding scheme for the system, based on
189 locale.getpreferredencoding() and some further tweaks.
191 def yield_preferredencoding():
193 pref = locale.getpreferredencoding()
199 return yield_preferredencoding().next()
201 def htmlentity_transform(matchobj):
202 """Transforms an HTML entity to a Unicode character.
204 This function receives a match object and is intended to be used with
205 the re.sub() function.
207 entity = matchobj.group(1)
209 # Known non-numeric HTML entity
210 if entity in htmlentitydefs.name2codepoint:
211 return unichr(htmlentitydefs.name2codepoint[entity])
214 mobj = re.match(ur'(?u)#(x?\d+)', entity)
216 numstr = mobj.group(1)
217 if numstr.startswith(u'x'):
219 numstr = u'0%s' % numstr
222 return unichr(long(numstr, base))
224 # Unknown entity in name, return its literal representation
225 return (u'&%s;' % entity)
227 def sanitize_title(utitle):
228 """Sanitizes a video title so it could be used as part of a filename."""
229 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
230 return utitle.replace(unicode(os.sep), u'%')
232 def sanitize_open(filename, open_mode):
233 """Try to open the given filename, and slightly tweak it if this fails.
235 Attempts to open the given filename. If this fails, it tries to change
236 the filename slightly, step by step, until it's either able to open it
237 or it fails and raises a final exception, like the standard open()
240 It returns the tuple (stream, definitive_file_name).
244 if sys.platform == 'win32':
246 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
247 return (sys.stdout, filename)
248 stream = open(filename, open_mode)
249 return (stream, filename)
250 except (IOError, OSError), err:
251 # In case of error, try to remove win32 forbidden chars
252 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
254 # An exception here should be caught in the caller
255 stream = open(filename, open_mode)
256 return (stream, filename)
258 def timeconvert(timestr):
259 """Convert RFC 2822 defined time string into system timestamp"""
261 timetuple = email.utils.parsedate_tz(timestr)
262 if timetuple is not None:
263 timestamp = email.utils.mktime_tz(timetuple)
266 class DownloadError(Exception):
267 """Download Error exception.
269 This exception may be thrown by FileDownloader objects if they are not
270 configured to continue on errors. They will contain the appropriate
275 class SameFileError(Exception):
276 """Same File exception.
278 This exception will be thrown by FileDownloader objects if they detect
279 multiple files would have to be downloaded to the same file on disk.
283 class PostProcessingError(Exception):
284 """Post Processing exception.
286 This exception may be raised by PostProcessor's .run() method to
287 indicate an error in the postprocessing task.
291 class UnavailableVideoError(Exception):
292 """Unavailable Format exception.
294 This exception will be thrown when a video is requested
295 in a format that is not available for that video.
299 class ContentTooShortError(Exception):
300 """Content Too Short exception.
302 This exception may be raised by FileDownloader objects when a file they
303 download is too small for what the server announced first, indicating
304 the connection was probably interrupted.
310 def __init__(self, downloaded, expected):
311 self.downloaded = downloaded
312 self.expected = expected
314 class YoutubeDLHandler(urllib2.HTTPHandler):
315 """Handler for HTTP requests and responses.
317 This class, when installed with an OpenerDirector, automatically adds
318 the standard headers to every HTTP request and handles gzipped and
319 deflated responses from web servers. If compression is to be avoided in
320 a particular request, the original request in the program code only has
321 to include the HTTP header "Youtubedl-No-Compression", which will be
322 removed before making the real request.
324 Part of this code was copied from:
326 http://techknack.net/python-urllib2-handlers/
328 Andrew Rowls, the author of that code, agreed to release it to the
335 return zlib.decompress(data, -zlib.MAX_WBITS)
337 return zlib.decompress(data)
340 def addinfourl_wrapper(stream, headers, url, code):
341 if hasattr(urllib2.addinfourl, 'getcode'):
342 return urllib2.addinfourl(stream, headers, url, code)
343 ret = urllib2.addinfourl(stream, headers, url)
347 def http_request(self, req):
348 for h in std_headers:
351 req.add_header(h, std_headers[h])
352 if 'Youtubedl-no-compression' in req.headers:
353 if 'Accept-encoding' in req.headers:
354 del req.headers['Accept-encoding']
355 del req.headers['Youtubedl-no-compression']
358 def http_response(self, req, resp):
361 if resp.headers.get('Content-encoding', '') == 'gzip':
362 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
363 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
364 resp.msg = old_resp.msg
366 if resp.headers.get('Content-encoding', '') == 'deflate':
367 gz = StringIO.StringIO(self.deflate(resp.read()))
368 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
369 resp.msg = old_resp.msg
372 class FileDownloader(object):
373 """File Downloader class.
375 File downloader objects are the ones responsible of downloading the
376 actual video file and writing it to disk if the user has requested
377 it, among some other tasks. In most cases there should be one per
378 program. As, given a video URL, the downloader doesn't know how to
379 extract all the needed information, task that InfoExtractors do, it
380 has to pass the URL to one of them.
382 For this, file downloader objects have a method that allows
383 InfoExtractors to be registered in a given order. When it is passed
384 a URL, the file downloader handles it to the first InfoExtractor it
385 finds that reports being able to handle it. The InfoExtractor extracts
386 all the information about the video or videos the URL refers to, and
387 asks the FileDownloader to process the video information, possibly
388 downloading the video.
390 File downloaders accept a lot of parameters. In order not to saturate
391 the object constructor with arguments, it receives a dictionary of
392 options instead. These options are available through the params
393 attribute for the InfoExtractors to use. The FileDownloader also
394 registers itself as the downloader in charge for the InfoExtractors
395 that are added to it, so this is a "mutual registration".
399 username: Username for authentication purposes.
400 password: Password for authentication purposes.
401 usenetrc: Use netrc for authentication instead.
402 quiet: Do not print messages to stdout.
403 forceurl: Force printing final URL.
404 forcetitle: Force printing title.
405 forcethumbnail: Force printing thumbnail URL.
406 forcedescription: Force printing description.
407 forcefilename: Force printing final filename.
408 simulate: Do not download the video files.
409 format: Video format code.
410 format_limit: Highest quality format to try.
411 outtmpl: Template for output names.
412 ignoreerrors: Do not stop on download errors.
413 ratelimit: Download speed limit, in bytes/sec.
414 nooverwrites: Prevent overwriting files.
415 retries: Number of times to retry for HTTP error 5xx
416 continuedl: Try to continue downloads if possible.
417 noprogress: Do not print the progress bar.
418 playliststart: Playlist item to start at.
419 playlistend: Playlist item to end at.
420 logtostderr: Log messages to stderr instead of stdout.
421 consoletitle: Display progress in console window's titlebar.
422 nopart: Do not use temporary .part files.
423 updatetime: Use the Last-modified header to set output file timestamps.
424 writedescription: Write the video description to a .description file
425 writeinfojson: Write the video description to a .info.json file
431 _download_retcode = None
432 _num_downloads = None
435 def __init__(self, params):
436 """Create a FileDownloader object with the given options."""
439 self._download_retcode = 0
440 self._num_downloads = 0
441 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
445 def pmkdir(filename):
446 """Create directory components in filename. Similar to Unix "mkdir -p"."""
447 components = filename.split(os.sep)
448 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
449 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
450 for dir in aggregate:
451 if not os.path.exists(dir):
455 def format_bytes(bytes):
458 if type(bytes) is str:
463 exponent = long(math.log(bytes, 1024.0))
464 suffix = 'bkMGTPEZY'[exponent]
465 converted = float(bytes) / float(1024**exponent)
466 return '%.2f%s' % (converted, suffix)
469 def calc_percent(byte_counter, data_len):
472 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
475 def calc_eta(start, now, total, current):
479 if current == 0 or dif < 0.001: # One millisecond
481 rate = float(current) / dif
482 eta = long((float(total) - float(current)) / rate)
483 (eta_mins, eta_secs) = divmod(eta, 60)
486 return '%02d:%02d' % (eta_mins, eta_secs)
489 def calc_speed(start, now, bytes):
491 if bytes == 0 or dif < 0.001: # One millisecond
492 return '%10s' % '---b/s'
493 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
496 def best_block_size(elapsed_time, bytes):
497 new_min = max(bytes / 2.0, 1.0)
498 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
499 if elapsed_time < 0.001:
501 rate = bytes / elapsed_time
509 def parse_bytes(bytestr):
510 """Parse a string indicating a byte quantity into a long integer."""
511 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
514 number = float(matchobj.group(1))
515 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
516 return long(round(number * multiplier))
518 def add_info_extractor(self, ie):
519 """Add an InfoExtractor object to the end of the list."""
521 ie.set_downloader(self)
523 def add_post_processor(self, pp):
524 """Add a PostProcessor object to the end of the chain."""
526 pp.set_downloader(self)
528 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
529 """Print message to stdout if not in quiet mode."""
531 if not self.params.get('quiet', False):
532 terminator = [u'\n', u''][skip_eol]
533 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
534 self._screen_file.flush()
535 except (UnicodeEncodeError), err:
536 if not ignore_encoding_errors:
539 def to_stderr(self, message):
540 """Print message to stderr."""
541 print >>sys.stderr, message.encode(preferredencoding())
543 def to_cons_title(self, message):
544 """Set console/terminal window title to message."""
545 if not self.params.get('consoletitle', False):
547 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
548 # c_wchar_p() might not be necessary if `message` is
549 # already of type unicode()
550 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
551 elif 'TERM' in os.environ:
552 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
554 def fixed_template(self):
555 """Checks if the output template is fixed."""
556 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
558 def trouble(self, message=None):
559 """Determine action to take when a download problem appears.
561 Depending on if the downloader has been configured to ignore
562 download errors or not, this method may throw an exception or
563 not when errors are found, after printing the message.
565 if message is not None:
566 self.to_stderr(message)
567 if not self.params.get('ignoreerrors', False):
568 raise DownloadError(message)
569 self._download_retcode = 1
571 def slow_down(self, start_time, byte_counter):
572 """Sleep if the download speed is over the rate limit."""
573 rate_limit = self.params.get('ratelimit', None)
574 if rate_limit is None or byte_counter == 0:
577 elapsed = now - start_time
580 speed = float(byte_counter) / elapsed
581 if speed > rate_limit:
582 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
584 def temp_name(self, filename):
585 """Returns a temporary filename for the given filename."""
586 if self.params.get('nopart', False) or filename == u'-' or \
587 (os.path.exists(filename) and not os.path.isfile(filename)):
589 return filename + u'.part'
591 def undo_temp_name(self, filename):
592 if filename.endswith(u'.part'):
593 return filename[:-len(u'.part')]
596 def try_rename(self, old_filename, new_filename):
598 if old_filename == new_filename:
600 os.rename(old_filename, new_filename)
601 except (IOError, OSError), err:
602 self.trouble(u'ERROR: unable to rename file')
604 def try_utime(self, filename, last_modified_hdr):
605 """Try to set the last-modified time of the given file."""
606 if last_modified_hdr is None:
608 if not os.path.isfile(filename):
610 timestr = last_modified_hdr
613 filetime = timeconvert(timestr)
617 os.utime(filename,(time.time(), filetime))
621 def report_writedescription(self, descfn):
622 """ Report that the description file is being written """
623 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
625 def report_writeinfojson(self, infofn):
626 """ Report that the metadata file has been written """
627 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
629 def report_destination(self, filename):
630 """Report destination filename."""
631 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
633 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
634 """Report download progress."""
635 if self.params.get('noprogress', False):
637 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
638 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
639 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
640 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
642 def report_resuming_byte(self, resume_len):
643 """Report attempt to resume at given byte."""
644 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
646 def report_retry(self, count, retries):
647 """Report retry in case of HTTP error 5xx"""
648 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
650 def report_file_already_downloaded(self, file_name):
651 """Report file has already been fully downloaded."""
653 self.to_screen(u'[download] %s has already been downloaded' % file_name)
654 except (UnicodeEncodeError), err:
655 self.to_screen(u'[download] The file has already been downloaded')
657 def report_unable_to_resume(self):
658 """Report it was impossible to resume download."""
659 self.to_screen(u'[download] Unable to resume')
661 def report_finish(self):
662 """Report download finished."""
663 if self.params.get('noprogress', False):
664 self.to_screen(u'[download] Download completed')
668 def increment_downloads(self):
669 """Increment the ordinal that assigns a number to each file."""
670 self._num_downloads += 1
672 def prepare_filename(self, info_dict):
673 """Generate the output filename."""
675 template_dict = dict(info_dict)
676 template_dict['epoch'] = unicode(long(time.time()))
677 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
678 filename = self.params['outtmpl'] % template_dict
680 except (ValueError, KeyError), err:
681 self.trouble(u'ERROR: invalid system charset or erroneous output template')
684 def process_info(self, info_dict):
685 """Process a single dictionary returned by an InfoExtractor."""
686 filename = self.prepare_filename(info_dict)
687 # Do nothing else if in simulate mode
688 if self.params.get('simulate', False):
690 if self.params.get('forcetitle', False):
691 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
692 if self.params.get('forceurl', False):
693 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
694 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
695 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
696 if self.params.get('forcedescription', False) and 'description' in info_dict:
697 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
698 if self.params.get('forcefilename', False) and filename is not None:
699 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
705 if self.params.get('nooverwrites', False) and os.path.exists(filename):
706 self.to_stderr(u'WARNING: file exists and will be skipped')
710 self.pmkdir(filename)
711 except (OSError, IOError), err:
712 self.trouble(u'ERROR: unable to create directories: %s' % str(err))
715 if self.params.get('writedescription', False):
717 descfn = filename + '.description'
718 self.report_writedescription(descfn)
719 descfile = open(descfn, 'wb')
721 descfile.write(info_dict['description'].encode('utf-8'))
724 except (OSError, IOError):
725 self.trouble(u'ERROR: Cannot write description file: %s' % str(descfn))
728 if self.params.get('writeinfojson', False):
729 infofn = filename + '.info.json'
730 self.report_writeinfojson(infofn)
733 except (NameError,AttributeError):
734 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
737 infof = open(infofn, 'wb')
739 json.dump(info_dict, infof)
742 except (OSError, IOError):
743 self.trouble(u'ERROR: Cannot write metadata to JSON file: %s' % str(infofn))
747 success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
748 except (OSError, IOError), err:
749 raise UnavailableVideoError
750 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
751 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
753 except (ContentTooShortError, ), err:
754 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
759 self.post_process(filename, info_dict)
760 except (PostProcessingError), err:
761 self.trouble(u'ERROR: postprocessing: %s' % str(err))
764 def download(self, url_list):
765 """Download a given list of URLs."""
766 if len(url_list) > 1 and self.fixed_template():
767 raise SameFileError(self.params['outtmpl'])
770 suitable_found = False
772 # Go to next InfoExtractor if not suitable
773 if not ie.suitable(url):
776 # Suitable InfoExtractor found
777 suitable_found = True
779 # Extract information from URL and process it
782 # Suitable InfoExtractor had been found; go to next URL
785 if not suitable_found:
786 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
788 return self._download_retcode
790 def post_process(self, filename, ie_info):
791 """Run the postprocessing chain on the given file."""
793 info['filepath'] = filename
799 def _download_with_rtmpdump(self, filename, url, player_url):
800 self.report_destination(filename)
801 tmpfilename = self.temp_name(filename)
803 # Check for rtmpdump first
805 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
806 except (OSError, IOError):
807 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
810 # Download using rtmpdump. rtmpdump returns exit code 2 when
811 # the connection was interrumpted and resuming appears to be
812 # possible. This is part of rtmpdump's normal usage, AFAIK.
813 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
814 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
815 while retval == 2 or retval == 1:
816 prevsize = os.path.getsize(tmpfilename)
817 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
818 time.sleep(5.0) # This seems to be needed
819 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
820 cursize = os.path.getsize(tmpfilename)
821 if prevsize == cursize and retval == 1:
824 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
825 self.try_rename(tmpfilename, filename)
828 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
831 def _do_download(self, filename, url, player_url):
832 # Check file already present
833 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
834 self.report_file_already_downloaded(filename)
837 # Attempt to download using rtmpdump
838 if url.startswith('rtmp'):
839 return self._download_with_rtmpdump(filename, url, player_url)
841 tmpfilename = self.temp_name(filename)
845 # Do not include the Accept-Encoding header
846 headers = {'Youtubedl-no-compression': 'True'}
847 basic_request = urllib2.Request(url, None, headers)
848 request = urllib2.Request(url, None, headers)
850 # Establish possible resume length
851 if os.path.isfile(tmpfilename):
852 resume_len = os.path.getsize(tmpfilename)
856 # Request parameters in case of being able to resume
857 if self.params.get('continuedl', False) and resume_len != 0:
858 self.report_resuming_byte(resume_len)
859 request.add_header('Range','bytes=%d-' % resume_len)
863 retries = self.params.get('retries', 0)
864 while count <= retries:
865 # Establish connection
867 data = urllib2.urlopen(request)
869 except (urllib2.HTTPError, ), err:
870 if (err.code < 500 or err.code >= 600) and err.code != 416:
871 # Unexpected HTTP error
873 elif err.code == 416:
874 # Unable to resume (requested range not satisfiable)
876 # Open the connection again without the range header
877 data = urllib2.urlopen(basic_request)
878 content_length = data.info()['Content-Length']
879 except (urllib2.HTTPError, ), err:
880 if err.code < 500 or err.code >= 600:
883 # Examine the reported length
884 if (content_length is not None and
885 (resume_len - 100 < long(content_length) < resume_len + 100)):
886 # The file had already been fully downloaded.
887 # Explanation to the above condition: in issue #175 it was revealed that
888 # YouTube sometimes adds or removes a few bytes from the end of the file,
889 # changing the file size slightly and causing problems for some users. So
890 # I decided to implement a suggested change and consider the file
891 # completely downloaded if the file size differs less than 100 bytes from
892 # the one in the hard drive.
893 self.report_file_already_downloaded(filename)
894 self.try_rename(tmpfilename, filename)
897 # The length does not match, we start the download over
898 self.report_unable_to_resume()
904 self.report_retry(count, retries)
907 self.trouble(u'ERROR: giving up after %s retries' % retries)
910 data_len = data.info().get('Content-length', None)
911 if data_len is not None:
912 data_len = long(data_len) + resume_len
913 data_len_str = self.format_bytes(data_len)
914 byte_counter = 0 + resume_len
920 data_block = data.read(block_size)
922 if len(data_block) == 0:
924 byte_counter += len(data_block)
926 # Open file just in time
929 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
930 filename = self.undo_temp_name(tmpfilename)
931 self.report_destination(filename)
932 except (OSError, IOError), err:
933 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
936 stream.write(data_block)
937 except (IOError, OSError), err:
938 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
940 block_size = self.best_block_size(after - before, len(data_block))
943 percent_str = self.calc_percent(byte_counter, data_len)
944 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
945 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
946 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
949 self.slow_down(start, byte_counter - resume_len)
953 if data_len is not None and byte_counter != data_len:
954 raise ContentTooShortError(byte_counter, long(data_len))
955 self.try_rename(tmpfilename, filename)
957 # Update file modification time
958 if self.params.get('updatetime', True):
959 self.try_utime(filename, data.info().get('last-modified', None))
963 class InfoExtractor(object):
964 """Information Extractor class.
966 Information extractors are the classes that, given a URL, extract
967 information from the video (or videos) the URL refers to. This
968 information includes the real video URL, the video title and simplified
969 title, author and others. The information is stored in a dictionary
970 which is then passed to the FileDownloader. The FileDownloader
971 processes this information possibly downloading the video to the file
972 system, among other possible outcomes. The dictionaries must include
973 the following fields:
975 id: Video identifier.
976 url: Final video URL.
977 uploader: Nickname of the video uploader.
978 title: Literal title.
979 stitle: Simplified title.
980 ext: Video filename extension.
981 format: Video format.
982 player_url: SWF Player URL (may be None).
984 The following fields are optional. Their primary purpose is to allow
985 youtube-dl to serve as the backend for a video search function, such
986 as the one in youtube2mp3. They are only used when their respective
987 forced printing functions are called:
989 thumbnail: Full URL to a video thumbnail image.
990 description: One-line video description.
992 Subclasses of this one should re-define the _real_initialize() and
993 _real_extract() methods, as well as the suitable() static method.
994 Probably, they should also be instantiated and added to the main
1001 def __init__(self, downloader=None):
1002 """Constructor. Receives an optional downloader."""
1004 self.set_downloader(downloader)
1008 """Receives a URL and returns True if suitable for this IE."""
1011 def initialize(self):
1012 """Initializes an instance (authentication, etc)."""
1014 self._real_initialize()
1017 def extract(self, url):
1018 """Extracts URL information and returns it in list of dicts."""
1020 return self._real_extract(url)
1022 def set_downloader(self, downloader):
1023 """Sets the downloader for this IE."""
1024 self._downloader = downloader
1026 def _real_initialize(self):
1027 """Real initialization process. Redefine in subclasses."""
1030 def _real_extract(self, url):
1031 """Real extraction process. Redefine in subclasses."""
1034 class YoutubeIE(InfoExtractor):
1035 """Information extractor for youtube.com."""
1037 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1038 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1039 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1040 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1041 _NETRC_MACHINE = 'youtube'
1042 # Listed in order of quality
1043 _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
1044 _video_extensions = {
1050 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1057 return (re.match(YoutubeIE._VALID_URL, url) is not None)
1059 def report_lang(self):
1060 """Report attempt to set language."""
1061 self._downloader.to_screen(u'[youtube] Setting language')
1063 def report_login(self):
1064 """Report attempt to log in."""
1065 self._downloader.to_screen(u'[youtube] Logging in')
1067 def report_age_confirmation(self):
1068 """Report attempt to confirm age."""
1069 self._downloader.to_screen(u'[youtube] Confirming age')
1071 def report_video_webpage_download(self, video_id):
1072 """Report attempt to download video webpage."""
1073 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1075 def report_video_info_webpage_download(self, video_id):
1076 """Report attempt to download video info webpage."""
1077 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1079 def report_information_extraction(self, video_id):
1080 """Report attempt to extract video information."""
1081 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1083 def report_unavailable_format(self, video_id, format):
1084 """Report extracted video URL."""
1085 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1087 def report_rtmp_download(self):
1088 """Indicate the download will use the RTMP protocol."""
1089 self._downloader.to_screen(u'[youtube] RTMP download detected')
1091 def _real_initialize(self):
1092 if self._downloader is None:
1097 downloader_params = self._downloader.params
1099 # Attempt to use provided username and password or .netrc data
1100 if downloader_params.get('username', None) is not None:
1101 username = downloader_params['username']
1102 password = downloader_params['password']
1103 elif downloader_params.get('usenetrc', False):
1105 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1106 if info is not None:
1110 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1111 except (IOError, netrc.NetrcParseError), err:
1112 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1116 request = urllib2.Request(self._LANG_URL)
1119 urllib2.urlopen(request).read()
1120 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1121 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1124 # No authentication to be performed
1125 if username is None:
1130 'current_form': 'loginForm',
1132 'action_login': 'Log In',
1133 'username': username,
1134 'password': password,
1136 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1139 login_results = urllib2.urlopen(request).read()
1140 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1141 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1143 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1144 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1150 'action_confirm': 'Confirm',
1152 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1154 self.report_age_confirmation()
1155 age_results = urllib2.urlopen(request).read()
1156 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1157 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1160 def _real_extract(self, url):
1161 # Extract video id from URL
1162 mobj = re.match(self._VALID_URL, url)
1164 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1166 video_id = mobj.group(2)
1169 self.report_video_webpage_download(video_id)
1170 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1172 video_webpage = urllib2.urlopen(request).read()
1173 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1174 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1177 # Attempt to extract SWF player URL
1178 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1179 if mobj is not None:
1180 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1185 self.report_video_info_webpage_download(video_id)
1186 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1187 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1188 % (video_id, el_type))
1189 request = urllib2.Request(video_info_url)
1191 video_info_webpage = urllib2.urlopen(request).read()
1192 video_info = parse_qs(video_info_webpage)
1193 if 'token' in video_info:
1195 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1196 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1198 if 'token' not in video_info:
1199 if 'reason' in video_info:
1200 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1202 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1205 # Start extracting information
1206 self.report_information_extraction(video_id)
1209 if 'author' not in video_info:
1210 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1212 video_uploader = urllib.unquote_plus(video_info['author'][0])
1215 if 'title' not in video_info:
1216 self._downloader.trouble(u'ERROR: unable to extract video title')
1218 video_title = urllib.unquote_plus(video_info['title'][0])
1219 video_title = video_title.decode('utf-8')
1220 video_title = sanitize_title(video_title)
1223 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1224 simple_title = simple_title.strip(ur'_')
1227 if 'thumbnail_url' not in video_info:
1228 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1229 video_thumbnail = ''
1230 else: # don't panic if we can't find it
1231 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1235 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1236 if mobj is not None:
1237 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1238 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1239 for expression in format_expressions:
1241 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1249 video_description = u'No description available.'
1250 if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1251 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1252 if mobj is not None:
1253 video_description = mobj.group(1).decode('utf-8')
1255 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1256 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1257 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1258 # TODO use another parser
1261 video_token = urllib.unquote_plus(video_info['token'][0])
1263 # Decide which formats to download
1264 req_format = self._downloader.params.get('format', None)
1266 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1267 self.report_rtmp_download()
1268 video_url_list = [(None, video_info['conn'][0])]
1269 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1270 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1271 url_data = [parse_qs(uds) for uds in url_data_strs]
1272 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1273 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1275 format_limit = self._downloader.params.get('format_limit', None)
1276 if format_limit is not None and format_limit in self._available_formats:
1277 format_list = self._available_formats[self._available_formats.index(format_limit):]
1279 format_list = self._available_formats
1280 existing_formats = [x for x in format_list if x in url_map]
1281 if len(existing_formats) == 0:
1282 self._downloader.trouble(u'ERROR: no known formats available for video')
1284 if req_format is None:
1285 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1286 elif req_format == '-1':
1287 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1290 if req_format not in url_map:
1291 self._downloader.trouble(u'ERROR: requested format not available')
1293 video_url_list = [(req_format, url_map[req_format])] # Specific format
1295 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1298 for format_param, video_real_url in video_url_list:
1299 # At this point we have a new video
1300 self._downloader.increment_downloads()
1303 video_extension = self._video_extensions.get(format_param, 'flv')
1306 # Process video information
1307 self._downloader.process_info({
1308 'id': video_id.decode('utf-8'),
1309 'url': video_real_url.decode('utf-8'),
1310 'uploader': video_uploader.decode('utf-8'),
1311 'upload_date': upload_date,
1312 'title': video_title,
1313 'stitle': simple_title,
1314 'ext': video_extension.decode('utf-8'),
1315 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1316 'thumbnail': video_thumbnail.decode('utf-8'),
1317 'description': video_description,
1318 'player_url': player_url,
1320 except UnavailableVideoError, err:
1321 self._downloader.trouble(u'\nERROR: unable to download video')
1324 class MetacafeIE(InfoExtractor):
1325 """Information Extractor for metacafe.com."""
1327 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1328 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1329 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1332 def __init__(self, youtube_ie, downloader=None):
1333 InfoExtractor.__init__(self, downloader)
1334 self._youtube_ie = youtube_ie
1338 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1340 def report_disclaimer(self):
1341 """Report disclaimer retrieval."""
1342 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1344 def report_age_confirmation(self):
1345 """Report attempt to confirm age."""
1346 self._downloader.to_screen(u'[metacafe] Confirming age')
1348 def report_download_webpage(self, video_id):
1349 """Report webpage download."""
1350 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1352 def report_extraction(self, video_id):
1353 """Report information extraction."""
1354 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1356 def _real_initialize(self):
1357 # Retrieve disclaimer
1358 request = urllib2.Request(self._DISCLAIMER)
1360 self.report_disclaimer()
1361 disclaimer = urllib2.urlopen(request).read()
1362 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1363 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1369 'submit': "Continue - I'm over 18",
1371 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1373 self.report_age_confirmation()
1374 disclaimer = urllib2.urlopen(request).read()
1375 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1376 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1379 def _real_extract(self, url):
1380 # Extract id and simplified title from URL
1381 mobj = re.match(self._VALID_URL, url)
1383 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1386 video_id = mobj.group(1)
1388 # Check if video comes from YouTube
1389 mobj2 = re.match(r'^yt-(.*)$', video_id)
1390 if mobj2 is not None:
1391 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1394 # At this point we have a new video
1395 self._downloader.increment_downloads()
1397 simple_title = mobj.group(2).decode('utf-8')
1399 # Retrieve video webpage to extract further information
1400 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1402 self.report_download_webpage(video_id)
1403 webpage = urllib2.urlopen(request).read()
1404 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1405 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1408 # Extract URL, uploader and title from webpage
1409 self.report_extraction(video_id)
1410 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1411 if mobj is not None:
1412 mediaURL = urllib.unquote(mobj.group(1))
1413 video_extension = mediaURL[-3:]
1415 # Extract gdaKey if available
1416 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1418 video_url = mediaURL
1420 gdaKey = mobj.group(1)
1421 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1423 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1425 self._downloader.trouble(u'ERROR: unable to extract media URL')
1427 vardict = parse_qs(mobj.group(1))
1428 if 'mediaData' not in vardict:
1429 self._downloader.trouble(u'ERROR: unable to extract media URL')
1431 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1433 self._downloader.trouble(u'ERROR: unable to extract media URL')
1435 mediaURL = mobj.group(1).replace('\\/', '/')
1436 video_extension = mediaURL[-3:]
1437 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1439 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1441 self._downloader.trouble(u'ERROR: unable to extract title')
1443 video_title = mobj.group(1).decode('utf-8')
1444 video_title = sanitize_title(video_title)
1446 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1448 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1450 video_uploader = mobj.group(1)
1453 # Process video information
1454 self._downloader.process_info({
1455 'id': video_id.decode('utf-8'),
1456 'url': video_url.decode('utf-8'),
1457 'uploader': video_uploader.decode('utf-8'),
1458 'upload_date': u'NA',
1459 'title': video_title,
1460 'stitle': simple_title,
1461 'ext': video_extension.decode('utf-8'),
1465 except UnavailableVideoError:
1466 self._downloader.trouble(u'\nERROR: unable to download video')
1469 class DailymotionIE(InfoExtractor):
1470 """Information Extractor for Dailymotion"""
1472 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1474 def __init__(self, downloader=None):
1475 InfoExtractor.__init__(self, downloader)
1479 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1481 def report_download_webpage(self, video_id):
1482 """Report webpage download."""
1483 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1485 def report_extraction(self, video_id):
1486 """Report information extraction."""
1487 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1489 def _real_initialize(self):
1492 def _real_extract(self, url):
1493 # Extract id and simplified title from URL
1494 mobj = re.match(self._VALID_URL, url)
1496 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1499 # At this point we have a new video
1500 self._downloader.increment_downloads()
1501 video_id = mobj.group(1)
1503 simple_title = mobj.group(2).decode('utf-8')
1504 video_extension = 'flv'
1506 # Retrieve video webpage to extract further information
1507 request = urllib2.Request(url)
1509 self.report_download_webpage(video_id)
1510 webpage = urllib2.urlopen(request).read()
1511 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1512 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1515 # Extract URL, uploader and title from webpage
1516 self.report_extraction(video_id)
1517 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1519 self._downloader.trouble(u'ERROR: unable to extract media URL')
1521 mediaURL = urllib.unquote(mobj.group(1))
1523 # if needed add http://www.dailymotion.com/ if relative URL
1525 video_url = mediaURL
1527 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1528 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1530 self._downloader.trouble(u'ERROR: unable to extract title')
1532 video_title = mobj.group(1).decode('utf-8')
1533 video_title = sanitize_title(video_title)
1535 mobj = re.search(r'(?im)<Attribute name="owner">(.+?)</Attribute>', webpage)
1537 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1539 video_uploader = mobj.group(1)
1542 # Process video information
1543 self._downloader.process_info({
1544 'id': video_id.decode('utf-8'),
1545 'url': video_url.decode('utf-8'),
1546 'uploader': video_uploader.decode('utf-8'),
1547 'upload_date': u'NA',
1548 'title': video_title,
1549 'stitle': simple_title,
1550 'ext': video_extension.decode('utf-8'),
1554 except UnavailableVideoError:
1555 self._downloader.trouble(u'\nERROR: unable to download video')
1557 class GoogleIE(InfoExtractor):
1558 """Information extractor for video.google.com."""
1560 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1562 def __init__(self, downloader=None):
1563 InfoExtractor.__init__(self, downloader)
1567 return (re.match(GoogleIE._VALID_URL, url) is not None)
1569 def report_download_webpage(self, video_id):
1570 """Report webpage download."""
1571 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1573 def report_extraction(self, video_id):
1574 """Report information extraction."""
1575 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1577 def _real_initialize(self):
1580 def _real_extract(self, url):
1581 # Extract id from URL
1582 mobj = re.match(self._VALID_URL, url)
1584 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1587 # At this point we have a new video
1588 self._downloader.increment_downloads()
1589 video_id = mobj.group(1)
1591 video_extension = 'mp4'
1593 # Retrieve video webpage to extract further information
1594 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1596 self.report_download_webpage(video_id)
1597 webpage = urllib2.urlopen(request).read()
1598 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1599 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1602 # Extract URL, uploader, and title from webpage
1603 self.report_extraction(video_id)
1604 mobj = re.search(r"download_url:'([^']+)'", webpage)
1606 video_extension = 'flv'
1607 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1609 self._downloader.trouble(u'ERROR: unable to extract media URL')
1611 mediaURL = urllib.unquote(mobj.group(1))
1612 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1613 mediaURL = mediaURL.replace('\\x26', '\x26')
1615 video_url = mediaURL
1617 mobj = re.search(r'<title>(.*)</title>', webpage)
1619 self._downloader.trouble(u'ERROR: unable to extract title')
1621 video_title = mobj.group(1).decode('utf-8')
1622 video_title = sanitize_title(video_title)
1623 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1625 # Extract video description
1626 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1628 self._downloader.trouble(u'ERROR: unable to extract video description')
1630 video_description = mobj.group(1).decode('utf-8')
1631 if not video_description:
1632 video_description = 'No description available.'
1634 # Extract video thumbnail
1635 if self._downloader.params.get('forcethumbnail', False):
1636 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1638 webpage = urllib2.urlopen(request).read()
1639 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1640 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1642 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1644 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1646 video_thumbnail = mobj.group(1)
1647 else: # we need something to pass to process_info
1648 video_thumbnail = ''
1652 # Process video information
1653 self._downloader.process_info({
1654 'id': video_id.decode('utf-8'),
1655 'url': video_url.decode('utf-8'),
1657 'upload_date': u'NA',
1658 'title': video_title,
1659 'stitle': simple_title,
1660 'ext': video_extension.decode('utf-8'),
1664 except UnavailableVideoError:
1665 self._downloader.trouble(u'\nERROR: unable to download video')
1668 class PhotobucketIE(InfoExtractor):
1669 """Information extractor for photobucket.com."""
1671 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1673 def __init__(self, downloader=None):
1674 InfoExtractor.__init__(self, downloader)
1678 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1680 def report_download_webpage(self, video_id):
1681 """Report webpage download."""
1682 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1684 def report_extraction(self, video_id):
1685 """Report information extraction."""
1686 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1688 def _real_initialize(self):
1691 def _real_extract(self, url):
1692 # Extract id from URL
1693 mobj = re.match(self._VALID_URL, url)
1695 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1698 # At this point we have a new video
1699 self._downloader.increment_downloads()
1700 video_id = mobj.group(1)
1702 video_extension = 'flv'
1704 # Retrieve video webpage to extract further information
1705 request = urllib2.Request(url)
1707 self.report_download_webpage(video_id)
1708 webpage = urllib2.urlopen(request).read()
1709 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1710 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1713 # Extract URL, uploader, and title from webpage
1714 self.report_extraction(video_id)
1715 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1717 self._downloader.trouble(u'ERROR: unable to extract media URL')
1719 mediaURL = urllib.unquote(mobj.group(1))
1721 video_url = mediaURL
1723 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1725 self._downloader.trouble(u'ERROR: unable to extract title')
1727 video_title = mobj.group(1).decode('utf-8')
1728 video_title = sanitize_title(video_title)
1729 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1731 video_uploader = mobj.group(2).decode('utf-8')
1734 # Process video information
1735 self._downloader.process_info({
1736 'id': video_id.decode('utf-8'),
1737 'url': video_url.decode('utf-8'),
1738 'uploader': video_uploader,
1739 'upload_date': u'NA',
1740 'title': video_title,
1741 'stitle': simple_title,
1742 'ext': video_extension.decode('utf-8'),
1746 except UnavailableVideoError:
1747 self._downloader.trouble(u'\nERROR: unable to download video')
1750 class YahooIE(InfoExtractor):
1751 """Information extractor for video.yahoo.com."""
1753 # _VALID_URL matches all Yahoo! Video URLs
1754 # _VPAGE_URL matches only the extractable '/watch/' URLs
1755 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1756 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1758 def __init__(self, downloader=None):
1759 InfoExtractor.__init__(self, downloader)
1763 return (re.match(YahooIE._VALID_URL, url) is not None)
1765 def report_download_webpage(self, video_id):
1766 """Report webpage download."""
1767 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1769 def report_extraction(self, video_id):
1770 """Report information extraction."""
1771 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1773 def _real_initialize(self):
1776 def _real_extract(self, url, new_video=True):
1777 # Extract ID from URL
1778 mobj = re.match(self._VALID_URL, url)
1780 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1783 # At this point we have a new video
1784 self._downloader.increment_downloads()
1785 video_id = mobj.group(2)
1786 video_extension = 'flv'
1788 # Rewrite valid but non-extractable URLs as
1789 # extractable English language /watch/ URLs
1790 if re.match(self._VPAGE_URL, url) is None:
1791 request = urllib2.Request(url)
1793 webpage = urllib2.urlopen(request).read()
1794 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1795 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1798 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1800 self._downloader.trouble(u'ERROR: Unable to extract id field')
1802 yahoo_id = mobj.group(1)
1804 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1806 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1808 yahoo_vid = mobj.group(1)
1810 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1811 return self._real_extract(url, new_video=False)
1813 # Retrieve video webpage to extract further information
1814 request = urllib2.Request(url)
1816 self.report_download_webpage(video_id)
1817 webpage = urllib2.urlopen(request).read()
1818 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1819 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1822 # Extract uploader and title from webpage
1823 self.report_extraction(video_id)
1824 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1826 self._downloader.trouble(u'ERROR: unable to extract video title')
1828 video_title = mobj.group(1).decode('utf-8')
1829 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1831 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1833 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1835 video_uploader = mobj.group(1).decode('utf-8')
1837 # Extract video thumbnail
1838 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1840 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1842 video_thumbnail = mobj.group(1).decode('utf-8')
1844 # Extract video description
1845 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1847 self._downloader.trouble(u'ERROR: unable to extract video description')
1849 video_description = mobj.group(1).decode('utf-8')
1850 if not video_description: video_description = 'No description available.'
1852 # Extract video height and width
1853 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1855 self._downloader.trouble(u'ERROR: unable to extract video height')
1857 yv_video_height = mobj.group(1)
1859 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1861 self._downloader.trouble(u'ERROR: unable to extract video width')
1863 yv_video_width = mobj.group(1)
1865 # Retrieve video playlist to extract media URL
1866 # I'm not completely sure what all these options are, but we
1867 # seem to need most of them, otherwise the server sends a 401.
1868 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1869 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1870 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1871 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1872 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1874 self.report_download_webpage(video_id)
1875 webpage = urllib2.urlopen(request).read()
1876 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1877 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1880 # Extract media URL from playlist XML
1881 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1883 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1885 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1886 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1889 # Process video information
1890 self._downloader.process_info({
1891 'id': video_id.decode('utf-8'),
1893 'uploader': video_uploader,
1894 'upload_date': u'NA',
1895 'title': video_title,
1896 'stitle': simple_title,
1897 'ext': video_extension.decode('utf-8'),
1898 'thumbnail': video_thumbnail.decode('utf-8'),
1899 'description': video_description,
1900 'thumbnail': video_thumbnail,
1901 'description': video_description,
1904 except UnavailableVideoError:
1905 self._downloader.trouble(u'\nERROR: unable to download video')
1908 class VimeoIE(InfoExtractor):
1909 """Information extractor for vimeo.com."""
1911 # _VALID_URL matches Vimeo URLs
1912 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1914 def __init__(self, downloader=None):
1915 InfoExtractor.__init__(self, downloader)
1919 return (re.match(VimeoIE._VALID_URL, url) is not None)
1921 def report_download_webpage(self, video_id):
1922 """Report webpage download."""
1923 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1925 def report_extraction(self, video_id):
1926 """Report information extraction."""
1927 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1929 def _real_initialize(self):
1932 def _real_extract(self, url, new_video=True):
1933 # Extract ID from URL
1934 mobj = re.match(self._VALID_URL, url)
1936 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1939 # At this point we have a new video
1940 self._downloader.increment_downloads()
1941 video_id = mobj.group(1)
1943 # Retrieve video webpage to extract further information
1944 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
1946 self.report_download_webpage(video_id)
1947 webpage = urllib2.urlopen(request).read()
1948 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1949 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1952 # Now we begin extracting as much information as we can from what we
1953 # retrieved. First we extract the information common to all extractors,
1954 # and latter we extract those that are Vimeo specific.
1955 self.report_extraction(video_id)
1958 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
1960 self._downloader.trouble(u'ERROR: unable to extract video title')
1962 video_title = mobj.group(1).decode('utf-8')
1963 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1966 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
1968 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1970 video_uploader = mobj.group(1).decode('utf-8')
1972 # Extract video thumbnail
1973 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
1975 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1977 video_thumbnail = mobj.group(1).decode('utf-8')
1979 # # Extract video description
1980 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
1982 # self._downloader.trouble(u'ERROR: unable to extract video description')
1984 # video_description = mobj.group(1).decode('utf-8')
1985 # if not video_description: video_description = 'No description available.'
1986 video_description = 'Foo.'
1988 # Vimeo specific: extract request signature
1989 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
1991 self._downloader.trouble(u'ERROR: unable to extract request signature')
1993 sig = mobj.group(1).decode('utf-8')
1995 # Vimeo specific: Extract request signature expiration
1996 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
1998 self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2000 sig_exp = mobj.group(1).decode('utf-8')
2002 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s" % (video_id, sig, sig_exp)
2005 # Process video information
2006 self._downloader.process_info({
2007 'id': video_id.decode('utf-8'),
2009 'uploader': video_uploader,
2010 'upload_date': u'NA',
2011 'title': video_title,
2012 'stitle': simple_title,
2014 'thumbnail': video_thumbnail.decode('utf-8'),
2015 'description': video_description,
2016 'thumbnail': video_thumbnail,
2017 'description': video_description,
2020 except UnavailableVideoError:
2021 self._downloader.trouble(u'ERROR: unable to download video')
2024 class GenericIE(InfoExtractor):
2025 """Generic last-resort information extractor."""
2027 def __init__(self, downloader=None):
2028 InfoExtractor.__init__(self, downloader)
2034 def report_download_webpage(self, video_id):
2035 """Report webpage download."""
2036 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2037 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2039 def report_extraction(self, video_id):
2040 """Report information extraction."""
2041 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2043 def _real_initialize(self):
2046 def _real_extract(self, url):
2047 # At this point we have a new video
2048 self._downloader.increment_downloads()
2050 video_id = url.split('/')[-1]
2051 request = urllib2.Request(url)
2053 self.report_download_webpage(video_id)
2054 webpage = urllib2.urlopen(request).read()
2055 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2056 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2058 except ValueError, err:
2059 # since this is the last-resort InfoExtractor, if
2060 # this error is thrown, it'll be thrown here
2061 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2064 self.report_extraction(video_id)
2065 # Start with something easy: JW Player in SWFObject
2066 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2068 # Broaden the search a little bit
2069 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2071 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2074 # It's possible that one of the regexes
2075 # matched, but returned an empty group:
2076 if mobj.group(1) is None:
2077 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2080 video_url = urllib.unquote(mobj.group(1))
2081 video_id = os.path.basename(video_url)
2083 # here's a fun little line of code for you:
2084 video_extension = os.path.splitext(video_id)[1][1:]
2085 video_id = os.path.splitext(video_id)[0]
2087 # it's tempting to parse this further, but you would
2088 # have to take into account all the variations like
2089 # Video Title - Site Name
2090 # Site Name | Video Title
2091 # Video Title - Tagline | Site Name
2092 # and so on and so forth; it's just not practical
2093 mobj = re.search(r'<title>(.*)</title>', webpage)
2095 self._downloader.trouble(u'ERROR: unable to extract title')
2097 video_title = mobj.group(1).decode('utf-8')
2098 video_title = sanitize_title(video_title)
2099 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2101 # video uploader is domain name
2102 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2104 self._downloader.trouble(u'ERROR: unable to extract title')
2106 video_uploader = mobj.group(1).decode('utf-8')
2109 # Process video information
2110 self._downloader.process_info({
2111 'id': video_id.decode('utf-8'),
2112 'url': video_url.decode('utf-8'),
2113 'uploader': video_uploader,
2114 'upload_date': u'NA',
2115 'title': video_title,
2116 'stitle': simple_title,
2117 'ext': video_extension.decode('utf-8'),
2121 except UnavailableVideoError, err:
2122 self._downloader.trouble(u'\nERROR: unable to download video')
2125 class YoutubeSearchIE(InfoExtractor):
2126 """Information Extractor for YouTube search queries."""
2127 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
2128 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2129 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2130 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2132 _max_youtube_results = 1000
2134 def __init__(self, youtube_ie, downloader=None):
2135 InfoExtractor.__init__(self, downloader)
2136 self._youtube_ie = youtube_ie
2140 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
2142 def report_download_page(self, query, pagenum):
2143 """Report attempt to download playlist page with given number."""
2144 query = query.decode(preferredencoding())
2145 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2147 def _real_initialize(self):
2148 self._youtube_ie.initialize()
2150 def _real_extract(self, query):
2151 mobj = re.match(self._VALID_QUERY, query)
2153 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2156 prefix, query = query.split(':')
2158 query = query.encode('utf-8')
2160 self._download_n_results(query, 1)
2162 elif prefix == 'all':
2163 self._download_n_results(query, self._max_youtube_results)
2169 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2171 elif n > self._max_youtube_results:
2172 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2173 n = self._max_youtube_results
2174 self._download_n_results(query, n)
2176 except ValueError: # parsing prefix as integer fails
2177 self._download_n_results(query, 1)
2180 def _download_n_results(self, query, n):
2181 """Downloads a specified number of results for a query"""
2184 already_seen = set()
2188 self.report_download_page(query, pagenum)
2189 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2190 request = urllib2.Request(result_url)
2192 page = urllib2.urlopen(request).read()
2193 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2194 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2197 # Extract video identifiers
2198 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2199 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2200 if video_id not in already_seen:
2201 video_ids.append(video_id)
2202 already_seen.add(video_id)
2203 if len(video_ids) == n:
2204 # Specified n videos reached
2205 for id in video_ids:
2206 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2209 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2210 for id in video_ids:
2211 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2214 pagenum = pagenum + 1
2216 class GoogleSearchIE(InfoExtractor):
2217 """Information Extractor for Google Video search queries."""
2218 _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
2219 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2220 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2221 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2223 _max_google_results = 1000
2225 def __init__(self, google_ie, downloader=None):
2226 InfoExtractor.__init__(self, downloader)
2227 self._google_ie = google_ie
2231 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
2233 def report_download_page(self, query, pagenum):
2234 """Report attempt to download playlist page with given number."""
2235 query = query.decode(preferredencoding())
2236 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2238 def _real_initialize(self):
2239 self._google_ie.initialize()
2241 def _real_extract(self, query):
2242 mobj = re.match(self._VALID_QUERY, query)
2244 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2247 prefix, query = query.split(':')
2249 query = query.encode('utf-8')
2251 self._download_n_results(query, 1)
2253 elif prefix == 'all':
2254 self._download_n_results(query, self._max_google_results)
2260 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2262 elif n > self._max_google_results:
2263 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2264 n = self._max_google_results
2265 self._download_n_results(query, n)
2267 except ValueError: # parsing prefix as integer fails
2268 self._download_n_results(query, 1)
2271 def _download_n_results(self, query, n):
2272 """Downloads a specified number of results for a query"""
2275 already_seen = set()
2279 self.report_download_page(query, pagenum)
2280 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2281 request = urllib2.Request(result_url)
2283 page = urllib2.urlopen(request).read()
2284 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2285 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2288 # Extract video identifiers
2289 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2290 video_id = mobj.group(1)
2291 if video_id not in already_seen:
2292 video_ids.append(video_id)
2293 already_seen.add(video_id)
2294 if len(video_ids) == n:
2295 # Specified n videos reached
2296 for id in video_ids:
2297 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2300 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2301 for id in video_ids:
2302 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2305 pagenum = pagenum + 1
2307 class YahooSearchIE(InfoExtractor):
2308 """Information Extractor for Yahoo! Video search queries."""
2309 _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
2310 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2311 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2312 _MORE_PAGES_INDICATOR = r'\s*Next'
2314 _max_yahoo_results = 1000
2316 def __init__(self, yahoo_ie, downloader=None):
2317 InfoExtractor.__init__(self, downloader)
2318 self._yahoo_ie = yahoo_ie
2322 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
2324 def report_download_page(self, query, pagenum):
2325 """Report attempt to download playlist page with given number."""
2326 query = query.decode(preferredencoding())
2327 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2329 def _real_initialize(self):
2330 self._yahoo_ie.initialize()
2332 def _real_extract(self, query):
2333 mobj = re.match(self._VALID_QUERY, query)
2335 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2338 prefix, query = query.split(':')
2340 query = query.encode('utf-8')
2342 self._download_n_results(query, 1)
2344 elif prefix == 'all':
2345 self._download_n_results(query, self._max_yahoo_results)
2351 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2353 elif n > self._max_yahoo_results:
2354 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2355 n = self._max_yahoo_results
2356 self._download_n_results(query, n)
2358 except ValueError: # parsing prefix as integer fails
2359 self._download_n_results(query, 1)
2362 def _download_n_results(self, query, n):
2363 """Downloads a specified number of results for a query"""
2366 already_seen = set()
2370 self.report_download_page(query, pagenum)
2371 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2372 request = urllib2.Request(result_url)
2374 page = urllib2.urlopen(request).read()
2375 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2376 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2379 # Extract video identifiers
2380 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2381 video_id = mobj.group(1)
2382 if video_id not in already_seen:
2383 video_ids.append(video_id)
2384 already_seen.add(video_id)
2385 if len(video_ids) == n:
2386 # Specified n videos reached
2387 for id in video_ids:
2388 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2391 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2392 for id in video_ids:
2393 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2396 pagenum = pagenum + 1
2398 class YoutubePlaylistIE(InfoExtractor):
2399 """Information Extractor for YouTube playlists."""
2401 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist)\?.*?(p|a)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2402 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2403 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2404 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2407 def __init__(self, youtube_ie, downloader=None):
2408 InfoExtractor.__init__(self, downloader)
2409 self._youtube_ie = youtube_ie
2413 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2415 def report_download_page(self, playlist_id, pagenum):
2416 """Report attempt to download playlist page with given number."""
2417 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2419 def _real_initialize(self):
2420 self._youtube_ie.initialize()
2422 def _real_extract(self, url):
2423 # Extract playlist id
2424 mobj = re.match(self._VALID_URL, url)
2426 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2430 if mobj.group(3) is not None:
2431 self._youtube_ie.extract(mobj.group(3))
2434 # Download playlist pages
2435 # prefix is 'p' as default for playlists but there are other types that need extra care
2436 playlist_prefix = mobj.group(1)
2437 if playlist_prefix == 'a':
2438 playlist_access = 'artist'
2440 playlist_prefix = 'p'
2441 playlist_access = 'view_play_list'
2442 playlist_id = mobj.group(2)
2447 self.report_download_page(playlist_id, pagenum)
2448 request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2450 page = urllib2.urlopen(request).read()
2451 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2452 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2455 # Extract video identifiers
2457 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2458 if mobj.group(1) not in ids_in_page:
2459 ids_in_page.append(mobj.group(1))
2460 video_ids.extend(ids_in_page)
2462 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2464 pagenum = pagenum + 1
2466 playliststart = self._downloader.params.get('playliststart', 1) - 1
2467 playlistend = self._downloader.params.get('playlistend', -1)
2468 video_ids = video_ids[playliststart:playlistend]
2470 for id in video_ids:
2471 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2474 class YoutubeUserIE(InfoExtractor):
2475 """Information Extractor for YouTube users."""
2477 _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2478 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2479 _GDATA_PAGE_SIZE = 50
2480 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2481 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2484 def __init__(self, youtube_ie, downloader=None):
2485 InfoExtractor.__init__(self, downloader)
2486 self._youtube_ie = youtube_ie
2490 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2492 def report_download_page(self, username, start_index):
2493 """Report attempt to download user page."""
2494 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2495 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2497 def _real_initialize(self):
2498 self._youtube_ie.initialize()
2500 def _real_extract(self, url):
2502 mobj = re.match(self._VALID_URL, url)
2504 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2507 username = mobj.group(1)
2509 # Download video ids using YouTube Data API. Result size per
2510 # query is limited (currently to 50 videos) so we need to query
2511 # page by page until there are no video ids - it means we got
2518 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2519 self.report_download_page(username, start_index)
2521 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2524 page = urllib2.urlopen(request).read()
2525 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2526 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2529 # Extract video identifiers
2532 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2533 if mobj.group(1) not in ids_in_page:
2534 ids_in_page.append(mobj.group(1))
2536 video_ids.extend(ids_in_page)
2538 # A little optimization - if current page is not
2539 # "full", ie. does not contain PAGE_SIZE video ids then
2540 # we can assume that this page is the last one - there
2541 # are no more ids on further pages - no need to query
2544 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2549 all_ids_count = len(video_ids)
2550 playliststart = self._downloader.params.get('playliststart', 1) - 1
2551 playlistend = self._downloader.params.get('playlistend', -1)
2553 if playlistend == -1:
2554 video_ids = video_ids[playliststart:]
2556 video_ids = video_ids[playliststart:playlistend]
2558 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2559 (username, all_ids_count, len(video_ids)))
2561 for video_id in video_ids:
2562 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2565 class DepositFilesIE(InfoExtractor):
2566 """Information extractor for depositfiles.com"""
2568 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2570 def __init__(self, downloader=None):
2571 InfoExtractor.__init__(self, downloader)
2575 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2577 def report_download_webpage(self, file_id):
2578 """Report webpage download."""
2579 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2581 def report_extraction(self, file_id):
2582 """Report information extraction."""
2583 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2585 def _real_initialize(self):
2588 def _real_extract(self, url):
2589 # At this point we have a new file
2590 self._downloader.increment_downloads()
2592 file_id = url.split('/')[-1]
2593 # Rebuild url in english locale
2594 url = 'http://depositfiles.com/en/files/' + file_id
2596 # Retrieve file webpage with 'Free download' button pressed
2597 free_download_indication = { 'gateway_result' : '1' }
2598 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2600 self.report_download_webpage(file_id)
2601 webpage = urllib2.urlopen(request).read()
2602 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2603 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2606 # Search for the real file URL
2607 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2608 if (mobj is None) or (mobj.group(1) is None):
2609 # Try to figure out reason of the error.
2610 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2611 if (mobj is not None) and (mobj.group(1) is not None):
2612 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2613 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2615 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2618 file_url = mobj.group(1)
2619 file_extension = os.path.splitext(file_url)[1][1:]
2621 # Search for file title
2622 mobj = re.search(r'<b title="(.*?)">', webpage)
2624 self._downloader.trouble(u'ERROR: unable to extract title')
2626 file_title = mobj.group(1).decode('utf-8')
2629 # Process file information
2630 self._downloader.process_info({
2631 'id': file_id.decode('utf-8'),
2632 'url': file_url.decode('utf-8'),
2634 'upload_date': u'NA',
2635 'title': file_title,
2636 'stitle': file_title,
2637 'ext': file_extension.decode('utf-8'),
2641 except UnavailableVideoError, err:
2642 self._downloader.trouble(u'ERROR: unable to download file')
2644 class FacebookIE(InfoExtractor):
2645 """Information Extractor for Facebook"""
2647 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2648 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2649 _NETRC_MACHINE = 'facebook'
2650 _available_formats = ['highqual', 'lowqual']
2651 _video_extensions = {
2656 def __init__(self, downloader=None):
2657 InfoExtractor.__init__(self, downloader)
2661 return (re.match(FacebookIE._VALID_URL, url) is not None)
2663 def _reporter(self, message):
2664 """Add header and report message."""
2665 self._downloader.to_screen(u'[facebook] %s' % message)
2667 def report_login(self):
2668 """Report attempt to log in."""
2669 self._reporter(u'Logging in')
2671 def report_video_webpage_download(self, video_id):
2672 """Report attempt to download video webpage."""
2673 self._reporter(u'%s: Downloading video webpage' % video_id)
2675 def report_information_extraction(self, video_id):
2676 """Report attempt to extract video information."""
2677 self._reporter(u'%s: Extracting video information' % video_id)
2679 def _parse_page(self, video_webpage):
2680 """Extract video information from page"""
2682 data = {'title': r'class="video_title datawrap">(.*?)</',
2683 'description': r'<div class="datawrap">(.*?)</div>',
2684 'owner': r'\("video_owner_name", "(.*?)"\)',
2685 'upload_date': r'data-date="(.*?)"',
2686 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2689 for piece in data.keys():
2690 mobj = re.search(data[piece], video_webpage)
2691 if mobj is not None:
2692 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2696 for fmt in self._available_formats:
2697 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2698 if mobj is not None:
2699 # URL is in a Javascript segment inside an escaped Unicode format within
2700 # the generally utf-8 page
2701 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2702 video_info['video_urls'] = video_urls
2706 def _real_initialize(self):
2707 if self._downloader is None:
2712 downloader_params = self._downloader.params
2714 # Attempt to use provided username and password or .netrc data
2715 if downloader_params.get('username', None) is not None:
2716 useremail = downloader_params['username']
2717 password = downloader_params['password']
2718 elif downloader_params.get('usenetrc', False):
2720 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2721 if info is not None:
2725 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2726 except (IOError, netrc.NetrcParseError), err:
2727 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2730 if useremail is None:
2739 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2742 login_results = urllib2.urlopen(request).read()
2743 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2744 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2746 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2747 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2750 def _real_extract(self, url):
2751 mobj = re.match(self._VALID_URL, url)
2753 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2755 video_id = mobj.group('ID')
2758 self.report_video_webpage_download(video_id)
2759 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2761 page = urllib2.urlopen(request)
2762 video_webpage = page.read()
2763 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2764 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2767 # Start extracting information
2768 self.report_information_extraction(video_id)
2770 # Extract information
2771 video_info = self._parse_page(video_webpage)
2774 if 'owner' not in video_info:
2775 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2777 video_uploader = video_info['owner']
2780 if 'title' not in video_info:
2781 self._downloader.trouble(u'ERROR: unable to extract video title')
2783 video_title = video_info['title']
2784 video_title = video_title.decode('utf-8')
2785 video_title = sanitize_title(video_title)
2788 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2789 simple_title = simple_title.strip(ur'_')
2792 if 'thumbnail' not in video_info:
2793 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2794 video_thumbnail = ''
2796 video_thumbnail = video_info['thumbnail']
2800 if 'upload_date' in video_info:
2801 upload_time = video_info['upload_date']
2802 timetuple = email.utils.parsedate_tz(upload_time)
2803 if timetuple is not None:
2805 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2810 video_description = video_info.get('description', 'No description available.')
2812 url_map = video_info['video_urls']
2813 if len(url_map.keys()) > 0:
2814 # Decide which formats to download
2815 req_format = self._downloader.params.get('format', None)
2816 format_limit = self._downloader.params.get('format_limit', None)
2818 if format_limit is not None and format_limit in self._available_formats:
2819 format_list = self._available_formats[self._available_formats.index(format_limit):]
2821 format_list = self._available_formats
2822 existing_formats = [x for x in format_list if x in url_map]
2823 if len(existing_formats) == 0:
2824 self._downloader.trouble(u'ERROR: no known formats available for video')
2826 if req_format is None:
2827 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2828 elif req_format == '-1':
2829 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2832 if req_format not in url_map:
2833 self._downloader.trouble(u'ERROR: requested format not available')
2835 video_url_list = [(req_format, url_map[req_format])] # Specific format
2837 for format_param, video_real_url in video_url_list:
2839 # At this point we have a new video
2840 self._downloader.increment_downloads()
2843 video_extension = self._video_extensions.get(format_param, 'mp4')
2846 # Process video information
2847 self._downloader.process_info({
2848 'id': video_id.decode('utf-8'),
2849 'url': video_real_url.decode('utf-8'),
2850 'uploader': video_uploader.decode('utf-8'),
2851 'upload_date': upload_date,
2852 'title': video_title,
2853 'stitle': simple_title,
2854 'ext': video_extension.decode('utf-8'),
2855 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2856 'thumbnail': video_thumbnail.decode('utf-8'),
2857 'description': video_description.decode('utf-8'),
2860 except UnavailableVideoError, err:
2861 self._downloader.trouble(u'\nERROR: unable to download video')
2863 class BlipTVIE(InfoExtractor):
2864 """Information extractor for blip.tv"""
2866 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2867 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2871 return (re.match(BlipTVIE._VALID_URL, url) is not None)
2873 def report_extraction(self, file_id):
2874 """Report information extraction."""
2875 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
2877 def _simplify_title(self, title):
2878 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2879 res = res.strip(ur'_')
2882 def _real_extract(self, url):
2883 mobj = re.match(self._VALID_URL, url)
2885 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2892 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2893 request = urllib2.Request(json_url)
2894 self.report_extraction(mobj.group(1))
2896 json_code = urllib2.urlopen(request).read()
2897 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2898 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2901 json_data = json.loads(json_code)
2902 if 'Post' in json_data:
2903 data = json_data['Post']
2907 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2908 video_url = data['media']['url']
2909 umobj = re.match(self._URL_EXT, video_url)
2911 raise ValueError('Can not determine filename extension')
2912 ext = umobj.group(1)
2914 self._downloader.increment_downloads()
2917 'id': data['item_id'],
2919 'uploader': data['display_name'],
2920 'upload_date': upload_date,
2921 'title': data['title'],
2922 'stitle': self._simplify_title(data['title']),
2924 'format': data['media']['mimeType'],
2925 'thumbnail': data['thumbnailUrl'],
2926 'description': data['description'],
2927 'player_url': data['embedUrl']
2929 except (ValueError,KeyError), err:
2930 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2934 self._downloader.process_info(info)
2935 except UnavailableVideoError, err:
2936 self._downloader.trouble(u'\nERROR: unable to download video')
2939 class PostProcessor(object):
2940 """Post Processor class.
2942 PostProcessor objects can be added to downloaders with their
2943 add_post_processor() method. When the downloader has finished a
2944 successful download, it will take its internal chain of PostProcessors
2945 and start calling the run() method on each one of them, first with
2946 an initial argument and then with the returned value of the previous
2949 The chain will be stopped if one of them ever returns None or the end
2950 of the chain is reached.
2952 PostProcessor objects follow a "mutual registration" process similar
2953 to InfoExtractor objects.
2958 def __init__(self, downloader=None):
2959 self._downloader = downloader
2961 def set_downloader(self, downloader):
2962 """Sets the downloader for this PP."""
2963 self._downloader = downloader
2965 def run(self, information):
2966 """Run the PostProcessor.
2968 The "information" argument is a dictionary like the ones
2969 composed by InfoExtractors. The only difference is that this
2970 one has an extra field called "filepath" that points to the
2973 When this method returns None, the postprocessing chain is
2974 stopped. However, this method may return an information
2975 dictionary that will be passed to the next postprocessing
2976 object in the chain. It can be the one it received after
2977 changing some fields.
2979 In addition, this method may raise a PostProcessingError
2980 exception that will be taken into account by the downloader
2983 return information # by default, do nothing
2985 class FFmpegExtractAudioPP(PostProcessor):
2987 def __init__(self, downloader=None, preferredcodec=None):
2988 PostProcessor.__init__(self, downloader)
2989 if preferredcodec is None:
2990 preferredcodec = 'best'
2991 self._preferredcodec = preferredcodec
2994 def get_audio_codec(path):
2996 cmd = ['ffprobe', '-show_streams', '--', path]
2997 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
2998 output = handle.communicate()[0]
2999 if handle.wait() != 0:
3001 except (IOError, OSError):
3004 for line in output.split('\n'):
3005 if line.startswith('codec_name='):
3006 audio_codec = line.split('=')[1].strip()
3007 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3012 def run_ffmpeg(path, out_path, codec, more_opts):
3014 cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3015 ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3017 except (IOError, OSError):
3020 def run(self, information):
3021 path = information['filepath']
3023 filecodec = self.get_audio_codec(path)
3024 if filecodec is None:
3025 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3029 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3030 if filecodec == 'aac' or filecodec == 'mp3':
3031 # Lossless if possible
3033 extension = filecodec
3034 if filecodec == 'aac':
3035 more_opts = ['-f', 'adts']
3038 acodec = 'libmp3lame'
3040 more_opts = ['-ab', '128k']
3042 # We convert the audio (lossy)
3043 acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
3044 extension = self._preferredcodec
3045 more_opts = ['-ab', '128k']
3046 if self._preferredcodec == 'aac':
3047 more_opts += ['-f', 'adts']
3049 (prefix, ext) = os.path.splitext(path)
3050 new_path = prefix + '.' + extension
3051 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3052 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3055 self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3060 except (IOError, OSError):
3061 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3064 information['filepath'] = new_path
3068 def updateSelf(downloader, filename):
3069 ''' Update the program file with the latest version from the repository '''
3070 # Note: downloader only used for options
3071 if not os.access(filename, os.W_OK):
3072 sys.exit('ERROR: no write permissions on %s' % filename)
3074 downloader.to_screen('Updating to latest stable version...')
3077 latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
3078 latest_version = urllib.urlopen(latest_url).read().strip()
3079 prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
3080 newcontent = urllib.urlopen(prog_url).read()
3081 except (IOError, OSError), err:
3082 sys.exit('ERROR: unable to download latest version')
3085 stream = open(filename, 'wb')
3086 stream.write(newcontent)
3088 except (IOError, OSError), err:
3089 sys.exit('ERROR: unable to overwrite current version')
3091 downloader.to_screen('Updated to version %s' % latest_version)
3098 def _format_option_string(option):
3099 ''' ('-o', '--option') -> -o, --format METAVAR'''
3103 if option._short_opts: opts.append(option._short_opts[0])
3104 if option._long_opts: opts.append(option._long_opts[0])
3105 if len(opts) > 1: opts.insert(1, ', ')
3107 if option.takes_value(): opts.append(' %s' % option.metavar)
3109 return "".join(opts)
3111 def _find_term_columns():
3112 columns = os.environ.get('COLUMNS', None)
3117 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3118 out,err = sp.communicate()
3119 return int(out.split()[1])
3125 max_help_position = 80
3127 # No need to wrap help messages if we're on a wide console
3128 columns = _find_term_columns()
3129 if columns: max_width = columns
3131 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
3132 fmt.format_option_strings = _format_option_string
3135 'version' : __version__,
3137 'usage' : '%prog [options] url...',
3138 'conflict_handler' : 'resolve',
3141 parser = optparse.OptionParser(**kw)
3144 general = optparse.OptionGroup(parser, 'General Options')
3145 authentication = optparse.OptionGroup(parser, 'Authentication Options')
3146 video_format = optparse.OptionGroup(parser, 'Video Format Options')
3147 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
3148 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
3149 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3151 general.add_option('-h', '--help',
3152 action='help', help='print this help text and exit')
3153 general.add_option('-v', '--version',
3154 action='version', help='print program version and exit')
3155 general.add_option('-U', '--update',
3156 action='store_true', dest='update_self', help='update this program to latest stable version')
3157 general.add_option('-i', '--ignore-errors',
3158 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3159 general.add_option('-r', '--rate-limit',
3160 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3161 general.add_option('-R', '--retries',
3162 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
3163 general.add_option('--playlist-start',
3164 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3165 general.add_option('--playlist-end',
3166 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3167 general.add_option('--dump-user-agent',
3168 action='store_true', dest='dump_user_agent',
3169 help='display the current browser identification', default=False)
3171 authentication.add_option('-u', '--username',
3172 dest='username', metavar='USERNAME', help='account username')
3173 authentication.add_option('-p', '--password',
3174 dest='password', metavar='PASSWORD', help='account password')
3175 authentication.add_option('-n', '--netrc',
3176 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3179 video_format.add_option('-f', '--format',
3180 action='store', dest='format', metavar='FORMAT', help='video format code')
3181 video_format.add_option('--all-formats',
3182 action='store_const', dest='format', help='download all available video formats', const='-1')
3183 video_format.add_option('--max-quality',
3184 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3187 verbosity.add_option('-q', '--quiet',
3188 action='store_true', dest='quiet', help='activates quiet mode', default=False)
3189 verbosity.add_option('-s', '--simulate',
3190 action='store_true', dest='simulate', help='do not download video', default=False)
3191 verbosity.add_option('-g', '--get-url',
3192 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3193 verbosity.add_option('-e', '--get-title',
3194 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3195 verbosity.add_option('--get-thumbnail',
3196 action='store_true', dest='getthumbnail',
3197 help='simulate, quiet but print thumbnail URL', default=False)
3198 verbosity.add_option('--get-description',
3199 action='store_true', dest='getdescription',
3200 help='simulate, quiet but print video description', default=False)
3201 verbosity.add_option('--get-filename',
3202 action='store_true', dest='getfilename',
3203 help='simulate, quiet but print output filename', default=False)
3204 verbosity.add_option('--no-progress',
3205 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3206 verbosity.add_option('--console-title',
3207 action='store_true', dest='consoletitle',
3208 help='display progress in console titlebar', default=False)
3211 filesystem.add_option('-t', '--title',
3212 action='store_true', dest='usetitle', help='use title in file name', default=False)
3213 filesystem.add_option('-l', '--literal',
3214 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3215 filesystem.add_option('-A', '--auto-number',
3216 action='store_true', dest='autonumber',
3217 help='number downloaded files starting from 00000', default=False)
3218 filesystem.add_option('-o', '--output',
3219 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
3220 filesystem.add_option('-a', '--batch-file',
3221 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3222 filesystem.add_option('-w', '--no-overwrites',
3223 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3224 filesystem.add_option('-c', '--continue',
3225 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3226 filesystem.add_option('--cookies',
3227 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
3228 filesystem.add_option('--no-part',
3229 action='store_true', dest='nopart', help='do not use .part files', default=False)
3230 filesystem.add_option('--no-mtime',
3231 action='store_false', dest='updatetime',
3232 help='do not use the Last-modified header to set the file modification time', default=True)
3233 filesystem.add_option('--write-description',
3234 action='store_true', dest='writedescription',
3235 help='write video description to a .description file', default=False)
3236 filesystem.add_option('--write-info-json',
3237 action='store_true', dest='writeinfojson',
3238 help='write video metadata to a .info.json file', default=False)
3241 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3242 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3243 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3244 help='"best", "aac" or "mp3"; best by default')
3247 parser.add_option_group(general)
3248 parser.add_option_group(filesystem)
3249 parser.add_option_group(verbosity)
3250 parser.add_option_group(video_format)
3251 parser.add_option_group(authentication)
3252 parser.add_option_group(postproc)
3254 opts, args = parser.parse_args()
3256 return parser, opts, args
3259 parser, opts, args = parseOpts()
3261 # Open appropriate CookieJar
3262 if opts.cookiefile is None:
3263 jar = cookielib.CookieJar()
3266 jar = cookielib.MozillaCookieJar(opts.cookiefile)
3267 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3269 except (IOError, OSError), err:
3270 sys.exit(u'ERROR: unable to open cookie file')
3273 if opts.dump_user_agent:
3274 print std_headers['User-Agent']
3277 # General configuration
3278 cookie_processor = urllib2.HTTPCookieProcessor(jar)
3279 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler()))
3280 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3282 # Batch file verification
3284 if opts.batchfile is not None:
3286 if opts.batchfile == '-':
3289 batchfd = open(opts.batchfile, 'r')
3290 batchurls = batchfd.readlines()
3291 batchurls = [x.strip() for x in batchurls]
3292 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3294 sys.exit(u'ERROR: batch file could not be read')
3295 all_urls = batchurls + args
3297 # Conflicting, missing and erroneous options
3298 if opts.usenetrc and (opts.username is not None or opts.password is not None):
3299 parser.error(u'using .netrc conflicts with giving username/password')
3300 if opts.password is not None and opts.username is None:
3301 parser.error(u'account username missing')
3302 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3303 parser.error(u'using output template conflicts with using title, literal title or auto number')
3304 if opts.usetitle and opts.useliteral:
3305 parser.error(u'using title conflicts with using literal title')
3306 if opts.username is not None and opts.password is None:
3307 opts.password = getpass.getpass(u'Type account password and press return:')
3308 if opts.ratelimit is not None:
3309 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3310 if numeric_limit is None:
3311 parser.error(u'invalid rate limit specified')
3312 opts.ratelimit = numeric_limit
3313 if opts.retries is not None:
3315 opts.retries = long(opts.retries)
3316 except (TypeError, ValueError), err:
3317 parser.error(u'invalid retry count specified')
3319 opts.playliststart = int(opts.playliststart)
3320 if opts.playliststart <= 0:
3321 raise ValueError(u'Playlist start must be positive')
3322 except (TypeError, ValueError), err:
3323 parser.error(u'invalid playlist start number specified')
3325 opts.playlistend = int(opts.playlistend)
3326 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3327 raise ValueError(u'Playlist end must be greater than playlist start')
3328 except (TypeError, ValueError), err:
3329 parser.error(u'invalid playlist end number specified')
3330 if opts.extractaudio:
3331 if opts.audioformat not in ['best', 'aac', 'mp3']:
3332 parser.error(u'invalid audio format specified')
3334 # Information extractors
3335 youtube_ie = YoutubeIE()
3336 metacafe_ie = MetacafeIE(youtube_ie)
3337 dailymotion_ie = DailymotionIE()
3338 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
3339 youtube_user_ie = YoutubeUserIE(youtube_ie)
3340 youtube_search_ie = YoutubeSearchIE(youtube_ie)
3341 google_ie = GoogleIE()
3342 google_search_ie = GoogleSearchIE(google_ie)
3343 photobucket_ie = PhotobucketIE()
3344 yahoo_ie = YahooIE()
3345 yahoo_search_ie = YahooSearchIE(yahoo_ie)
3346 deposit_files_ie = DepositFilesIE()
3347 facebook_ie = FacebookIE()
3348 bliptv_ie = BlipTVIE()
3349 vimeo_ie = VimeoIE()
3350 generic_ie = GenericIE()
3353 fd = FileDownloader({
3354 'usenetrc': opts.usenetrc,
3355 'username': opts.username,
3356 'password': opts.password,
3357 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3358 'forceurl': opts.geturl,
3359 'forcetitle': opts.gettitle,
3360 'forcethumbnail': opts.getthumbnail,
3361 'forcedescription': opts.getdescription,
3362 'forcefilename': opts.getfilename,
3363 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3364 'format': opts.format,
3365 'format_limit': opts.format_limit,
3366 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3367 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3368 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3369 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3370 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3371 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3372 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3373 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3374 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3375 or u'%(id)s.%(ext)s'),
3376 'ignoreerrors': opts.ignoreerrors,
3377 'ratelimit': opts.ratelimit,
3378 'nooverwrites': opts.nooverwrites,
3379 'retries': opts.retries,
3380 'continuedl': opts.continue_dl,
3381 'noprogress': opts.noprogress,
3382 'playliststart': opts.playliststart,
3383 'playlistend': opts.playlistend,
3384 'logtostderr': opts.outtmpl == '-',
3385 'consoletitle': opts.consoletitle,
3386 'nopart': opts.nopart,
3387 'updatetime': opts.updatetime,
3388 'writedescription': opts.writedescription,
3389 'writeinfojson': opts.writeinfojson,
3391 fd.add_info_extractor(youtube_search_ie)
3392 fd.add_info_extractor(youtube_pl_ie)
3393 fd.add_info_extractor(youtube_user_ie)
3394 fd.add_info_extractor(metacafe_ie)
3395 fd.add_info_extractor(dailymotion_ie)
3396 fd.add_info_extractor(youtube_ie)
3397 fd.add_info_extractor(google_ie)
3398 fd.add_info_extractor(google_search_ie)
3399 fd.add_info_extractor(photobucket_ie)
3400 fd.add_info_extractor(yahoo_ie)
3401 fd.add_info_extractor(yahoo_search_ie)
3402 fd.add_info_extractor(deposit_files_ie)
3403 fd.add_info_extractor(facebook_ie)
3404 fd.add_info_extractor(bliptv_ie)
3405 fd.add_info_extractor(vimeo_ie)
3407 # This must come last since it's the
3408 # fallback if none of the others work
3409 fd.add_info_extractor(generic_ie)
3412 if opts.extractaudio:
3413 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))
3416 if opts.update_self:
3417 updateSelf(fd, sys.argv[0])
3420 if len(all_urls) < 1:
3421 if not opts.update_self:
3422 parser.error(u'you must provide at least one URL')
3425 retcode = fd.download(all_urls)
3427 # Dump cookie jar if requested
3428 if opts.cookiefile is not None:
3431 except (IOError, OSError), err:
3432 sys.exit(u'ERROR: unable to save cookie jar')
3437 if __name__ == '__main__':
3440 except DownloadError:
3442 except SameFileError:
3443 sys.exit(u'ERROR: fixed output name but more than one file to download')
3444 except KeyboardInterrupt:
3445 sys.exit(u'\nERROR: Interrupted by user')
3447 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: