2 # -*- coding: utf-8 -*-
5 'Ricardo Garcia Gonzalez',
13 'Philipp Hagemeister',
16 __license__ = 'Public Domain'
17 __version__ = '2011.09.06-phihag'
19 UPDATE_URL = 'https://raw.github.com/phihag/youtube-dl/master/youtube-dl'
47 except ImportError: # Python 2.4
50 import cStringIO as StringIO
54 # parse_qs was moved from the cgi module to the urlparse module recently.
56 from urlparse import parse_qs
58 from cgi import parse_qs
66 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
67 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
68 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
69 'Accept-Encoding': 'gzip, deflate',
70 'Accept-Language': 'en-us,en;q=0.5',
73 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
77 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
83 def raiseError(msg, i):
84 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
85 def skipSpace(i, expectMore=True):
86 while i < len(s) and s[i] in ' \t\r\n':
90 raiseError('Premature end', i)
92 def decodeEscape(match):
108 return unichr(int(esc[1:5], 16))
109 if len(esc) == 5+6 and esc[5:7] == '\\u':
110 hi = int(esc[1:5], 16)
111 low = int(esc[7:11], 16)
112 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
113 raise ValueError('Unknown escape ' + str(esc))
120 while s[e-bslashes-1] == '\\':
122 if bslashes % 2 == 1:
126 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
127 stri = rexp.sub(decodeEscape, s[i:e])
133 if s[i] == '}': # Empty dictionary
137 raiseError('Expected a string object key', i)
138 i,key = parseString(i)
140 if i >= len(s) or s[i] != ':':
141 raiseError('Expected a colon', i)
148 raiseError('Expected comma or closing curly brace', i)
153 if s[i] == ']': # Empty array
158 i = skipSpace(i) # Raise exception if premature end
162 raiseError('Expected a comma or closing bracket', i)
164 def parseDiscrete(i):
165 for k,v in {'true': True, 'false': False, 'null': None}.items():
166 if s.startswith(k, i):
168 raiseError('Not a boolean (or null)', i)
170 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
172 raiseError('Not a number', i)
174 if '.' in nums or 'e' in nums or 'E' in nums:
175 return (i+len(nums), float(nums))
176 return (i+len(nums), int(nums))
177 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
180 i,res = CHARMAP.get(s[i], parseNumber)(i)
181 i = skipSpace(i, False)
185 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
188 def preferredencoding():
189 """Get preferred encoding.
191 Returns the best encoding scheme for the system, based on
192 locale.getpreferredencoding() and some further tweaks.
194 def yield_preferredencoding():
196 pref = locale.getpreferredencoding()
202 return yield_preferredencoding().next()
205 def htmlentity_transform(matchobj):
206 """Transforms an HTML entity to a Unicode character.
208 This function receives a match object and is intended to be used with
209 the re.sub() function.
211 entity = matchobj.group(1)
213 # Known non-numeric HTML entity
214 if entity in htmlentitydefs.name2codepoint:
215 return unichr(htmlentitydefs.name2codepoint[entity])
218 mobj = re.match(ur'(?u)#(x?\d+)', entity)
220 numstr = mobj.group(1)
221 if numstr.startswith(u'x'):
223 numstr = u'0%s' % numstr
226 return unichr(long(numstr, base))
228 # Unknown entity in name, return its literal representation
229 return (u'&%s;' % entity)
232 def sanitize_title(utitle):
233 """Sanitizes a video title so it could be used as part of a filename."""
234 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
235 return utitle.replace(unicode(os.sep), u'%')
238 def sanitize_open(filename, open_mode):
239 """Try to open the given filename, and slightly tweak it if this fails.
241 Attempts to open the given filename. If this fails, it tries to change
242 the filename slightly, step by step, until it's either able to open it
243 or it fails and raises a final exception, like the standard open()
246 It returns the tuple (stream, definitive_file_name).
250 if sys.platform == 'win32':
252 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
253 return (sys.stdout, filename)
254 stream = open(filename, open_mode)
255 return (stream, filename)
256 except (IOError, OSError), err:
257 # In case of error, try to remove win32 forbidden chars
258 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
260 # An exception here should be caught in the caller
261 stream = open(filename, open_mode)
262 return (stream, filename)
265 def timeconvert(timestr):
266 """Convert RFC 2822 defined time string into system timestamp"""
268 timetuple = email.utils.parsedate_tz(timestr)
269 if timetuple is not None:
270 timestamp = email.utils.mktime_tz(timetuple)
274 class DownloadError(Exception):
275 """Download Error exception.
277 This exception may be thrown by FileDownloader objects if they are not
278 configured to continue on errors. They will contain the appropriate
284 class SameFileError(Exception):
285 """Same File exception.
287 This exception will be thrown by FileDownloader objects if they detect
288 multiple files would have to be downloaded to the same file on disk.
293 class PostProcessingError(Exception):
294 """Post Processing exception.
296 This exception may be raised by PostProcessor's .run() method to
297 indicate an error in the postprocessing task.
302 class UnavailableVideoError(Exception):
303 """Unavailable Format exception.
305 This exception will be thrown when a video is requested
306 in a format that is not available for that video.
311 class ContentTooShortError(Exception):
312 """Content Too Short exception.
314 This exception may be raised by FileDownloader objects when a file they
315 download is too small for what the server announced first, indicating
316 the connection was probably interrupted.
322 def __init__(self, downloaded, expected):
323 self.downloaded = downloaded
324 self.expected = expected
327 class YoutubeDLHandler(urllib2.HTTPHandler):
328 """Handler for HTTP requests and responses.
330 This class, when installed with an OpenerDirector, automatically adds
331 the standard headers to every HTTP request and handles gzipped and
332 deflated responses from web servers. If compression is to be avoided in
333 a particular request, the original request in the program code only has
334 to include the HTTP header "Youtubedl-No-Compression", which will be
335 removed before making the real request.
337 Part of this code was copied from:
339 http://techknack.net/python-urllib2-handlers/
341 Andrew Rowls, the author of that code, agreed to release it to the
348 return zlib.decompress(data, -zlib.MAX_WBITS)
350 return zlib.decompress(data)
353 def addinfourl_wrapper(stream, headers, url, code):
354 if hasattr(urllib2.addinfourl, 'getcode'):
355 return urllib2.addinfourl(stream, headers, url, code)
356 ret = urllib2.addinfourl(stream, headers, url)
360 def http_request(self, req):
361 for h in std_headers:
364 req.add_header(h, std_headers[h])
365 if 'Youtubedl-no-compression' in req.headers:
366 if 'Accept-encoding' in req.headers:
367 del req.headers['Accept-encoding']
368 del req.headers['Youtubedl-no-compression']
371 def http_response(self, req, resp):
374 if resp.headers.get('Content-encoding', '') == 'gzip':
375 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
376 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
377 resp.msg = old_resp.msg
379 if resp.headers.get('Content-encoding', '') == 'deflate':
380 gz = StringIO.StringIO(self.deflate(resp.read()))
381 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
382 resp.msg = old_resp.msg
386 class FileDownloader(object):
387 """File Downloader class.
389 File downloader objects are the ones responsible of downloading the
390 actual video file and writing it to disk if the user has requested
391 it, among some other tasks. In most cases there should be one per
392 program. As, given a video URL, the downloader doesn't know how to
393 extract all the needed information, task that InfoExtractors do, it
394 has to pass the URL to one of them.
396 For this, file downloader objects have a method that allows
397 InfoExtractors to be registered in a given order. When it is passed
398 a URL, the file downloader handles it to the first InfoExtractor it
399 finds that reports being able to handle it. The InfoExtractor extracts
400 all the information about the video or videos the URL refers to, and
401 asks the FileDownloader to process the video information, possibly
402 downloading the video.
404 File downloaders accept a lot of parameters. In order not to saturate
405 the object constructor with arguments, it receives a dictionary of
406 options instead. These options are available through the params
407 attribute for the InfoExtractors to use. The FileDownloader also
408 registers itself as the downloader in charge for the InfoExtractors
409 that are added to it, so this is a "mutual registration".
413 username: Username for authentication purposes.
414 password: Password for authentication purposes.
415 usenetrc: Use netrc for authentication instead.
416 quiet: Do not print messages to stdout.
417 forceurl: Force printing final URL.
418 forcetitle: Force printing title.
419 forcethumbnail: Force printing thumbnail URL.
420 forcedescription: Force printing description.
421 forcefilename: Force printing final filename.
422 simulate: Do not download the video files.
423 format: Video format code.
424 format_limit: Highest quality format to try.
425 outtmpl: Template for output names.
426 ignoreerrors: Do not stop on download errors.
427 ratelimit: Download speed limit, in bytes/sec.
428 nooverwrites: Prevent overwriting files.
429 retries: Number of times to retry for HTTP error 5xx
430 continuedl: Try to continue downloads if possible.
431 noprogress: Do not print the progress bar.
432 playliststart: Playlist item to start at.
433 playlistend: Playlist item to end at.
434 logtostderr: Log messages to stderr instead of stdout.
435 consoletitle: Display progress in console window's titlebar.
436 nopart: Do not use temporary .part files.
437 updatetime: Use the Last-modified header to set output file timestamps.
438 writedescription: Write the video description to a .description file
439 writeinfojson: Write the video description to a .info.json file
445 _download_retcode = None
446 _num_downloads = None
449 def __init__(self, params):
450 """Create a FileDownloader object with the given options."""
453 self._download_retcode = 0
454 self._num_downloads = 0
455 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
459 def format_bytes(bytes):
462 if type(bytes) is str:
467 exponent = long(math.log(bytes, 1024.0))
468 suffix = 'bkMGTPEZY'[exponent]
469 converted = float(bytes) / float(1024 ** exponent)
470 return '%.2f%s' % (converted, suffix)
473 def calc_percent(byte_counter, data_len):
476 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
479 def calc_eta(start, now, total, current):
483 if current == 0 or dif < 0.001: # One millisecond
485 rate = float(current) / dif
486 eta = long((float(total) - float(current)) / rate)
487 (eta_mins, eta_secs) = divmod(eta, 60)
490 return '%02d:%02d' % (eta_mins, eta_secs)
493 def calc_speed(start, now, bytes):
495 if bytes == 0 or dif < 0.001: # One millisecond
496 return '%10s' % '---b/s'
497 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
500 def best_block_size(elapsed_time, bytes):
501 new_min = max(bytes / 2.0, 1.0)
502 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
503 if elapsed_time < 0.001:
505 rate = bytes / elapsed_time
513 def parse_bytes(bytestr):
514 """Parse a string indicating a byte quantity into a long integer."""
515 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
518 number = float(matchobj.group(1))
519 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
520 return long(round(number * multiplier))
522 def add_info_extractor(self, ie):
523 """Add an InfoExtractor object to the end of the list."""
525 ie.set_downloader(self)
527 def add_post_processor(self, pp):
528 """Add a PostProcessor object to the end of the chain."""
530 pp.set_downloader(self)
532 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
533 """Print message to stdout if not in quiet mode."""
535 if not self.params.get('quiet', False):
536 terminator = [u'\n', u''][skip_eol]
537 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
538 self._screen_file.flush()
539 except (UnicodeEncodeError), err:
540 if not ignore_encoding_errors:
543 def to_stderr(self, message):
544 """Print message to stderr."""
545 print >>sys.stderr, message.encode(preferredencoding())
547 def to_cons_title(self, message):
548 """Set console/terminal window title to message."""
549 if not self.params.get('consoletitle', False):
551 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
552 # c_wchar_p() might not be necessary if `message` is
553 # already of type unicode()
554 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
555 elif 'TERM' in os.environ:
556 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
558 def fixed_template(self):
559 """Checks if the output template is fixed."""
560 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
562 def trouble(self, message=None):
563 """Determine action to take when a download problem appears.
565 Depending on if the downloader has been configured to ignore
566 download errors or not, this method may throw an exception or
567 not when errors are found, after printing the message.
569 if message is not None:
570 self.to_stderr(message)
571 if not self.params.get('ignoreerrors', False):
572 raise DownloadError(message)
573 self._download_retcode = 1
575 def slow_down(self, start_time, byte_counter):
576 """Sleep if the download speed is over the rate limit."""
577 rate_limit = self.params.get('ratelimit', None)
578 if rate_limit is None or byte_counter == 0:
581 elapsed = now - start_time
584 speed = float(byte_counter) / elapsed
585 if speed > rate_limit:
586 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
588 def temp_name(self, filename):
589 """Returns a temporary filename for the given filename."""
590 if self.params.get('nopart', False) or filename == u'-' or \
591 (os.path.exists(filename) and not os.path.isfile(filename)):
593 return filename + u'.part'
595 def undo_temp_name(self, filename):
596 if filename.endswith(u'.part'):
597 return filename[:-len(u'.part')]
600 def try_rename(self, old_filename, new_filename):
602 if old_filename == new_filename:
604 os.rename(old_filename, new_filename)
605 except (IOError, OSError), err:
606 self.trouble(u'ERROR: unable to rename file')
608 def try_utime(self, filename, last_modified_hdr):
609 """Try to set the last-modified time of the given file."""
610 if last_modified_hdr is None:
612 if not os.path.isfile(filename):
614 timestr = last_modified_hdr
617 filetime = timeconvert(timestr)
621 os.utime(filename, (time.time(), filetime))
625 def report_writedescription(self, descfn):
626 """ Report that the description file is being written """
627 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
629 def report_writeinfojson(self, infofn):
630 """ Report that the metadata file has been written """
631 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
633 def report_destination(self, filename):
634 """Report destination filename."""
635 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
637 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
638 """Report download progress."""
639 if self.params.get('noprogress', False):
641 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
642 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
643 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
644 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
646 def report_resuming_byte(self, resume_len):
647 """Report attempt to resume at given byte."""
648 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
650 def report_retry(self, count, retries):
651 """Report retry in case of HTTP error 5xx"""
652 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
654 def report_file_already_downloaded(self, file_name):
655 """Report file has already been fully downloaded."""
657 self.to_screen(u'[download] %s has already been downloaded' % file_name)
658 except (UnicodeEncodeError), err:
659 self.to_screen(u'[download] The file has already been downloaded')
661 def report_unable_to_resume(self):
662 """Report it was impossible to resume download."""
663 self.to_screen(u'[download] Unable to resume')
665 def report_finish(self):
666 """Report download finished."""
667 if self.params.get('noprogress', False):
668 self.to_screen(u'[download] Download completed')
672 def increment_downloads(self):
673 """Increment the ordinal that assigns a number to each file."""
674 self._num_downloads += 1
676 def prepare_filename(self, info_dict):
677 """Generate the output filename."""
679 template_dict = dict(info_dict)
680 template_dict['epoch'] = unicode(long(time.time()))
681 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
682 filename = self.params['outtmpl'] % template_dict
684 except (ValueError, KeyError), err:
685 self.trouble(u'ERROR: invalid system charset or erroneous output template')
688 def process_info(self, info_dict):
689 """Process a single dictionary returned by an InfoExtractor."""
690 filename = self.prepare_filename(info_dict)
691 # Do nothing else if in simulate mode
692 if self.params.get('simulate', False):
694 if self.params.get('forcetitle', False):
695 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
696 if self.params.get('forceurl', False):
697 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
698 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
699 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
700 if self.params.get('forcedescription', False) and 'description' in info_dict:
701 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
702 if self.params.get('forcefilename', False) and filename is not None:
703 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
709 if self.params.get('nooverwrites', False) and os.path.exists(filename):
710 self.to_stderr(u'WARNING: file exists and will be skipped')
714 dn = os.path.dirname(filename)
715 if dn != '' and not os.path.exists(dn):
717 except (OSError, IOError), err:
718 self.trouble(u'ERROR: unable to create directories: %s' % str(err))
721 if self.params.get('writedescription', False):
723 descfn = filename + '.description'
724 self.report_writedescription(descfn)
725 descfile = open(descfn, 'wb')
727 descfile.write(info_dict['description'].encode('utf-8'))
730 except (OSError, IOError):
731 self.trouble(u'ERROR: Cannot write description file: %s' % str(descfn))
734 if self.params.get('writeinfojson', False):
735 infofn = filename + '.info.json'
736 self.report_writeinfojson(infofn)
739 except (NameError,AttributeError):
740 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
743 infof = open(infofn, 'wb')
745 json.dump(info_dict, infof)
748 except (OSError, IOError):
749 self.trouble(u'ERROR: Cannot write metadata to JSON file: %s' % str(infofn))
753 success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
754 except (OSError, IOError), err:
755 raise UnavailableVideoError
756 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
757 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
759 except (ContentTooShortError, ), err:
760 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
765 self.post_process(filename, info_dict)
766 except (PostProcessingError), err:
767 self.trouble(u'ERROR: postprocessing: %s' % str(err))
770 def download(self, url_list):
771 """Download a given list of URLs."""
772 if len(url_list) > 1 and self.fixed_template():
773 raise SameFileError(self.params['outtmpl'])
776 suitable_found = False
778 # Go to next InfoExtractor if not suitable
779 if not ie.suitable(url):
782 # Suitable InfoExtractor found
783 suitable_found = True
785 # Extract information from URL and process it
788 # Suitable InfoExtractor had been found; go to next URL
791 if not suitable_found:
792 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
794 return self._download_retcode
796 def post_process(self, filename, ie_info):
797 """Run the postprocessing chain on the given file."""
799 info['filepath'] = filename
805 def _download_with_rtmpdump(self, filename, url, player_url):
806 self.report_destination(filename)
807 tmpfilename = self.temp_name(filename)
809 # Check for rtmpdump first
811 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
812 except (OSError, IOError):
813 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
816 # Download using rtmpdump. rtmpdump returns exit code 2 when
817 # the connection was interrumpted and resuming appears to be
818 # possible. This is part of rtmpdump's normal usage, AFAIK.
819 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
820 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
821 while retval == 2 or retval == 1:
822 prevsize = os.path.getsize(tmpfilename)
823 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
824 time.sleep(5.0) # This seems to be needed
825 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
826 cursize = os.path.getsize(tmpfilename)
827 if prevsize == cursize and retval == 1:
830 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
831 self.try_rename(tmpfilename, filename)
834 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
837 def _do_download(self, filename, url, player_url):
838 # Check file already present
839 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
840 self.report_file_already_downloaded(filename)
843 # Attempt to download using rtmpdump
844 if url.startswith('rtmp'):
845 return self._download_with_rtmpdump(filename, url, player_url)
847 tmpfilename = self.temp_name(filename)
851 # Do not include the Accept-Encoding header
852 headers = {'Youtubedl-no-compression': 'True'}
853 basic_request = urllib2.Request(url, None, headers)
854 request = urllib2.Request(url, None, headers)
856 # Establish possible resume length
857 if os.path.isfile(tmpfilename):
858 resume_len = os.path.getsize(tmpfilename)
862 # Request parameters in case of being able to resume
863 if self.params.get('continuedl', False) and resume_len != 0:
864 self.report_resuming_byte(resume_len)
865 request.add_header('Range', 'bytes=%d-' % resume_len)
869 retries = self.params.get('retries', 0)
870 while count <= retries:
871 # Establish connection
873 data = urllib2.urlopen(request)
875 except (urllib2.HTTPError, ), err:
876 if (err.code < 500 or err.code >= 600) and err.code != 416:
877 # Unexpected HTTP error
879 elif err.code == 416:
880 # Unable to resume (requested range not satisfiable)
882 # Open the connection again without the range header
883 data = urllib2.urlopen(basic_request)
884 content_length = data.info()['Content-Length']
885 except (urllib2.HTTPError, ), err:
886 if err.code < 500 or err.code >= 600:
889 # Examine the reported length
890 if (content_length is not None and
891 (resume_len - 100 < long(content_length) < resume_len + 100)):
892 # The file had already been fully downloaded.
893 # Explanation to the above condition: in issue #175 it was revealed that
894 # YouTube sometimes adds or removes a few bytes from the end of the file,
895 # changing the file size slightly and causing problems for some users. So
896 # I decided to implement a suggested change and consider the file
897 # completely downloaded if the file size differs less than 100 bytes from
898 # the one in the hard drive.
899 self.report_file_already_downloaded(filename)
900 self.try_rename(tmpfilename, filename)
903 # The length does not match, we start the download over
904 self.report_unable_to_resume()
910 self.report_retry(count, retries)
913 self.trouble(u'ERROR: giving up after %s retries' % retries)
916 data_len = data.info().get('Content-length', None)
917 if data_len is not None:
918 data_len = long(data_len) + resume_len
919 data_len_str = self.format_bytes(data_len)
920 byte_counter = 0 + resume_len
926 data_block = data.read(block_size)
928 if len(data_block) == 0:
930 byte_counter += len(data_block)
932 # Open file just in time
935 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
936 assert stream is not None
937 filename = self.undo_temp_name(tmpfilename)
938 self.report_destination(filename)
939 except (OSError, IOError), err:
940 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
943 stream.write(data_block)
944 except (IOError, OSError), err:
945 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
947 block_size = self.best_block_size(after - before, len(data_block))
950 percent_str = self.calc_percent(byte_counter, data_len)
951 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
952 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
953 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
956 self.slow_down(start, byte_counter - resume_len)
959 self.trouble(u'\nERROR: Did not get any data blocks')
963 if data_len is not None and byte_counter != data_len:
964 raise ContentTooShortError(byte_counter, long(data_len))
965 self.try_rename(tmpfilename, filename)
967 # Update file modification time
968 if self.params.get('updatetime', True):
969 self.try_utime(filename, data.info().get('last-modified', None))
974 class InfoExtractor(object):
975 """Information Extractor class.
977 Information extractors are the classes that, given a URL, extract
978 information from the video (or videos) the URL refers to. This
979 information includes the real video URL, the video title and simplified
980 title, author and others. The information is stored in a dictionary
981 which is then passed to the FileDownloader. The FileDownloader
982 processes this information possibly downloading the video to the file
983 system, among other possible outcomes. The dictionaries must include
984 the following fields:
986 id: Video identifier.
987 url: Final video URL.
988 uploader: Nickname of the video uploader.
989 title: Literal title.
990 stitle: Simplified title.
991 ext: Video filename extension.
992 format: Video format.
993 player_url: SWF Player URL (may be None).
995 The following fields are optional. Their primary purpose is to allow
996 youtube-dl to serve as the backend for a video search function, such
997 as the one in youtube2mp3. They are only used when their respective
998 forced printing functions are called:
1000 thumbnail: Full URL to a video thumbnail image.
1001 description: One-line video description.
1003 Subclasses of this one should re-define the _real_initialize() and
1004 _real_extract() methods, as well as the suitable() static method.
1005 Probably, they should also be instantiated and added to the main
1012 def __init__(self, downloader=None):
1013 """Constructor. Receives an optional downloader."""
1015 self.set_downloader(downloader)
1019 """Receives a URL and returns True if suitable for this IE."""
1022 def initialize(self):
1023 """Initializes an instance (authentication, etc)."""
1025 self._real_initialize()
1028 def extract(self, url):
1029 """Extracts URL information and returns it in list of dicts."""
1031 return self._real_extract(url)
1033 def set_downloader(self, downloader):
1034 """Sets the downloader for this IE."""
1035 self._downloader = downloader
1037 def _real_initialize(self):
1038 """Real initialization process. Redefine in subclasses."""
1041 def _real_extract(self, url):
1042 """Real extraction process. Redefine in subclasses."""
1046 class YoutubeIE(InfoExtractor):
1047 """Information extractor for youtube.com."""
1049 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1050 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1051 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1052 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1053 _NETRC_MACHINE = 'youtube'
1054 # Listed in order of quality
1055 _available_formats = ['38', '37', '45', '22', '43', '35', '34', '18', '6', '5', '17', '13']
1056 _video_extensions = {
1062 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1069 return (re.match(YoutubeIE._VALID_URL, url) is not None)
1071 def report_lang(self):
1072 """Report attempt to set language."""
1073 self._downloader.to_screen(u'[youtube] Setting language')
1075 def report_login(self):
1076 """Report attempt to log in."""
1077 self._downloader.to_screen(u'[youtube] Logging in')
1079 def report_age_confirmation(self):
1080 """Report attempt to confirm age."""
1081 self._downloader.to_screen(u'[youtube] Confirming age')
1083 def report_video_webpage_download(self, video_id):
1084 """Report attempt to download video webpage."""
1085 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1087 def report_video_info_webpage_download(self, video_id):
1088 """Report attempt to download video info webpage."""
1089 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1091 def report_information_extraction(self, video_id):
1092 """Report attempt to extract video information."""
1093 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1095 def report_unavailable_format(self, video_id, format):
1096 """Report extracted video URL."""
1097 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1099 def report_rtmp_download(self):
1100 """Indicate the download will use the RTMP protocol."""
1101 self._downloader.to_screen(u'[youtube] RTMP download detected')
1103 def _real_initialize(self):
1104 if self._downloader is None:
1109 downloader_params = self._downloader.params
1111 # Attempt to use provided username and password or .netrc data
1112 if downloader_params.get('username', None) is not None:
1113 username = downloader_params['username']
1114 password = downloader_params['password']
1115 elif downloader_params.get('usenetrc', False):
1117 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1118 if info is not None:
1122 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1123 except (IOError, netrc.NetrcParseError), err:
1124 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1128 request = urllib2.Request(self._LANG_URL)
1131 urllib2.urlopen(request).read()
1132 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1133 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1136 # No authentication to be performed
1137 if username is None:
1142 'current_form': 'loginForm',
1144 'action_login': 'Log In',
1145 'username': username,
1146 'password': password,
1148 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1151 login_results = urllib2.urlopen(request).read()
1152 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1153 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1155 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1156 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1162 'action_confirm': 'Confirm',
1164 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1166 self.report_age_confirmation()
1167 age_results = urllib2.urlopen(request).read()
1168 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1169 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1172 def _real_extract(self, url):
1173 # Extract video id from URL
1174 mobj = re.match(self._VALID_URL, url)
1176 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1178 video_id = mobj.group(2)
1181 self.report_video_webpage_download(video_id)
1182 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1184 video_webpage = urllib2.urlopen(request).read()
1185 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1186 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1189 # Attempt to extract SWF player URL
1190 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1191 if mobj is not None:
1192 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1197 self.report_video_info_webpage_download(video_id)
1198 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1199 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1200 % (video_id, el_type))
1201 request = urllib2.Request(video_info_url)
1203 video_info_webpage = urllib2.urlopen(request).read()
1204 video_info = parse_qs(video_info_webpage)
1205 if 'token' in video_info:
1207 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1208 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1210 if 'token' not in video_info:
1211 if 'reason' in video_info:
1212 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1214 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1217 # Start extracting information
1218 self.report_information_extraction(video_id)
1221 if 'author' not in video_info:
1222 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1224 video_uploader = urllib.unquote_plus(video_info['author'][0])
1227 if 'title' not in video_info:
1228 self._downloader.trouble(u'ERROR: unable to extract video title')
1230 video_title = urllib.unquote_plus(video_info['title'][0])
1231 video_title = video_title.decode('utf-8')
1232 video_title = sanitize_title(video_title)
1235 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1236 simple_title = simple_title.strip(ur'_')
1239 if 'thumbnail_url' not in video_info:
1240 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1241 video_thumbnail = ''
1242 else: # don't panic if we can't find it
1243 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1247 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1248 if mobj is not None:
1249 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1250 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1251 for expression in format_expressions:
1253 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1261 video_description = u'No description available.'
1262 if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1263 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1264 if mobj is not None:
1265 video_description = mobj.group(1).decode('utf-8')
1267 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1268 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1269 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1270 # TODO use another parser
1273 video_token = urllib.unquote_plus(video_info['token'][0])
1275 # Decide which formats to download
1276 req_format = self._downloader.params.get('format', None)
1278 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1279 self.report_rtmp_download()
1280 video_url_list = [(None, video_info['conn'][0])]
1281 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1282 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1283 url_data = [parse_qs(uds) for uds in url_data_strs]
1284 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1285 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1287 format_limit = self._downloader.params.get('format_limit', None)
1288 if format_limit is not None and format_limit in self._available_formats:
1289 format_list = self._available_formats[self._available_formats.index(format_limit):]
1291 format_list = self._available_formats
1292 existing_formats = [x for x in format_list if x in url_map]
1293 if len(existing_formats) == 0:
1294 self._downloader.trouble(u'ERROR: no known formats available for video')
1296 if req_format is None:
1297 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1298 elif req_format == '-1':
1299 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1302 if req_format not in url_map:
1303 self._downloader.trouble(u'ERROR: requested format not available')
1305 video_url_list = [(req_format, url_map[req_format])] # Specific format
1307 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1310 for format_param, video_real_url in video_url_list:
1311 # At this point we have a new video
1312 self._downloader.increment_downloads()
1315 video_extension = self._video_extensions.get(format_param, 'flv')
1318 # Process video information
1319 self._downloader.process_info({
1320 'id': video_id.decode('utf-8'),
1321 'url': video_real_url.decode('utf-8'),
1322 'uploader': video_uploader.decode('utf-8'),
1323 'upload_date': upload_date,
1324 'title': video_title,
1325 'stitle': simple_title,
1326 'ext': video_extension.decode('utf-8'),
1327 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1328 'thumbnail': video_thumbnail.decode('utf-8'),
1329 'description': video_description,
1330 'player_url': player_url,
1332 except UnavailableVideoError, err:
1333 self._downloader.trouble(u'\nERROR: unable to download video')
1336 class MetacafeIE(InfoExtractor):
1337 """Information Extractor for metacafe.com."""
1339 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1340 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1341 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1344 def __init__(self, youtube_ie, downloader=None):
1345 InfoExtractor.__init__(self, downloader)
1346 self._youtube_ie = youtube_ie
1350 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1352 def report_disclaimer(self):
1353 """Report disclaimer retrieval."""
1354 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1356 def report_age_confirmation(self):
1357 """Report attempt to confirm age."""
1358 self._downloader.to_screen(u'[metacafe] Confirming age')
1360 def report_download_webpage(self, video_id):
1361 """Report webpage download."""
1362 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1364 def report_extraction(self, video_id):
1365 """Report information extraction."""
1366 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1368 def _real_initialize(self):
1369 # Retrieve disclaimer
1370 request = urllib2.Request(self._DISCLAIMER)
1372 self.report_disclaimer()
1373 disclaimer = urllib2.urlopen(request).read()
1374 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1375 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1381 'submit': "Continue - I'm over 18",
1383 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1385 self.report_age_confirmation()
1386 disclaimer = urllib2.urlopen(request).read()
1387 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1388 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1391 def _real_extract(self, url):
1392 # Extract id and simplified title from URL
1393 mobj = re.match(self._VALID_URL, url)
1395 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1398 video_id = mobj.group(1)
1400 # Check if video comes from YouTube
1401 mobj2 = re.match(r'^yt-(.*)$', video_id)
1402 if mobj2 is not None:
1403 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1406 # At this point we have a new video
1407 self._downloader.increment_downloads()
1409 simple_title = mobj.group(2).decode('utf-8')
1411 # Retrieve video webpage to extract further information
1412 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1414 self.report_download_webpage(video_id)
1415 webpage = urllib2.urlopen(request).read()
1416 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1417 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1420 # Extract URL, uploader and title from webpage
1421 self.report_extraction(video_id)
1422 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1423 if mobj is not None:
1424 mediaURL = urllib.unquote(mobj.group(1))
1425 video_extension = mediaURL[-3:]
1427 # Extract gdaKey if available
1428 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1430 video_url = mediaURL
1432 gdaKey = mobj.group(1)
1433 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1435 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1437 self._downloader.trouble(u'ERROR: unable to extract media URL')
1439 vardict = parse_qs(mobj.group(1))
1440 if 'mediaData' not in vardict:
1441 self._downloader.trouble(u'ERROR: unable to extract media URL')
1443 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1445 self._downloader.trouble(u'ERROR: unable to extract media URL')
1447 mediaURL = mobj.group(1).replace('\\/', '/')
1448 video_extension = mediaURL[-3:]
1449 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1451 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1453 self._downloader.trouble(u'ERROR: unable to extract title')
1455 video_title = mobj.group(1).decode('utf-8')
1456 video_title = sanitize_title(video_title)
1458 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1460 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1462 video_uploader = mobj.group(1)
1465 # Process video information
1466 self._downloader.process_info({
1467 'id': video_id.decode('utf-8'),
1468 'url': video_url.decode('utf-8'),
1469 'uploader': video_uploader.decode('utf-8'),
1470 'upload_date': u'NA',
1471 'title': video_title,
1472 'stitle': simple_title,
1473 'ext': video_extension.decode('utf-8'),
1477 except UnavailableVideoError:
1478 self._downloader.trouble(u'\nERROR: unable to download video')
1481 class DailymotionIE(InfoExtractor):
1482 """Information Extractor for Dailymotion"""
1484 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1486 def __init__(self, downloader=None):
1487 InfoExtractor.__init__(self, downloader)
1491 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1493 def report_download_webpage(self, video_id):
1494 """Report webpage download."""
1495 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1497 def report_extraction(self, video_id):
1498 """Report information extraction."""
1499 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1501 def _real_initialize(self):
1504 def _real_extract(self, url):
1505 # Extract id and simplified title from URL
1506 mobj = re.match(self._VALID_URL, url)
1508 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1511 # At this point we have a new video
1512 self._downloader.increment_downloads()
1513 video_id = mobj.group(1)
1515 simple_title = mobj.group(2).decode('utf-8')
1516 video_extension = 'flv'
1518 # Retrieve video webpage to extract further information
1519 request = urllib2.Request(url)
1521 self.report_download_webpage(video_id)
1522 webpage = urllib2.urlopen(request).read()
1523 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1524 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1527 # Extract URL, uploader and title from webpage
1528 self.report_extraction(video_id)
1529 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1531 self._downloader.trouble(u'ERROR: unable to extract media URL')
1533 mediaURL = urllib.unquote(mobj.group(1))
1535 # if needed add http://www.dailymotion.com/ if relative URL
1537 video_url = mediaURL
1539 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1540 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1542 self._downloader.trouble(u'ERROR: unable to extract title')
1544 video_title = mobj.group(1).decode('utf-8')
1545 video_title = sanitize_title(video_title)
1547 mobj = re.search(r'(?im)<Attribute name="owner">(.+?)</Attribute>', webpage)
1549 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1551 video_uploader = mobj.group(1)
1554 # Process video information
1555 self._downloader.process_info({
1556 'id': video_id.decode('utf-8'),
1557 'url': video_url.decode('utf-8'),
1558 'uploader': video_uploader.decode('utf-8'),
1559 'upload_date': u'NA',
1560 'title': video_title,
1561 'stitle': simple_title,
1562 'ext': video_extension.decode('utf-8'),
1566 except UnavailableVideoError:
1567 self._downloader.trouble(u'\nERROR: unable to download video')
1570 class GoogleIE(InfoExtractor):
1571 """Information extractor for video.google.com."""
1573 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1575 def __init__(self, downloader=None):
1576 InfoExtractor.__init__(self, downloader)
1580 return (re.match(GoogleIE._VALID_URL, url) is not None)
1582 def report_download_webpage(self, video_id):
1583 """Report webpage download."""
1584 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1586 def report_extraction(self, video_id):
1587 """Report information extraction."""
1588 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1590 def _real_initialize(self):
1593 def _real_extract(self, url):
1594 # Extract id from URL
1595 mobj = re.match(self._VALID_URL, url)
1597 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1600 # At this point we have a new video
1601 self._downloader.increment_downloads()
1602 video_id = mobj.group(1)
1604 video_extension = 'mp4'
1606 # Retrieve video webpage to extract further information
1607 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1609 self.report_download_webpage(video_id)
1610 webpage = urllib2.urlopen(request).read()
1611 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1612 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1615 # Extract URL, uploader, and title from webpage
1616 self.report_extraction(video_id)
1617 mobj = re.search(r"download_url:'([^']+)'", webpage)
1619 video_extension = 'flv'
1620 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1622 self._downloader.trouble(u'ERROR: unable to extract media URL')
1624 mediaURL = urllib.unquote(mobj.group(1))
1625 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1626 mediaURL = mediaURL.replace('\\x26', '\x26')
1628 video_url = mediaURL
1630 mobj = re.search(r'<title>(.*)</title>', webpage)
1632 self._downloader.trouble(u'ERROR: unable to extract title')
1634 video_title = mobj.group(1).decode('utf-8')
1635 video_title = sanitize_title(video_title)
1636 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1638 # Extract video description
1639 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1641 self._downloader.trouble(u'ERROR: unable to extract video description')
1643 video_description = mobj.group(1).decode('utf-8')
1644 if not video_description:
1645 video_description = 'No description available.'
1647 # Extract video thumbnail
1648 if self._downloader.params.get('forcethumbnail', False):
1649 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1651 webpage = urllib2.urlopen(request).read()
1652 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1653 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1655 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1657 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1659 video_thumbnail = mobj.group(1)
1660 else: # we need something to pass to process_info
1661 video_thumbnail = ''
1664 # Process video information
1665 self._downloader.process_info({
1666 'id': video_id.decode('utf-8'),
1667 'url': video_url.decode('utf-8'),
1669 'upload_date': u'NA',
1670 'title': video_title,
1671 'stitle': simple_title,
1672 'ext': video_extension.decode('utf-8'),
1676 except UnavailableVideoError:
1677 self._downloader.trouble(u'\nERROR: unable to download video')
1680 class PhotobucketIE(InfoExtractor):
1681 """Information extractor for photobucket.com."""
1683 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1685 def __init__(self, downloader=None):
1686 InfoExtractor.__init__(self, downloader)
1690 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1692 def report_download_webpage(self, video_id):
1693 """Report webpage download."""
1694 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1696 def report_extraction(self, video_id):
1697 """Report information extraction."""
1698 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1700 def _real_initialize(self):
1703 def _real_extract(self, url):
1704 # Extract id from URL
1705 mobj = re.match(self._VALID_URL, url)
1707 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1710 # At this point we have a new video
1711 self._downloader.increment_downloads()
1712 video_id = mobj.group(1)
1714 video_extension = 'flv'
1716 # Retrieve video webpage to extract further information
1717 request = urllib2.Request(url)
1719 self.report_download_webpage(video_id)
1720 webpage = urllib2.urlopen(request).read()
1721 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1722 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1725 # Extract URL, uploader, and title from webpage
1726 self.report_extraction(video_id)
1727 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1729 self._downloader.trouble(u'ERROR: unable to extract media URL')
1731 mediaURL = urllib.unquote(mobj.group(1))
1733 video_url = mediaURL
1735 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1737 self._downloader.trouble(u'ERROR: unable to extract title')
1739 video_title = mobj.group(1).decode('utf-8')
1740 video_title = sanitize_title(video_title)
1741 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1743 video_uploader = mobj.group(2).decode('utf-8')
1746 # Process video information
1747 self._downloader.process_info({
1748 'id': video_id.decode('utf-8'),
1749 'url': video_url.decode('utf-8'),
1750 'uploader': video_uploader,
1751 'upload_date': u'NA',
1752 'title': video_title,
1753 'stitle': simple_title,
1754 'ext': video_extension.decode('utf-8'),
1758 except UnavailableVideoError:
1759 self._downloader.trouble(u'\nERROR: unable to download video')
1762 class YahooIE(InfoExtractor):
1763 """Information extractor for video.yahoo.com."""
1765 # _VALID_URL matches all Yahoo! Video URLs
1766 # _VPAGE_URL matches only the extractable '/watch/' URLs
1767 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1768 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1770 def __init__(self, downloader=None):
1771 InfoExtractor.__init__(self, downloader)
1775 return (re.match(YahooIE._VALID_URL, url) is not None)
1777 def report_download_webpage(self, video_id):
1778 """Report webpage download."""
1779 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1781 def report_extraction(self, video_id):
1782 """Report information extraction."""
1783 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1785 def _real_initialize(self):
1788 def _real_extract(self, url, new_video=True):
1789 # Extract ID from URL
1790 mobj = re.match(self._VALID_URL, url)
1792 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1795 # At this point we have a new video
1796 self._downloader.increment_downloads()
1797 video_id = mobj.group(2)
1798 video_extension = 'flv'
1800 # Rewrite valid but non-extractable URLs as
1801 # extractable English language /watch/ URLs
1802 if re.match(self._VPAGE_URL, url) is None:
1803 request = urllib2.Request(url)
1805 webpage = urllib2.urlopen(request).read()
1806 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1807 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1810 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1812 self._downloader.trouble(u'ERROR: Unable to extract id field')
1814 yahoo_id = mobj.group(1)
1816 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1818 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1820 yahoo_vid = mobj.group(1)
1822 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1823 return self._real_extract(url, new_video=False)
1825 # Retrieve video webpage to extract further information
1826 request = urllib2.Request(url)
1828 self.report_download_webpage(video_id)
1829 webpage = urllib2.urlopen(request).read()
1830 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1831 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1834 # Extract uploader and title from webpage
1835 self.report_extraction(video_id)
1836 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1838 self._downloader.trouble(u'ERROR: unable to extract video title')
1840 video_title = mobj.group(1).decode('utf-8')
1841 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1843 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1845 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1847 video_uploader = mobj.group(1).decode('utf-8')
1849 # Extract video thumbnail
1850 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1852 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1854 video_thumbnail = mobj.group(1).decode('utf-8')
1856 # Extract video description
1857 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1859 self._downloader.trouble(u'ERROR: unable to extract video description')
1861 video_description = mobj.group(1).decode('utf-8')
1862 if not video_description:
1863 video_description = 'No description available.'
1865 # Extract video height and width
1866 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1868 self._downloader.trouble(u'ERROR: unable to extract video height')
1870 yv_video_height = mobj.group(1)
1872 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1874 self._downloader.trouble(u'ERROR: unable to extract video width')
1876 yv_video_width = mobj.group(1)
1878 # Retrieve video playlist to extract media URL
1879 # I'm not completely sure what all these options are, but we
1880 # seem to need most of them, otherwise the server sends a 401.
1881 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1882 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1883 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1884 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1885 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1887 self.report_download_webpage(video_id)
1888 webpage = urllib2.urlopen(request).read()
1889 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1890 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1893 # Extract media URL from playlist XML
1894 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1896 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1898 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1899 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1902 # Process video information
1903 self._downloader.process_info({
1904 'id': video_id.decode('utf-8'),
1906 'uploader': video_uploader,
1907 'upload_date': u'NA',
1908 'title': video_title,
1909 'stitle': simple_title,
1910 'ext': video_extension.decode('utf-8'),
1911 'thumbnail': video_thumbnail.decode('utf-8'),
1912 'description': video_description,
1913 'thumbnail': video_thumbnail,
1914 'description': video_description,
1917 except UnavailableVideoError:
1918 self._downloader.trouble(u'\nERROR: unable to download video')
1921 class VimeoIE(InfoExtractor):
1922 """Information extractor for vimeo.com."""
1924 # _VALID_URL matches Vimeo URLs
1925 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1927 def __init__(self, downloader=None):
1928 InfoExtractor.__init__(self, downloader)
1932 return (re.match(VimeoIE._VALID_URL, url) is not None)
1934 def report_download_webpage(self, video_id):
1935 """Report webpage download."""
1936 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1938 def report_extraction(self, video_id):
1939 """Report information extraction."""
1940 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1942 def _real_initialize(self):
1945 def _real_extract(self, url, new_video=True):
1946 # Extract ID from URL
1947 mobj = re.match(self._VALID_URL, url)
1949 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1952 # At this point we have a new video
1953 self._downloader.increment_downloads()
1954 video_id = mobj.group(1)
1956 # Retrieve video webpage to extract further information
1957 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
1959 self.report_download_webpage(video_id)
1960 webpage = urllib2.urlopen(request).read()
1961 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1962 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1965 # Now we begin extracting as much information as we can from what we
1966 # retrieved. First we extract the information common to all extractors,
1967 # and latter we extract those that are Vimeo specific.
1968 self.report_extraction(video_id)
1971 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
1973 self._downloader.trouble(u'ERROR: unable to extract video title')
1975 video_title = mobj.group(1).decode('utf-8')
1976 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1979 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
1981 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1983 video_uploader = mobj.group(1).decode('utf-8')
1985 # Extract video thumbnail
1986 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
1988 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1990 video_thumbnail = mobj.group(1).decode('utf-8')
1992 # # Extract video description
1993 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
1995 # self._downloader.trouble(u'ERROR: unable to extract video description')
1997 # video_description = mobj.group(1).decode('utf-8')
1998 # if not video_description: video_description = 'No description available.'
1999 video_description = 'Foo.'
2001 # Vimeo specific: extract request signature
2002 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2004 self._downloader.trouble(u'ERROR: unable to extract request signature')
2006 sig = mobj.group(1).decode('utf-8')
2008 # Vimeo specific: Extract request signature expiration
2009 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2011 self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2013 sig_exp = mobj.group(1).decode('utf-8')
2015 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s" % (video_id, sig, sig_exp)
2018 # Process video information
2019 self._downloader.process_info({
2020 'id': video_id.decode('utf-8'),
2022 'uploader': video_uploader,
2023 'upload_date': u'NA',
2024 'title': video_title,
2025 'stitle': simple_title,
2027 'thumbnail': video_thumbnail.decode('utf-8'),
2028 'description': video_description,
2029 'thumbnail': video_thumbnail,
2030 'description': video_description,
2033 except UnavailableVideoError:
2034 self._downloader.trouble(u'ERROR: unable to download video')
2037 class GenericIE(InfoExtractor):
2038 """Generic last-resort information extractor."""
2040 def __init__(self, downloader=None):
2041 InfoExtractor.__init__(self, downloader)
2047 def report_download_webpage(self, video_id):
2048 """Report webpage download."""
2049 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2050 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2052 def report_extraction(self, video_id):
2053 """Report information extraction."""
2054 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2056 def _real_initialize(self):
2059 def _real_extract(self, url):
2060 # At this point we have a new video
2061 self._downloader.increment_downloads()
2063 video_id = url.split('/')[-1]
2064 request = urllib2.Request(url)
2066 self.report_download_webpage(video_id)
2067 webpage = urllib2.urlopen(request).read()
2068 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2069 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2071 except ValueError, err:
2072 # since this is the last-resort InfoExtractor, if
2073 # this error is thrown, it'll be thrown here
2074 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2077 self.report_extraction(video_id)
2078 # Start with something easy: JW Player in SWFObject
2079 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2081 # Broaden the search a little bit
2082 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2084 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2087 # It's possible that one of the regexes
2088 # matched, but returned an empty group:
2089 if mobj.group(1) is None:
2090 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2093 video_url = urllib.unquote(mobj.group(1))
2094 video_id = os.path.basename(video_url)
2096 # here's a fun little line of code for you:
2097 video_extension = os.path.splitext(video_id)[1][1:]
2098 video_id = os.path.splitext(video_id)[0]
2100 # it's tempting to parse this further, but you would
2101 # have to take into account all the variations like
2102 # Video Title - Site Name
2103 # Site Name | Video Title
2104 # Video Title - Tagline | Site Name
2105 # and so on and so forth; it's just not practical
2106 mobj = re.search(r'<title>(.*)</title>', webpage)
2108 self._downloader.trouble(u'ERROR: unable to extract title')
2110 video_title = mobj.group(1).decode('utf-8')
2111 video_title = sanitize_title(video_title)
2112 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2114 # video uploader is domain name
2115 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2117 self._downloader.trouble(u'ERROR: unable to extract title')
2119 video_uploader = mobj.group(1).decode('utf-8')
2122 # Process video information
2123 self._downloader.process_info({
2124 'id': video_id.decode('utf-8'),
2125 'url': video_url.decode('utf-8'),
2126 'uploader': video_uploader,
2127 'upload_date': u'NA',
2128 'title': video_title,
2129 'stitle': simple_title,
2130 'ext': video_extension.decode('utf-8'),
2134 except UnavailableVideoError, err:
2135 self._downloader.trouble(u'\nERROR: unable to download video')
2138 class YoutubeSearchIE(InfoExtractor):
2139 """Information Extractor for YouTube search queries."""
2140 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
2141 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2142 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2143 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2145 _max_youtube_results = 1000
2147 def __init__(self, youtube_ie, downloader=None):
2148 InfoExtractor.__init__(self, downloader)
2149 self._youtube_ie = youtube_ie
2153 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
2155 def report_download_page(self, query, pagenum):
2156 """Report attempt to download playlist page with given number."""
2157 query = query.decode(preferredencoding())
2158 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2160 def _real_initialize(self):
2161 self._youtube_ie.initialize()
2163 def _real_extract(self, query):
2164 mobj = re.match(self._VALID_QUERY, query)
2166 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2169 prefix, query = query.split(':')
2171 query = query.encode('utf-8')
2173 self._download_n_results(query, 1)
2175 elif prefix == 'all':
2176 self._download_n_results(query, self._max_youtube_results)
2182 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2184 elif n > self._max_youtube_results:
2185 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2186 n = self._max_youtube_results
2187 self._download_n_results(query, n)
2189 except ValueError: # parsing prefix as integer fails
2190 self._download_n_results(query, 1)
2193 def _download_n_results(self, query, n):
2194 """Downloads a specified number of results for a query"""
2197 already_seen = set()
2201 self.report_download_page(query, pagenum)
2202 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2203 request = urllib2.Request(result_url)
2205 page = urllib2.urlopen(request).read()
2206 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2207 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2210 # Extract video identifiers
2211 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2212 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2213 if video_id not in already_seen:
2214 video_ids.append(video_id)
2215 already_seen.add(video_id)
2216 if len(video_ids) == n:
2217 # Specified n videos reached
2218 for id in video_ids:
2219 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2222 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2223 for id in video_ids:
2224 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2227 pagenum = pagenum + 1
2230 class GoogleSearchIE(InfoExtractor):
2231 """Information Extractor for Google Video search queries."""
2232 _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
2233 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2234 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2235 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2237 _max_google_results = 1000
2239 def __init__(self, google_ie, downloader=None):
2240 InfoExtractor.__init__(self, downloader)
2241 self._google_ie = google_ie
2245 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
2247 def report_download_page(self, query, pagenum):
2248 """Report attempt to download playlist page with given number."""
2249 query = query.decode(preferredencoding())
2250 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2252 def _real_initialize(self):
2253 self._google_ie.initialize()
2255 def _real_extract(self, query):
2256 mobj = re.match(self._VALID_QUERY, query)
2258 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2261 prefix, query = query.split(':')
2263 query = query.encode('utf-8')
2265 self._download_n_results(query, 1)
2267 elif prefix == 'all':
2268 self._download_n_results(query, self._max_google_results)
2274 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2276 elif n > self._max_google_results:
2277 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2278 n = self._max_google_results
2279 self._download_n_results(query, n)
2281 except ValueError: # parsing prefix as integer fails
2282 self._download_n_results(query, 1)
2285 def _download_n_results(self, query, n):
2286 """Downloads a specified number of results for a query"""
2289 already_seen = set()
2293 self.report_download_page(query, pagenum)
2294 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2295 request = urllib2.Request(result_url)
2297 page = urllib2.urlopen(request).read()
2298 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2299 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2302 # Extract video identifiers
2303 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2304 video_id = mobj.group(1)
2305 if video_id not in already_seen:
2306 video_ids.append(video_id)
2307 already_seen.add(video_id)
2308 if len(video_ids) == n:
2309 # Specified n videos reached
2310 for id in video_ids:
2311 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2314 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2315 for id in video_ids:
2316 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2319 pagenum = pagenum + 1
2322 class YahooSearchIE(InfoExtractor):
2323 """Information Extractor for Yahoo! Video search queries."""
2324 _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
2325 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2326 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2327 _MORE_PAGES_INDICATOR = r'\s*Next'
2329 _max_yahoo_results = 1000
2331 def __init__(self, yahoo_ie, downloader=None):
2332 InfoExtractor.__init__(self, downloader)
2333 self._yahoo_ie = yahoo_ie
2337 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
2339 def report_download_page(self, query, pagenum):
2340 """Report attempt to download playlist page with given number."""
2341 query = query.decode(preferredencoding())
2342 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2344 def _real_initialize(self):
2345 self._yahoo_ie.initialize()
2347 def _real_extract(self, query):
2348 mobj = re.match(self._VALID_QUERY, query)
2350 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2353 prefix, query = query.split(':')
2355 query = query.encode('utf-8')
2357 self._download_n_results(query, 1)
2359 elif prefix == 'all':
2360 self._download_n_results(query, self._max_yahoo_results)
2366 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2368 elif n > self._max_yahoo_results:
2369 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2370 n = self._max_yahoo_results
2371 self._download_n_results(query, n)
2373 except ValueError: # parsing prefix as integer fails
2374 self._download_n_results(query, 1)
2377 def _download_n_results(self, query, n):
2378 """Downloads a specified number of results for a query"""
2381 already_seen = set()
2385 self.report_download_page(query, pagenum)
2386 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2387 request = urllib2.Request(result_url)
2389 page = urllib2.urlopen(request).read()
2390 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2391 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2394 # Extract video identifiers
2395 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2396 video_id = mobj.group(1)
2397 if video_id not in already_seen:
2398 video_ids.append(video_id)
2399 already_seen.add(video_id)
2400 if len(video_ids) == n:
2401 # Specified n videos reached
2402 for id in video_ids:
2403 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2406 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2407 for id in video_ids:
2408 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2411 pagenum = pagenum + 1
2414 class YoutubePlaylistIE(InfoExtractor):
2415 """Information Extractor for YouTube playlists."""
2417 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2418 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2419 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2420 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2423 def __init__(self, youtube_ie, downloader=None):
2424 InfoExtractor.__init__(self, downloader)
2425 self._youtube_ie = youtube_ie
2429 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2431 def report_download_page(self, playlist_id, pagenum):
2432 """Report attempt to download playlist page with given number."""
2433 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2435 def _real_initialize(self):
2436 self._youtube_ie.initialize()
2438 def _real_extract(self, url):
2439 # Extract playlist id
2440 mobj = re.match(self._VALID_URL, url)
2442 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2446 if mobj.group(3) is not None:
2447 self._youtube_ie.extract(mobj.group(3))
2450 # Download playlist pages
2451 # prefix is 'p' as default for playlists but there are other types that need extra care
2452 playlist_prefix = mobj.group(1)
2453 if playlist_prefix == 'a':
2454 playlist_access = 'artist'
2456 playlist_prefix = 'p'
2457 playlist_access = 'view_play_list'
2458 playlist_id = mobj.group(2)
2463 self.report_download_page(playlist_id, pagenum)
2464 request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2466 page = urllib2.urlopen(request).read()
2467 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2468 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2471 # Extract video identifiers
2473 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2474 if mobj.group(1) not in ids_in_page:
2475 ids_in_page.append(mobj.group(1))
2476 video_ids.extend(ids_in_page)
2478 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2480 pagenum = pagenum + 1
2482 playliststart = self._downloader.params.get('playliststart', 1) - 1
2483 playlistend = self._downloader.params.get('playlistend', -1)
2484 video_ids = video_ids[playliststart:playlistend]
2486 for id in video_ids:
2487 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2491 class YoutubeUserIE(InfoExtractor):
2492 """Information Extractor for YouTube users."""
2494 _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2495 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2496 _GDATA_PAGE_SIZE = 50
2497 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2498 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2501 def __init__(self, youtube_ie, downloader=None):
2502 InfoExtractor.__init__(self, downloader)
2503 self._youtube_ie = youtube_ie
2507 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2509 def report_download_page(self, username, start_index):
2510 """Report attempt to download user page."""
2511 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2512 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2514 def _real_initialize(self):
2515 self._youtube_ie.initialize()
2517 def _real_extract(self, url):
2519 mobj = re.match(self._VALID_URL, url)
2521 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2524 username = mobj.group(1)
2526 # Download video ids using YouTube Data API. Result size per
2527 # query is limited (currently to 50 videos) so we need to query
2528 # page by page until there are no video ids - it means we got
2535 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2536 self.report_download_page(username, start_index)
2538 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2541 page = urllib2.urlopen(request).read()
2542 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2543 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2546 # Extract video identifiers
2549 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2550 if mobj.group(1) not in ids_in_page:
2551 ids_in_page.append(mobj.group(1))
2553 video_ids.extend(ids_in_page)
2555 # A little optimization - if current page is not
2556 # "full", ie. does not contain PAGE_SIZE video ids then
2557 # we can assume that this page is the last one - there
2558 # are no more ids on further pages - no need to query
2561 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2566 all_ids_count = len(video_ids)
2567 playliststart = self._downloader.params.get('playliststart', 1) - 1
2568 playlistend = self._downloader.params.get('playlistend', -1)
2570 if playlistend == -1:
2571 video_ids = video_ids[playliststart:]
2573 video_ids = video_ids[playliststart:playlistend]
2575 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2576 (username, all_ids_count, len(video_ids)))
2578 for video_id in video_ids:
2579 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2582 class DepositFilesIE(InfoExtractor):
2583 """Information extractor for depositfiles.com"""
2585 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2587 def __init__(self, downloader=None):
2588 InfoExtractor.__init__(self, downloader)
2592 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2594 def report_download_webpage(self, file_id):
2595 """Report webpage download."""
2596 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2598 def report_extraction(self, file_id):
2599 """Report information extraction."""
2600 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2602 def _real_initialize(self):
2605 def _real_extract(self, url):
2606 # At this point we have a new file
2607 self._downloader.increment_downloads()
2609 file_id = url.split('/')[-1]
2610 # Rebuild url in english locale
2611 url = 'http://depositfiles.com/en/files/' + file_id
2613 # Retrieve file webpage with 'Free download' button pressed
2614 free_download_indication = { 'gateway_result' : '1' }
2615 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2617 self.report_download_webpage(file_id)
2618 webpage = urllib2.urlopen(request).read()
2619 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2620 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2623 # Search for the real file URL
2624 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2625 if (mobj is None) or (mobj.group(1) is None):
2626 # Try to figure out reason of the error.
2627 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2628 if (mobj is not None) and (mobj.group(1) is not None):
2629 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2630 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2632 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2635 file_url = mobj.group(1)
2636 file_extension = os.path.splitext(file_url)[1][1:]
2638 # Search for file title
2639 mobj = re.search(r'<b title="(.*?)">', webpage)
2641 self._downloader.trouble(u'ERROR: unable to extract title')
2643 file_title = mobj.group(1).decode('utf-8')
2646 # Process file information
2647 self._downloader.process_info({
2648 'id': file_id.decode('utf-8'),
2649 'url': file_url.decode('utf-8'),
2651 'upload_date': u'NA',
2652 'title': file_title,
2653 'stitle': file_title,
2654 'ext': file_extension.decode('utf-8'),
2658 except UnavailableVideoError, err:
2659 self._downloader.trouble(u'ERROR: unable to download file')
2662 class FacebookIE(InfoExtractor):
2663 """Information Extractor for Facebook"""
2665 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2666 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2667 _NETRC_MACHINE = 'facebook'
2668 _available_formats = ['highqual', 'lowqual']
2669 _video_extensions = {
2674 def __init__(self, downloader=None):
2675 InfoExtractor.__init__(self, downloader)
2679 return (re.match(FacebookIE._VALID_URL, url) is not None)
2681 def _reporter(self, message):
2682 """Add header and report message."""
2683 self._downloader.to_screen(u'[facebook] %s' % message)
2685 def report_login(self):
2686 """Report attempt to log in."""
2687 self._reporter(u'Logging in')
2689 def report_video_webpage_download(self, video_id):
2690 """Report attempt to download video webpage."""
2691 self._reporter(u'%s: Downloading video webpage' % video_id)
2693 def report_information_extraction(self, video_id):
2694 """Report attempt to extract video information."""
2695 self._reporter(u'%s: Extracting video information' % video_id)
2697 def _parse_page(self, video_webpage):
2698 """Extract video information from page"""
2700 data = {'title': r'class="video_title datawrap">(.*?)</',
2701 'description': r'<div class="datawrap">(.*?)</div>',
2702 'owner': r'\("video_owner_name", "(.*?)"\)',
2703 'upload_date': r'data-date="(.*?)"',
2704 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2707 for piece in data.keys():
2708 mobj = re.search(data[piece], video_webpage)
2709 if mobj is not None:
2710 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2714 for fmt in self._available_formats:
2715 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2716 if mobj is not None:
2717 # URL is in a Javascript segment inside an escaped Unicode format within
2718 # the generally utf-8 page
2719 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2720 video_info['video_urls'] = video_urls
2724 def _real_initialize(self):
2725 if self._downloader is None:
2730 downloader_params = self._downloader.params
2732 # Attempt to use provided username and password or .netrc data
2733 if downloader_params.get('username', None) is not None:
2734 useremail = downloader_params['username']
2735 password = downloader_params['password']
2736 elif downloader_params.get('usenetrc', False):
2738 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2739 if info is not None:
2743 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2744 except (IOError, netrc.NetrcParseError), err:
2745 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2748 if useremail is None:
2757 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2760 login_results = urllib2.urlopen(request).read()
2761 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2762 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2764 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2765 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2768 def _real_extract(self, url):
2769 mobj = re.match(self._VALID_URL, url)
2771 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2773 video_id = mobj.group('ID')
2776 self.report_video_webpage_download(video_id)
2777 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2779 page = urllib2.urlopen(request)
2780 video_webpage = page.read()
2781 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2782 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2785 # Start extracting information
2786 self.report_information_extraction(video_id)
2788 # Extract information
2789 video_info = self._parse_page(video_webpage)
2792 if 'owner' not in video_info:
2793 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2795 video_uploader = video_info['owner']
2798 if 'title' not in video_info:
2799 self._downloader.trouble(u'ERROR: unable to extract video title')
2801 video_title = video_info['title']
2802 video_title = video_title.decode('utf-8')
2803 video_title = sanitize_title(video_title)
2806 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2807 simple_title = simple_title.strip(ur'_')
2810 if 'thumbnail' not in video_info:
2811 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2812 video_thumbnail = ''
2814 video_thumbnail = video_info['thumbnail']
2818 if 'upload_date' in video_info:
2819 upload_time = video_info['upload_date']
2820 timetuple = email.utils.parsedate_tz(upload_time)
2821 if timetuple is not None:
2823 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2828 video_description = video_info.get('description', 'No description available.')
2830 url_map = video_info['video_urls']
2831 if len(url_map.keys()) > 0:
2832 # Decide which formats to download
2833 req_format = self._downloader.params.get('format', None)
2834 format_limit = self._downloader.params.get('format_limit', None)
2836 if format_limit is not None and format_limit in self._available_formats:
2837 format_list = self._available_formats[self._available_formats.index(format_limit):]
2839 format_list = self._available_formats
2840 existing_formats = [x for x in format_list if x in url_map]
2841 if len(existing_formats) == 0:
2842 self._downloader.trouble(u'ERROR: no known formats available for video')
2844 if req_format is None:
2845 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2846 elif req_format == '-1':
2847 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2850 if req_format not in url_map:
2851 self._downloader.trouble(u'ERROR: requested format not available')
2853 video_url_list = [(req_format, url_map[req_format])] # Specific format
2855 for format_param, video_real_url in video_url_list:
2857 # At this point we have a new video
2858 self._downloader.increment_downloads()
2861 video_extension = self._video_extensions.get(format_param, 'mp4')
2864 # Process video information
2865 self._downloader.process_info({
2866 'id': video_id.decode('utf-8'),
2867 'url': video_real_url.decode('utf-8'),
2868 'uploader': video_uploader.decode('utf-8'),
2869 'upload_date': upload_date,
2870 'title': video_title,
2871 'stitle': simple_title,
2872 'ext': video_extension.decode('utf-8'),
2873 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2874 'thumbnail': video_thumbnail.decode('utf-8'),
2875 'description': video_description.decode('utf-8'),
2878 except UnavailableVideoError, err:
2879 self._downloader.trouble(u'\nERROR: unable to download video')
2881 class BlipTVIE(InfoExtractor):
2882 """Information extractor for blip.tv"""
2884 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2885 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2889 return (re.match(BlipTVIE._VALID_URL, url) is not None)
2891 def report_extraction(self, file_id):
2892 """Report information extraction."""
2893 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
2895 def _simplify_title(self, title):
2896 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2897 res = res.strip(ur'_')
2900 def _real_extract(self, url):
2901 mobj = re.match(self._VALID_URL, url)
2903 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2910 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2911 request = urllib2.Request(json_url)
2912 self.report_extraction(mobj.group(1))
2914 json_code = urllib2.urlopen(request).read()
2915 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2916 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2919 json_data = json.loads(json_code)
2920 if 'Post' in json_data:
2921 data = json_data['Post']
2925 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2926 video_url = data['media']['url']
2927 umobj = re.match(self._URL_EXT, video_url)
2929 raise ValueError('Can not determine filename extension')
2930 ext = umobj.group(1)
2932 self._downloader.increment_downloads()
2935 'id': data['item_id'],
2937 'uploader': data['display_name'],
2938 'upload_date': upload_date,
2939 'title': data['title'],
2940 'stitle': self._simplify_title(data['title']),
2942 'format': data['media']['mimeType'],
2943 'thumbnail': data['thumbnailUrl'],
2944 'description': data['description'],
2945 'player_url': data['embedUrl']
2947 except (ValueError,KeyError), err:
2948 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2952 self._downloader.process_info(info)
2953 except UnavailableVideoError, err:
2954 self._downloader.trouble(u'\nERROR: unable to download video')
2957 class PostProcessor(object):
2958 """Post Processor class.
2960 PostProcessor objects can be added to downloaders with their
2961 add_post_processor() method. When the downloader has finished a
2962 successful download, it will take its internal chain of PostProcessors
2963 and start calling the run() method on each one of them, first with
2964 an initial argument and then with the returned value of the previous
2967 The chain will be stopped if one of them ever returns None or the end
2968 of the chain is reached.
2970 PostProcessor objects follow a "mutual registration" process similar
2971 to InfoExtractor objects.
2976 def __init__(self, downloader=None):
2977 self._downloader = downloader
2979 def set_downloader(self, downloader):
2980 """Sets the downloader for this PP."""
2981 self._downloader = downloader
2983 def run(self, information):
2984 """Run the PostProcessor.
2986 The "information" argument is a dictionary like the ones
2987 composed by InfoExtractors. The only difference is that this
2988 one has an extra field called "filepath" that points to the
2991 When this method returns None, the postprocessing chain is
2992 stopped. However, this method may return an information
2993 dictionary that will be passed to the next postprocessing
2994 object in the chain. It can be the one it received after
2995 changing some fields.
2997 In addition, this method may raise a PostProcessingError
2998 exception that will be taken into account by the downloader
3001 return information # by default, do nothing
3004 class FFmpegExtractAudioPP(PostProcessor):
3006 def __init__(self, downloader=None, preferredcodec=None):
3007 PostProcessor.__init__(self, downloader)
3008 if preferredcodec is None:
3009 preferredcodec = 'best'
3010 self._preferredcodec = preferredcodec
3013 def get_audio_codec(path):
3015 cmd = ['ffprobe', '-show_streams', '--', path]
3016 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3017 output = handle.communicate()[0]
3018 if handle.wait() != 0:
3020 except (IOError, OSError):
3023 for line in output.split('\n'):
3024 if line.startswith('codec_name='):
3025 audio_codec = line.split('=')[1].strip()
3026 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3031 def run_ffmpeg(path, out_path, codec, more_opts):
3033 cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3034 ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3036 except (IOError, OSError):
3039 def run(self, information):
3040 path = information['filepath']
3042 filecodec = self.get_audio_codec(path)
3043 if filecodec is None:
3044 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3048 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3049 if filecodec == 'aac' or filecodec == 'mp3':
3050 # Lossless if possible
3052 extension = filecodec
3053 if filecodec == 'aac':
3054 more_opts = ['-f', 'adts']
3057 acodec = 'libmp3lame'
3059 more_opts = ['-ab', '128k']
3061 # We convert the audio (lossy)
3062 acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
3063 extension = self._preferredcodec
3064 more_opts = ['-ab', '128k']
3065 if self._preferredcodec == 'aac':
3066 more_opts += ['-f', 'adts']
3068 (prefix, ext) = os.path.splitext(path)
3069 new_path = prefix + '.' + extension
3070 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3071 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3074 self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3079 except (IOError, OSError):
3080 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3083 information['filepath'] = new_path
3087 def updateSelf(downloader, filename):
3088 ''' Update the program file with the latest version from the repository '''
3089 # Note: downloader only used for options
3090 if not os.access(filename, os.W_OK):
3091 sys.exit('ERROR: no write permissions on %s' % filename)
3093 downloader.to_screen('Updating to latest version...')
3097 urlh = urllib.urlopen(UPDATE_URL)
3098 newcontent = urlh.read()
3101 except (IOError, OSError), err:
3102 sys.exit('ERROR: unable to download latest version')
3105 outf = open(filename, 'wb')
3107 outf.write(newcontent)
3110 except (IOError, OSError), err:
3111 sys.exit('ERROR: unable to overwrite current version')
3113 downloader.to_screen('Updated youtube-dl. Restart to use the new version.')
3120 def _format_option_string(option):
3121 ''' ('-o', '--option') -> -o, --format METAVAR'''
3125 if option._short_opts: opts.append(option._short_opts[0])
3126 if option._long_opts: opts.append(option._long_opts[0])
3127 if len(opts) > 1: opts.insert(1, ', ')
3129 if option.takes_value(): opts.append(' %s' % option.metavar)
3131 return "".join(opts)
3133 def _find_term_columns():
3134 columns = os.environ.get('COLUMNS', None)
3139 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3140 out,err = sp.communicate()
3141 return int(out.split()[1])
3147 max_help_position = 80
3149 # No need to wrap help messages if we're on a wide console
3150 columns = _find_term_columns()
3151 if columns: max_width = columns
3153 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
3154 fmt.format_option_strings = _format_option_string
3157 'version' : __version__,
3159 'usage' : '%prog [options] url...',
3160 'conflict_handler' : 'resolve',
3163 parser = optparse.OptionParser(**kw)
3166 general = optparse.OptionGroup(parser, 'General Options')
3167 authentication = optparse.OptionGroup(parser, 'Authentication Options')
3168 video_format = optparse.OptionGroup(parser, 'Video Format Options')
3169 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
3170 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
3171 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3173 general.add_option('-h', '--help',
3174 action='help', help='print this help text and exit')
3175 general.add_option('-v', '--version',
3176 action='version', help='print program version and exit')
3177 general.add_option('-U', '--update',
3178 action='store_true', dest='update_self', help='update this program to latest version')
3179 general.add_option('-i', '--ignore-errors',
3180 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3181 general.add_option('-r', '--rate-limit',
3182 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3183 general.add_option('-R', '--retries',
3184 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
3185 general.add_option('--playlist-start',
3186 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3187 general.add_option('--playlist-end',
3188 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3189 general.add_option('--dump-user-agent',
3190 action='store_true', dest='dump_user_agent',
3191 help='display the current browser identification', default=False)
3193 authentication.add_option('-u', '--username',
3194 dest='username', metavar='USERNAME', help='account username')
3195 authentication.add_option('-p', '--password',
3196 dest='password', metavar='PASSWORD', help='account password')
3197 authentication.add_option('-n', '--netrc',
3198 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3201 video_format.add_option('-f', '--format',
3202 action='store', dest='format', metavar='FORMAT', help='video format code')
3203 video_format.add_option('--all-formats',
3204 action='store_const', dest='format', help='download all available video formats', const='-1')
3205 video_format.add_option('--max-quality',
3206 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3209 verbosity.add_option('-q', '--quiet',
3210 action='store_true', dest='quiet', help='activates quiet mode', default=False)
3211 verbosity.add_option('-s', '--simulate',
3212 action='store_true', dest='simulate', help='do not download video', default=False)
3213 verbosity.add_option('-g', '--get-url',
3214 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3215 verbosity.add_option('-e', '--get-title',
3216 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3217 verbosity.add_option('--get-thumbnail',
3218 action='store_true', dest='getthumbnail',
3219 help='simulate, quiet but print thumbnail URL', default=False)
3220 verbosity.add_option('--get-description',
3221 action='store_true', dest='getdescription',
3222 help='simulate, quiet but print video description', default=False)
3223 verbosity.add_option('--get-filename',
3224 action='store_true', dest='getfilename',
3225 help='simulate, quiet but print output filename', default=False)
3226 verbosity.add_option('--no-progress',
3227 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3228 verbosity.add_option('--console-title',
3229 action='store_true', dest='consoletitle',
3230 help='display progress in console titlebar', default=False)
3233 filesystem.add_option('-t', '--title',
3234 action='store_true', dest='usetitle', help='use title in file name', default=False)
3235 filesystem.add_option('-l', '--literal',
3236 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3237 filesystem.add_option('-A', '--auto-number',
3238 action='store_true', dest='autonumber',
3239 help='number downloaded files starting from 00000', default=False)
3240 filesystem.add_option('-o', '--output',
3241 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
3242 filesystem.add_option('-a', '--batch-file',
3243 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3244 filesystem.add_option('-w', '--no-overwrites',
3245 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3246 filesystem.add_option('-c', '--continue',
3247 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3248 filesystem.add_option('--cookies',
3249 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
3250 filesystem.add_option('--no-part',
3251 action='store_true', dest='nopart', help='do not use .part files', default=False)
3252 filesystem.add_option('--no-mtime',
3253 action='store_false', dest='updatetime',
3254 help='do not use the Last-modified header to set the file modification time', default=True)
3255 filesystem.add_option('--write-description',
3256 action='store_true', dest='writedescription',
3257 help='write video description to a .description file', default=False)
3258 filesystem.add_option('--write-info-json',
3259 action='store_true', dest='writeinfojson',
3260 help='write video metadata to a .info.json file', default=False)
3263 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3264 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3265 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3266 help='"best", "aac" or "mp3"; best by default')
3269 parser.add_option_group(general)
3270 parser.add_option_group(filesystem)
3271 parser.add_option_group(verbosity)
3272 parser.add_option_group(video_format)
3273 parser.add_option_group(authentication)
3274 parser.add_option_group(postproc)
3276 opts, args = parser.parse_args()
3278 return parser, opts, args
3281 parser, opts, args = parseOpts()
3283 # Open appropriate CookieJar
3284 if opts.cookiefile is None:
3285 jar = cookielib.CookieJar()
3288 jar = cookielib.MozillaCookieJar(opts.cookiefile)
3289 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3291 except (IOError, OSError), err:
3292 sys.exit(u'ERROR: unable to open cookie file')
3295 if opts.dump_user_agent:
3296 print std_headers['User-Agent']
3299 # General configuration
3300 cookie_processor = urllib2.HTTPCookieProcessor(jar)
3301 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler()))
3302 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3304 # Batch file verification
3306 if opts.batchfile is not None:
3308 if opts.batchfile == '-':
3311 batchfd = open(opts.batchfile, 'r')
3312 batchurls = batchfd.readlines()
3313 batchurls = [x.strip() for x in batchurls]
3314 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3316 sys.exit(u'ERROR: batch file could not be read')
3317 all_urls = batchurls + args
3319 # Conflicting, missing and erroneous options
3320 if opts.usenetrc and (opts.username is not None or opts.password is not None):
3321 parser.error(u'using .netrc conflicts with giving username/password')
3322 if opts.password is not None and opts.username is None:
3323 parser.error(u'account username missing')
3324 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3325 parser.error(u'using output template conflicts with using title, literal title or auto number')
3326 if opts.usetitle and opts.useliteral:
3327 parser.error(u'using title conflicts with using literal title')
3328 if opts.username is not None and opts.password is None:
3329 opts.password = getpass.getpass(u'Type account password and press return:')
3330 if opts.ratelimit is not None:
3331 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3332 if numeric_limit is None:
3333 parser.error(u'invalid rate limit specified')
3334 opts.ratelimit = numeric_limit
3335 if opts.retries is not None:
3337 opts.retries = long(opts.retries)
3338 except (TypeError, ValueError), err:
3339 parser.error(u'invalid retry count specified')
3341 opts.playliststart = int(opts.playliststart)
3342 if opts.playliststart <= 0:
3343 raise ValueError(u'Playlist start must be positive')
3344 except (TypeError, ValueError), err:
3345 parser.error(u'invalid playlist start number specified')
3347 opts.playlistend = int(opts.playlistend)
3348 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3349 raise ValueError(u'Playlist end must be greater than playlist start')
3350 except (TypeError, ValueError), err:
3351 parser.error(u'invalid playlist end number specified')
3352 if opts.extractaudio:
3353 if opts.audioformat not in ['best', 'aac', 'mp3']:
3354 parser.error(u'invalid audio format specified')
3356 # Information extractors
3357 youtube_ie = YoutubeIE()
3358 metacafe_ie = MetacafeIE(youtube_ie)
3359 dailymotion_ie = DailymotionIE()
3360 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
3361 youtube_user_ie = YoutubeUserIE(youtube_ie)
3362 youtube_search_ie = YoutubeSearchIE(youtube_ie)
3363 google_ie = GoogleIE()
3364 google_search_ie = GoogleSearchIE(google_ie)
3365 photobucket_ie = PhotobucketIE()
3366 yahoo_ie = YahooIE()
3367 yahoo_search_ie = YahooSearchIE(yahoo_ie)
3368 deposit_files_ie = DepositFilesIE()
3369 facebook_ie = FacebookIE()
3370 bliptv_ie = BlipTVIE()
3371 vimeo_ie = VimeoIE()
3372 generic_ie = GenericIE()
3375 fd = FileDownloader({
3376 'usenetrc': opts.usenetrc,
3377 'username': opts.username,
3378 'password': opts.password,
3379 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3380 'forceurl': opts.geturl,
3381 'forcetitle': opts.gettitle,
3382 'forcethumbnail': opts.getthumbnail,
3383 'forcedescription': opts.getdescription,
3384 'forcefilename': opts.getfilename,
3385 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3386 'format': opts.format,
3387 'format_limit': opts.format_limit,
3388 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3389 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3390 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3391 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3392 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3393 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3394 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3395 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3396 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3397 or u'%(id)s.%(ext)s'),
3398 'ignoreerrors': opts.ignoreerrors,
3399 'ratelimit': opts.ratelimit,
3400 'nooverwrites': opts.nooverwrites,
3401 'retries': opts.retries,
3402 'continuedl': opts.continue_dl,
3403 'noprogress': opts.noprogress,
3404 'playliststart': opts.playliststart,
3405 'playlistend': opts.playlistend,
3406 'logtostderr': opts.outtmpl == '-',
3407 'consoletitle': opts.consoletitle,
3408 'nopart': opts.nopart,
3409 'updatetime': opts.updatetime,
3410 'writedescription': opts.writedescription,
3411 'writeinfojson': opts.writeinfojson,
3413 fd.add_info_extractor(youtube_search_ie)
3414 fd.add_info_extractor(youtube_pl_ie)
3415 fd.add_info_extractor(youtube_user_ie)
3416 fd.add_info_extractor(metacafe_ie)
3417 fd.add_info_extractor(dailymotion_ie)
3418 fd.add_info_extractor(youtube_ie)
3419 fd.add_info_extractor(google_ie)
3420 fd.add_info_extractor(google_search_ie)
3421 fd.add_info_extractor(photobucket_ie)
3422 fd.add_info_extractor(yahoo_ie)
3423 fd.add_info_extractor(yahoo_search_ie)
3424 fd.add_info_extractor(deposit_files_ie)
3425 fd.add_info_extractor(facebook_ie)
3426 fd.add_info_extractor(bliptv_ie)
3427 fd.add_info_extractor(vimeo_ie)
3429 # This must come last since it's the
3430 # fallback if none of the others work
3431 fd.add_info_extractor(generic_ie)
3434 if opts.extractaudio:
3435 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))
3438 if opts.update_self:
3439 updateSelf(fd, sys.argv[0])
3442 if len(all_urls) < 1:
3443 if not opts.update_self:
3444 parser.error(u'you must provide at least one URL')
3447 retcode = fd.download(all_urls)
3449 # Dump cookie jar if requested
3450 if opts.cookiefile is not None:
3453 except (IOError, OSError), err:
3454 sys.exit(u'ERROR: unable to save cookie jar')
3459 if __name__ == '__main__':
3462 except DownloadError:
3464 except SameFileError:
3465 sys.exit(u'ERROR: fixed output name but more than one file to download')
3466 except KeyboardInterrupt:
3467 sys.exit(u'\nERROR: Interrupted by user')
3469 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: