2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # Author: Vasyl' Vavrychuk
7 # Author: Witold Baryluk
8 # Author: Paweł Paprota
9 # Author: Gergely Imreh
10 # Author: Philipp Hagemeister <phihag@phihag.de>
11 # License: Public domain code
38 except ImportError: # Python 2.4
41 import cStringIO as StringIO
45 # parse_qs was moved from the cgi module to the urlparse module recently.
47 from urlparse import parse_qs
49 from cgi import parse_qs
53 except ImportError: # Python < 2.6
57 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:2.0b11) Gecko/20100101 Firefox/4.0b11',
58 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
59 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
60 'Accept-Encoding': 'gzip, deflate',
61 'Accept-Language': 'en-us,en;q=0.5',
64 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
68 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
74 def raiseError(msg, i):
75 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
76 def skipSpace(i, expectMore=True):
77 while i < len(s) and s[i] in ' \t\r\n':
81 raiseError('Premature end', i)
83 def decodeEscape(match):
99 return unichr(int(esc[1:5], 16))
100 if len(esc) == 5+6 and esc[5:7] == '\\u':
101 hi = int(esc[1:5], 16)
102 low = int(esc[7:11], 16)
103 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
104 raise ValueError('Unknown escape ' + str(esc))
111 while s[e-bslashes-1] == '\\':
113 if bslashes % 2 == 1:
117 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
118 stri = rexp.sub(decodeEscape, s[i:e])
124 if s[i] == '}': # Empty dictionary
128 raiseError('Expected a string object key', i)
129 i,key = parseString(i)
131 if i >= len(s) or s[i] != ':':
132 raiseError('Expected a colon', i)
139 raiseError('Expected comma or closing curly brace', i)
144 if s[i] == ']': # Empty array
149 i = skipSpace(i) # Raise exception if premature end
153 raiseError('Expected a comma or closing bracket', i)
155 def parseDiscrete(i):
156 for k,v in {'true': True, 'false': False, 'null': None}.items():
157 if s.startswith(k, i):
159 raiseError('Not a boolean (or null)', i)
161 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
163 raiseError('Not a number', i)
165 if '.' in nums or 'e' in nums or 'E' in nums:
166 return (i+len(nums), float(nums))
167 return (i+len(nums), int(nums))
168 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
171 i,res = CHARMAP.get(s[i], parseNumber)(i)
172 i = skipSpace(i, False)
176 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
179 def preferredencoding():
180 """Get preferred encoding.
182 Returns the best encoding scheme for the system, based on
183 locale.getpreferredencoding() and some further tweaks.
185 def yield_preferredencoding():
187 pref = locale.getpreferredencoding()
193 return yield_preferredencoding().next()
195 def htmlentity_transform(matchobj):
196 """Transforms an HTML entity to a Unicode character.
198 This function receives a match object and is intended to be used with
199 the re.sub() function.
201 entity = matchobj.group(1)
203 # Known non-numeric HTML entity
204 if entity in htmlentitydefs.name2codepoint:
205 return unichr(htmlentitydefs.name2codepoint[entity])
208 mobj = re.match(ur'(?u)#(x?\d+)', entity)
210 numstr = mobj.group(1)
211 if numstr.startswith(u'x'):
213 numstr = u'0%s' % numstr
216 return unichr(long(numstr, base))
218 # Unknown entity in name, return its literal representation
219 return (u'&%s;' % entity)
221 def sanitize_title(utitle):
222 """Sanitizes a video title so it could be used as part of a filename."""
223 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
224 return utitle.replace(unicode(os.sep), u'%')
226 def sanitize_open(filename, open_mode):
227 """Try to open the given filename, and slightly tweak it if this fails.
229 Attempts to open the given filename. If this fails, it tries to change
230 the filename slightly, step by step, until it's either able to open it
231 or it fails and raises a final exception, like the standard open()
234 It returns the tuple (stream, definitive_file_name).
238 if sys.platform == 'win32':
240 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
241 return (sys.stdout, filename)
242 stream = open(filename, open_mode)
243 return (stream, filename)
244 except (IOError, OSError), err:
245 # In case of error, try to remove win32 forbidden chars
246 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
248 # An exception here should be caught in the caller
249 stream = open(filename, open_mode)
250 return (stream, filename)
252 def timeconvert(timestr):
253 """Convert RFC 2822 defined time string into system timestamp"""
255 timetuple = email.utils.parsedate_tz(timestr)
256 if timetuple is not None:
257 timestamp = email.utils.mktime_tz(timetuple)
260 class DownloadError(Exception):
261 """Download Error exception.
263 This exception may be thrown by FileDownloader objects if they are not
264 configured to continue on errors. They will contain the appropriate
269 class SameFileError(Exception):
270 """Same File exception.
272 This exception will be thrown by FileDownloader objects if they detect
273 multiple files would have to be downloaded to the same file on disk.
277 class PostProcessingError(Exception):
278 """Post Processing exception.
280 This exception may be raised by PostProcessor's .run() method to
281 indicate an error in the postprocessing task.
285 class UnavailableVideoError(Exception):
286 """Unavailable Format exception.
288 This exception will be thrown when a video is requested
289 in a format that is not available for that video.
293 class ContentTooShortError(Exception):
294 """Content Too Short exception.
296 This exception may be raised by FileDownloader objects when a file they
297 download is too small for what the server announced first, indicating
298 the connection was probably interrupted.
304 def __init__(self, downloaded, expected):
305 self.downloaded = downloaded
306 self.expected = expected
308 class YoutubeDLHandler(urllib2.HTTPHandler):
309 """Handler for HTTP requests and responses.
311 This class, when installed with an OpenerDirector, automatically adds
312 the standard headers to every HTTP request and handles gzipped and
313 deflated responses from web servers. If compression is to be avoided in
314 a particular request, the original request in the program code only has
315 to include the HTTP header "Youtubedl-No-Compression", which will be
316 removed before making the real request.
318 Part of this code was copied from:
320 http://techknack.net/python-urllib2-handlers/
322 Andrew Rowls, the author of that code, agreed to release it to the
329 return zlib.decompress(data, -zlib.MAX_WBITS)
331 return zlib.decompress(data)
334 def addinfourl_wrapper(stream, headers, url, code):
335 if hasattr(urllib2.addinfourl, 'getcode'):
336 return urllib2.addinfourl(stream, headers, url, code)
337 ret = urllib2.addinfourl(stream, headers, url)
341 def http_request(self, req):
342 for h in std_headers:
345 req.add_header(h, std_headers[h])
346 if 'Youtubedl-no-compression' in req.headers:
347 if 'Accept-encoding' in req.headers:
348 del req.headers['Accept-encoding']
349 del req.headers['Youtubedl-no-compression']
352 def http_response(self, req, resp):
355 if resp.headers.get('Content-encoding', '') == 'gzip':
356 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
357 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
358 resp.msg = old_resp.msg
360 if resp.headers.get('Content-encoding', '') == 'deflate':
361 gz = StringIO.StringIO(self.deflate(resp.read()))
362 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
363 resp.msg = old_resp.msg
366 class FileDownloader(object):
367 """File Downloader class.
369 File downloader objects are the ones responsible of downloading the
370 actual video file and writing it to disk if the user has requested
371 it, among some other tasks. In most cases there should be one per
372 program. As, given a video URL, the downloader doesn't know how to
373 extract all the needed information, task that InfoExtractors do, it
374 has to pass the URL to one of them.
376 For this, file downloader objects have a method that allows
377 InfoExtractors to be registered in a given order. When it is passed
378 a URL, the file downloader handles it to the first InfoExtractor it
379 finds that reports being able to handle it. The InfoExtractor extracts
380 all the information about the video or videos the URL refers to, and
381 asks the FileDownloader to process the video information, possibly
382 downloading the video.
384 File downloaders accept a lot of parameters. In order not to saturate
385 the object constructor with arguments, it receives a dictionary of
386 options instead. These options are available through the params
387 attribute for the InfoExtractors to use. The FileDownloader also
388 registers itself as the downloader in charge for the InfoExtractors
389 that are added to it, so this is a "mutual registration".
393 username: Username for authentication purposes.
394 password: Password for authentication purposes.
395 usenetrc: Use netrc for authentication instead.
396 quiet: Do not print messages to stdout.
397 forceurl: Force printing final URL.
398 forcetitle: Force printing title.
399 forcethumbnail: Force printing thumbnail URL.
400 forcedescription: Force printing description.
401 forcefilename: Force printing final filename.
402 simulate: Do not download the video files.
403 format: Video format code.
404 format_limit: Highest quality format to try.
405 outtmpl: Template for output names.
406 ignoreerrors: Do not stop on download errors.
407 ratelimit: Download speed limit, in bytes/sec.
408 nooverwrites: Prevent overwriting files.
409 retries: Number of times to retry for HTTP error 5xx
410 continuedl: Try to continue downloads if possible.
411 noprogress: Do not print the progress bar.
412 playliststart: Playlist item to start at.
413 playlistend: Playlist item to end at.
414 logtostderr: Log messages to stderr instead of stdout.
415 consoletitle: Display progress in console window's titlebar.
416 nopart: Do not use temporary .part files.
417 updatetime: Use the Last-modified header to set output file timestamps.
418 writedescription: Write the video description to a .description file
419 writeinfojson: Write the video description to a .info.json file
425 _download_retcode = None
426 _num_downloads = None
429 def __init__(self, params):
430 """Create a FileDownloader object with the given options."""
433 self._download_retcode = 0
434 self._num_downloads = 0
435 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
439 def pmkdir(filename):
440 """Create directory components in filename. Similar to Unix "mkdir -p"."""
441 components = filename.split(os.sep)
442 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
443 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
444 for dir in aggregate:
445 if not os.path.exists(dir):
449 def format_bytes(bytes):
452 if type(bytes) is str:
457 exponent = long(math.log(bytes, 1024.0))
458 suffix = 'bkMGTPEZY'[exponent]
459 converted = float(bytes) / float(1024**exponent)
460 return '%.2f%s' % (converted, suffix)
463 def calc_percent(byte_counter, data_len):
466 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
469 def calc_eta(start, now, total, current):
473 if current == 0 or dif < 0.001: # One millisecond
475 rate = float(current) / dif
476 eta = long((float(total) - float(current)) / rate)
477 (eta_mins, eta_secs) = divmod(eta, 60)
480 return '%02d:%02d' % (eta_mins, eta_secs)
483 def calc_speed(start, now, bytes):
485 if bytes == 0 or dif < 0.001: # One millisecond
486 return '%10s' % '---b/s'
487 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
490 def best_block_size(elapsed_time, bytes):
491 new_min = max(bytes / 2.0, 1.0)
492 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
493 if elapsed_time < 0.001:
495 rate = bytes / elapsed_time
503 def parse_bytes(bytestr):
504 """Parse a string indicating a byte quantity into a long integer."""
505 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
508 number = float(matchobj.group(1))
509 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
510 return long(round(number * multiplier))
512 def add_info_extractor(self, ie):
513 """Add an InfoExtractor object to the end of the list."""
515 ie.set_downloader(self)
517 def add_post_processor(self, pp):
518 """Add a PostProcessor object to the end of the chain."""
520 pp.set_downloader(self)
522 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
523 """Print message to stdout if not in quiet mode."""
525 if not self.params.get('quiet', False):
526 terminator = [u'\n', u''][skip_eol]
527 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
528 self._screen_file.flush()
529 except (UnicodeEncodeError), err:
530 if not ignore_encoding_errors:
533 def to_stderr(self, message):
534 """Print message to stderr."""
535 print >>sys.stderr, message.encode(preferredencoding())
537 def to_cons_title(self, message):
538 """Set console/terminal window title to message."""
539 if not self.params.get('consoletitle', False):
541 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
542 # c_wchar_p() might not be necessary if `message` is
543 # already of type unicode()
544 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
545 elif 'TERM' in os.environ:
546 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
548 def fixed_template(self):
549 """Checks if the output template is fixed."""
550 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
552 def trouble(self, message=None):
553 """Determine action to take when a download problem appears.
555 Depending on if the downloader has been configured to ignore
556 download errors or not, this method may throw an exception or
557 not when errors are found, after printing the message.
559 if message is not None:
560 self.to_stderr(message)
561 if not self.params.get('ignoreerrors', False):
562 raise DownloadError(message)
563 self._download_retcode = 1
565 def slow_down(self, start_time, byte_counter):
566 """Sleep if the download speed is over the rate limit."""
567 rate_limit = self.params.get('ratelimit', None)
568 if rate_limit is None or byte_counter == 0:
571 elapsed = now - start_time
574 speed = float(byte_counter) / elapsed
575 if speed > rate_limit:
576 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
578 def temp_name(self, filename):
579 """Returns a temporary filename for the given filename."""
580 if self.params.get('nopart', False) or filename == u'-' or \
581 (os.path.exists(filename) and not os.path.isfile(filename)):
583 return filename + u'.part'
585 def undo_temp_name(self, filename):
586 if filename.endswith(u'.part'):
587 return filename[:-len(u'.part')]
590 def try_rename(self, old_filename, new_filename):
592 if old_filename == new_filename:
594 os.rename(old_filename, new_filename)
595 except (IOError, OSError), err:
596 self.trouble(u'ERROR: unable to rename file')
598 def try_utime(self, filename, last_modified_hdr):
599 """Try to set the last-modified time of the given file."""
600 if last_modified_hdr is None:
602 if not os.path.isfile(filename):
604 timestr = last_modified_hdr
607 filetime = timeconvert(timestr)
611 os.utime(filename,(time.time(), filetime))
615 def report_writedescription(self, descfn):
616 """ Report that the description file is being written """
617 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
619 def report_writeinfojson(self, infofn):
620 """ Report that the metadata file has been written """
621 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
623 def report_destination(self, filename):
624 """Report destination filename."""
625 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
627 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
628 """Report download progress."""
629 if self.params.get('noprogress', False):
631 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
632 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
633 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
634 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
636 def report_resuming_byte(self, resume_len):
637 """Report attempt to resume at given byte."""
638 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
640 def report_retry(self, count, retries):
641 """Report retry in case of HTTP error 5xx"""
642 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
644 def report_file_already_downloaded(self, file_name):
645 """Report file has already been fully downloaded."""
647 self.to_screen(u'[download] %s has already been downloaded' % file_name)
648 except (UnicodeEncodeError), err:
649 self.to_screen(u'[download] The file has already been downloaded')
651 def report_unable_to_resume(self):
652 """Report it was impossible to resume download."""
653 self.to_screen(u'[download] Unable to resume')
655 def report_finish(self):
656 """Report download finished."""
657 if self.params.get('noprogress', False):
658 self.to_screen(u'[download] Download completed')
662 def increment_downloads(self):
663 """Increment the ordinal that assigns a number to each file."""
664 self._num_downloads += 1
666 def prepare_filename(self, info_dict):
667 """Generate the output filename."""
669 template_dict = dict(info_dict)
670 template_dict['epoch'] = unicode(long(time.time()))
671 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
672 filename = self.params['outtmpl'] % template_dict
674 except (ValueError, KeyError), err:
675 self.trouble(u'ERROR: invalid system charset or erroneous output template')
678 def process_info(self, info_dict):
679 """Process a single dictionary returned by an InfoExtractor."""
680 filename = self.prepare_filename(info_dict)
681 # Do nothing else if in simulate mode
682 if self.params.get('simulate', False):
684 if self.params.get('forcetitle', False):
685 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
686 if self.params.get('forceurl', False):
687 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
688 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
689 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
690 if self.params.get('forcedescription', False) and 'description' in info_dict:
691 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
692 if self.params.get('forcefilename', False) and filename is not None:
693 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
699 if self.params.get('nooverwrites', False) and os.path.exists(filename):
700 self.to_stderr(u'WARNING: file exists and will be skipped')
704 self.pmkdir(filename)
705 except (OSError, IOError), err:
706 self.trouble(u'ERROR: unable to create directories: %s' % str(err))
709 if self.params.get('writedescription', False):
711 descfn = filename + '.description'
712 self.report_writedescription(descfn)
713 descfile = open(descfn, 'wb')
715 descfile.write(info_dict['description'].encode('utf-8'))
718 except (OSError, IOError):
719 self.trouble(u'ERROR: Cannot write description file: %s' % str(descfn))
722 if self.params.get('writeinfojson', False):
723 infofn = filename + '.info.json'
724 self.report_writeinfojson(infofn)
727 except (NameError,AttributeError):
728 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
731 infof = open(infofn, 'wb')
733 json.dump(info_dict, infof)
736 except (OSError, IOError):
737 self.trouble(u'ERROR: Cannot write metadata to JSON file: %s' % str(infofn))
741 success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
742 except (OSError, IOError), err:
743 raise UnavailableVideoError
744 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
745 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
747 except (ContentTooShortError, ), err:
748 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
753 self.post_process(filename, info_dict)
754 except (PostProcessingError), err:
755 self.trouble(u'ERROR: postprocessing: %s' % str(err))
758 def download(self, url_list):
759 """Download a given list of URLs."""
760 if len(url_list) > 1 and self.fixed_template():
761 raise SameFileError(self.params['outtmpl'])
764 suitable_found = False
766 # Go to next InfoExtractor if not suitable
767 if not ie.suitable(url):
770 # Suitable InfoExtractor found
771 suitable_found = True
773 # Extract information from URL and process it
776 # Suitable InfoExtractor had been found; go to next URL
779 if not suitable_found:
780 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
782 return self._download_retcode
784 def post_process(self, filename, ie_info):
785 """Run the postprocessing chain on the given file."""
787 info['filepath'] = filename
793 def _download_with_rtmpdump(self, filename, url, player_url):
794 self.report_destination(filename)
795 tmpfilename = self.temp_name(filename)
797 # Check for rtmpdump first
799 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
800 except (OSError, IOError):
801 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
804 # Download using rtmpdump. rtmpdump returns exit code 2 when
805 # the connection was interrumpted and resuming appears to be
806 # possible. This is part of rtmpdump's normal usage, AFAIK.
807 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
808 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
809 while retval == 2 or retval == 1:
810 prevsize = os.path.getsize(tmpfilename)
811 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
812 time.sleep(5.0) # This seems to be needed
813 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
814 cursize = os.path.getsize(tmpfilename)
815 if prevsize == cursize and retval == 1:
818 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
819 self.try_rename(tmpfilename, filename)
822 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
825 def _do_download(self, filename, url, player_url):
826 # Check file already present
827 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
828 self.report_file_already_downloaded(filename)
831 # Attempt to download using rtmpdump
832 if url.startswith('rtmp'):
833 return self._download_with_rtmpdump(filename, url, player_url)
835 tmpfilename = self.temp_name(filename)
839 # Do not include the Accept-Encoding header
840 headers = {'Youtubedl-no-compression': 'True'}
841 basic_request = urllib2.Request(url, None, headers)
842 request = urllib2.Request(url, None, headers)
844 # Establish possible resume length
845 if os.path.isfile(tmpfilename):
846 resume_len = os.path.getsize(tmpfilename)
850 # Request parameters in case of being able to resume
851 if self.params.get('continuedl', False) and resume_len != 0:
852 self.report_resuming_byte(resume_len)
853 request.add_header('Range','bytes=%d-' % resume_len)
857 retries = self.params.get('retries', 0)
858 while count <= retries:
859 # Establish connection
861 data = urllib2.urlopen(request)
863 except (urllib2.HTTPError, ), err:
864 if (err.code < 500 or err.code >= 600) and err.code != 416:
865 # Unexpected HTTP error
867 elif err.code == 416:
868 # Unable to resume (requested range not satisfiable)
870 # Open the connection again without the range header
871 data = urllib2.urlopen(basic_request)
872 content_length = data.info()['Content-Length']
873 except (urllib2.HTTPError, ), err:
874 if err.code < 500 or err.code >= 600:
877 # Examine the reported length
878 if (content_length is not None and
879 (resume_len - 100 < long(content_length) < resume_len + 100)):
880 # The file had already been fully downloaded.
881 # Explanation to the above condition: in issue #175 it was revealed that
882 # YouTube sometimes adds or removes a few bytes from the end of the file,
883 # changing the file size slightly and causing problems for some users. So
884 # I decided to implement a suggested change and consider the file
885 # completely downloaded if the file size differs less than 100 bytes from
886 # the one in the hard drive.
887 self.report_file_already_downloaded(filename)
888 self.try_rename(tmpfilename, filename)
891 # The length does not match, we start the download over
892 self.report_unable_to_resume()
898 self.report_retry(count, retries)
901 self.trouble(u'ERROR: giving up after %s retries' % retries)
904 data_len = data.info().get('Content-length', None)
905 if data_len is not None:
906 data_len = long(data_len) + resume_len
907 data_len_str = self.format_bytes(data_len)
908 byte_counter = 0 + resume_len
914 data_block = data.read(block_size)
916 if len(data_block) == 0:
918 byte_counter += len(data_block)
920 # Open file just in time
923 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
924 filename = self.undo_temp_name(tmpfilename)
925 self.report_destination(filename)
926 except (OSError, IOError), err:
927 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
930 stream.write(data_block)
931 except (IOError, OSError), err:
932 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
934 block_size = self.best_block_size(after - before, len(data_block))
937 percent_str = self.calc_percent(byte_counter, data_len)
938 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
939 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
940 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
943 self.slow_down(start, byte_counter - resume_len)
947 if data_len is not None and byte_counter != data_len:
948 raise ContentTooShortError(byte_counter, long(data_len))
949 self.try_rename(tmpfilename, filename)
951 # Update file modification time
952 if self.params.get('updatetime', True):
953 self.try_utime(filename, data.info().get('last-modified', None))
957 class InfoExtractor(object):
958 """Information Extractor class.
960 Information extractors are the classes that, given a URL, extract
961 information from the video (or videos) the URL refers to. This
962 information includes the real video URL, the video title and simplified
963 title, author and others. The information is stored in a dictionary
964 which is then passed to the FileDownloader. The FileDownloader
965 processes this information possibly downloading the video to the file
966 system, among other possible outcomes. The dictionaries must include
967 the following fields:
969 id: Video identifier.
970 url: Final video URL.
971 uploader: Nickname of the video uploader.
972 title: Literal title.
973 stitle: Simplified title.
974 ext: Video filename extension.
975 format: Video format.
976 player_url: SWF Player URL (may be None).
978 The following fields are optional. Their primary purpose is to allow
979 youtube-dl to serve as the backend for a video search function, such
980 as the one in youtube2mp3. They are only used when their respective
981 forced printing functions are called:
983 thumbnail: Full URL to a video thumbnail image.
984 description: One-line video description.
986 Subclasses of this one should re-define the _real_initialize() and
987 _real_extract() methods, as well as the suitable() static method.
988 Probably, they should also be instantiated and added to the main
995 def __init__(self, downloader=None):
996 """Constructor. Receives an optional downloader."""
998 self.set_downloader(downloader)
1002 """Receives a URL and returns True if suitable for this IE."""
1005 def initialize(self):
1006 """Initializes an instance (authentication, etc)."""
1008 self._real_initialize()
1011 def extract(self, url):
1012 """Extracts URL information and returns it in list of dicts."""
1014 return self._real_extract(url)
1016 def set_downloader(self, downloader):
1017 """Sets the downloader for this IE."""
1018 self._downloader = downloader
1020 def _real_initialize(self):
1021 """Real initialization process. Redefine in subclasses."""
1024 def _real_extract(self, url):
1025 """Real extraction process. Redefine in subclasses."""
1028 class YoutubeIE(InfoExtractor):
1029 """Information extractor for youtube.com."""
1031 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1032 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1033 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1034 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1035 _NETRC_MACHINE = 'youtube'
1036 # Listed in order of quality
1037 _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
1038 _video_extensions = {
1044 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1051 return (re.match(YoutubeIE._VALID_URL, url) is not None)
1053 def report_lang(self):
1054 """Report attempt to set language."""
1055 self._downloader.to_screen(u'[youtube] Setting language')
1057 def report_login(self):
1058 """Report attempt to log in."""
1059 self._downloader.to_screen(u'[youtube] Logging in')
1061 def report_age_confirmation(self):
1062 """Report attempt to confirm age."""
1063 self._downloader.to_screen(u'[youtube] Confirming age')
1065 def report_video_webpage_download(self, video_id):
1066 """Report attempt to download video webpage."""
1067 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1069 def report_video_info_webpage_download(self, video_id):
1070 """Report attempt to download video info webpage."""
1071 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1073 def report_information_extraction(self, video_id):
1074 """Report attempt to extract video information."""
1075 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1077 def report_unavailable_format(self, video_id, format):
1078 """Report extracted video URL."""
1079 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1081 def report_rtmp_download(self):
1082 """Indicate the download will use the RTMP protocol."""
1083 self._downloader.to_screen(u'[youtube] RTMP download detected')
1085 def _real_initialize(self):
1086 if self._downloader is None:
1091 downloader_params = self._downloader.params
1093 # Attempt to use provided username and password or .netrc data
1094 if downloader_params.get('username', None) is not None:
1095 username = downloader_params['username']
1096 password = downloader_params['password']
1097 elif downloader_params.get('usenetrc', False):
1099 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1100 if info is not None:
1104 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1105 except (IOError, netrc.NetrcParseError), err:
1106 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1110 request = urllib2.Request(self._LANG_URL)
1113 urllib2.urlopen(request).read()
1114 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1115 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1118 # No authentication to be performed
1119 if username is None:
1124 'current_form': 'loginForm',
1126 'action_login': 'Log In',
1127 'username': username,
1128 'password': password,
1130 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1133 login_results = urllib2.urlopen(request).read()
1134 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1135 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1137 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1138 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1144 'action_confirm': 'Confirm',
1146 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1148 self.report_age_confirmation()
1149 age_results = urllib2.urlopen(request).read()
1150 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1151 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1154 def _real_extract(self, url):
1155 # Extract video id from URL
1156 mobj = re.match(self._VALID_URL, url)
1158 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1160 video_id = mobj.group(2)
1163 self.report_video_webpage_download(video_id)
1164 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1166 video_webpage = urllib2.urlopen(request).read()
1167 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1168 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1171 # Attempt to extract SWF player URL
1172 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1173 if mobj is not None:
1174 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1179 self.report_video_info_webpage_download(video_id)
1180 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1181 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1182 % (video_id, el_type))
1183 request = urllib2.Request(video_info_url)
1185 video_info_webpage = urllib2.urlopen(request).read()
1186 video_info = parse_qs(video_info_webpage)
1187 if 'token' in video_info:
1189 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1190 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1192 if 'token' not in video_info:
1193 if 'reason' in video_info:
1194 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1196 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1199 # Start extracting information
1200 self.report_information_extraction(video_id)
1203 if 'author' not in video_info:
1204 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1206 video_uploader = urllib.unquote_plus(video_info['author'][0])
1209 if 'title' not in video_info:
1210 self._downloader.trouble(u'ERROR: unable to extract video title')
1212 video_title = urllib.unquote_plus(video_info['title'][0])
1213 video_title = video_title.decode('utf-8')
1214 video_title = sanitize_title(video_title)
1217 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1218 simple_title = simple_title.strip(ur'_')
1221 if 'thumbnail_url' not in video_info:
1222 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1223 video_thumbnail = ''
1224 else: # don't panic if we can't find it
1225 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1229 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1230 if mobj is not None:
1231 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1232 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1233 for expression in format_expressions:
1235 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1243 video_description = u'No description available.'
1244 if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1245 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1246 if mobj is not None:
1247 video_description = mobj.group(1).decode('utf-8')
1249 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1250 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1251 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1252 # TODO use another parser
1255 video_token = urllib.unquote_plus(video_info['token'][0])
1257 # Decide which formats to download
1258 req_format = self._downloader.params.get('format', None)
1260 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1261 self.report_rtmp_download()
1262 video_url_list = [(None, video_info['conn'][0])]
1263 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1264 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1265 url_data = [parse_qs(uds) for uds in url_data_strs]
1266 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1267 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1269 format_limit = self._downloader.params.get('format_limit', None)
1270 if format_limit is not None and format_limit in self._available_formats:
1271 format_list = self._available_formats[self._available_formats.index(format_limit):]
1273 format_list = self._available_formats
1274 existing_formats = [x for x in format_list if x in url_map]
1275 if len(existing_formats) == 0:
1276 self._downloader.trouble(u'ERROR: no known formats available for video')
1278 if req_format is None:
1279 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1280 elif req_format == '-1':
1281 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1284 if req_format not in url_map:
1285 self._downloader.trouble(u'ERROR: requested format not available')
1287 video_url_list = [(req_format, url_map[req_format])] # Specific format
1289 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1292 for format_param, video_real_url in video_url_list:
1293 # At this point we have a new video
1294 self._downloader.increment_downloads()
1297 video_extension = self._video_extensions.get(format_param, 'flv')
1300 # Process video information
1301 self._downloader.process_info({
1302 'id': video_id.decode('utf-8'),
1303 'url': video_real_url.decode('utf-8'),
1304 'uploader': video_uploader.decode('utf-8'),
1305 'upload_date': upload_date,
1306 'title': video_title,
1307 'stitle': simple_title,
1308 'ext': video_extension.decode('utf-8'),
1309 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1310 'thumbnail': video_thumbnail.decode('utf-8'),
1311 'description': video_description,
1312 'player_url': player_url,
1314 except UnavailableVideoError, err:
1315 self._downloader.trouble(u'\nERROR: unable to download video')
1318 class MetacafeIE(InfoExtractor):
1319 """Information Extractor for metacafe.com."""
1321 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1322 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1323 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1326 def __init__(self, youtube_ie, downloader=None):
1327 InfoExtractor.__init__(self, downloader)
1328 self._youtube_ie = youtube_ie
1332 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1334 def report_disclaimer(self):
1335 """Report disclaimer retrieval."""
1336 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1338 def report_age_confirmation(self):
1339 """Report attempt to confirm age."""
1340 self._downloader.to_screen(u'[metacafe] Confirming age')
1342 def report_download_webpage(self, video_id):
1343 """Report webpage download."""
1344 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1346 def report_extraction(self, video_id):
1347 """Report information extraction."""
1348 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1350 def _real_initialize(self):
1351 # Retrieve disclaimer
1352 request = urllib2.Request(self._DISCLAIMER)
1354 self.report_disclaimer()
1355 disclaimer = urllib2.urlopen(request).read()
1356 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1357 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1363 'submit': "Continue - I'm over 18",
1365 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1367 self.report_age_confirmation()
1368 disclaimer = urllib2.urlopen(request).read()
1369 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1370 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1373 def _real_extract(self, url):
1374 # Extract id and simplified title from URL
1375 mobj = re.match(self._VALID_URL, url)
1377 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1380 video_id = mobj.group(1)
1382 # Check if video comes from YouTube
1383 mobj2 = re.match(r'^yt-(.*)$', video_id)
1384 if mobj2 is not None:
1385 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1388 # At this point we have a new video
1389 self._downloader.increment_downloads()
1391 simple_title = mobj.group(2).decode('utf-8')
1393 # Retrieve video webpage to extract further information
1394 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1396 self.report_download_webpage(video_id)
1397 webpage = urllib2.urlopen(request).read()
1398 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1399 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1402 # Extract URL, uploader and title from webpage
1403 self.report_extraction(video_id)
1404 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1405 if mobj is not None:
1406 mediaURL = urllib.unquote(mobj.group(1))
1407 video_extension = mediaURL[-3:]
1409 # Extract gdaKey if available
1410 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1412 video_url = mediaURL
1414 gdaKey = mobj.group(1)
1415 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1417 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1419 self._downloader.trouble(u'ERROR: unable to extract media URL')
1421 vardict = parse_qs(mobj.group(1))
1422 if 'mediaData' not in vardict:
1423 self._downloader.trouble(u'ERROR: unable to extract media URL')
1425 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1427 self._downloader.trouble(u'ERROR: unable to extract media URL')
1429 mediaURL = mobj.group(1).replace('\\/', '/')
1430 video_extension = mediaURL[-3:]
1431 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1433 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1435 self._downloader.trouble(u'ERROR: unable to extract title')
1437 video_title = mobj.group(1).decode('utf-8')
1438 video_title = sanitize_title(video_title)
1440 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1442 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1444 video_uploader = mobj.group(1)
1447 # Process video information
1448 self._downloader.process_info({
1449 'id': video_id.decode('utf-8'),
1450 'url': video_url.decode('utf-8'),
1451 'uploader': video_uploader.decode('utf-8'),
1452 'upload_date': u'NA',
1453 'title': video_title,
1454 'stitle': simple_title,
1455 'ext': video_extension.decode('utf-8'),
1459 except UnavailableVideoError:
1460 self._downloader.trouble(u'\nERROR: unable to download video')
1463 class DailymotionIE(InfoExtractor):
1464 """Information Extractor for Dailymotion"""
1466 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1468 def __init__(self, downloader=None):
1469 InfoExtractor.__init__(self, downloader)
1473 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1475 def report_download_webpage(self, video_id):
1476 """Report webpage download."""
1477 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1479 def report_extraction(self, video_id):
1480 """Report information extraction."""
1481 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1483 def _real_initialize(self):
1486 def _real_extract(self, url):
1487 # Extract id and simplified title from URL
1488 mobj = re.match(self._VALID_URL, url)
1490 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1493 # At this point we have a new video
1494 self._downloader.increment_downloads()
1495 video_id = mobj.group(1)
1497 simple_title = mobj.group(2).decode('utf-8')
1498 video_extension = 'flv'
1500 # Retrieve video webpage to extract further information
1501 request = urllib2.Request(url)
1503 self.report_download_webpage(video_id)
1504 webpage = urllib2.urlopen(request).read()
1505 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1506 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1509 # Extract URL, uploader and title from webpage
1510 self.report_extraction(video_id)
1511 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1513 self._downloader.trouble(u'ERROR: unable to extract media URL')
1515 mediaURL = urllib.unquote(mobj.group(1))
1517 # if needed add http://www.dailymotion.com/ if relative URL
1519 video_url = mediaURL
1521 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1522 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1524 self._downloader.trouble(u'ERROR: unable to extract title')
1526 video_title = mobj.group(1).decode('utf-8')
1527 video_title = sanitize_title(video_title)
1529 mobj = re.search(r'(?im)<Attribute name="owner">(.+?)</Attribute>', webpage)
1531 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1533 video_uploader = mobj.group(1)
1536 # Process video information
1537 self._downloader.process_info({
1538 'id': video_id.decode('utf-8'),
1539 'url': video_url.decode('utf-8'),
1540 'uploader': video_uploader.decode('utf-8'),
1541 'upload_date': u'NA',
1542 'title': video_title,
1543 'stitle': simple_title,
1544 'ext': video_extension.decode('utf-8'),
1548 except UnavailableVideoError:
1549 self._downloader.trouble(u'\nERROR: unable to download video')
1551 class GoogleIE(InfoExtractor):
1552 """Information extractor for video.google.com."""
1554 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1556 def __init__(self, downloader=None):
1557 InfoExtractor.__init__(self, downloader)
1561 return (re.match(GoogleIE._VALID_URL, url) is not None)
1563 def report_download_webpage(self, video_id):
1564 """Report webpage download."""
1565 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1567 def report_extraction(self, video_id):
1568 """Report information extraction."""
1569 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1571 def _real_initialize(self):
1574 def _real_extract(self, url):
1575 # Extract id from URL
1576 mobj = re.match(self._VALID_URL, url)
1578 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1581 # At this point we have a new video
1582 self._downloader.increment_downloads()
1583 video_id = mobj.group(1)
1585 video_extension = 'mp4'
1587 # Retrieve video webpage to extract further information
1588 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1590 self.report_download_webpage(video_id)
1591 webpage = urllib2.urlopen(request).read()
1592 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1593 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1596 # Extract URL, uploader, and title from webpage
1597 self.report_extraction(video_id)
1598 mobj = re.search(r"download_url:'([^']+)'", webpage)
1600 video_extension = 'flv'
1601 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1603 self._downloader.trouble(u'ERROR: unable to extract media URL')
1605 mediaURL = urllib.unquote(mobj.group(1))
1606 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1607 mediaURL = mediaURL.replace('\\x26', '\x26')
1609 video_url = mediaURL
1611 mobj = re.search(r'<title>(.*)</title>', webpage)
1613 self._downloader.trouble(u'ERROR: unable to extract title')
1615 video_title = mobj.group(1).decode('utf-8')
1616 video_title = sanitize_title(video_title)
1617 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1619 # Extract video description
1620 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1622 self._downloader.trouble(u'ERROR: unable to extract video description')
1624 video_description = mobj.group(1).decode('utf-8')
1625 if not video_description:
1626 video_description = 'No description available.'
1628 # Extract video thumbnail
1629 if self._downloader.params.get('forcethumbnail', False):
1630 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1632 webpage = urllib2.urlopen(request).read()
1633 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1634 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1636 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1638 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1640 video_thumbnail = mobj.group(1)
1641 else: # we need something to pass to process_info
1642 video_thumbnail = ''
1646 # Process video information
1647 self._downloader.process_info({
1648 'id': video_id.decode('utf-8'),
1649 'url': video_url.decode('utf-8'),
1651 'upload_date': u'NA',
1652 'title': video_title,
1653 'stitle': simple_title,
1654 'ext': video_extension.decode('utf-8'),
1658 except UnavailableVideoError:
1659 self._downloader.trouble(u'\nERROR: unable to download video')
1662 class PhotobucketIE(InfoExtractor):
1663 """Information extractor for photobucket.com."""
1665 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1667 def __init__(self, downloader=None):
1668 InfoExtractor.__init__(self, downloader)
1672 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1674 def report_download_webpage(self, video_id):
1675 """Report webpage download."""
1676 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1678 def report_extraction(self, video_id):
1679 """Report information extraction."""
1680 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1682 def _real_initialize(self):
1685 def _real_extract(self, url):
1686 # Extract id from URL
1687 mobj = re.match(self._VALID_URL, url)
1689 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1692 # At this point we have a new video
1693 self._downloader.increment_downloads()
1694 video_id = mobj.group(1)
1696 video_extension = 'flv'
1698 # Retrieve video webpage to extract further information
1699 request = urllib2.Request(url)
1701 self.report_download_webpage(video_id)
1702 webpage = urllib2.urlopen(request).read()
1703 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1704 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1707 # Extract URL, uploader, and title from webpage
1708 self.report_extraction(video_id)
1709 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1711 self._downloader.trouble(u'ERROR: unable to extract media URL')
1713 mediaURL = urllib.unquote(mobj.group(1))
1715 video_url = mediaURL
1717 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1719 self._downloader.trouble(u'ERROR: unable to extract title')
1721 video_title = mobj.group(1).decode('utf-8')
1722 video_title = sanitize_title(video_title)
1723 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1725 video_uploader = mobj.group(2).decode('utf-8')
1728 # Process video information
1729 self._downloader.process_info({
1730 'id': video_id.decode('utf-8'),
1731 'url': video_url.decode('utf-8'),
1732 'uploader': video_uploader,
1733 'upload_date': u'NA',
1734 'title': video_title,
1735 'stitle': simple_title,
1736 'ext': video_extension.decode('utf-8'),
1740 except UnavailableVideoError:
1741 self._downloader.trouble(u'\nERROR: unable to download video')
1744 class YahooIE(InfoExtractor):
1745 """Information extractor for video.yahoo.com."""
1747 # _VALID_URL matches all Yahoo! Video URLs
1748 # _VPAGE_URL matches only the extractable '/watch/' URLs
1749 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1750 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1752 def __init__(self, downloader=None):
1753 InfoExtractor.__init__(self, downloader)
1757 return (re.match(YahooIE._VALID_URL, url) is not None)
1759 def report_download_webpage(self, video_id):
1760 """Report webpage download."""
1761 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1763 def report_extraction(self, video_id):
1764 """Report information extraction."""
1765 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1767 def _real_initialize(self):
1770 def _real_extract(self, url, new_video=True):
1771 # Extract ID from URL
1772 mobj = re.match(self._VALID_URL, url)
1774 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1777 # At this point we have a new video
1778 self._downloader.increment_downloads()
1779 video_id = mobj.group(2)
1780 video_extension = 'flv'
1782 # Rewrite valid but non-extractable URLs as
1783 # extractable English language /watch/ URLs
1784 if re.match(self._VPAGE_URL, url) is None:
1785 request = urllib2.Request(url)
1787 webpage = urllib2.urlopen(request).read()
1788 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1789 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1792 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1794 self._downloader.trouble(u'ERROR: Unable to extract id field')
1796 yahoo_id = mobj.group(1)
1798 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1800 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1802 yahoo_vid = mobj.group(1)
1804 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1805 return self._real_extract(url, new_video=False)
1807 # Retrieve video webpage to extract further information
1808 request = urllib2.Request(url)
1810 self.report_download_webpage(video_id)
1811 webpage = urllib2.urlopen(request).read()
1812 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1813 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1816 # Extract uploader and title from webpage
1817 self.report_extraction(video_id)
1818 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1820 self._downloader.trouble(u'ERROR: unable to extract video title')
1822 video_title = mobj.group(1).decode('utf-8')
1823 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1825 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1827 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1829 video_uploader = mobj.group(1).decode('utf-8')
1831 # Extract video thumbnail
1832 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1834 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1836 video_thumbnail = mobj.group(1).decode('utf-8')
1838 # Extract video description
1839 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1841 self._downloader.trouble(u'ERROR: unable to extract video description')
1843 video_description = mobj.group(1).decode('utf-8')
1844 if not video_description: video_description = 'No description available.'
1846 # Extract video height and width
1847 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1849 self._downloader.trouble(u'ERROR: unable to extract video height')
1851 yv_video_height = mobj.group(1)
1853 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1855 self._downloader.trouble(u'ERROR: unable to extract video width')
1857 yv_video_width = mobj.group(1)
1859 # Retrieve video playlist to extract media URL
1860 # I'm not completely sure what all these options are, but we
1861 # seem to need most of them, otherwise the server sends a 401.
1862 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1863 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1864 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1865 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1866 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1868 self.report_download_webpage(video_id)
1869 webpage = urllib2.urlopen(request).read()
1870 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1871 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1874 # Extract media URL from playlist XML
1875 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1877 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1879 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1880 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1883 # Process video information
1884 self._downloader.process_info({
1885 'id': video_id.decode('utf-8'),
1887 'uploader': video_uploader,
1888 'upload_date': u'NA',
1889 'title': video_title,
1890 'stitle': simple_title,
1891 'ext': video_extension.decode('utf-8'),
1892 'thumbnail': video_thumbnail.decode('utf-8'),
1893 'description': video_description,
1894 'thumbnail': video_thumbnail,
1895 'description': video_description,
1898 except UnavailableVideoError:
1899 self._downloader.trouble(u'\nERROR: unable to download video')
1902 class GenericIE(InfoExtractor):
1903 """Generic last-resort information extractor."""
1905 def __init__(self, downloader=None):
1906 InfoExtractor.__init__(self, downloader)
1912 def report_download_webpage(self, video_id):
1913 """Report webpage download."""
1914 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1915 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1917 def report_extraction(self, video_id):
1918 """Report information extraction."""
1919 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1921 def _real_initialize(self):
1924 def _real_extract(self, url):
1925 # At this point we have a new video
1926 self._downloader.increment_downloads()
1928 video_id = url.split('/')[-1]
1929 request = urllib2.Request(url)
1931 self.report_download_webpage(video_id)
1932 webpage = urllib2.urlopen(request).read()
1933 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1934 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1936 except ValueError, err:
1937 # since this is the last-resort InfoExtractor, if
1938 # this error is thrown, it'll be thrown here
1939 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1942 self.report_extraction(video_id)
1943 # Start with something easy: JW Player in SWFObject
1944 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1946 # Broaden the search a little bit
1947 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1949 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1952 # It's possible that one of the regexes
1953 # matched, but returned an empty group:
1954 if mobj.group(1) is None:
1955 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1958 video_url = urllib.unquote(mobj.group(1))
1959 video_id = os.path.basename(video_url)
1961 # here's a fun little line of code for you:
1962 video_extension = os.path.splitext(video_id)[1][1:]
1963 video_id = os.path.splitext(video_id)[0]
1965 # it's tempting to parse this further, but you would
1966 # have to take into account all the variations like
1967 # Video Title - Site Name
1968 # Site Name | Video Title
1969 # Video Title - Tagline | Site Name
1970 # and so on and so forth; it's just not practical
1971 mobj = re.search(r'<title>(.*)</title>', webpage)
1973 self._downloader.trouble(u'ERROR: unable to extract title')
1975 video_title = mobj.group(1).decode('utf-8')
1976 video_title = sanitize_title(video_title)
1977 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1979 # video uploader is domain name
1980 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1982 self._downloader.trouble(u'ERROR: unable to extract title')
1984 video_uploader = mobj.group(1).decode('utf-8')
1987 # Process video information
1988 self._downloader.process_info({
1989 'id': video_id.decode('utf-8'),
1990 'url': video_url.decode('utf-8'),
1991 'uploader': video_uploader,
1992 'upload_date': u'NA',
1993 'title': video_title,
1994 'stitle': simple_title,
1995 'ext': video_extension.decode('utf-8'),
1999 except UnavailableVideoError, err:
2000 self._downloader.trouble(u'\nERROR: unable to download video')
2003 class YoutubeSearchIE(InfoExtractor):
2004 """Information Extractor for YouTube search queries."""
2005 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
2006 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2007 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2008 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2010 _max_youtube_results = 1000
2012 def __init__(self, youtube_ie, downloader=None):
2013 InfoExtractor.__init__(self, downloader)
2014 self._youtube_ie = youtube_ie
2018 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
2020 def report_download_page(self, query, pagenum):
2021 """Report attempt to download playlist page with given number."""
2022 query = query.decode(preferredencoding())
2023 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2025 def _real_initialize(self):
2026 self._youtube_ie.initialize()
2028 def _real_extract(self, query):
2029 mobj = re.match(self._VALID_QUERY, query)
2031 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2034 prefix, query = query.split(':')
2036 query = query.encode('utf-8')
2038 self._download_n_results(query, 1)
2040 elif prefix == 'all':
2041 self._download_n_results(query, self._max_youtube_results)
2047 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2049 elif n > self._max_youtube_results:
2050 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2051 n = self._max_youtube_results
2052 self._download_n_results(query, n)
2054 except ValueError: # parsing prefix as integer fails
2055 self._download_n_results(query, 1)
2058 def _download_n_results(self, query, n):
2059 """Downloads a specified number of results for a query"""
2062 already_seen = set()
2066 self.report_download_page(query, pagenum)
2067 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2068 request = urllib2.Request(result_url)
2070 page = urllib2.urlopen(request).read()
2071 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2072 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2075 # Extract video identifiers
2076 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2077 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2078 if video_id not in already_seen:
2079 video_ids.append(video_id)
2080 already_seen.add(video_id)
2081 if len(video_ids) == n:
2082 # Specified n videos reached
2083 for id in video_ids:
2084 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2087 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2088 for id in video_ids:
2089 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2092 pagenum = pagenum + 1
2094 class GoogleSearchIE(InfoExtractor):
2095 """Information Extractor for Google Video search queries."""
2096 _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
2097 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2098 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2099 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2101 _max_google_results = 1000
2103 def __init__(self, google_ie, downloader=None):
2104 InfoExtractor.__init__(self, downloader)
2105 self._google_ie = google_ie
2109 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
2111 def report_download_page(self, query, pagenum):
2112 """Report attempt to download playlist page with given number."""
2113 query = query.decode(preferredencoding())
2114 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2116 def _real_initialize(self):
2117 self._google_ie.initialize()
2119 def _real_extract(self, query):
2120 mobj = re.match(self._VALID_QUERY, query)
2122 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2125 prefix, query = query.split(':')
2127 query = query.encode('utf-8')
2129 self._download_n_results(query, 1)
2131 elif prefix == 'all':
2132 self._download_n_results(query, self._max_google_results)
2138 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2140 elif n > self._max_google_results:
2141 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2142 n = self._max_google_results
2143 self._download_n_results(query, n)
2145 except ValueError: # parsing prefix as integer fails
2146 self._download_n_results(query, 1)
2149 def _download_n_results(self, query, n):
2150 """Downloads a specified number of results for a query"""
2153 already_seen = set()
2157 self.report_download_page(query, pagenum)
2158 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2159 request = urllib2.Request(result_url)
2161 page = urllib2.urlopen(request).read()
2162 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2163 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2166 # Extract video identifiers
2167 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2168 video_id = mobj.group(1)
2169 if video_id not in already_seen:
2170 video_ids.append(video_id)
2171 already_seen.add(video_id)
2172 if len(video_ids) == n:
2173 # Specified n videos reached
2174 for id in video_ids:
2175 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2178 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2179 for id in video_ids:
2180 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2183 pagenum = pagenum + 1
2185 class YahooSearchIE(InfoExtractor):
2186 """Information Extractor for Yahoo! Video search queries."""
2187 _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
2188 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2189 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2190 _MORE_PAGES_INDICATOR = r'\s*Next'
2192 _max_yahoo_results = 1000
2194 def __init__(self, yahoo_ie, downloader=None):
2195 InfoExtractor.__init__(self, downloader)
2196 self._yahoo_ie = yahoo_ie
2200 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
2202 def report_download_page(self, query, pagenum):
2203 """Report attempt to download playlist page with given number."""
2204 query = query.decode(preferredencoding())
2205 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2207 def _real_initialize(self):
2208 self._yahoo_ie.initialize()
2210 def _real_extract(self, query):
2211 mobj = re.match(self._VALID_QUERY, query)
2213 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2216 prefix, query = query.split(':')
2218 query = query.encode('utf-8')
2220 self._download_n_results(query, 1)
2222 elif prefix == 'all':
2223 self._download_n_results(query, self._max_yahoo_results)
2229 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2231 elif n > self._max_yahoo_results:
2232 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2233 n = self._max_yahoo_results
2234 self._download_n_results(query, n)
2236 except ValueError: # parsing prefix as integer fails
2237 self._download_n_results(query, 1)
2240 def _download_n_results(self, query, n):
2241 """Downloads a specified number of results for a query"""
2244 already_seen = set()
2248 self.report_download_page(query, pagenum)
2249 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2250 request = urllib2.Request(result_url)
2252 page = urllib2.urlopen(request).read()
2253 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2254 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2257 # Extract video identifiers
2258 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2259 video_id = mobj.group(1)
2260 if video_id not in already_seen:
2261 video_ids.append(video_id)
2262 already_seen.add(video_id)
2263 if len(video_ids) == n:
2264 # Specified n videos reached
2265 for id in video_ids:
2266 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2269 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2270 for id in video_ids:
2271 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2274 pagenum = pagenum + 1
2276 class YoutubePlaylistIE(InfoExtractor):
2277 """Information Extractor for YouTube playlists."""
2279 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist)\?.*?(p|a)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2280 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2281 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2282 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2285 def __init__(self, youtube_ie, downloader=None):
2286 InfoExtractor.__init__(self, downloader)
2287 self._youtube_ie = youtube_ie
2291 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2293 def report_download_page(self, playlist_id, pagenum):
2294 """Report attempt to download playlist page with given number."""
2295 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2297 def _real_initialize(self):
2298 self._youtube_ie.initialize()
2300 def _real_extract(self, url):
2301 # Extract playlist id
2302 mobj = re.match(self._VALID_URL, url)
2304 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2308 if mobj.group(3) is not None:
2309 self._youtube_ie.extract(mobj.group(3))
2312 # Download playlist pages
2313 # prefix is 'p' as default for playlists but there are other types that need extra care
2314 playlist_prefix = mobj.group(1)
2315 if playlist_prefix == 'a':
2316 playlist_access = 'artist'
2318 playlist_prefix = 'p'
2319 playlist_access = 'view_play_list'
2320 playlist_id = mobj.group(2)
2325 self.report_download_page(playlist_id, pagenum)
2326 request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2328 page = urllib2.urlopen(request).read()
2329 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2330 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2333 # Extract video identifiers
2335 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2336 if mobj.group(1) not in ids_in_page:
2337 ids_in_page.append(mobj.group(1))
2338 video_ids.extend(ids_in_page)
2340 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2342 pagenum = pagenum + 1
2344 playliststart = self._downloader.params.get('playliststart', 1) - 1
2345 playlistend = self._downloader.params.get('playlistend', -1)
2346 video_ids = video_ids[playliststart:playlistend]
2348 for id in video_ids:
2349 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2352 class YoutubeUserIE(InfoExtractor):
2353 """Information Extractor for YouTube users."""
2355 _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2356 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2357 _GDATA_PAGE_SIZE = 50
2358 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2359 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2362 def __init__(self, youtube_ie, downloader=None):
2363 InfoExtractor.__init__(self, downloader)
2364 self._youtube_ie = youtube_ie
2368 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2370 def report_download_page(self, username, start_index):
2371 """Report attempt to download user page."""
2372 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2373 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2375 def _real_initialize(self):
2376 self._youtube_ie.initialize()
2378 def _real_extract(self, url):
2380 mobj = re.match(self._VALID_URL, url)
2382 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2385 username = mobj.group(1)
2387 # Download video ids using YouTube Data API. Result size per
2388 # query is limited (currently to 50 videos) so we need to query
2389 # page by page until there are no video ids - it means we got
2396 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2397 self.report_download_page(username, start_index)
2399 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2402 page = urllib2.urlopen(request).read()
2403 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2404 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2407 # Extract video identifiers
2410 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2411 if mobj.group(1) not in ids_in_page:
2412 ids_in_page.append(mobj.group(1))
2414 video_ids.extend(ids_in_page)
2416 # A little optimization - if current page is not
2417 # "full", ie. does not contain PAGE_SIZE video ids then
2418 # we can assume that this page is the last one - there
2419 # are no more ids on further pages - no need to query
2422 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2427 all_ids_count = len(video_ids)
2428 playliststart = self._downloader.params.get('playliststart', 1) - 1
2429 playlistend = self._downloader.params.get('playlistend', -1)
2431 if playlistend == -1:
2432 video_ids = video_ids[playliststart:]
2434 video_ids = video_ids[playliststart:playlistend]
2436 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2437 (username, all_ids_count, len(video_ids)))
2439 for video_id in video_ids:
2440 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2443 class DepositFilesIE(InfoExtractor):
2444 """Information extractor for depositfiles.com"""
2446 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2448 def __init__(self, downloader=None):
2449 InfoExtractor.__init__(self, downloader)
2453 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2455 def report_download_webpage(self, file_id):
2456 """Report webpage download."""
2457 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2459 def report_extraction(self, file_id):
2460 """Report information extraction."""
2461 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2463 def _real_initialize(self):
2466 def _real_extract(self, url):
2467 # At this point we have a new file
2468 self._downloader.increment_downloads()
2470 file_id = url.split('/')[-1]
2471 # Rebuild url in english locale
2472 url = 'http://depositfiles.com/en/files/' + file_id
2474 # Retrieve file webpage with 'Free download' button pressed
2475 free_download_indication = { 'gateway_result' : '1' }
2476 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2478 self.report_download_webpage(file_id)
2479 webpage = urllib2.urlopen(request).read()
2480 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2481 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2484 # Search for the real file URL
2485 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2486 if (mobj is None) or (mobj.group(1) is None):
2487 # Try to figure out reason of the error.
2488 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2489 if (mobj is not None) and (mobj.group(1) is not None):
2490 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2491 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2493 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2496 file_url = mobj.group(1)
2497 file_extension = os.path.splitext(file_url)[1][1:]
2499 # Search for file title
2500 mobj = re.search(r'<b title="(.*?)">', webpage)
2502 self._downloader.trouble(u'ERROR: unable to extract title')
2504 file_title = mobj.group(1).decode('utf-8')
2507 # Process file information
2508 self._downloader.process_info({
2509 'id': file_id.decode('utf-8'),
2510 'url': file_url.decode('utf-8'),
2512 'upload_date': u'NA',
2513 'title': file_title,
2514 'stitle': file_title,
2515 'ext': file_extension.decode('utf-8'),
2519 except UnavailableVideoError, err:
2520 self._downloader.trouble(u'ERROR: unable to download file')
2522 class FacebookIE(InfoExtractor):
2523 """Information Extractor for Facebook"""
2525 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2526 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2527 _NETRC_MACHINE = 'facebook'
2528 _available_formats = ['highqual', 'lowqual']
2529 _video_extensions = {
2534 def __init__(self, downloader=None):
2535 InfoExtractor.__init__(self, downloader)
2539 return (re.match(FacebookIE._VALID_URL, url) is not None)
2541 def _reporter(self, message):
2542 """Add header and report message."""
2543 self._downloader.to_screen(u'[facebook] %s' % message)
2545 def report_login(self):
2546 """Report attempt to log in."""
2547 self._reporter(u'Logging in')
2549 def report_video_webpage_download(self, video_id):
2550 """Report attempt to download video webpage."""
2551 self._reporter(u'%s: Downloading video webpage' % video_id)
2553 def report_information_extraction(self, video_id):
2554 """Report attempt to extract video information."""
2555 self._reporter(u'%s: Extracting video information' % video_id)
2557 def _parse_page(self, video_webpage):
2558 """Extract video information from page"""
2560 data = {'title': r'class="video_title datawrap">(.*?)</',
2561 'description': r'<div class="datawrap">(.*?)</div>',
2562 'owner': r'\("video_owner_name", "(.*?)"\)',
2563 'upload_date': r'data-date="(.*?)"',
2564 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2567 for piece in data.keys():
2568 mobj = re.search(data[piece], video_webpage)
2569 if mobj is not None:
2570 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2574 for fmt in self._available_formats:
2575 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2576 if mobj is not None:
2577 # URL is in a Javascript segment inside an escaped Unicode format within
2578 # the generally utf-8 page
2579 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2580 video_info['video_urls'] = video_urls
2584 def _real_initialize(self):
2585 if self._downloader is None:
2590 downloader_params = self._downloader.params
2592 # Attempt to use provided username and password or .netrc data
2593 if downloader_params.get('username', None) is not None:
2594 useremail = downloader_params['username']
2595 password = downloader_params['password']
2596 elif downloader_params.get('usenetrc', False):
2598 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2599 if info is not None:
2603 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2604 except (IOError, netrc.NetrcParseError), err:
2605 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2608 if useremail is None:
2617 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2620 login_results = urllib2.urlopen(request).read()
2621 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2622 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2624 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2625 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2628 def _real_extract(self, url):
2629 mobj = re.match(self._VALID_URL, url)
2631 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2633 video_id = mobj.group('ID')
2636 self.report_video_webpage_download(video_id)
2637 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2639 page = urllib2.urlopen(request)
2640 video_webpage = page.read()
2641 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2642 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2645 # Start extracting information
2646 self.report_information_extraction(video_id)
2648 # Extract information
2649 video_info = self._parse_page(video_webpage)
2652 if 'owner' not in video_info:
2653 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2655 video_uploader = video_info['owner']
2658 if 'title' not in video_info:
2659 self._downloader.trouble(u'ERROR: unable to extract video title')
2661 video_title = video_info['title']
2662 video_title = video_title.decode('utf-8')
2663 video_title = sanitize_title(video_title)
2666 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2667 simple_title = simple_title.strip(ur'_')
2670 if 'thumbnail' not in video_info:
2671 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2672 video_thumbnail = ''
2674 video_thumbnail = video_info['thumbnail']
2678 if 'upload_date' in video_info:
2679 upload_time = video_info['upload_date']
2680 timetuple = email.utils.parsedate_tz(upload_time)
2681 if timetuple is not None:
2683 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2688 video_description = video_info.get('description', 'No description available.')
2690 url_map = video_info['video_urls']
2691 if len(url_map.keys()) > 0:
2692 # Decide which formats to download
2693 req_format = self._downloader.params.get('format', None)
2694 format_limit = self._downloader.params.get('format_limit', None)
2696 if format_limit is not None and format_limit in self._available_formats:
2697 format_list = self._available_formats[self._available_formats.index(format_limit):]
2699 format_list = self._available_formats
2700 existing_formats = [x for x in format_list if x in url_map]
2701 if len(existing_formats) == 0:
2702 self._downloader.trouble(u'ERROR: no known formats available for video')
2704 if req_format is None:
2705 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2706 elif req_format == '-1':
2707 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2710 if req_format not in url_map:
2711 self._downloader.trouble(u'ERROR: requested format not available')
2713 video_url_list = [(req_format, url_map[req_format])] # Specific format
2715 for format_param, video_real_url in video_url_list:
2717 # At this point we have a new video
2718 self._downloader.increment_downloads()
2721 video_extension = self._video_extensions.get(format_param, 'mp4')
2724 # Process video information
2725 self._downloader.process_info({
2726 'id': video_id.decode('utf-8'),
2727 'url': video_real_url.decode('utf-8'),
2728 'uploader': video_uploader.decode('utf-8'),
2729 'upload_date': upload_date,
2730 'title': video_title,
2731 'stitle': simple_title,
2732 'ext': video_extension.decode('utf-8'),
2733 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2734 'thumbnail': video_thumbnail.decode('utf-8'),
2735 'description': video_description.decode('utf-8'),
2738 except UnavailableVideoError, err:
2739 self._downloader.trouble(u'\nERROR: unable to download video')
2741 class BlipTVIE(InfoExtractor):
2742 """Information extractor for blip.tv"""
2744 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip.tv(/.+)$'
2745 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2749 return (re.match(BlipTVIE._VALID_URL, url) is not None)
2751 def report_extraction(self, file_id):
2752 """Report information extraction."""
2753 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
2755 def _simplify_title(self, title):
2756 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2757 res = res.strip(ur'_')
2760 def _real_extract(self, url):
2761 mobj = re.match(self._VALID_URL, url)
2763 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2770 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2771 request = urllib2.Request(json_url)
2772 self.report_extraction(mobj.group(1))
2774 json_code = urllib2.urlopen(request).read()
2775 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2776 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2779 json_data = json.loads(json_code)
2780 if 'Post' in json_data:
2781 data = json_data['Post']
2785 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2786 video_url = data['media']['url']
2787 umobj = re.match(self._URL_EXT, video_url)
2789 raise ValueError('Can not determine filename extension')
2790 ext = umobj.group(1)
2792 self._downloader.increment_downloads()
2795 'id': data['item_id'],
2797 'uploader': data['display_name'],
2798 'upload_date': upload_date,
2799 'title': data['title'],
2800 'stitle': self._simplify_title(data['title']),
2802 'format': data['media']['mimeType'],
2803 'thumbnail': data['thumbnailUrl'],
2804 'description': data['description'],
2805 'player_url': data['embedUrl']
2807 except (ValueError,KeyError), err:
2808 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2812 self._downloader.process_info(info)
2813 except UnavailableVideoError, err:
2814 self._downloader.trouble(u'\nERROR: unable to download video')
2817 class PostProcessor(object):
2818 """Post Processor class.
2820 PostProcessor objects can be added to downloaders with their
2821 add_post_processor() method. When the downloader has finished a
2822 successful download, it will take its internal chain of PostProcessors
2823 and start calling the run() method on each one of them, first with
2824 an initial argument and then with the returned value of the previous
2827 The chain will be stopped if one of them ever returns None or the end
2828 of the chain is reached.
2830 PostProcessor objects follow a "mutual registration" process similar
2831 to InfoExtractor objects.
2836 def __init__(self, downloader=None):
2837 self._downloader = downloader
2839 def set_downloader(self, downloader):
2840 """Sets the downloader for this PP."""
2841 self._downloader = downloader
2843 def run(self, information):
2844 """Run the PostProcessor.
2846 The "information" argument is a dictionary like the ones
2847 composed by InfoExtractors. The only difference is that this
2848 one has an extra field called "filepath" that points to the
2851 When this method returns None, the postprocessing chain is
2852 stopped. However, this method may return an information
2853 dictionary that will be passed to the next postprocessing
2854 object in the chain. It can be the one it received after
2855 changing some fields.
2857 In addition, this method may raise a PostProcessingError
2858 exception that will be taken into account by the downloader
2861 return information # by default, do nothing
2863 class FFmpegExtractAudioPP(PostProcessor):
2865 def __init__(self, downloader=None, preferredcodec=None):
2866 PostProcessor.__init__(self, downloader)
2867 if preferredcodec is None:
2868 preferredcodec = 'best'
2869 self._preferredcodec = preferredcodec
2872 def get_audio_codec(path):
2874 cmd = ['ffprobe', '-show_streams', '--', path]
2875 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
2876 output = handle.communicate()[0]
2877 if handle.wait() != 0:
2879 except (IOError, OSError):
2882 for line in output.split('\n'):
2883 if line.startswith('codec_name='):
2884 audio_codec = line.split('=')[1].strip()
2885 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
2890 def run_ffmpeg(path, out_path, codec, more_opts):
2892 cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
2893 ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
2895 except (IOError, OSError):
2898 def run(self, information):
2899 path = information['filepath']
2901 filecodec = self.get_audio_codec(path)
2902 if filecodec is None:
2903 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
2907 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
2908 if filecodec == 'aac' or filecodec == 'mp3':
2909 # Lossless if possible
2911 extension = filecodec
2912 if filecodec == 'aac':
2913 more_opts = ['-f', 'adts']
2916 acodec = 'libmp3lame'
2918 more_opts = ['-ab', '128k']
2920 # We convert the audio (lossy)
2921 acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
2922 extension = self._preferredcodec
2923 more_opts = ['-ab', '128k']
2924 if self._preferredcodec == 'aac':
2925 more_opts += ['-f', 'adts']
2927 (prefix, ext) = os.path.splitext(path)
2928 new_path = prefix + '.' + extension
2929 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
2930 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
2933 self._downloader.to_stderr(u'WARNING: error running ffmpeg')
2938 except (IOError, OSError):
2939 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
2942 information['filepath'] = new_path
2945 ### MAIN PROGRAM ###
2946 if __name__ == '__main__':
2948 # Modules needed only when running the main program
2952 # Function to update the program file with the latest version from the repository.
2953 def update_self(downloader, filename):
2954 # Note: downloader only used for options
2955 if not os.access(filename, os.W_OK):
2956 sys.exit('ERROR: no write permissions on %s' % filename)
2958 downloader.to_screen('Updating to latest stable version...')
2960 latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
2961 latest_version = urllib.urlopen(latest_url).read().strip()
2962 prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2963 newcontent = urllib.urlopen(prog_url).read()
2964 except (IOError, OSError), err:
2965 sys.exit('ERROR: unable to download latest version')
2967 stream = open(filename, 'w')
2968 stream.write(newcontent)
2970 except (IOError, OSError), err:
2971 sys.exit('ERROR: unable to overwrite current version')
2972 downloader.to_screen('Updated to version %s' % latest_version)
2974 # Parse command line
2975 parser = optparse.OptionParser(
2976 usage='Usage: %prog [options] url...',
2977 version='2011.07.09-phihag',
2978 conflict_handler='resolve',
2981 parser.add_option('-h', '--help',
2982 action='help', help='print this help text and exit')
2983 parser.add_option('-v', '--version',
2984 action='version', help='print program version and exit')
2985 parser.add_option('-U', '--update',
2986 action='store_true', dest='update_self', help='update this program to latest stable version')
2987 parser.add_option('-i', '--ignore-errors',
2988 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2989 parser.add_option('-r', '--rate-limit',
2990 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2991 parser.add_option('-R', '--retries',
2992 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2993 parser.add_option('--playlist-start',
2994 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2995 parser.add_option('--playlist-end',
2996 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
2997 parser.add_option('--dump-user-agent',
2998 action='store_true', dest='dump_user_agent',
2999 help='display the current browser identification', default=False)
3001 authentication = optparse.OptionGroup(parser, 'Authentication Options')
3002 authentication.add_option('-u', '--username',
3003 dest='username', metavar='USERNAME', help='account username')
3004 authentication.add_option('-p', '--password',
3005 dest='password', metavar='PASSWORD', help='account password')
3006 authentication.add_option('-n', '--netrc',
3007 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3008 parser.add_option_group(authentication)
3010 video_format = optparse.OptionGroup(parser, 'Video Format Options')
3011 video_format.add_option('-f', '--format',
3012 action='store', dest='format', metavar='FORMAT', help='video format code')
3013 video_format.add_option('--all-formats',
3014 action='store_const', dest='format', help='download all available video formats', const='-1')
3015 video_format.add_option('--max-quality',
3016 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3017 parser.add_option_group(video_format)
3019 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3020 verbosity.add_option('-q', '--quiet',
3021 action='store_true', dest='quiet', help='activates quiet mode', default=False)
3022 verbosity.add_option('-s', '--simulate',
3023 action='store_true', dest='simulate', help='do not download video', default=False)
3024 verbosity.add_option('-g', '--get-url',
3025 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3026 verbosity.add_option('-e', '--get-title',
3027 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3028 verbosity.add_option('--get-thumbnail',
3029 action='store_true', dest='getthumbnail',
3030 help='simulate, quiet but print thumbnail URL', default=False)
3031 verbosity.add_option('--get-description',
3032 action='store_true', dest='getdescription',
3033 help='simulate, quiet but print video description', default=False)
3034 verbosity.add_option('--get-filename',
3035 action='store_true', dest='getfilename',
3036 help='simulate, quiet but print output filename', default=False)
3037 verbosity.add_option('--no-progress',
3038 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3039 verbosity.add_option('--console-title',
3040 action='store_true', dest='consoletitle',
3041 help='display progress in console titlebar', default=False)
3042 parser.add_option_group(verbosity)
3044 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
3045 filesystem.add_option('-t', '--title',
3046 action='store_true', dest='usetitle', help='use title in file name', default=False)
3047 filesystem.add_option('-l', '--literal',
3048 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3049 filesystem.add_option('-A', '--auto-number',
3050 action='store_true', dest='autonumber',
3051 help='number downloaded files starting from 00000', default=False)
3052 filesystem.add_option('-o', '--output',
3053 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
3054 filesystem.add_option('-a', '--batch-file',
3055 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3056 filesystem.add_option('-w', '--no-overwrites',
3057 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3058 filesystem.add_option('-c', '--continue',
3059 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3060 filesystem.add_option('--cookies',
3061 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
3062 filesystem.add_option('--no-part',
3063 action='store_true', dest='nopart', help='do not use .part files', default=False)
3064 filesystem.add_option('--no-mtime',
3065 action='store_false', dest='updatetime',
3066 help='do not use the Last-modified header to set the file modification time', default=True)
3067 filesystem.add_option('--write-description',
3068 action='store_true', dest='writedescription',
3069 help='write video description to a .description file', default=False)
3070 filesystem.add_option('--write-info-json',
3071 action='store_true', dest='writeinfojson',
3072 help='write video metadata to a .info.json file', default=False)
3073 parser.add_option_group(filesystem)
3075 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
3076 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3077 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3078 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3079 help='"best", "aac" or "mp3"; best by default')
3080 parser.add_option_group(postproc)
3082 (opts, args) = parser.parse_args()
3084 # Open appropriate CookieJar
3085 if opts.cookiefile is None:
3086 jar = cookielib.CookieJar()
3089 jar = cookielib.MozillaCookieJar(opts.cookiefile)
3090 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3092 except (IOError, OSError), err:
3093 sys.exit(u'ERROR: unable to open cookie file')
3096 if opts.dump_user_agent:
3097 print std_headers['User-Agent']
3100 # General configuration
3101 cookie_processor = urllib2.HTTPCookieProcessor(jar)
3102 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler()))
3103 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3105 # Batch file verification
3107 if opts.batchfile is not None:
3109 if opts.batchfile == '-':
3112 batchfd = open(opts.batchfile, 'r')
3113 batchurls = batchfd.readlines()
3114 batchurls = [x.strip() for x in batchurls]
3115 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3117 sys.exit(u'ERROR: batch file could not be read')
3118 all_urls = batchurls + args
3120 # Conflicting, missing and erroneous options
3121 if opts.usenetrc and (opts.username is not None or opts.password is not None):
3122 parser.error(u'using .netrc conflicts with giving username/password')
3123 if opts.password is not None and opts.username is None:
3124 parser.error(u'account username missing')
3125 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3126 parser.error(u'using output template conflicts with using title, literal title or auto number')
3127 if opts.usetitle and opts.useliteral:
3128 parser.error(u'using title conflicts with using literal title')
3129 if opts.username is not None and opts.password is None:
3130 opts.password = getpass.getpass(u'Type account password and press return:')
3131 if opts.ratelimit is not None:
3132 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3133 if numeric_limit is None:
3134 parser.error(u'invalid rate limit specified')
3135 opts.ratelimit = numeric_limit
3136 if opts.retries is not None:
3138 opts.retries = long(opts.retries)
3139 except (TypeError, ValueError), err:
3140 parser.error(u'invalid retry count specified')
3142 opts.playliststart = long(opts.playliststart)
3143 if opts.playliststart <= 0:
3145 except (TypeError, ValueError), err:
3146 parser.error(u'invalid playlist start number specified')
3148 opts.playlistend = long(opts.playlistend)
3149 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3151 except (TypeError, ValueError), err:
3152 parser.error(u'invalid playlist end number specified')
3153 if opts.extractaudio:
3154 if opts.audioformat not in ['best', 'aac', 'mp3']:
3155 parser.error(u'invalid audio format specified')
3157 # Information extractors
3158 youtube_ie = YoutubeIE()
3159 metacafe_ie = MetacafeIE(youtube_ie)
3160 dailymotion_ie = DailymotionIE()
3161 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
3162 youtube_user_ie = YoutubeUserIE(youtube_ie)
3163 youtube_search_ie = YoutubeSearchIE(youtube_ie)
3164 google_ie = GoogleIE()
3165 google_search_ie = GoogleSearchIE(google_ie)
3166 photobucket_ie = PhotobucketIE()
3167 yahoo_ie = YahooIE()
3168 yahoo_search_ie = YahooSearchIE(yahoo_ie)
3169 deposit_files_ie = DepositFilesIE()
3170 facebook_ie = FacebookIE()
3171 bliptv_ie = BlipTVIE()
3172 generic_ie = GenericIE()
3175 fd = FileDownloader({
3176 'usenetrc': opts.usenetrc,
3177 'username': opts.username,
3178 'password': opts.password,
3179 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3180 'forceurl': opts.geturl,
3181 'forcetitle': opts.gettitle,
3182 'forcethumbnail': opts.getthumbnail,
3183 'forcedescription': opts.getdescription,
3184 'forcefilename': opts.getfilename,
3185 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3186 'format': opts.format,
3187 'format_limit': opts.format_limit,
3188 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3189 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3190 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3191 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3192 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3193 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3194 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3195 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3196 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3197 or u'%(id)s.%(ext)s'),
3198 'ignoreerrors': opts.ignoreerrors,
3199 'ratelimit': opts.ratelimit,
3200 'nooverwrites': opts.nooverwrites,
3201 'retries': opts.retries,
3202 'continuedl': opts.continue_dl,
3203 'noprogress': opts.noprogress,
3204 'playliststart': opts.playliststart,
3205 'playlistend': opts.playlistend,
3206 'logtostderr': opts.outtmpl == '-',
3207 'consoletitle': opts.consoletitle,
3208 'nopart': opts.nopart,
3209 'updatetime': opts.updatetime,
3210 'writedescription': opts.writedescription,
3211 'writeinfojson': opts.writeinfojson,
3213 fd.add_info_extractor(youtube_search_ie)
3214 fd.add_info_extractor(youtube_pl_ie)
3215 fd.add_info_extractor(youtube_user_ie)
3216 fd.add_info_extractor(metacafe_ie)
3217 fd.add_info_extractor(dailymotion_ie)
3218 fd.add_info_extractor(youtube_ie)
3219 fd.add_info_extractor(google_ie)
3220 fd.add_info_extractor(google_search_ie)
3221 fd.add_info_extractor(photobucket_ie)
3222 fd.add_info_extractor(yahoo_ie)
3223 fd.add_info_extractor(yahoo_search_ie)
3224 fd.add_info_extractor(deposit_files_ie)
3225 fd.add_info_extractor(facebook_ie)
3226 fd.add_info_extractor(bliptv_ie)
3228 # This must come last since it's the
3229 # fallback if none of the others work
3230 fd.add_info_extractor(generic_ie)
3233 if opts.extractaudio:
3234 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))
3237 if opts.update_self:
3238 update_self(fd, sys.argv[0])
3241 if len(all_urls) < 1:
3242 if not opts.update_self:
3243 parser.error(u'you must provide at least one URL')
3246 retcode = fd.download(all_urls)
3248 # Dump cookie jar if requested
3249 if opts.cookiefile is not None:
3252 except (IOError, OSError), err:
3253 sys.exit(u'ERROR: unable to save cookie jar')
3257 except DownloadError:
3259 except SameFileError:
3260 sys.exit(u'ERROR: fixed output name but more than one file to download')
3261 except KeyboardInterrupt:
3262 sys.exit(u'\nERROR: Interrupted by user')