2 # -*- coding: utf-8 -*-
5 'Ricardo Garcia Gonzalez',
13 'Philipp Hagemeister',
16 __license__ = 'Public Domain'
17 __version__ = '2011.08.28-phihag'
19 UPDATE_URL = 'https://raw.github.com/phihag/youtube-dl/master/youtube-dl'
47 except ImportError: # Python 2.4
50 import cStringIO as StringIO
54 # parse_qs was moved from the cgi module to the urlparse module recently.
56 from urlparse import parse_qs
58 from cgi import parse_qs
66 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
67 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
68 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
69 'Accept-Encoding': 'gzip, deflate',
70 'Accept-Language': 'en-us,en;q=0.5',
73 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
77 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
83 def raiseError(msg, i):
84 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
85 def skipSpace(i, expectMore=True):
86 while i < len(s) and s[i] in ' \t\r\n':
90 raiseError('Premature end', i)
92 def decodeEscape(match):
108 return unichr(int(esc[1:5], 16))
109 if len(esc) == 5+6 and esc[5:7] == '\\u':
110 hi = int(esc[1:5], 16)
111 low = int(esc[7:11], 16)
112 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
113 raise ValueError('Unknown escape ' + str(esc))
120 while s[e-bslashes-1] == '\\':
122 if bslashes % 2 == 1:
126 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
127 stri = rexp.sub(decodeEscape, s[i:e])
133 if s[i] == '}': # Empty dictionary
137 raiseError('Expected a string object key', i)
138 i,key = parseString(i)
140 if i >= len(s) or s[i] != ':':
141 raiseError('Expected a colon', i)
148 raiseError('Expected comma or closing curly brace', i)
153 if s[i] == ']': # Empty array
158 i = skipSpace(i) # Raise exception if premature end
162 raiseError('Expected a comma or closing bracket', i)
164 def parseDiscrete(i):
165 for k,v in {'true': True, 'false': False, 'null': None}.items():
166 if s.startswith(k, i):
168 raiseError('Not a boolean (or null)', i)
170 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
172 raiseError('Not a number', i)
174 if '.' in nums or 'e' in nums or 'E' in nums:
175 return (i+len(nums), float(nums))
176 return (i+len(nums), int(nums))
177 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
180 i,res = CHARMAP.get(s[i], parseNumber)(i)
181 i = skipSpace(i, False)
185 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
188 def preferredencoding():
189 """Get preferred encoding.
191 Returns the best encoding scheme for the system, based on
192 locale.getpreferredencoding() and some further tweaks.
194 def yield_preferredencoding():
196 pref = locale.getpreferredencoding()
202 return yield_preferredencoding().next()
204 def htmlentity_transform(matchobj):
205 """Transforms an HTML entity to a Unicode character.
207 This function receives a match object and is intended to be used with
208 the re.sub() function.
210 entity = matchobj.group(1)
212 # Known non-numeric HTML entity
213 if entity in htmlentitydefs.name2codepoint:
214 return unichr(htmlentitydefs.name2codepoint[entity])
217 mobj = re.match(ur'(?u)#(x?\d+)', entity)
219 numstr = mobj.group(1)
220 if numstr.startswith(u'x'):
222 numstr = u'0%s' % numstr
225 return unichr(long(numstr, base))
227 # Unknown entity in name, return its literal representation
228 return (u'&%s;' % entity)
230 def sanitize_title(utitle):
231 """Sanitizes a video title so it could be used as part of a filename."""
232 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
233 return utitle.replace(unicode(os.sep), u'%')
235 def sanitize_open(filename, open_mode):
236 """Try to open the given filename, and slightly tweak it if this fails.
238 Attempts to open the given filename. If this fails, it tries to change
239 the filename slightly, step by step, until it's either able to open it
240 or it fails and raises a final exception, like the standard open()
243 It returns the tuple (stream, definitive_file_name).
247 if sys.platform == 'win32':
249 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
250 return (sys.stdout, filename)
251 stream = open(filename, open_mode)
252 return (stream, filename)
253 except (IOError, OSError), err:
254 # In case of error, try to remove win32 forbidden chars
255 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
257 # An exception here should be caught in the caller
258 stream = open(filename, open_mode)
259 return (stream, filename)
261 def timeconvert(timestr):
262 """Convert RFC 2822 defined time string into system timestamp"""
264 timetuple = email.utils.parsedate_tz(timestr)
265 if timetuple is not None:
266 timestamp = email.utils.mktime_tz(timetuple)
269 class DownloadError(Exception):
270 """Download Error exception.
272 This exception may be thrown by FileDownloader objects if they are not
273 configured to continue on errors. They will contain the appropriate
278 class SameFileError(Exception):
279 """Same File exception.
281 This exception will be thrown by FileDownloader objects if they detect
282 multiple files would have to be downloaded to the same file on disk.
286 class PostProcessingError(Exception):
287 """Post Processing exception.
289 This exception may be raised by PostProcessor's .run() method to
290 indicate an error in the postprocessing task.
294 class UnavailableVideoError(Exception):
295 """Unavailable Format exception.
297 This exception will be thrown when a video is requested
298 in a format that is not available for that video.
302 class ContentTooShortError(Exception):
303 """Content Too Short exception.
305 This exception may be raised by FileDownloader objects when a file they
306 download is too small for what the server announced first, indicating
307 the connection was probably interrupted.
313 def __init__(self, downloaded, expected):
314 self.downloaded = downloaded
315 self.expected = expected
317 class YoutubeDLHandler(urllib2.HTTPHandler):
318 """Handler for HTTP requests and responses.
320 This class, when installed with an OpenerDirector, automatically adds
321 the standard headers to every HTTP request and handles gzipped and
322 deflated responses from web servers. If compression is to be avoided in
323 a particular request, the original request in the program code only has
324 to include the HTTP header "Youtubedl-No-Compression", which will be
325 removed before making the real request.
327 Part of this code was copied from:
329 http://techknack.net/python-urllib2-handlers/
331 Andrew Rowls, the author of that code, agreed to release it to the
338 return zlib.decompress(data, -zlib.MAX_WBITS)
340 return zlib.decompress(data)
343 def addinfourl_wrapper(stream, headers, url, code):
344 if hasattr(urllib2.addinfourl, 'getcode'):
345 return urllib2.addinfourl(stream, headers, url, code)
346 ret = urllib2.addinfourl(stream, headers, url)
350 def http_request(self, req):
351 for h in std_headers:
354 req.add_header(h, std_headers[h])
355 if 'Youtubedl-no-compression' in req.headers:
356 if 'Accept-encoding' in req.headers:
357 del req.headers['Accept-encoding']
358 del req.headers['Youtubedl-no-compression']
361 def http_response(self, req, resp):
364 if resp.headers.get('Content-encoding', '') == 'gzip':
365 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
366 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
367 resp.msg = old_resp.msg
369 if resp.headers.get('Content-encoding', '') == 'deflate':
370 gz = StringIO.StringIO(self.deflate(resp.read()))
371 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
372 resp.msg = old_resp.msg
375 class FileDownloader(object):
376 """File Downloader class.
378 File downloader objects are the ones responsible of downloading the
379 actual video file and writing it to disk if the user has requested
380 it, among some other tasks. In most cases there should be one per
381 program. As, given a video URL, the downloader doesn't know how to
382 extract all the needed information, task that InfoExtractors do, it
383 has to pass the URL to one of them.
385 For this, file downloader objects have a method that allows
386 InfoExtractors to be registered in a given order. When it is passed
387 a URL, the file downloader handles it to the first InfoExtractor it
388 finds that reports being able to handle it. The InfoExtractor extracts
389 all the information about the video or videos the URL refers to, and
390 asks the FileDownloader to process the video information, possibly
391 downloading the video.
393 File downloaders accept a lot of parameters. In order not to saturate
394 the object constructor with arguments, it receives a dictionary of
395 options instead. These options are available through the params
396 attribute for the InfoExtractors to use. The FileDownloader also
397 registers itself as the downloader in charge for the InfoExtractors
398 that are added to it, so this is a "mutual registration".
402 username: Username for authentication purposes.
403 password: Password for authentication purposes.
404 usenetrc: Use netrc for authentication instead.
405 quiet: Do not print messages to stdout.
406 forceurl: Force printing final URL.
407 forcetitle: Force printing title.
408 forcethumbnail: Force printing thumbnail URL.
409 forcedescription: Force printing description.
410 forcefilename: Force printing final filename.
411 simulate: Do not download the video files.
412 format: Video format code.
413 format_limit: Highest quality format to try.
414 outtmpl: Template for output names.
415 ignoreerrors: Do not stop on download errors.
416 ratelimit: Download speed limit, in bytes/sec.
417 nooverwrites: Prevent overwriting files.
418 retries: Number of times to retry for HTTP error 5xx
419 continuedl: Try to continue downloads if possible.
420 noprogress: Do not print the progress bar.
421 playliststart: Playlist item to start at.
422 playlistend: Playlist item to end at.
423 logtostderr: Log messages to stderr instead of stdout.
424 consoletitle: Display progress in console window's titlebar.
425 nopart: Do not use temporary .part files.
426 updatetime: Use the Last-modified header to set output file timestamps.
427 writedescription: Write the video description to a .description file
428 writeinfojson: Write the video description to a .info.json file
434 _download_retcode = None
435 _num_downloads = None
438 def __init__(self, params):
439 """Create a FileDownloader object with the given options."""
442 self._download_retcode = 0
443 self._num_downloads = 0
444 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
448 def pmkdir(filename):
449 """Create directory components in filename. Similar to Unix "mkdir -p"."""
450 components = filename.split(os.sep)
451 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
452 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
453 for dir in aggregate:
454 if not os.path.exists(dir):
458 def format_bytes(bytes):
461 if type(bytes) is str:
466 exponent = long(math.log(bytes, 1024.0))
467 suffix = 'bkMGTPEZY'[exponent]
468 converted = float(bytes) / float(1024**exponent)
469 return '%.2f%s' % (converted, suffix)
472 def calc_percent(byte_counter, data_len):
475 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
478 def calc_eta(start, now, total, current):
482 if current == 0 or dif < 0.001: # One millisecond
484 rate = float(current) / dif
485 eta = long((float(total) - float(current)) / rate)
486 (eta_mins, eta_secs) = divmod(eta, 60)
489 return '%02d:%02d' % (eta_mins, eta_secs)
492 def calc_speed(start, now, bytes):
494 if bytes == 0 or dif < 0.001: # One millisecond
495 return '%10s' % '---b/s'
496 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
499 def best_block_size(elapsed_time, bytes):
500 new_min = max(bytes / 2.0, 1.0)
501 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
502 if elapsed_time < 0.001:
504 rate = bytes / elapsed_time
512 def parse_bytes(bytestr):
513 """Parse a string indicating a byte quantity into a long integer."""
514 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
517 number = float(matchobj.group(1))
518 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
519 return long(round(number * multiplier))
521 def add_info_extractor(self, ie):
522 """Add an InfoExtractor object to the end of the list."""
524 ie.set_downloader(self)
526 def add_post_processor(self, pp):
527 """Add a PostProcessor object to the end of the chain."""
529 pp.set_downloader(self)
531 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
532 """Print message to stdout if not in quiet mode."""
534 if not self.params.get('quiet', False):
535 terminator = [u'\n', u''][skip_eol]
536 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
537 self._screen_file.flush()
538 except (UnicodeEncodeError), err:
539 if not ignore_encoding_errors:
542 def to_stderr(self, message):
543 """Print message to stderr."""
544 print >>sys.stderr, message.encode(preferredencoding())
546 def to_cons_title(self, message):
547 """Set console/terminal window title to message."""
548 if not self.params.get('consoletitle', False):
550 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
551 # c_wchar_p() might not be necessary if `message` is
552 # already of type unicode()
553 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
554 elif 'TERM' in os.environ:
555 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
557 def fixed_template(self):
558 """Checks if the output template is fixed."""
559 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
561 def trouble(self, message=None):
562 """Determine action to take when a download problem appears.
564 Depending on if the downloader has been configured to ignore
565 download errors or not, this method may throw an exception or
566 not when errors are found, after printing the message.
568 if message is not None:
569 self.to_stderr(message)
570 if not self.params.get('ignoreerrors', False):
571 raise DownloadError(message)
572 self._download_retcode = 1
574 def slow_down(self, start_time, byte_counter):
575 """Sleep if the download speed is over the rate limit."""
576 rate_limit = self.params.get('ratelimit', None)
577 if rate_limit is None or byte_counter == 0:
580 elapsed = now - start_time
583 speed = float(byte_counter) / elapsed
584 if speed > rate_limit:
585 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
587 def temp_name(self, filename):
588 """Returns a temporary filename for the given filename."""
589 if self.params.get('nopart', False) or filename == u'-' or \
590 (os.path.exists(filename) and not os.path.isfile(filename)):
592 return filename + u'.part'
594 def undo_temp_name(self, filename):
595 if filename.endswith(u'.part'):
596 return filename[:-len(u'.part')]
599 def try_rename(self, old_filename, new_filename):
601 if old_filename == new_filename:
603 os.rename(old_filename, new_filename)
604 except (IOError, OSError), err:
605 self.trouble(u'ERROR: unable to rename file')
607 def try_utime(self, filename, last_modified_hdr):
608 """Try to set the last-modified time of the given file."""
609 if last_modified_hdr is None:
611 if not os.path.isfile(filename):
613 timestr = last_modified_hdr
616 filetime = timeconvert(timestr)
620 os.utime(filename,(time.time(), filetime))
624 def report_writedescription(self, descfn):
625 """ Report that the description file is being written """
626 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
628 def report_writeinfojson(self, infofn):
629 """ Report that the metadata file has been written """
630 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
632 def report_destination(self, filename):
633 """Report destination filename."""
634 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
636 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
637 """Report download progress."""
638 if self.params.get('noprogress', False):
640 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
641 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
642 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
643 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
645 def report_resuming_byte(self, resume_len):
646 """Report attempt to resume at given byte."""
647 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
649 def report_retry(self, count, retries):
650 """Report retry in case of HTTP error 5xx"""
651 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
653 def report_file_already_downloaded(self, file_name):
654 """Report file has already been fully downloaded."""
656 self.to_screen(u'[download] %s has already been downloaded' % file_name)
657 except (UnicodeEncodeError), err:
658 self.to_screen(u'[download] The file has already been downloaded')
660 def report_unable_to_resume(self):
661 """Report it was impossible to resume download."""
662 self.to_screen(u'[download] Unable to resume')
664 def report_finish(self):
665 """Report download finished."""
666 if self.params.get('noprogress', False):
667 self.to_screen(u'[download] Download completed')
671 def increment_downloads(self):
672 """Increment the ordinal that assigns a number to each file."""
673 self._num_downloads += 1
675 def prepare_filename(self, info_dict):
676 """Generate the output filename."""
678 template_dict = dict(info_dict)
679 template_dict['epoch'] = unicode(long(time.time()))
680 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
681 filename = self.params['outtmpl'] % template_dict
683 except (ValueError, KeyError), err:
684 self.trouble(u'ERROR: invalid system charset or erroneous output template')
687 def process_info(self, info_dict):
688 """Process a single dictionary returned by an InfoExtractor."""
689 filename = self.prepare_filename(info_dict)
690 # Do nothing else if in simulate mode
691 if self.params.get('simulate', False):
693 if self.params.get('forcetitle', False):
694 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
695 if self.params.get('forceurl', False):
696 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
697 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
698 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
699 if self.params.get('forcedescription', False) and 'description' in info_dict:
700 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
701 if self.params.get('forcefilename', False) and filename is not None:
702 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
708 if self.params.get('nooverwrites', False) and os.path.exists(filename):
709 self.to_stderr(u'WARNING: file exists and will be skipped')
713 self.pmkdir(filename)
714 except (OSError, IOError), err:
715 self.trouble(u'ERROR: unable to create directories: %s' % str(err))
718 if self.params.get('writedescription', False):
720 descfn = filename + '.description'
721 self.report_writedescription(descfn)
722 descfile = open(descfn, 'wb')
724 descfile.write(info_dict['description'].encode('utf-8'))
727 except (OSError, IOError):
728 self.trouble(u'ERROR: Cannot write description file: %s' % str(descfn))
731 if self.params.get('writeinfojson', False):
732 infofn = filename + '.info.json'
733 self.report_writeinfojson(infofn)
736 except (NameError,AttributeError):
737 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
740 infof = open(infofn, 'wb')
742 json.dump(info_dict, infof)
745 except (OSError, IOError):
746 self.trouble(u'ERROR: Cannot write metadata to JSON file: %s' % str(infofn))
750 success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
751 except (OSError, IOError), err:
752 raise UnavailableVideoError
753 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
754 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
756 except (ContentTooShortError, ), err:
757 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
762 self.post_process(filename, info_dict)
763 except (PostProcessingError), err:
764 self.trouble(u'ERROR: postprocessing: %s' % str(err))
767 def download(self, url_list):
768 """Download a given list of URLs."""
769 if len(url_list) > 1 and self.fixed_template():
770 raise SameFileError(self.params['outtmpl'])
773 suitable_found = False
775 # Go to next InfoExtractor if not suitable
776 if not ie.suitable(url):
779 # Suitable InfoExtractor found
780 suitable_found = True
782 # Extract information from URL and process it
785 # Suitable InfoExtractor had been found; go to next URL
788 if not suitable_found:
789 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
791 return self._download_retcode
793 def post_process(self, filename, ie_info):
794 """Run the postprocessing chain on the given file."""
796 info['filepath'] = filename
802 def _download_with_rtmpdump(self, filename, url, player_url):
803 self.report_destination(filename)
804 tmpfilename = self.temp_name(filename)
806 # Check for rtmpdump first
808 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
809 except (OSError, IOError):
810 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
813 # Download using rtmpdump. rtmpdump returns exit code 2 when
814 # the connection was interrumpted and resuming appears to be
815 # possible. This is part of rtmpdump's normal usage, AFAIK.
816 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
817 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
818 while retval == 2 or retval == 1:
819 prevsize = os.path.getsize(tmpfilename)
820 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
821 time.sleep(5.0) # This seems to be needed
822 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
823 cursize = os.path.getsize(tmpfilename)
824 if prevsize == cursize and retval == 1:
827 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
828 self.try_rename(tmpfilename, filename)
831 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
834 def _do_download(self, filename, url, player_url):
835 # Check file already present
836 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
837 self.report_file_already_downloaded(filename)
840 # Attempt to download using rtmpdump
841 if url.startswith('rtmp'):
842 return self._download_with_rtmpdump(filename, url, player_url)
844 tmpfilename = self.temp_name(filename)
848 # Do not include the Accept-Encoding header
849 headers = {'Youtubedl-no-compression': 'True'}
850 basic_request = urllib2.Request(url, None, headers)
851 request = urllib2.Request(url, None, headers)
853 # Establish possible resume length
854 if os.path.isfile(tmpfilename):
855 resume_len = os.path.getsize(tmpfilename)
859 # Request parameters in case of being able to resume
860 if self.params.get('continuedl', False) and resume_len != 0:
861 self.report_resuming_byte(resume_len)
862 request.add_header('Range','bytes=%d-' % resume_len)
866 retries = self.params.get('retries', 0)
867 while count <= retries:
868 # Establish connection
870 data = urllib2.urlopen(request)
872 except (urllib2.HTTPError, ), err:
873 if (err.code < 500 or err.code >= 600) and err.code != 416:
874 # Unexpected HTTP error
876 elif err.code == 416:
877 # Unable to resume (requested range not satisfiable)
879 # Open the connection again without the range header
880 data = urllib2.urlopen(basic_request)
881 content_length = data.info()['Content-Length']
882 except (urllib2.HTTPError, ), err:
883 if err.code < 500 or err.code >= 600:
886 # Examine the reported length
887 if (content_length is not None and
888 (resume_len - 100 < long(content_length) < resume_len + 100)):
889 # The file had already been fully downloaded.
890 # Explanation to the above condition: in issue #175 it was revealed that
891 # YouTube sometimes adds or removes a few bytes from the end of the file,
892 # changing the file size slightly and causing problems for some users. So
893 # I decided to implement a suggested change and consider the file
894 # completely downloaded if the file size differs less than 100 bytes from
895 # the one in the hard drive.
896 self.report_file_already_downloaded(filename)
897 self.try_rename(tmpfilename, filename)
900 # The length does not match, we start the download over
901 self.report_unable_to_resume()
907 self.report_retry(count, retries)
910 self.trouble(u'ERROR: giving up after %s retries' % retries)
913 data_len = data.info().get('Content-length', None)
914 if data_len is not None:
915 data_len = long(data_len) + resume_len
916 data_len_str = self.format_bytes(data_len)
917 byte_counter = 0 + resume_len
923 data_block = data.read(block_size)
925 if len(data_block) == 0:
927 byte_counter += len(data_block)
929 # Open file just in time
932 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
933 filename = self.undo_temp_name(tmpfilename)
934 self.report_destination(filename)
935 except (OSError, IOError), err:
936 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
939 stream.write(data_block)
940 except (IOError, OSError), err:
941 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
943 block_size = self.best_block_size(after - before, len(data_block))
946 percent_str = self.calc_percent(byte_counter, data_len)
947 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
948 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
949 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
952 self.slow_down(start, byte_counter - resume_len)
956 if data_len is not None and byte_counter != data_len:
957 raise ContentTooShortError(byte_counter, long(data_len))
958 self.try_rename(tmpfilename, filename)
960 # Update file modification time
961 if self.params.get('updatetime', True):
962 self.try_utime(filename, data.info().get('last-modified', None))
966 class InfoExtractor(object):
967 """Information Extractor class.
969 Information extractors are the classes that, given a URL, extract
970 information from the video (or videos) the URL refers to. This
971 information includes the real video URL, the video title and simplified
972 title, author and others. The information is stored in a dictionary
973 which is then passed to the FileDownloader. The FileDownloader
974 processes this information possibly downloading the video to the file
975 system, among other possible outcomes. The dictionaries must include
976 the following fields:
978 id: Video identifier.
979 url: Final video URL.
980 uploader: Nickname of the video uploader.
981 title: Literal title.
982 stitle: Simplified title.
983 ext: Video filename extension.
984 format: Video format.
985 player_url: SWF Player URL (may be None).
987 The following fields are optional. Their primary purpose is to allow
988 youtube-dl to serve as the backend for a video search function, such
989 as the one in youtube2mp3. They are only used when their respective
990 forced printing functions are called:
992 thumbnail: Full URL to a video thumbnail image.
993 description: One-line video description.
995 Subclasses of this one should re-define the _real_initialize() and
996 _real_extract() methods, as well as the suitable() static method.
997 Probably, they should also be instantiated and added to the main
1004 def __init__(self, downloader=None):
1005 """Constructor. Receives an optional downloader."""
1007 self.set_downloader(downloader)
1011 """Receives a URL and returns True if suitable for this IE."""
1014 def initialize(self):
1015 """Initializes an instance (authentication, etc)."""
1017 self._real_initialize()
1020 def extract(self, url):
1021 """Extracts URL information and returns it in list of dicts."""
1023 return self._real_extract(url)
1025 def set_downloader(self, downloader):
1026 """Sets the downloader for this IE."""
1027 self._downloader = downloader
1029 def _real_initialize(self):
1030 """Real initialization process. Redefine in subclasses."""
1033 def _real_extract(self, url):
1034 """Real extraction process. Redefine in subclasses."""
1037 class YoutubeIE(InfoExtractor):
1038 """Information extractor for youtube.com."""
1040 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1041 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1042 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1043 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1044 _NETRC_MACHINE = 'youtube'
1045 # Listed in order of quality
1046 _available_formats = ['38', '37', '45', '22', '43', '35', '34', '18', '6', '5', '17', '13']
1047 _video_extensions = {
1053 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1060 return (re.match(YoutubeIE._VALID_URL, url) is not None)
1062 def report_lang(self):
1063 """Report attempt to set language."""
1064 self._downloader.to_screen(u'[youtube] Setting language')
1066 def report_login(self):
1067 """Report attempt to log in."""
1068 self._downloader.to_screen(u'[youtube] Logging in')
1070 def report_age_confirmation(self):
1071 """Report attempt to confirm age."""
1072 self._downloader.to_screen(u'[youtube] Confirming age')
1074 def report_video_webpage_download(self, video_id):
1075 """Report attempt to download video webpage."""
1076 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1078 def report_video_info_webpage_download(self, video_id):
1079 """Report attempt to download video info webpage."""
1080 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1082 def report_information_extraction(self, video_id):
1083 """Report attempt to extract video information."""
1084 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1086 def report_unavailable_format(self, video_id, format):
1087 """Report extracted video URL."""
1088 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1090 def report_rtmp_download(self):
1091 """Indicate the download will use the RTMP protocol."""
1092 self._downloader.to_screen(u'[youtube] RTMP download detected')
1094 def _real_initialize(self):
1095 if self._downloader is None:
1100 downloader_params = self._downloader.params
1102 # Attempt to use provided username and password or .netrc data
1103 if downloader_params.get('username', None) is not None:
1104 username = downloader_params['username']
1105 password = downloader_params['password']
1106 elif downloader_params.get('usenetrc', False):
1108 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1109 if info is not None:
1113 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1114 except (IOError, netrc.NetrcParseError), err:
1115 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1119 request = urllib2.Request(self._LANG_URL)
1122 urllib2.urlopen(request).read()
1123 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1124 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1127 # No authentication to be performed
1128 if username is None:
1133 'current_form': 'loginForm',
1135 'action_login': 'Log In',
1136 'username': username,
1137 'password': password,
1139 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1142 login_results = urllib2.urlopen(request).read()
1143 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1144 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1146 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1147 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1153 'action_confirm': 'Confirm',
1155 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1157 self.report_age_confirmation()
1158 age_results = urllib2.urlopen(request).read()
1159 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1160 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1163 def _real_extract(self, url):
1164 # Extract video id from URL
1165 mobj = re.match(self._VALID_URL, url)
1167 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1169 video_id = mobj.group(2)
1172 self.report_video_webpage_download(video_id)
1173 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1175 video_webpage = urllib2.urlopen(request).read()
1176 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1177 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1180 # Attempt to extract SWF player URL
1181 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1182 if mobj is not None:
1183 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1188 self.report_video_info_webpage_download(video_id)
1189 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1190 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1191 % (video_id, el_type))
1192 request = urllib2.Request(video_info_url)
1194 video_info_webpage = urllib2.urlopen(request).read()
1195 video_info = parse_qs(video_info_webpage)
1196 if 'token' in video_info:
1198 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1199 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1201 if 'token' not in video_info:
1202 if 'reason' in video_info:
1203 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1205 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1208 # Start extracting information
1209 self.report_information_extraction(video_id)
1212 if 'author' not in video_info:
1213 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1215 video_uploader = urllib.unquote_plus(video_info['author'][0])
1218 if 'title' not in video_info:
1219 self._downloader.trouble(u'ERROR: unable to extract video title')
1221 video_title = urllib.unquote_plus(video_info['title'][0])
1222 video_title = video_title.decode('utf-8')
1223 video_title = sanitize_title(video_title)
1226 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1227 simple_title = simple_title.strip(ur'_')
1230 if 'thumbnail_url' not in video_info:
1231 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1232 video_thumbnail = ''
1233 else: # don't panic if we can't find it
1234 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1238 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1239 if mobj is not None:
1240 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1241 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1242 for expression in format_expressions:
1244 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1252 video_description = u'No description available.'
1253 if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1254 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1255 if mobj is not None:
1256 video_description = mobj.group(1).decode('utf-8')
1258 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1259 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1260 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1261 # TODO use another parser
1264 video_token = urllib.unquote_plus(video_info['token'][0])
1266 # Decide which formats to download
1267 req_format = self._downloader.params.get('format', None)
1269 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1270 self.report_rtmp_download()
1271 video_url_list = [(None, video_info['conn'][0])]
1272 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1273 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1274 url_data = [parse_qs(uds) for uds in url_data_strs]
1275 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1276 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1278 format_limit = self._downloader.params.get('format_limit', None)
1279 if format_limit is not None and format_limit in self._available_formats:
1280 format_list = self._available_formats[self._available_formats.index(format_limit):]
1282 format_list = self._available_formats
1283 existing_formats = [x for x in format_list if x in url_map]
1284 if len(existing_formats) == 0:
1285 self._downloader.trouble(u'ERROR: no known formats available for video')
1287 if req_format is None:
1288 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1289 elif req_format == '-1':
1290 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1293 if req_format not in url_map:
1294 self._downloader.trouble(u'ERROR: requested format not available')
1296 video_url_list = [(req_format, url_map[req_format])] # Specific format
1298 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1301 for format_param, video_real_url in video_url_list:
1302 # At this point we have a new video
1303 self._downloader.increment_downloads()
1306 video_extension = self._video_extensions.get(format_param, 'flv')
1309 # Process video information
1310 self._downloader.process_info({
1311 'id': video_id.decode('utf-8'),
1312 'url': video_real_url.decode('utf-8'),
1313 'uploader': video_uploader.decode('utf-8'),
1314 'upload_date': upload_date,
1315 'title': video_title,
1316 'stitle': simple_title,
1317 'ext': video_extension.decode('utf-8'),
1318 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1319 'thumbnail': video_thumbnail.decode('utf-8'),
1320 'description': video_description,
1321 'player_url': player_url,
1323 except UnavailableVideoError, err:
1324 self._downloader.trouble(u'\nERROR: unable to download video')
1327 class MetacafeIE(InfoExtractor):
1328 """Information Extractor for metacafe.com."""
1330 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1331 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1332 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1335 def __init__(self, youtube_ie, downloader=None):
1336 InfoExtractor.__init__(self, downloader)
1337 self._youtube_ie = youtube_ie
1341 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1343 def report_disclaimer(self):
1344 """Report disclaimer retrieval."""
1345 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1347 def report_age_confirmation(self):
1348 """Report attempt to confirm age."""
1349 self._downloader.to_screen(u'[metacafe] Confirming age')
1351 def report_download_webpage(self, video_id):
1352 """Report webpage download."""
1353 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1355 def report_extraction(self, video_id):
1356 """Report information extraction."""
1357 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1359 def _real_initialize(self):
1360 # Retrieve disclaimer
1361 request = urllib2.Request(self._DISCLAIMER)
1363 self.report_disclaimer()
1364 disclaimer = urllib2.urlopen(request).read()
1365 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1366 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1372 'submit': "Continue - I'm over 18",
1374 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1376 self.report_age_confirmation()
1377 disclaimer = urllib2.urlopen(request).read()
1378 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1379 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1382 def _real_extract(self, url):
1383 # Extract id and simplified title from URL
1384 mobj = re.match(self._VALID_URL, url)
1386 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1389 video_id = mobj.group(1)
1391 # Check if video comes from YouTube
1392 mobj2 = re.match(r'^yt-(.*)$', video_id)
1393 if mobj2 is not None:
1394 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1397 # At this point we have a new video
1398 self._downloader.increment_downloads()
1400 simple_title = mobj.group(2).decode('utf-8')
1402 # Retrieve video webpage to extract further information
1403 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1405 self.report_download_webpage(video_id)
1406 webpage = urllib2.urlopen(request).read()
1407 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1408 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1411 # Extract URL, uploader and title from webpage
1412 self.report_extraction(video_id)
1413 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1414 if mobj is not None:
1415 mediaURL = urllib.unquote(mobj.group(1))
1416 video_extension = mediaURL[-3:]
1418 # Extract gdaKey if available
1419 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1421 video_url = mediaURL
1423 gdaKey = mobj.group(1)
1424 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1426 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1428 self._downloader.trouble(u'ERROR: unable to extract media URL')
1430 vardict = parse_qs(mobj.group(1))
1431 if 'mediaData' not in vardict:
1432 self._downloader.trouble(u'ERROR: unable to extract media URL')
1434 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1436 self._downloader.trouble(u'ERROR: unable to extract media URL')
1438 mediaURL = mobj.group(1).replace('\\/', '/')
1439 video_extension = mediaURL[-3:]
1440 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1442 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1444 self._downloader.trouble(u'ERROR: unable to extract title')
1446 video_title = mobj.group(1).decode('utf-8')
1447 video_title = sanitize_title(video_title)
1449 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1451 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1453 video_uploader = mobj.group(1)
1456 # Process video information
1457 self._downloader.process_info({
1458 'id': video_id.decode('utf-8'),
1459 'url': video_url.decode('utf-8'),
1460 'uploader': video_uploader.decode('utf-8'),
1461 'upload_date': u'NA',
1462 'title': video_title,
1463 'stitle': simple_title,
1464 'ext': video_extension.decode('utf-8'),
1468 except UnavailableVideoError:
1469 self._downloader.trouble(u'\nERROR: unable to download video')
1472 class DailymotionIE(InfoExtractor):
1473 """Information Extractor for Dailymotion"""
1475 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1477 def __init__(self, downloader=None):
1478 InfoExtractor.__init__(self, downloader)
1482 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1484 def report_download_webpage(self, video_id):
1485 """Report webpage download."""
1486 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1488 def report_extraction(self, video_id):
1489 """Report information extraction."""
1490 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1492 def _real_initialize(self):
1495 def _real_extract(self, url):
1496 # Extract id and simplified title from URL
1497 mobj = re.match(self._VALID_URL, url)
1499 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1502 # At this point we have a new video
1503 self._downloader.increment_downloads()
1504 video_id = mobj.group(1)
1506 simple_title = mobj.group(2).decode('utf-8')
1507 video_extension = 'flv'
1509 # Retrieve video webpage to extract further information
1510 request = urllib2.Request(url)
1512 self.report_download_webpage(video_id)
1513 webpage = urllib2.urlopen(request).read()
1514 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1515 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1518 # Extract URL, uploader and title from webpage
1519 self.report_extraction(video_id)
1520 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1522 self._downloader.trouble(u'ERROR: unable to extract media URL')
1524 mediaURL = urllib.unquote(mobj.group(1))
1526 # if needed add http://www.dailymotion.com/ if relative URL
1528 video_url = mediaURL
1530 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1531 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1533 self._downloader.trouble(u'ERROR: unable to extract title')
1535 video_title = mobj.group(1).decode('utf-8')
1536 video_title = sanitize_title(video_title)
1538 mobj = re.search(r'(?im)<Attribute name="owner">(.+?)</Attribute>', webpage)
1540 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1542 video_uploader = mobj.group(1)
1545 # Process video information
1546 self._downloader.process_info({
1547 'id': video_id.decode('utf-8'),
1548 'url': video_url.decode('utf-8'),
1549 'uploader': video_uploader.decode('utf-8'),
1550 'upload_date': u'NA',
1551 'title': video_title,
1552 'stitle': simple_title,
1553 'ext': video_extension.decode('utf-8'),
1557 except UnavailableVideoError:
1558 self._downloader.trouble(u'\nERROR: unable to download video')
1560 class GoogleIE(InfoExtractor):
1561 """Information extractor for video.google.com."""
1563 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1565 def __init__(self, downloader=None):
1566 InfoExtractor.__init__(self, downloader)
1570 return (re.match(GoogleIE._VALID_URL, url) is not None)
1572 def report_download_webpage(self, video_id):
1573 """Report webpage download."""
1574 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1576 def report_extraction(self, video_id):
1577 """Report information extraction."""
1578 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1580 def _real_initialize(self):
1583 def _real_extract(self, url):
1584 # Extract id from URL
1585 mobj = re.match(self._VALID_URL, url)
1587 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1590 # At this point we have a new video
1591 self._downloader.increment_downloads()
1592 video_id = mobj.group(1)
1594 video_extension = 'mp4'
1596 # Retrieve video webpage to extract further information
1597 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1599 self.report_download_webpage(video_id)
1600 webpage = urllib2.urlopen(request).read()
1601 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1602 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1605 # Extract URL, uploader, and title from webpage
1606 self.report_extraction(video_id)
1607 mobj = re.search(r"download_url:'([^']+)'", webpage)
1609 video_extension = 'flv'
1610 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1612 self._downloader.trouble(u'ERROR: unable to extract media URL')
1614 mediaURL = urllib.unquote(mobj.group(1))
1615 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1616 mediaURL = mediaURL.replace('\\x26', '\x26')
1618 video_url = mediaURL
1620 mobj = re.search(r'<title>(.*)</title>', webpage)
1622 self._downloader.trouble(u'ERROR: unable to extract title')
1624 video_title = mobj.group(1).decode('utf-8')
1625 video_title = sanitize_title(video_title)
1626 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1628 # Extract video description
1629 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1631 self._downloader.trouble(u'ERROR: unable to extract video description')
1633 video_description = mobj.group(1).decode('utf-8')
1634 if not video_description:
1635 video_description = 'No description available.'
1637 # Extract video thumbnail
1638 if self._downloader.params.get('forcethumbnail', False):
1639 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1641 webpage = urllib2.urlopen(request).read()
1642 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1643 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1645 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1647 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1649 video_thumbnail = mobj.group(1)
1650 else: # we need something to pass to process_info
1651 video_thumbnail = ''
1655 # Process video information
1656 self._downloader.process_info({
1657 'id': video_id.decode('utf-8'),
1658 'url': video_url.decode('utf-8'),
1660 'upload_date': u'NA',
1661 'title': video_title,
1662 'stitle': simple_title,
1663 'ext': video_extension.decode('utf-8'),
1667 except UnavailableVideoError:
1668 self._downloader.trouble(u'\nERROR: unable to download video')
1671 class PhotobucketIE(InfoExtractor):
1672 """Information extractor for photobucket.com."""
1674 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1676 def __init__(self, downloader=None):
1677 InfoExtractor.__init__(self, downloader)
1681 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1683 def report_download_webpage(self, video_id):
1684 """Report webpage download."""
1685 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1687 def report_extraction(self, video_id):
1688 """Report information extraction."""
1689 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1691 def _real_initialize(self):
1694 def _real_extract(self, url):
1695 # Extract id from URL
1696 mobj = re.match(self._VALID_URL, url)
1698 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1701 # At this point we have a new video
1702 self._downloader.increment_downloads()
1703 video_id = mobj.group(1)
1705 video_extension = 'flv'
1707 # Retrieve video webpage to extract further information
1708 request = urllib2.Request(url)
1710 self.report_download_webpage(video_id)
1711 webpage = urllib2.urlopen(request).read()
1712 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1713 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1716 # Extract URL, uploader, and title from webpage
1717 self.report_extraction(video_id)
1718 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1720 self._downloader.trouble(u'ERROR: unable to extract media URL')
1722 mediaURL = urllib.unquote(mobj.group(1))
1724 video_url = mediaURL
1726 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1728 self._downloader.trouble(u'ERROR: unable to extract title')
1730 video_title = mobj.group(1).decode('utf-8')
1731 video_title = sanitize_title(video_title)
1732 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1734 video_uploader = mobj.group(2).decode('utf-8')
1737 # Process video information
1738 self._downloader.process_info({
1739 'id': video_id.decode('utf-8'),
1740 'url': video_url.decode('utf-8'),
1741 'uploader': video_uploader,
1742 'upload_date': u'NA',
1743 'title': video_title,
1744 'stitle': simple_title,
1745 'ext': video_extension.decode('utf-8'),
1749 except UnavailableVideoError:
1750 self._downloader.trouble(u'\nERROR: unable to download video')
1753 class YahooIE(InfoExtractor):
1754 """Information extractor for video.yahoo.com."""
1756 # _VALID_URL matches all Yahoo! Video URLs
1757 # _VPAGE_URL matches only the extractable '/watch/' URLs
1758 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1759 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1761 def __init__(self, downloader=None):
1762 InfoExtractor.__init__(self, downloader)
1766 return (re.match(YahooIE._VALID_URL, url) is not None)
1768 def report_download_webpage(self, video_id):
1769 """Report webpage download."""
1770 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1772 def report_extraction(self, video_id):
1773 """Report information extraction."""
1774 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1776 def _real_initialize(self):
1779 def _real_extract(self, url, new_video=True):
1780 # Extract ID from URL
1781 mobj = re.match(self._VALID_URL, url)
1783 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1786 # At this point we have a new video
1787 self._downloader.increment_downloads()
1788 video_id = mobj.group(2)
1789 video_extension = 'flv'
1791 # Rewrite valid but non-extractable URLs as
1792 # extractable English language /watch/ URLs
1793 if re.match(self._VPAGE_URL, url) is None:
1794 request = urllib2.Request(url)
1796 webpage = urllib2.urlopen(request).read()
1797 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1798 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1801 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1803 self._downloader.trouble(u'ERROR: Unable to extract id field')
1805 yahoo_id = mobj.group(1)
1807 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1809 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1811 yahoo_vid = mobj.group(1)
1813 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1814 return self._real_extract(url, new_video=False)
1816 # Retrieve video webpage to extract further information
1817 request = urllib2.Request(url)
1819 self.report_download_webpage(video_id)
1820 webpage = urllib2.urlopen(request).read()
1821 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1822 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1825 # Extract uploader and title from webpage
1826 self.report_extraction(video_id)
1827 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1829 self._downloader.trouble(u'ERROR: unable to extract video title')
1831 video_title = mobj.group(1).decode('utf-8')
1832 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1834 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1836 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1838 video_uploader = mobj.group(1).decode('utf-8')
1840 # Extract video thumbnail
1841 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1843 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1845 video_thumbnail = mobj.group(1).decode('utf-8')
1847 # Extract video description
1848 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1850 self._downloader.trouble(u'ERROR: unable to extract video description')
1852 video_description = mobj.group(1).decode('utf-8')
1853 if not video_description: video_description = 'No description available.'
1855 # Extract video height and width
1856 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1858 self._downloader.trouble(u'ERROR: unable to extract video height')
1860 yv_video_height = mobj.group(1)
1862 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1864 self._downloader.trouble(u'ERROR: unable to extract video width')
1866 yv_video_width = mobj.group(1)
1868 # Retrieve video playlist to extract media URL
1869 # I'm not completely sure what all these options are, but we
1870 # seem to need most of them, otherwise the server sends a 401.
1871 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1872 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1873 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1874 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1875 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1877 self.report_download_webpage(video_id)
1878 webpage = urllib2.urlopen(request).read()
1879 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1880 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1883 # Extract media URL from playlist XML
1884 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1886 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1888 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1889 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1892 # Process video information
1893 self._downloader.process_info({
1894 'id': video_id.decode('utf-8'),
1896 'uploader': video_uploader,
1897 'upload_date': u'NA',
1898 'title': video_title,
1899 'stitle': simple_title,
1900 'ext': video_extension.decode('utf-8'),
1901 'thumbnail': video_thumbnail.decode('utf-8'),
1902 'description': video_description,
1903 'thumbnail': video_thumbnail,
1904 'description': video_description,
1907 except UnavailableVideoError:
1908 self._downloader.trouble(u'\nERROR: unable to download video')
1911 class VimeoIE(InfoExtractor):
1912 """Information extractor for vimeo.com."""
1914 # _VALID_URL matches Vimeo URLs
1915 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1917 def __init__(self, downloader=None):
1918 InfoExtractor.__init__(self, downloader)
1922 return (re.match(VimeoIE._VALID_URL, url) is not None)
1924 def report_download_webpage(self, video_id):
1925 """Report webpage download."""
1926 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1928 def report_extraction(self, video_id):
1929 """Report information extraction."""
1930 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1932 def _real_initialize(self):
1935 def _real_extract(self, url, new_video=True):
1936 # Extract ID from URL
1937 mobj = re.match(self._VALID_URL, url)
1939 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1942 # At this point we have a new video
1943 self._downloader.increment_downloads()
1944 video_id = mobj.group(1)
1946 # Retrieve video webpage to extract further information
1947 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
1949 self.report_download_webpage(video_id)
1950 webpage = urllib2.urlopen(request).read()
1951 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1952 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1955 # Now we begin extracting as much information as we can from what we
1956 # retrieved. First we extract the information common to all extractors,
1957 # and latter we extract those that are Vimeo specific.
1958 self.report_extraction(video_id)
1961 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
1963 self._downloader.trouble(u'ERROR: unable to extract video title')
1965 video_title = mobj.group(1).decode('utf-8')
1966 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1969 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
1971 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1973 video_uploader = mobj.group(1).decode('utf-8')
1975 # Extract video thumbnail
1976 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
1978 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1980 video_thumbnail = mobj.group(1).decode('utf-8')
1982 # # Extract video description
1983 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
1985 # self._downloader.trouble(u'ERROR: unable to extract video description')
1987 # video_description = mobj.group(1).decode('utf-8')
1988 # if not video_description: video_description = 'No description available.'
1989 video_description = 'Foo.'
1991 # Vimeo specific: extract request signature
1992 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
1994 self._downloader.trouble(u'ERROR: unable to extract request signature')
1996 sig = mobj.group(1).decode('utf-8')
1998 # Vimeo specific: Extract request signature expiration
1999 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2001 self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2003 sig_exp = mobj.group(1).decode('utf-8')
2005 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s" % (video_id, sig, sig_exp)
2008 # Process video information
2009 self._downloader.process_info({
2010 'id': video_id.decode('utf-8'),
2012 'uploader': video_uploader,
2013 'upload_date': u'NA',
2014 'title': video_title,
2015 'stitle': simple_title,
2017 'thumbnail': video_thumbnail.decode('utf-8'),
2018 'description': video_description,
2019 'thumbnail': video_thumbnail,
2020 'description': video_description,
2023 except UnavailableVideoError:
2024 self._downloader.trouble(u'ERROR: unable to download video')
2027 class GenericIE(InfoExtractor):
2028 """Generic last-resort information extractor."""
2030 def __init__(self, downloader=None):
2031 InfoExtractor.__init__(self, downloader)
2037 def report_download_webpage(self, video_id):
2038 """Report webpage download."""
2039 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2040 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2042 def report_extraction(self, video_id):
2043 """Report information extraction."""
2044 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2046 def _real_initialize(self):
2049 def _real_extract(self, url):
2050 # At this point we have a new video
2051 self._downloader.increment_downloads()
2053 video_id = url.split('/')[-1]
2054 request = urllib2.Request(url)
2056 self.report_download_webpage(video_id)
2057 webpage = urllib2.urlopen(request).read()
2058 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2059 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2061 except ValueError, err:
2062 # since this is the last-resort InfoExtractor, if
2063 # this error is thrown, it'll be thrown here
2064 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2067 self.report_extraction(video_id)
2068 # Start with something easy: JW Player in SWFObject
2069 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2071 # Broaden the search a little bit
2072 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2074 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2077 # It's possible that one of the regexes
2078 # matched, but returned an empty group:
2079 if mobj.group(1) is None:
2080 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2083 video_url = urllib.unquote(mobj.group(1))
2084 video_id = os.path.basename(video_url)
2086 # here's a fun little line of code for you:
2087 video_extension = os.path.splitext(video_id)[1][1:]
2088 video_id = os.path.splitext(video_id)[0]
2090 # it's tempting to parse this further, but you would
2091 # have to take into account all the variations like
2092 # Video Title - Site Name
2093 # Site Name | Video Title
2094 # Video Title - Tagline | Site Name
2095 # and so on and so forth; it's just not practical
2096 mobj = re.search(r'<title>(.*)</title>', webpage)
2098 self._downloader.trouble(u'ERROR: unable to extract title')
2100 video_title = mobj.group(1).decode('utf-8')
2101 video_title = sanitize_title(video_title)
2102 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2104 # video uploader is domain name
2105 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2107 self._downloader.trouble(u'ERROR: unable to extract title')
2109 video_uploader = mobj.group(1).decode('utf-8')
2112 # Process video information
2113 self._downloader.process_info({
2114 'id': video_id.decode('utf-8'),
2115 'url': video_url.decode('utf-8'),
2116 'uploader': video_uploader,
2117 'upload_date': u'NA',
2118 'title': video_title,
2119 'stitle': simple_title,
2120 'ext': video_extension.decode('utf-8'),
2124 except UnavailableVideoError, err:
2125 self._downloader.trouble(u'\nERROR: unable to download video')
2128 class YoutubeSearchIE(InfoExtractor):
2129 """Information Extractor for YouTube search queries."""
2130 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
2131 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2132 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2133 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2135 _max_youtube_results = 1000
2137 def __init__(self, youtube_ie, downloader=None):
2138 InfoExtractor.__init__(self, downloader)
2139 self._youtube_ie = youtube_ie
2143 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
2145 def report_download_page(self, query, pagenum):
2146 """Report attempt to download playlist page with given number."""
2147 query = query.decode(preferredencoding())
2148 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2150 def _real_initialize(self):
2151 self._youtube_ie.initialize()
2153 def _real_extract(self, query):
2154 mobj = re.match(self._VALID_QUERY, query)
2156 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2159 prefix, query = query.split(':')
2161 query = query.encode('utf-8')
2163 self._download_n_results(query, 1)
2165 elif prefix == 'all':
2166 self._download_n_results(query, self._max_youtube_results)
2172 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2174 elif n > self._max_youtube_results:
2175 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2176 n = self._max_youtube_results
2177 self._download_n_results(query, n)
2179 except ValueError: # parsing prefix as integer fails
2180 self._download_n_results(query, 1)
2183 def _download_n_results(self, query, n):
2184 """Downloads a specified number of results for a query"""
2187 already_seen = set()
2191 self.report_download_page(query, pagenum)
2192 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2193 request = urllib2.Request(result_url)
2195 page = urllib2.urlopen(request).read()
2196 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2197 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2200 # Extract video identifiers
2201 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2202 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2203 if video_id not in already_seen:
2204 video_ids.append(video_id)
2205 already_seen.add(video_id)
2206 if len(video_ids) == n:
2207 # Specified n videos reached
2208 for id in video_ids:
2209 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2212 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2213 for id in video_ids:
2214 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2217 pagenum = pagenum + 1
2219 class GoogleSearchIE(InfoExtractor):
2220 """Information Extractor for Google Video search queries."""
2221 _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
2222 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2223 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2224 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2226 _max_google_results = 1000
2228 def __init__(self, google_ie, downloader=None):
2229 InfoExtractor.__init__(self, downloader)
2230 self._google_ie = google_ie
2234 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
2236 def report_download_page(self, query, pagenum):
2237 """Report attempt to download playlist page with given number."""
2238 query = query.decode(preferredencoding())
2239 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2241 def _real_initialize(self):
2242 self._google_ie.initialize()
2244 def _real_extract(self, query):
2245 mobj = re.match(self._VALID_QUERY, query)
2247 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2250 prefix, query = query.split(':')
2252 query = query.encode('utf-8')
2254 self._download_n_results(query, 1)
2256 elif prefix == 'all':
2257 self._download_n_results(query, self._max_google_results)
2263 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2265 elif n > self._max_google_results:
2266 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2267 n = self._max_google_results
2268 self._download_n_results(query, n)
2270 except ValueError: # parsing prefix as integer fails
2271 self._download_n_results(query, 1)
2274 def _download_n_results(self, query, n):
2275 """Downloads a specified number of results for a query"""
2278 already_seen = set()
2282 self.report_download_page(query, pagenum)
2283 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2284 request = urllib2.Request(result_url)
2286 page = urllib2.urlopen(request).read()
2287 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2288 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2291 # Extract video identifiers
2292 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2293 video_id = mobj.group(1)
2294 if video_id not in already_seen:
2295 video_ids.append(video_id)
2296 already_seen.add(video_id)
2297 if len(video_ids) == n:
2298 # Specified n videos reached
2299 for id in video_ids:
2300 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2303 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2304 for id in video_ids:
2305 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2308 pagenum = pagenum + 1
2310 class YahooSearchIE(InfoExtractor):
2311 """Information Extractor for Yahoo! Video search queries."""
2312 _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
2313 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2314 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2315 _MORE_PAGES_INDICATOR = r'\s*Next'
2317 _max_yahoo_results = 1000
2319 def __init__(self, yahoo_ie, downloader=None):
2320 InfoExtractor.__init__(self, downloader)
2321 self._yahoo_ie = yahoo_ie
2325 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
2327 def report_download_page(self, query, pagenum):
2328 """Report attempt to download playlist page with given number."""
2329 query = query.decode(preferredencoding())
2330 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2332 def _real_initialize(self):
2333 self._yahoo_ie.initialize()
2335 def _real_extract(self, query):
2336 mobj = re.match(self._VALID_QUERY, query)
2338 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2341 prefix, query = query.split(':')
2343 query = query.encode('utf-8')
2345 self._download_n_results(query, 1)
2347 elif prefix == 'all':
2348 self._download_n_results(query, self._max_yahoo_results)
2354 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2356 elif n > self._max_yahoo_results:
2357 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2358 n = self._max_yahoo_results
2359 self._download_n_results(query, n)
2361 except ValueError: # parsing prefix as integer fails
2362 self._download_n_results(query, 1)
2365 def _download_n_results(self, query, n):
2366 """Downloads a specified number of results for a query"""
2369 already_seen = set()
2373 self.report_download_page(query, pagenum)
2374 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2375 request = urllib2.Request(result_url)
2377 page = urllib2.urlopen(request).read()
2378 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2379 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2382 # Extract video identifiers
2383 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2384 video_id = mobj.group(1)
2385 if video_id not in already_seen:
2386 video_ids.append(video_id)
2387 already_seen.add(video_id)
2388 if len(video_ids) == n:
2389 # Specified n videos reached
2390 for id in video_ids:
2391 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2394 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2395 for id in video_ids:
2396 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2399 pagenum = pagenum + 1
2401 class YoutubePlaylistIE(InfoExtractor):
2402 """Information Extractor for YouTube playlists."""
2404 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist)\?.*?(p|a)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2405 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2406 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2407 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2410 def __init__(self, youtube_ie, downloader=None):
2411 InfoExtractor.__init__(self, downloader)
2412 self._youtube_ie = youtube_ie
2416 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2418 def report_download_page(self, playlist_id, pagenum):
2419 """Report attempt to download playlist page with given number."""
2420 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2422 def _real_initialize(self):
2423 self._youtube_ie.initialize()
2425 def _real_extract(self, url):
2426 # Extract playlist id
2427 mobj = re.match(self._VALID_URL, url)
2429 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2433 if mobj.group(3) is not None:
2434 self._youtube_ie.extract(mobj.group(3))
2437 # Download playlist pages
2438 # prefix is 'p' as default for playlists but there are other types that need extra care
2439 playlist_prefix = mobj.group(1)
2440 if playlist_prefix == 'a':
2441 playlist_access = 'artist'
2443 playlist_prefix = 'p'
2444 playlist_access = 'view_play_list'
2445 playlist_id = mobj.group(2)
2450 self.report_download_page(playlist_id, pagenum)
2451 request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2453 page = urllib2.urlopen(request).read()
2454 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2455 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2458 # Extract video identifiers
2460 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2461 if mobj.group(1) not in ids_in_page:
2462 ids_in_page.append(mobj.group(1))
2463 video_ids.extend(ids_in_page)
2465 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2467 pagenum = pagenum + 1
2469 playliststart = self._downloader.params.get('playliststart', 1) - 1
2470 playlistend = self._downloader.params.get('playlistend', -1)
2471 video_ids = video_ids[playliststart:playlistend]
2473 for id in video_ids:
2474 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2477 class YoutubeUserIE(InfoExtractor):
2478 """Information Extractor for YouTube users."""
2480 _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2481 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2482 _GDATA_PAGE_SIZE = 50
2483 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2484 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2487 def __init__(self, youtube_ie, downloader=None):
2488 InfoExtractor.__init__(self, downloader)
2489 self._youtube_ie = youtube_ie
2493 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2495 def report_download_page(self, username, start_index):
2496 """Report attempt to download user page."""
2497 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2498 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2500 def _real_initialize(self):
2501 self._youtube_ie.initialize()
2503 def _real_extract(self, url):
2505 mobj = re.match(self._VALID_URL, url)
2507 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2510 username = mobj.group(1)
2512 # Download video ids using YouTube Data API. Result size per
2513 # query is limited (currently to 50 videos) so we need to query
2514 # page by page until there are no video ids - it means we got
2521 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2522 self.report_download_page(username, start_index)
2524 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2527 page = urllib2.urlopen(request).read()
2528 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2529 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2532 # Extract video identifiers
2535 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2536 if mobj.group(1) not in ids_in_page:
2537 ids_in_page.append(mobj.group(1))
2539 video_ids.extend(ids_in_page)
2541 # A little optimization - if current page is not
2542 # "full", ie. does not contain PAGE_SIZE video ids then
2543 # we can assume that this page is the last one - there
2544 # are no more ids on further pages - no need to query
2547 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2552 all_ids_count = len(video_ids)
2553 playliststart = self._downloader.params.get('playliststart', 1) - 1
2554 playlistend = self._downloader.params.get('playlistend', -1)
2556 if playlistend == -1:
2557 video_ids = video_ids[playliststart:]
2559 video_ids = video_ids[playliststart:playlistend]
2561 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2562 (username, all_ids_count, len(video_ids)))
2564 for video_id in video_ids:
2565 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2568 class DepositFilesIE(InfoExtractor):
2569 """Information extractor for depositfiles.com"""
2571 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2573 def __init__(self, downloader=None):
2574 InfoExtractor.__init__(self, downloader)
2578 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2580 def report_download_webpage(self, file_id):
2581 """Report webpage download."""
2582 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2584 def report_extraction(self, file_id):
2585 """Report information extraction."""
2586 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2588 def _real_initialize(self):
2591 def _real_extract(self, url):
2592 # At this point we have a new file
2593 self._downloader.increment_downloads()
2595 file_id = url.split('/')[-1]
2596 # Rebuild url in english locale
2597 url = 'http://depositfiles.com/en/files/' + file_id
2599 # Retrieve file webpage with 'Free download' button pressed
2600 free_download_indication = { 'gateway_result' : '1' }
2601 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2603 self.report_download_webpage(file_id)
2604 webpage = urllib2.urlopen(request).read()
2605 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2606 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2609 # Search for the real file URL
2610 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2611 if (mobj is None) or (mobj.group(1) is None):
2612 # Try to figure out reason of the error.
2613 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2614 if (mobj is not None) and (mobj.group(1) is not None):
2615 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2616 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2618 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2621 file_url = mobj.group(1)
2622 file_extension = os.path.splitext(file_url)[1][1:]
2624 # Search for file title
2625 mobj = re.search(r'<b title="(.*?)">', webpage)
2627 self._downloader.trouble(u'ERROR: unable to extract title')
2629 file_title = mobj.group(1).decode('utf-8')
2632 # Process file information
2633 self._downloader.process_info({
2634 'id': file_id.decode('utf-8'),
2635 'url': file_url.decode('utf-8'),
2637 'upload_date': u'NA',
2638 'title': file_title,
2639 'stitle': file_title,
2640 'ext': file_extension.decode('utf-8'),
2644 except UnavailableVideoError, err:
2645 self._downloader.trouble(u'ERROR: unable to download file')
2647 class FacebookIE(InfoExtractor):
2648 """Information Extractor for Facebook"""
2650 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2651 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2652 _NETRC_MACHINE = 'facebook'
2653 _available_formats = ['highqual', 'lowqual']
2654 _video_extensions = {
2659 def __init__(self, downloader=None):
2660 InfoExtractor.__init__(self, downloader)
2664 return (re.match(FacebookIE._VALID_URL, url) is not None)
2666 def _reporter(self, message):
2667 """Add header and report message."""
2668 self._downloader.to_screen(u'[facebook] %s' % message)
2670 def report_login(self):
2671 """Report attempt to log in."""
2672 self._reporter(u'Logging in')
2674 def report_video_webpage_download(self, video_id):
2675 """Report attempt to download video webpage."""
2676 self._reporter(u'%s: Downloading video webpage' % video_id)
2678 def report_information_extraction(self, video_id):
2679 """Report attempt to extract video information."""
2680 self._reporter(u'%s: Extracting video information' % video_id)
2682 def _parse_page(self, video_webpage):
2683 """Extract video information from page"""
2685 data = {'title': r'class="video_title datawrap">(.*?)</',
2686 'description': r'<div class="datawrap">(.*?)</div>',
2687 'owner': r'\("video_owner_name", "(.*?)"\)',
2688 'upload_date': r'data-date="(.*?)"',
2689 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2692 for piece in data.keys():
2693 mobj = re.search(data[piece], video_webpage)
2694 if mobj is not None:
2695 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2699 for fmt in self._available_formats:
2700 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2701 if mobj is not None:
2702 # URL is in a Javascript segment inside an escaped Unicode format within
2703 # the generally utf-8 page
2704 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2705 video_info['video_urls'] = video_urls
2709 def _real_initialize(self):
2710 if self._downloader is None:
2715 downloader_params = self._downloader.params
2717 # Attempt to use provided username and password or .netrc data
2718 if downloader_params.get('username', None) is not None:
2719 useremail = downloader_params['username']
2720 password = downloader_params['password']
2721 elif downloader_params.get('usenetrc', False):
2723 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2724 if info is not None:
2728 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2729 except (IOError, netrc.NetrcParseError), err:
2730 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2733 if useremail is None:
2742 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2745 login_results = urllib2.urlopen(request).read()
2746 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2747 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2749 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2750 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2753 def _real_extract(self, url):
2754 mobj = re.match(self._VALID_URL, url)
2756 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2758 video_id = mobj.group('ID')
2761 self.report_video_webpage_download(video_id)
2762 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2764 page = urllib2.urlopen(request)
2765 video_webpage = page.read()
2766 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2767 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2770 # Start extracting information
2771 self.report_information_extraction(video_id)
2773 # Extract information
2774 video_info = self._parse_page(video_webpage)
2777 if 'owner' not in video_info:
2778 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2780 video_uploader = video_info['owner']
2783 if 'title' not in video_info:
2784 self._downloader.trouble(u'ERROR: unable to extract video title')
2786 video_title = video_info['title']
2787 video_title = video_title.decode('utf-8')
2788 video_title = sanitize_title(video_title)
2791 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2792 simple_title = simple_title.strip(ur'_')
2795 if 'thumbnail' not in video_info:
2796 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2797 video_thumbnail = ''
2799 video_thumbnail = video_info['thumbnail']
2803 if 'upload_date' in video_info:
2804 upload_time = video_info['upload_date']
2805 timetuple = email.utils.parsedate_tz(upload_time)
2806 if timetuple is not None:
2808 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2813 video_description = video_info.get('description', 'No description available.')
2815 url_map = video_info['video_urls']
2816 if len(url_map.keys()) > 0:
2817 # Decide which formats to download
2818 req_format = self._downloader.params.get('format', None)
2819 format_limit = self._downloader.params.get('format_limit', None)
2821 if format_limit is not None and format_limit in self._available_formats:
2822 format_list = self._available_formats[self._available_formats.index(format_limit):]
2824 format_list = self._available_formats
2825 existing_formats = [x for x in format_list if x in url_map]
2826 if len(existing_formats) == 0:
2827 self._downloader.trouble(u'ERROR: no known formats available for video')
2829 if req_format is None:
2830 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2831 elif req_format == '-1':
2832 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2835 if req_format not in url_map:
2836 self._downloader.trouble(u'ERROR: requested format not available')
2838 video_url_list = [(req_format, url_map[req_format])] # Specific format
2840 for format_param, video_real_url in video_url_list:
2842 # At this point we have a new video
2843 self._downloader.increment_downloads()
2846 video_extension = self._video_extensions.get(format_param, 'mp4')
2849 # Process video information
2850 self._downloader.process_info({
2851 'id': video_id.decode('utf-8'),
2852 'url': video_real_url.decode('utf-8'),
2853 'uploader': video_uploader.decode('utf-8'),
2854 'upload_date': upload_date,
2855 'title': video_title,
2856 'stitle': simple_title,
2857 'ext': video_extension.decode('utf-8'),
2858 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2859 'thumbnail': video_thumbnail.decode('utf-8'),
2860 'description': video_description.decode('utf-8'),
2863 except UnavailableVideoError, err:
2864 self._downloader.trouble(u'\nERROR: unable to download video')
2866 class BlipTVIE(InfoExtractor):
2867 """Information extractor for blip.tv"""
2869 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2870 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2874 return (re.match(BlipTVIE._VALID_URL, url) is not None)
2876 def report_extraction(self, file_id):
2877 """Report information extraction."""
2878 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
2880 def _simplify_title(self, title):
2881 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2882 res = res.strip(ur'_')
2885 def _real_extract(self, url):
2886 mobj = re.match(self._VALID_URL, url)
2888 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2895 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2896 request = urllib2.Request(json_url)
2897 self.report_extraction(mobj.group(1))
2899 json_code = urllib2.urlopen(request).read()
2900 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2901 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2904 json_data = json.loads(json_code)
2905 if 'Post' in json_data:
2906 data = json_data['Post']
2910 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2911 video_url = data['media']['url']
2912 umobj = re.match(self._URL_EXT, video_url)
2914 raise ValueError('Can not determine filename extension')
2915 ext = umobj.group(1)
2917 self._downloader.increment_downloads()
2920 'id': data['item_id'],
2922 'uploader': data['display_name'],
2923 'upload_date': upload_date,
2924 'title': data['title'],
2925 'stitle': self._simplify_title(data['title']),
2927 'format': data['media']['mimeType'],
2928 'thumbnail': data['thumbnailUrl'],
2929 'description': data['description'],
2930 'player_url': data['embedUrl']
2932 except (ValueError,KeyError), err:
2933 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2937 self._downloader.process_info(info)
2938 except UnavailableVideoError, err:
2939 self._downloader.trouble(u'\nERROR: unable to download video')
2942 class PostProcessor(object):
2943 """Post Processor class.
2945 PostProcessor objects can be added to downloaders with their
2946 add_post_processor() method. When the downloader has finished a
2947 successful download, it will take its internal chain of PostProcessors
2948 and start calling the run() method on each one of them, first with
2949 an initial argument and then with the returned value of the previous
2952 The chain will be stopped if one of them ever returns None or the end
2953 of the chain is reached.
2955 PostProcessor objects follow a "mutual registration" process similar
2956 to InfoExtractor objects.
2961 def __init__(self, downloader=None):
2962 self._downloader = downloader
2964 def set_downloader(self, downloader):
2965 """Sets the downloader for this PP."""
2966 self._downloader = downloader
2968 def run(self, information):
2969 """Run the PostProcessor.
2971 The "information" argument is a dictionary like the ones
2972 composed by InfoExtractors. The only difference is that this
2973 one has an extra field called "filepath" that points to the
2976 When this method returns None, the postprocessing chain is
2977 stopped. However, this method may return an information
2978 dictionary that will be passed to the next postprocessing
2979 object in the chain. It can be the one it received after
2980 changing some fields.
2982 In addition, this method may raise a PostProcessingError
2983 exception that will be taken into account by the downloader
2986 return information # by default, do nothing
2988 class FFmpegExtractAudioPP(PostProcessor):
2990 def __init__(self, downloader=None, preferredcodec=None):
2991 PostProcessor.__init__(self, downloader)
2992 if preferredcodec is None:
2993 preferredcodec = 'best'
2994 self._preferredcodec = preferredcodec
2997 def get_audio_codec(path):
2999 cmd = ['ffprobe', '-show_streams', '--', path]
3000 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3001 output = handle.communicate()[0]
3002 if handle.wait() != 0:
3004 except (IOError, OSError):
3007 for line in output.split('\n'):
3008 if line.startswith('codec_name='):
3009 audio_codec = line.split('=')[1].strip()
3010 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3015 def run_ffmpeg(path, out_path, codec, more_opts):
3017 cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3018 ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3020 except (IOError, OSError):
3023 def run(self, information):
3024 path = information['filepath']
3026 filecodec = self.get_audio_codec(path)
3027 if filecodec is None:
3028 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3032 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3033 if filecodec == 'aac' or filecodec == 'mp3':
3034 # Lossless if possible
3036 extension = filecodec
3037 if filecodec == 'aac':
3038 more_opts = ['-f', 'adts']
3041 acodec = 'libmp3lame'
3043 more_opts = ['-ab', '128k']
3045 # We convert the audio (lossy)
3046 acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
3047 extension = self._preferredcodec
3048 more_opts = ['-ab', '128k']
3049 if self._preferredcodec == 'aac':
3050 more_opts += ['-f', 'adts']
3052 (prefix, ext) = os.path.splitext(path)
3053 new_path = prefix + '.' + extension
3054 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3055 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3058 self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3063 except (IOError, OSError):
3064 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3067 information['filepath'] = new_path
3071 def updateSelf(downloader, filename):
3072 ''' Update the program file with the latest version from the repository '''
3073 # Note: downloader only used for options
3074 if not os.access(filename, os.W_OK):
3075 sys.exit('ERROR: no write permissions on %s' % filename)
3077 downloader.to_screen('Updating to latest version...')
3081 urlh = urllib.urlopen(UPDATE_URL)
3082 newcontent = urlh.read()
3085 except (IOError, OSError), err:
3086 sys.exit('ERROR: unable to download latest version')
3089 outf = open(filename, 'wb')
3091 outf.write(newcontent)
3094 except (IOError, OSError), err:
3095 sys.exit('ERROR: unable to overwrite current version')
3097 downloader.to_screen('Updated youtube-dl. Restart to use the new version.')
3104 def _format_option_string(option):
3105 ''' ('-o', '--option') -> -o, --format METAVAR'''
3109 if option._short_opts: opts.append(option._short_opts[0])
3110 if option._long_opts: opts.append(option._long_opts[0])
3111 if len(opts) > 1: opts.insert(1, ', ')
3113 if option.takes_value(): opts.append(' %s' % option.metavar)
3115 return "".join(opts)
3117 def _find_term_columns():
3118 columns = os.environ.get('COLUMNS', None)
3123 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3124 out,err = sp.communicate()
3125 return int(out.split()[1])
3131 max_help_position = 80
3133 # No need to wrap help messages if we're on a wide console
3134 columns = _find_term_columns()
3135 if columns: max_width = columns
3137 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
3138 fmt.format_option_strings = _format_option_string
3141 'version' : __version__,
3143 'usage' : '%prog [options] url...',
3144 'conflict_handler' : 'resolve',
3147 parser = optparse.OptionParser(**kw)
3150 general = optparse.OptionGroup(parser, 'General Options')
3151 authentication = optparse.OptionGroup(parser, 'Authentication Options')
3152 video_format = optparse.OptionGroup(parser, 'Video Format Options')
3153 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
3154 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
3155 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3157 general.add_option('-h', '--help',
3158 action='help', help='print this help text and exit')
3159 general.add_option('-v', '--version',
3160 action='version', help='print program version and exit')
3161 general.add_option('-U', '--update',
3162 action='store_true', dest='update_self', help='update this program to latest version')
3163 general.add_option('-i', '--ignore-errors',
3164 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3165 general.add_option('-r', '--rate-limit',
3166 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3167 general.add_option('-R', '--retries',
3168 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
3169 general.add_option('--playlist-start',
3170 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3171 general.add_option('--playlist-end',
3172 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3173 general.add_option('--dump-user-agent',
3174 action='store_true', dest='dump_user_agent',
3175 help='display the current browser identification', default=False)
3177 authentication.add_option('-u', '--username',
3178 dest='username', metavar='USERNAME', help='account username')
3179 authentication.add_option('-p', '--password',
3180 dest='password', metavar='PASSWORD', help='account password')
3181 authentication.add_option('-n', '--netrc',
3182 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3185 video_format.add_option('-f', '--format',
3186 action='store', dest='format', metavar='FORMAT', help='video format code')
3187 video_format.add_option('--all-formats',
3188 action='store_const', dest='format', help='download all available video formats', const='-1')
3189 video_format.add_option('--max-quality',
3190 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3193 verbosity.add_option('-q', '--quiet',
3194 action='store_true', dest='quiet', help='activates quiet mode', default=False)
3195 verbosity.add_option('-s', '--simulate',
3196 action='store_true', dest='simulate', help='do not download video', default=False)
3197 verbosity.add_option('-g', '--get-url',
3198 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3199 verbosity.add_option('-e', '--get-title',
3200 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3201 verbosity.add_option('--get-thumbnail',
3202 action='store_true', dest='getthumbnail',
3203 help='simulate, quiet but print thumbnail URL', default=False)
3204 verbosity.add_option('--get-description',
3205 action='store_true', dest='getdescription',
3206 help='simulate, quiet but print video description', default=False)
3207 verbosity.add_option('--get-filename',
3208 action='store_true', dest='getfilename',
3209 help='simulate, quiet but print output filename', default=False)
3210 verbosity.add_option('--no-progress',
3211 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3212 verbosity.add_option('--console-title',
3213 action='store_true', dest='consoletitle',
3214 help='display progress in console titlebar', default=False)
3217 filesystem.add_option('-t', '--title',
3218 action='store_true', dest='usetitle', help='use title in file name', default=False)
3219 filesystem.add_option('-l', '--literal',
3220 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3221 filesystem.add_option('-A', '--auto-number',
3222 action='store_true', dest='autonumber',
3223 help='number downloaded files starting from 00000', default=False)
3224 filesystem.add_option('-o', '--output',
3225 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
3226 filesystem.add_option('-a', '--batch-file',
3227 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3228 filesystem.add_option('-w', '--no-overwrites',
3229 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3230 filesystem.add_option('-c', '--continue',
3231 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3232 filesystem.add_option('--cookies',
3233 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
3234 filesystem.add_option('--no-part',
3235 action='store_true', dest='nopart', help='do not use .part files', default=False)
3236 filesystem.add_option('--no-mtime',
3237 action='store_false', dest='updatetime',
3238 help='do not use the Last-modified header to set the file modification time', default=True)
3239 filesystem.add_option('--write-description',
3240 action='store_true', dest='writedescription',
3241 help='write video description to a .description file', default=False)
3242 filesystem.add_option('--write-info-json',
3243 action='store_true', dest='writeinfojson',
3244 help='write video metadata to a .info.json file', default=False)
3247 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3248 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3249 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3250 help='"best", "aac" or "mp3"; best by default')
3253 parser.add_option_group(general)
3254 parser.add_option_group(filesystem)
3255 parser.add_option_group(verbosity)
3256 parser.add_option_group(video_format)
3257 parser.add_option_group(authentication)
3258 parser.add_option_group(postproc)
3260 opts, args = parser.parse_args()
3262 return parser, opts, args
3265 parser, opts, args = parseOpts()
3267 # Open appropriate CookieJar
3268 if opts.cookiefile is None:
3269 jar = cookielib.CookieJar()
3272 jar = cookielib.MozillaCookieJar(opts.cookiefile)
3273 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3275 except (IOError, OSError), err:
3276 sys.exit(u'ERROR: unable to open cookie file')
3279 if opts.dump_user_agent:
3280 print std_headers['User-Agent']
3283 # General configuration
3284 cookie_processor = urllib2.HTTPCookieProcessor(jar)
3285 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler()))
3286 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3288 # Batch file verification
3290 if opts.batchfile is not None:
3292 if opts.batchfile == '-':
3295 batchfd = open(opts.batchfile, 'r')
3296 batchurls = batchfd.readlines()
3297 batchurls = [x.strip() for x in batchurls]
3298 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3300 sys.exit(u'ERROR: batch file could not be read')
3301 all_urls = batchurls + args
3303 # Conflicting, missing and erroneous options
3304 if opts.usenetrc and (opts.username is not None or opts.password is not None):
3305 parser.error(u'using .netrc conflicts with giving username/password')
3306 if opts.password is not None and opts.username is None:
3307 parser.error(u'account username missing')
3308 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3309 parser.error(u'using output template conflicts with using title, literal title or auto number')
3310 if opts.usetitle and opts.useliteral:
3311 parser.error(u'using title conflicts with using literal title')
3312 if opts.username is not None and opts.password is None:
3313 opts.password = getpass.getpass(u'Type account password and press return:')
3314 if opts.ratelimit is not None:
3315 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3316 if numeric_limit is None:
3317 parser.error(u'invalid rate limit specified')
3318 opts.ratelimit = numeric_limit
3319 if opts.retries is not None:
3321 opts.retries = long(opts.retries)
3322 except (TypeError, ValueError), err:
3323 parser.error(u'invalid retry count specified')
3325 opts.playliststart = int(opts.playliststart)
3326 if opts.playliststart <= 0:
3327 raise ValueError(u'Playlist start must be positive')
3328 except (TypeError, ValueError), err:
3329 parser.error(u'invalid playlist start number specified')
3331 opts.playlistend = int(opts.playlistend)
3332 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3333 raise ValueError(u'Playlist end must be greater than playlist start')
3334 except (TypeError, ValueError), err:
3335 parser.error(u'invalid playlist end number specified')
3336 if opts.extractaudio:
3337 if opts.audioformat not in ['best', 'aac', 'mp3']:
3338 parser.error(u'invalid audio format specified')
3340 # Information extractors
3341 youtube_ie = YoutubeIE()
3342 metacafe_ie = MetacafeIE(youtube_ie)
3343 dailymotion_ie = DailymotionIE()
3344 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
3345 youtube_user_ie = YoutubeUserIE(youtube_ie)
3346 youtube_search_ie = YoutubeSearchIE(youtube_ie)
3347 google_ie = GoogleIE()
3348 google_search_ie = GoogleSearchIE(google_ie)
3349 photobucket_ie = PhotobucketIE()
3350 yahoo_ie = YahooIE()
3351 yahoo_search_ie = YahooSearchIE(yahoo_ie)
3352 deposit_files_ie = DepositFilesIE()
3353 facebook_ie = FacebookIE()
3354 bliptv_ie = BlipTVIE()
3355 vimeo_ie = VimeoIE()
3356 generic_ie = GenericIE()
3359 fd = FileDownloader({
3360 'usenetrc': opts.usenetrc,
3361 'username': opts.username,
3362 'password': opts.password,
3363 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3364 'forceurl': opts.geturl,
3365 'forcetitle': opts.gettitle,
3366 'forcethumbnail': opts.getthumbnail,
3367 'forcedescription': opts.getdescription,
3368 'forcefilename': opts.getfilename,
3369 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3370 'format': opts.format,
3371 'format_limit': opts.format_limit,
3372 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3373 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3374 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3375 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3376 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3377 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3378 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3379 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3380 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3381 or u'%(id)s.%(ext)s'),
3382 'ignoreerrors': opts.ignoreerrors,
3383 'ratelimit': opts.ratelimit,
3384 'nooverwrites': opts.nooverwrites,
3385 'retries': opts.retries,
3386 'continuedl': opts.continue_dl,
3387 'noprogress': opts.noprogress,
3388 'playliststart': opts.playliststart,
3389 'playlistend': opts.playlistend,
3390 'logtostderr': opts.outtmpl == '-',
3391 'consoletitle': opts.consoletitle,
3392 'nopart': opts.nopart,
3393 'updatetime': opts.updatetime,
3394 'writedescription': opts.writedescription,
3395 'writeinfojson': opts.writeinfojson,
3397 fd.add_info_extractor(youtube_search_ie)
3398 fd.add_info_extractor(youtube_pl_ie)
3399 fd.add_info_extractor(youtube_user_ie)
3400 fd.add_info_extractor(metacafe_ie)
3401 fd.add_info_extractor(dailymotion_ie)
3402 fd.add_info_extractor(youtube_ie)
3403 fd.add_info_extractor(google_ie)
3404 fd.add_info_extractor(google_search_ie)
3405 fd.add_info_extractor(photobucket_ie)
3406 fd.add_info_extractor(yahoo_ie)
3407 fd.add_info_extractor(yahoo_search_ie)
3408 fd.add_info_extractor(deposit_files_ie)
3409 fd.add_info_extractor(facebook_ie)
3410 fd.add_info_extractor(bliptv_ie)
3411 fd.add_info_extractor(vimeo_ie)
3413 # This must come last since it's the
3414 # fallback if none of the others work
3415 fd.add_info_extractor(generic_ie)
3418 if opts.extractaudio:
3419 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))
3422 if opts.update_self:
3423 updateSelf(fd, sys.argv[0])
3426 if len(all_urls) < 1:
3427 if not opts.update_self:
3428 parser.error(u'you must provide at least one URL')
3431 retcode = fd.download(all_urls)
3433 # Dump cookie jar if requested
3434 if opts.cookiefile is not None:
3437 except (IOError, OSError), err:
3438 sys.exit(u'ERROR: unable to save cookie jar')
3443 if __name__ == '__main__':
3446 except DownloadError:
3448 except SameFileError:
3449 sys.exit(u'ERROR: fixed output name but more than one file to download')
3450 except KeyboardInterrupt:
3451 sys.exit(u'\nERROR: Interrupted by user')
3453 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: