2 # -*- coding: utf-8 -*-
5 'Ricardo Garcia Gonzalez',
13 'Philipp Hagemeister',
16 __license__ = 'Public Domain'
17 __version__ = '2011.08.28-phihag'
19 UPDATE_URL = 'https://raw.github.com/phihag/youtube-dl/master/youtube-dl'
47 except ImportError: # Python 2.4
50 import cStringIO as StringIO
54 # parse_qs was moved from the cgi module to the urlparse module recently.
56 from urlparse import parse_qs
58 from cgi import parse_qs
66 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
67 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
68 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
69 'Accept-Encoding': 'gzip, deflate',
70 'Accept-Language': 'en-us,en;q=0.5',
73 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
77 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
83 def raiseError(msg, i):
84 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
85 def skipSpace(i, expectMore=True):
86 while i < len(s) and s[i] in ' \t\r\n':
90 raiseError('Premature end', i)
92 def decodeEscape(match):
108 return unichr(int(esc[1:5], 16))
109 if len(esc) == 5+6 and esc[5:7] == '\\u':
110 hi = int(esc[1:5], 16)
111 low = int(esc[7:11], 16)
112 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
113 raise ValueError('Unknown escape ' + str(esc))
120 while s[e-bslashes-1] == '\\':
122 if bslashes % 2 == 1:
126 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
127 stri = rexp.sub(decodeEscape, s[i:e])
133 if s[i] == '}': # Empty dictionary
137 raiseError('Expected a string object key', i)
138 i,key = parseString(i)
140 if i >= len(s) or s[i] != ':':
141 raiseError('Expected a colon', i)
148 raiseError('Expected comma or closing curly brace', i)
153 if s[i] == ']': # Empty array
158 i = skipSpace(i) # Raise exception if premature end
162 raiseError('Expected a comma or closing bracket', i)
164 def parseDiscrete(i):
165 for k,v in {'true': True, 'false': False, 'null': None}.items():
166 if s.startswith(k, i):
168 raiseError('Not a boolean (or null)', i)
170 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
172 raiseError('Not a number', i)
174 if '.' in nums or 'e' in nums or 'E' in nums:
175 return (i+len(nums), float(nums))
176 return (i+len(nums), int(nums))
177 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
180 i,res = CHARMAP.get(s[i], parseNumber)(i)
181 i = skipSpace(i, False)
185 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
188 def preferredencoding():
189 """Get preferred encoding.
191 Returns the best encoding scheme for the system, based on
192 locale.getpreferredencoding() and some further tweaks.
194 def yield_preferredencoding():
196 pref = locale.getpreferredencoding()
202 return yield_preferredencoding().next()
204 def htmlentity_transform(matchobj):
205 """Transforms an HTML entity to a Unicode character.
207 This function receives a match object and is intended to be used with
208 the re.sub() function.
210 entity = matchobj.group(1)
212 # Known non-numeric HTML entity
213 if entity in htmlentitydefs.name2codepoint:
214 return unichr(htmlentitydefs.name2codepoint[entity])
217 mobj = re.match(ur'(?u)#(x?\d+)', entity)
219 numstr = mobj.group(1)
220 if numstr.startswith(u'x'):
222 numstr = u'0%s' % numstr
225 return unichr(long(numstr, base))
227 # Unknown entity in name, return its literal representation
228 return (u'&%s;' % entity)
230 def sanitize_title(utitle):
231 """Sanitizes a video title so it could be used as part of a filename."""
232 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
233 return utitle.replace(unicode(os.sep), u'%')
235 def sanitize_open(filename, open_mode):
236 """Try to open the given filename, and slightly tweak it if this fails.
238 Attempts to open the given filename. If this fails, it tries to change
239 the filename slightly, step by step, until it's either able to open it
240 or it fails and raises a final exception, like the standard open()
243 It returns the tuple (stream, definitive_file_name).
247 if sys.platform == 'win32':
249 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
250 return (sys.stdout, filename)
251 stream = open(filename, open_mode)
252 return (stream, filename)
253 except (IOError, OSError), err:
254 # In case of error, try to remove win32 forbidden chars
255 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
257 # An exception here should be caught in the caller
258 stream = open(filename, open_mode)
259 return (stream, filename)
261 def timeconvert(timestr):
262 """Convert RFC 2822 defined time string into system timestamp"""
264 timetuple = email.utils.parsedate_tz(timestr)
265 if timetuple is not None:
266 timestamp = email.utils.mktime_tz(timetuple)
269 class DownloadError(Exception):
270 """Download Error exception.
272 This exception may be thrown by FileDownloader objects if they are not
273 configured to continue on errors. They will contain the appropriate
278 class SameFileError(Exception):
279 """Same File exception.
281 This exception will be thrown by FileDownloader objects if they detect
282 multiple files would have to be downloaded to the same file on disk.
286 class PostProcessingError(Exception):
287 """Post Processing exception.
289 This exception may be raised by PostProcessor's .run() method to
290 indicate an error in the postprocessing task.
294 class UnavailableVideoError(Exception):
295 """Unavailable Format exception.
297 This exception will be thrown when a video is requested
298 in a format that is not available for that video.
302 class ContentTooShortError(Exception):
303 """Content Too Short exception.
305 This exception may be raised by FileDownloader objects when a file they
306 download is too small for what the server announced first, indicating
307 the connection was probably interrupted.
313 def __init__(self, downloaded, expected):
314 self.downloaded = downloaded
315 self.expected = expected
317 class YoutubeDLHandler(urllib2.HTTPHandler):
318 """Handler for HTTP requests and responses.
320 This class, when installed with an OpenerDirector, automatically adds
321 the standard headers to every HTTP request and handles gzipped and
322 deflated responses from web servers. If compression is to be avoided in
323 a particular request, the original request in the program code only has
324 to include the HTTP header "Youtubedl-No-Compression", which will be
325 removed before making the real request.
327 Part of this code was copied from:
329 http://techknack.net/python-urllib2-handlers/
331 Andrew Rowls, the author of that code, agreed to release it to the
338 return zlib.decompress(data, -zlib.MAX_WBITS)
340 return zlib.decompress(data)
343 def addinfourl_wrapper(stream, headers, url, code):
344 if hasattr(urllib2.addinfourl, 'getcode'):
345 return urllib2.addinfourl(stream, headers, url, code)
346 ret = urllib2.addinfourl(stream, headers, url)
350 def http_request(self, req):
351 for h in std_headers:
354 req.add_header(h, std_headers[h])
355 if 'Youtubedl-no-compression' in req.headers:
356 if 'Accept-encoding' in req.headers:
357 del req.headers['Accept-encoding']
358 del req.headers['Youtubedl-no-compression']
361 def http_response(self, req, resp):
364 if resp.headers.get('Content-encoding', '') == 'gzip':
365 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
366 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
367 resp.msg = old_resp.msg
369 if resp.headers.get('Content-encoding', '') == 'deflate':
370 gz = StringIO.StringIO(self.deflate(resp.read()))
371 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
372 resp.msg = old_resp.msg
375 class FileDownloader(object):
376 """File Downloader class.
378 File downloader objects are the ones responsible of downloading the
379 actual video file and writing it to disk if the user has requested
380 it, among some other tasks. In most cases there should be one per
381 program. As, given a video URL, the downloader doesn't know how to
382 extract all the needed information, task that InfoExtractors do, it
383 has to pass the URL to one of them.
385 For this, file downloader objects have a method that allows
386 InfoExtractors to be registered in a given order. When it is passed
387 a URL, the file downloader handles it to the first InfoExtractor it
388 finds that reports being able to handle it. The InfoExtractor extracts
389 all the information about the video or videos the URL refers to, and
390 asks the FileDownloader to process the video information, possibly
391 downloading the video.
393 File downloaders accept a lot of parameters. In order not to saturate
394 the object constructor with arguments, it receives a dictionary of
395 options instead. These options are available through the params
396 attribute for the InfoExtractors to use. The FileDownloader also
397 registers itself as the downloader in charge for the InfoExtractors
398 that are added to it, so this is a "mutual registration".
402 username: Username for authentication purposes.
403 password: Password for authentication purposes.
404 usenetrc: Use netrc for authentication instead.
405 quiet: Do not print messages to stdout.
406 forceurl: Force printing final URL.
407 forcetitle: Force printing title.
408 forcethumbnail: Force printing thumbnail URL.
409 forcedescription: Force printing description.
410 forcefilename: Force printing final filename.
411 simulate: Do not download the video files.
412 format: Video format code.
413 format_limit: Highest quality format to try.
414 outtmpl: Template for output names.
415 ignoreerrors: Do not stop on download errors.
416 ratelimit: Download speed limit, in bytes/sec.
417 nooverwrites: Prevent overwriting files.
418 retries: Number of times to retry for HTTP error 5xx
419 continuedl: Try to continue downloads if possible.
420 noprogress: Do not print the progress bar.
421 playliststart: Playlist item to start at.
422 playlistend: Playlist item to end at.
423 logtostderr: Log messages to stderr instead of stdout.
424 consoletitle: Display progress in console window's titlebar.
425 nopart: Do not use temporary .part files.
426 updatetime: Use the Last-modified header to set output file timestamps.
427 writedescription: Write the video description to a .description file
428 writeinfojson: Write the video description to a .info.json file
434 _download_retcode = None
435 _num_downloads = None
438 def __init__(self, params):
439 """Create a FileDownloader object with the given options."""
442 self._download_retcode = 0
443 self._num_downloads = 0
444 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
448 def pmkdir(filename):
449 """Create directory components in filename. Similar to Unix "mkdir -p"."""
450 components = filename.split(os.sep)
451 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
452 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
453 for dir in aggregate:
454 if not os.path.exists(dir):
458 def format_bytes(bytes):
461 if type(bytes) is str:
466 exponent = long(math.log(bytes, 1024.0))
467 suffix = 'bkMGTPEZY'[exponent]
468 converted = float(bytes) / float(1024**exponent)
469 return '%.2f%s' % (converted, suffix)
472 def calc_percent(byte_counter, data_len):
475 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
478 def calc_eta(start, now, total, current):
482 if current == 0 or dif < 0.001: # One millisecond
484 rate = float(current) / dif
485 eta = long((float(total) - float(current)) / rate)
486 (eta_mins, eta_secs) = divmod(eta, 60)
489 return '%02d:%02d' % (eta_mins, eta_secs)
492 def calc_speed(start, now, bytes):
494 if bytes == 0 or dif < 0.001: # One millisecond
495 return '%10s' % '---b/s'
496 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
499 def best_block_size(elapsed_time, bytes):
500 new_min = max(bytes / 2.0, 1.0)
501 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
502 if elapsed_time < 0.001:
504 rate = bytes / elapsed_time
512 def parse_bytes(bytestr):
513 """Parse a string indicating a byte quantity into a long integer."""
514 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
517 number = float(matchobj.group(1))
518 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
519 return long(round(number * multiplier))
521 def add_info_extractor(self, ie):
522 """Add an InfoExtractor object to the end of the list."""
524 ie.set_downloader(self)
526 def add_post_processor(self, pp):
527 """Add a PostProcessor object to the end of the chain."""
529 pp.set_downloader(self)
531 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
532 """Print message to stdout if not in quiet mode."""
534 if not self.params.get('quiet', False):
535 terminator = [u'\n', u''][skip_eol]
536 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
537 self._screen_file.flush()
538 except (UnicodeEncodeError), err:
539 if not ignore_encoding_errors:
542 def to_stderr(self, message):
543 """Print message to stderr."""
544 print >>sys.stderr, message.encode(preferredencoding())
546 def to_cons_title(self, message):
547 """Set console/terminal window title to message."""
548 if not self.params.get('consoletitle', False):
550 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
551 # c_wchar_p() might not be necessary if `message` is
552 # already of type unicode()
553 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
554 elif 'TERM' in os.environ:
555 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
557 def fixed_template(self):
558 """Checks if the output template is fixed."""
559 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
561 def trouble(self, message=None):
562 """Determine action to take when a download problem appears.
564 Depending on if the downloader has been configured to ignore
565 download errors or not, this method may throw an exception or
566 not when errors are found, after printing the message.
568 if message is not None:
569 self.to_stderr(message)
570 if not self.params.get('ignoreerrors', False):
571 raise DownloadError(message)
572 self._download_retcode = 1
574 def slow_down(self, start_time, byte_counter):
575 """Sleep if the download speed is over the rate limit."""
576 rate_limit = self.params.get('ratelimit', None)
577 if rate_limit is None or byte_counter == 0:
580 elapsed = now - start_time
583 speed = float(byte_counter) / elapsed
584 if speed > rate_limit:
585 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
587 def temp_name(self, filename):
588 """Returns a temporary filename for the given filename."""
589 if self.params.get('nopart', False) or filename == u'-' or \
590 (os.path.exists(filename) and not os.path.isfile(filename)):
592 return filename + u'.part'
594 def undo_temp_name(self, filename):
595 if filename.endswith(u'.part'):
596 return filename[:-len(u'.part')]
599 def try_rename(self, old_filename, new_filename):
601 if old_filename == new_filename:
603 os.rename(old_filename, new_filename)
604 except (IOError, OSError), err:
605 self.trouble(u'ERROR: unable to rename file')
607 def try_utime(self, filename, last_modified_hdr):
608 """Try to set the last-modified time of the given file."""
609 if last_modified_hdr is None:
611 if not os.path.isfile(filename):
613 timestr = last_modified_hdr
616 filetime = timeconvert(timestr)
620 os.utime(filename,(time.time(), filetime))
624 def report_writedescription(self, descfn):
625 """ Report that the description file is being written """
626 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
628 def report_writeinfojson(self, infofn):
629 """ Report that the metadata file has been written """
630 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
632 def report_destination(self, filename):
633 """Report destination filename."""
634 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
636 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
637 """Report download progress."""
638 if self.params.get('noprogress', False):
640 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
641 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
642 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
643 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
645 def report_resuming_byte(self, resume_len):
646 """Report attempt to resume at given byte."""
647 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
649 def report_retry(self, count, retries):
650 """Report retry in case of HTTP error 5xx"""
651 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
653 def report_file_already_downloaded(self, file_name):
654 """Report file has already been fully downloaded."""
656 self.to_screen(u'[download] %s has already been downloaded' % file_name)
657 except (UnicodeEncodeError), err:
658 self.to_screen(u'[download] The file has already been downloaded')
660 def report_unable_to_resume(self):
661 """Report it was impossible to resume download."""
662 self.to_screen(u'[download] Unable to resume')
664 def report_finish(self):
665 """Report download finished."""
666 if self.params.get('noprogress', False):
667 self.to_screen(u'[download] Download completed')
671 def increment_downloads(self):
672 """Increment the ordinal that assigns a number to each file."""
673 self._num_downloads += 1
675 def prepare_filename(self, info_dict):
676 """Generate the output filename."""
678 template_dict = dict(info_dict)
679 template_dict['epoch'] = unicode(long(time.time()))
680 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
681 filename = self.params['outtmpl'] % template_dict
683 except (ValueError, KeyError), err:
684 self.trouble(u'ERROR: invalid system charset or erroneous output template')
687 def process_info(self, info_dict):
688 """Process a single dictionary returned by an InfoExtractor."""
689 filename = self.prepare_filename(info_dict)
690 # Do nothing else if in simulate mode
691 if self.params.get('simulate', False):
693 if self.params.get('forcetitle', False):
694 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
695 if self.params.get('forceurl', False):
696 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
697 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
698 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
699 if self.params.get('forcedescription', False) and 'description' in info_dict:
700 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
701 if self.params.get('forcefilename', False) and filename is not None:
702 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
708 if self.params.get('nooverwrites', False) and os.path.exists(filename):
709 self.to_stderr(u'WARNING: file exists and will be skipped')
713 self.pmkdir(filename)
714 except (OSError, IOError), err:
715 self.trouble(u'ERROR: unable to create directories: %s' % str(err))
718 if self.params.get('writedescription', False):
720 descfn = filename + '.description'
721 self.report_writedescription(descfn)
722 descfile = open(descfn, 'wb')
724 descfile.write(info_dict['description'].encode('utf-8'))
727 except (OSError, IOError):
728 self.trouble(u'ERROR: Cannot write description file: %s' % str(descfn))
731 if self.params.get('writeinfojson', False):
732 infofn = filename + '.info.json'
733 self.report_writeinfojson(infofn)
736 except (NameError,AttributeError):
737 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
740 infof = open(infofn, 'wb')
742 json.dump(info_dict, infof)
745 except (OSError, IOError):
746 self.trouble(u'ERROR: Cannot write metadata to JSON file: %s' % str(infofn))
750 success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
751 except (OSError, IOError), err:
752 raise UnavailableVideoError
753 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
754 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
756 except (ContentTooShortError, ), err:
757 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
762 self.post_process(filename, info_dict)
763 except (PostProcessingError), err:
764 self.trouble(u'ERROR: postprocessing: %s' % str(err))
767 def download(self, url_list):
768 """Download a given list of URLs."""
769 if len(url_list) > 1 and self.fixed_template():
770 raise SameFileError(self.params['outtmpl'])
773 suitable_found = False
775 # Go to next InfoExtractor if not suitable
776 if not ie.suitable(url):
779 # Suitable InfoExtractor found
780 suitable_found = True
782 # Extract information from URL and process it
785 # Suitable InfoExtractor had been found; go to next URL
788 if not suitable_found:
789 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
791 return self._download_retcode
793 def post_process(self, filename, ie_info):
794 """Run the postprocessing chain on the given file."""
796 info['filepath'] = filename
802 def _download_with_rtmpdump(self, filename, url, player_url):
803 self.report_destination(filename)
804 tmpfilename = self.temp_name(filename)
806 # Check for rtmpdump first
808 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
809 except (OSError, IOError):
810 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
813 # Download using rtmpdump. rtmpdump returns exit code 2 when
814 # the connection was interrumpted and resuming appears to be
815 # possible. This is part of rtmpdump's normal usage, AFAIK.
816 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
817 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
818 while retval == 2 or retval == 1:
819 prevsize = os.path.getsize(tmpfilename)
820 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
821 time.sleep(5.0) # This seems to be needed
822 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
823 cursize = os.path.getsize(tmpfilename)
824 if prevsize == cursize and retval == 1:
827 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
828 self.try_rename(tmpfilename, filename)
831 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
834 def _do_download(self, filename, url, player_url):
835 # Check file already present
836 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
837 self.report_file_already_downloaded(filename)
840 # Attempt to download using rtmpdump
841 if url.startswith('rtmp'):
842 return self._download_with_rtmpdump(filename, url, player_url)
844 tmpfilename = self.temp_name(filename)
848 # Do not include the Accept-Encoding header
849 headers = {'Youtubedl-no-compression': 'True'}
850 basic_request = urllib2.Request(url, None, headers)
851 request = urllib2.Request(url, None, headers)
853 # Establish possible resume length
854 if os.path.isfile(tmpfilename):
855 resume_len = os.path.getsize(tmpfilename)
859 # Request parameters in case of being able to resume
860 if self.params.get('continuedl', False) and resume_len != 0:
861 self.report_resuming_byte(resume_len)
862 request.add_header('Range','bytes=%d-' % resume_len)
866 retries = self.params.get('retries', 0)
867 while count <= retries:
868 # Establish connection
870 data = urllib2.urlopen(request)
872 except (urllib2.HTTPError, ), err:
873 if (err.code < 500 or err.code >= 600) and err.code != 416:
874 # Unexpected HTTP error
876 elif err.code == 416:
877 # Unable to resume (requested range not satisfiable)
879 # Open the connection again without the range header
880 data = urllib2.urlopen(basic_request)
881 content_length = data.info()['Content-Length']
882 except (urllib2.HTTPError, ), err:
883 if err.code < 500 or err.code >= 600:
886 # Examine the reported length
887 if (content_length is not None and
888 (resume_len - 100 < long(content_length) < resume_len + 100)):
889 # The file had already been fully downloaded.
890 # Explanation to the above condition: in issue #175 it was revealed that
891 # YouTube sometimes adds or removes a few bytes from the end of the file,
892 # changing the file size slightly and causing problems for some users. So
893 # I decided to implement a suggested change and consider the file
894 # completely downloaded if the file size differs less than 100 bytes from
895 # the one in the hard drive.
896 self.report_file_already_downloaded(filename)
897 self.try_rename(tmpfilename, filename)
900 # The length does not match, we start the download over
901 self.report_unable_to_resume()
907 self.report_retry(count, retries)
910 self.trouble(u'ERROR: giving up after %s retries' % retries)
913 data_len = data.info().get('Content-length', None)
914 if data_len is not None:
915 data_len = long(data_len) + resume_len
916 data_len_str = self.format_bytes(data_len)
917 byte_counter = 0 + resume_len
923 data_block = data.read(block_size)
925 if len(data_block) == 0:
927 byte_counter += len(data_block)
929 # Open file just in time
932 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
933 assert stream is not None
934 filename = self.undo_temp_name(tmpfilename)
935 self.report_destination(filename)
936 except (OSError, IOError), err:
937 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
940 stream.write(data_block)
941 except (IOError, OSError), err:
942 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
944 block_size = self.best_block_size(after - before, len(data_block))
947 percent_str = self.calc_percent(byte_counter, data_len)
948 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
949 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
950 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
953 self.slow_down(start, byte_counter - resume_len)
956 self.trouble(u'\nERROR: Did not get any data blocks')
960 if data_len is not None and byte_counter != data_len:
961 raise ContentTooShortError(byte_counter, long(data_len))
962 self.try_rename(tmpfilename, filename)
964 # Update file modification time
965 if self.params.get('updatetime', True):
966 self.try_utime(filename, data.info().get('last-modified', None))
970 class InfoExtractor(object):
971 """Information Extractor class.
973 Information extractors are the classes that, given a URL, extract
974 information from the video (or videos) the URL refers to. This
975 information includes the real video URL, the video title and simplified
976 title, author and others. The information is stored in a dictionary
977 which is then passed to the FileDownloader. The FileDownloader
978 processes this information possibly downloading the video to the file
979 system, among other possible outcomes. The dictionaries must include
980 the following fields:
982 id: Video identifier.
983 url: Final video URL.
984 uploader: Nickname of the video uploader.
985 title: Literal title.
986 stitle: Simplified title.
987 ext: Video filename extension.
988 format: Video format.
989 player_url: SWF Player URL (may be None).
991 The following fields are optional. Their primary purpose is to allow
992 youtube-dl to serve as the backend for a video search function, such
993 as the one in youtube2mp3. They are only used when their respective
994 forced printing functions are called:
996 thumbnail: Full URL to a video thumbnail image.
997 description: One-line video description.
999 Subclasses of this one should re-define the _real_initialize() and
1000 _real_extract() methods, as well as the suitable() static method.
1001 Probably, they should also be instantiated and added to the main
1008 def __init__(self, downloader=None):
1009 """Constructor. Receives an optional downloader."""
1011 self.set_downloader(downloader)
1015 """Receives a URL and returns True if suitable for this IE."""
1018 def initialize(self):
1019 """Initializes an instance (authentication, etc)."""
1021 self._real_initialize()
1024 def extract(self, url):
1025 """Extracts URL information and returns it in list of dicts."""
1027 return self._real_extract(url)
1029 def set_downloader(self, downloader):
1030 """Sets the downloader for this IE."""
1031 self._downloader = downloader
1033 def _real_initialize(self):
1034 """Real initialization process. Redefine in subclasses."""
1037 def _real_extract(self, url):
1038 """Real extraction process. Redefine in subclasses."""
1041 class YoutubeIE(InfoExtractor):
1042 """Information extractor for youtube.com."""
1044 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1045 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1046 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1047 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1048 _NETRC_MACHINE = 'youtube'
1049 # Listed in order of quality
1050 _available_formats = ['38', '37', '45', '22', '43', '35', '34', '18', '6', '5', '17', '13']
1051 _video_extensions = {
1057 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1064 return (re.match(YoutubeIE._VALID_URL, url) is not None)
1066 def report_lang(self):
1067 """Report attempt to set language."""
1068 self._downloader.to_screen(u'[youtube] Setting language')
1070 def report_login(self):
1071 """Report attempt to log in."""
1072 self._downloader.to_screen(u'[youtube] Logging in')
1074 def report_age_confirmation(self):
1075 """Report attempt to confirm age."""
1076 self._downloader.to_screen(u'[youtube] Confirming age')
1078 def report_video_webpage_download(self, video_id):
1079 """Report attempt to download video webpage."""
1080 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1082 def report_video_info_webpage_download(self, video_id):
1083 """Report attempt to download video info webpage."""
1084 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1086 def report_information_extraction(self, video_id):
1087 """Report attempt to extract video information."""
1088 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1090 def report_unavailable_format(self, video_id, format):
1091 """Report extracted video URL."""
1092 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1094 def report_rtmp_download(self):
1095 """Indicate the download will use the RTMP protocol."""
1096 self._downloader.to_screen(u'[youtube] RTMP download detected')
1098 def _real_initialize(self):
1099 if self._downloader is None:
1104 downloader_params = self._downloader.params
1106 # Attempt to use provided username and password or .netrc data
1107 if downloader_params.get('username', None) is not None:
1108 username = downloader_params['username']
1109 password = downloader_params['password']
1110 elif downloader_params.get('usenetrc', False):
1112 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1113 if info is not None:
1117 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1118 except (IOError, netrc.NetrcParseError), err:
1119 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1123 request = urllib2.Request(self._LANG_URL)
1126 urllib2.urlopen(request).read()
1127 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1128 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1131 # No authentication to be performed
1132 if username is None:
1137 'current_form': 'loginForm',
1139 'action_login': 'Log In',
1140 'username': username,
1141 'password': password,
1143 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1146 login_results = urllib2.urlopen(request).read()
1147 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1148 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1150 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1151 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1157 'action_confirm': 'Confirm',
1159 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1161 self.report_age_confirmation()
1162 age_results = urllib2.urlopen(request).read()
1163 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1164 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1167 def _real_extract(self, url):
1168 # Extract video id from URL
1169 mobj = re.match(self._VALID_URL, url)
1171 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1173 video_id = mobj.group(2)
1176 self.report_video_webpage_download(video_id)
1177 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1179 video_webpage = urllib2.urlopen(request).read()
1180 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1181 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1184 # Attempt to extract SWF player URL
1185 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1186 if mobj is not None:
1187 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1192 self.report_video_info_webpage_download(video_id)
1193 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1194 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1195 % (video_id, el_type))
1196 request = urllib2.Request(video_info_url)
1198 video_info_webpage = urllib2.urlopen(request).read()
1199 video_info = parse_qs(video_info_webpage)
1200 if 'token' in video_info:
1202 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1203 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1205 if 'token' not in video_info:
1206 if 'reason' in video_info:
1207 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1209 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1212 # Start extracting information
1213 self.report_information_extraction(video_id)
1216 if 'author' not in video_info:
1217 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1219 video_uploader = urllib.unquote_plus(video_info['author'][0])
1222 if 'title' not in video_info:
1223 self._downloader.trouble(u'ERROR: unable to extract video title')
1225 video_title = urllib.unquote_plus(video_info['title'][0])
1226 video_title = video_title.decode('utf-8')
1227 video_title = sanitize_title(video_title)
1230 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1231 simple_title = simple_title.strip(ur'_')
1234 if 'thumbnail_url' not in video_info:
1235 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1236 video_thumbnail = ''
1237 else: # don't panic if we can't find it
1238 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1242 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1243 if mobj is not None:
1244 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1245 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1246 for expression in format_expressions:
1248 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1256 video_description = u'No description available.'
1257 if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1258 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1259 if mobj is not None:
1260 video_description = mobj.group(1).decode('utf-8')
1262 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1263 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1264 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1265 # TODO use another parser
1268 video_token = urllib.unquote_plus(video_info['token'][0])
1270 # Decide which formats to download
1271 req_format = self._downloader.params.get('format', None)
1273 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1274 self.report_rtmp_download()
1275 video_url_list = [(None, video_info['conn'][0])]
1276 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1277 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1278 url_data = [parse_qs(uds) for uds in url_data_strs]
1279 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1280 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1282 format_limit = self._downloader.params.get('format_limit', None)
1283 if format_limit is not None and format_limit in self._available_formats:
1284 format_list = self._available_formats[self._available_formats.index(format_limit):]
1286 format_list = self._available_formats
1287 existing_formats = [x for x in format_list if x in url_map]
1288 if len(existing_formats) == 0:
1289 self._downloader.trouble(u'ERROR: no known formats available for video')
1291 if req_format is None:
1292 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1293 elif req_format == '-1':
1294 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1297 if req_format not in url_map:
1298 self._downloader.trouble(u'ERROR: requested format not available')
1300 video_url_list = [(req_format, url_map[req_format])] # Specific format
1302 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1305 for format_param, video_real_url in video_url_list:
1306 # At this point we have a new video
1307 self._downloader.increment_downloads()
1310 video_extension = self._video_extensions.get(format_param, 'flv')
1313 # Process video information
1314 self._downloader.process_info({
1315 'id': video_id.decode('utf-8'),
1316 'url': video_real_url.decode('utf-8'),
1317 'uploader': video_uploader.decode('utf-8'),
1318 'upload_date': upload_date,
1319 'title': video_title,
1320 'stitle': simple_title,
1321 'ext': video_extension.decode('utf-8'),
1322 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1323 'thumbnail': video_thumbnail.decode('utf-8'),
1324 'description': video_description,
1325 'player_url': player_url,
1327 except UnavailableVideoError, err:
1328 self._downloader.trouble(u'\nERROR: unable to download video')
1331 class MetacafeIE(InfoExtractor):
1332 """Information Extractor for metacafe.com."""
1334 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1335 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1336 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1339 def __init__(self, youtube_ie, downloader=None):
1340 InfoExtractor.__init__(self, downloader)
1341 self._youtube_ie = youtube_ie
1345 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1347 def report_disclaimer(self):
1348 """Report disclaimer retrieval."""
1349 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1351 def report_age_confirmation(self):
1352 """Report attempt to confirm age."""
1353 self._downloader.to_screen(u'[metacafe] Confirming age')
1355 def report_download_webpage(self, video_id):
1356 """Report webpage download."""
1357 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1359 def report_extraction(self, video_id):
1360 """Report information extraction."""
1361 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1363 def _real_initialize(self):
1364 # Retrieve disclaimer
1365 request = urllib2.Request(self._DISCLAIMER)
1367 self.report_disclaimer()
1368 disclaimer = urllib2.urlopen(request).read()
1369 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1370 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1376 'submit': "Continue - I'm over 18",
1378 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1380 self.report_age_confirmation()
1381 disclaimer = urllib2.urlopen(request).read()
1382 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1383 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1386 def _real_extract(self, url):
1387 # Extract id and simplified title from URL
1388 mobj = re.match(self._VALID_URL, url)
1390 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1393 video_id = mobj.group(1)
1395 # Check if video comes from YouTube
1396 mobj2 = re.match(r'^yt-(.*)$', video_id)
1397 if mobj2 is not None:
1398 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1401 # At this point we have a new video
1402 self._downloader.increment_downloads()
1404 simple_title = mobj.group(2).decode('utf-8')
1406 # Retrieve video webpage to extract further information
1407 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1409 self.report_download_webpage(video_id)
1410 webpage = urllib2.urlopen(request).read()
1411 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1412 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1415 # Extract URL, uploader and title from webpage
1416 self.report_extraction(video_id)
1417 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1418 if mobj is not None:
1419 mediaURL = urllib.unquote(mobj.group(1))
1420 video_extension = mediaURL[-3:]
1422 # Extract gdaKey if available
1423 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1425 video_url = mediaURL
1427 gdaKey = mobj.group(1)
1428 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1430 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1432 self._downloader.trouble(u'ERROR: unable to extract media URL')
1434 vardict = parse_qs(mobj.group(1))
1435 if 'mediaData' not in vardict:
1436 self._downloader.trouble(u'ERROR: unable to extract media URL')
1438 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1440 self._downloader.trouble(u'ERROR: unable to extract media URL')
1442 mediaURL = mobj.group(1).replace('\\/', '/')
1443 video_extension = mediaURL[-3:]
1444 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1446 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1448 self._downloader.trouble(u'ERROR: unable to extract title')
1450 video_title = mobj.group(1).decode('utf-8')
1451 video_title = sanitize_title(video_title)
1453 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1455 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1457 video_uploader = mobj.group(1)
1460 # Process video information
1461 self._downloader.process_info({
1462 'id': video_id.decode('utf-8'),
1463 'url': video_url.decode('utf-8'),
1464 'uploader': video_uploader.decode('utf-8'),
1465 'upload_date': u'NA',
1466 'title': video_title,
1467 'stitle': simple_title,
1468 'ext': video_extension.decode('utf-8'),
1472 except UnavailableVideoError:
1473 self._downloader.trouble(u'\nERROR: unable to download video')
1476 class DailymotionIE(InfoExtractor):
1477 """Information Extractor for Dailymotion"""
1479 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1481 def __init__(self, downloader=None):
1482 InfoExtractor.__init__(self, downloader)
1486 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1488 def report_download_webpage(self, video_id):
1489 """Report webpage download."""
1490 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1492 def report_extraction(self, video_id):
1493 """Report information extraction."""
1494 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1496 def _real_initialize(self):
1499 def _real_extract(self, url):
1500 # Extract id and simplified title from URL
1501 mobj = re.match(self._VALID_URL, url)
1503 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1506 # At this point we have a new video
1507 self._downloader.increment_downloads()
1508 video_id = mobj.group(1)
1510 simple_title = mobj.group(2).decode('utf-8')
1511 video_extension = 'flv'
1513 # Retrieve video webpage to extract further information
1514 request = urllib2.Request(url)
1516 self.report_download_webpage(video_id)
1517 webpage = urllib2.urlopen(request).read()
1518 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1519 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1522 # Extract URL, uploader and title from webpage
1523 self.report_extraction(video_id)
1524 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1526 self._downloader.trouble(u'ERROR: unable to extract media URL')
1528 mediaURL = urllib.unquote(mobj.group(1))
1530 # if needed add http://www.dailymotion.com/ if relative URL
1532 video_url = mediaURL
1534 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1535 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1537 self._downloader.trouble(u'ERROR: unable to extract title')
1539 video_title = mobj.group(1).decode('utf-8')
1540 video_title = sanitize_title(video_title)
1542 mobj = re.search(r'(?im)<Attribute name="owner">(.+?)</Attribute>', webpage)
1544 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1546 video_uploader = mobj.group(1)
1549 # Process video information
1550 self._downloader.process_info({
1551 'id': video_id.decode('utf-8'),
1552 'url': video_url.decode('utf-8'),
1553 'uploader': video_uploader.decode('utf-8'),
1554 'upload_date': u'NA',
1555 'title': video_title,
1556 'stitle': simple_title,
1557 'ext': video_extension.decode('utf-8'),
1561 except UnavailableVideoError:
1562 self._downloader.trouble(u'\nERROR: unable to download video')
1564 class GoogleIE(InfoExtractor):
1565 """Information extractor for video.google.com."""
1567 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1569 def __init__(self, downloader=None):
1570 InfoExtractor.__init__(self, downloader)
1574 return (re.match(GoogleIE._VALID_URL, url) is not None)
1576 def report_download_webpage(self, video_id):
1577 """Report webpage download."""
1578 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1580 def report_extraction(self, video_id):
1581 """Report information extraction."""
1582 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1584 def _real_initialize(self):
1587 def _real_extract(self, url):
1588 # Extract id from URL
1589 mobj = re.match(self._VALID_URL, url)
1591 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1594 # At this point we have a new video
1595 self._downloader.increment_downloads()
1596 video_id = mobj.group(1)
1598 video_extension = 'mp4'
1600 # Retrieve video webpage to extract further information
1601 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1603 self.report_download_webpage(video_id)
1604 webpage = urllib2.urlopen(request).read()
1605 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1606 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1609 # Extract URL, uploader, and title from webpage
1610 self.report_extraction(video_id)
1611 mobj = re.search(r"download_url:'([^']+)'", webpage)
1613 video_extension = 'flv'
1614 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1616 self._downloader.trouble(u'ERROR: unable to extract media URL')
1618 mediaURL = urllib.unquote(mobj.group(1))
1619 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1620 mediaURL = mediaURL.replace('\\x26', '\x26')
1622 video_url = mediaURL
1624 mobj = re.search(r'<title>(.*)</title>', webpage)
1626 self._downloader.trouble(u'ERROR: unable to extract title')
1628 video_title = mobj.group(1).decode('utf-8')
1629 video_title = sanitize_title(video_title)
1630 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1632 # Extract video description
1633 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1635 self._downloader.trouble(u'ERROR: unable to extract video description')
1637 video_description = mobj.group(1).decode('utf-8')
1638 if not video_description:
1639 video_description = 'No description available.'
1641 # Extract video thumbnail
1642 if self._downloader.params.get('forcethumbnail', False):
1643 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1645 webpage = urllib2.urlopen(request).read()
1646 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1647 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1649 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1651 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1653 video_thumbnail = mobj.group(1)
1654 else: # we need something to pass to process_info
1655 video_thumbnail = ''
1659 # Process video information
1660 self._downloader.process_info({
1661 'id': video_id.decode('utf-8'),
1662 'url': video_url.decode('utf-8'),
1664 'upload_date': u'NA',
1665 'title': video_title,
1666 'stitle': simple_title,
1667 'ext': video_extension.decode('utf-8'),
1671 except UnavailableVideoError:
1672 self._downloader.trouble(u'\nERROR: unable to download video')
1675 class PhotobucketIE(InfoExtractor):
1676 """Information extractor for photobucket.com."""
1678 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1680 def __init__(self, downloader=None):
1681 InfoExtractor.__init__(self, downloader)
1685 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1687 def report_download_webpage(self, video_id):
1688 """Report webpage download."""
1689 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1691 def report_extraction(self, video_id):
1692 """Report information extraction."""
1693 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1695 def _real_initialize(self):
1698 def _real_extract(self, url):
1699 # Extract id from URL
1700 mobj = re.match(self._VALID_URL, url)
1702 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1705 # At this point we have a new video
1706 self._downloader.increment_downloads()
1707 video_id = mobj.group(1)
1709 video_extension = 'flv'
1711 # Retrieve video webpage to extract further information
1712 request = urllib2.Request(url)
1714 self.report_download_webpage(video_id)
1715 webpage = urllib2.urlopen(request).read()
1716 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1717 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1720 # Extract URL, uploader, and title from webpage
1721 self.report_extraction(video_id)
1722 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1724 self._downloader.trouble(u'ERROR: unable to extract media URL')
1726 mediaURL = urllib.unquote(mobj.group(1))
1728 video_url = mediaURL
1730 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1732 self._downloader.trouble(u'ERROR: unable to extract title')
1734 video_title = mobj.group(1).decode('utf-8')
1735 video_title = sanitize_title(video_title)
1736 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1738 video_uploader = mobj.group(2).decode('utf-8')
1741 # Process video information
1742 self._downloader.process_info({
1743 'id': video_id.decode('utf-8'),
1744 'url': video_url.decode('utf-8'),
1745 'uploader': video_uploader,
1746 'upload_date': u'NA',
1747 'title': video_title,
1748 'stitle': simple_title,
1749 'ext': video_extension.decode('utf-8'),
1753 except UnavailableVideoError:
1754 self._downloader.trouble(u'\nERROR: unable to download video')
1757 class YahooIE(InfoExtractor):
1758 """Information extractor for video.yahoo.com."""
1760 # _VALID_URL matches all Yahoo! Video URLs
1761 # _VPAGE_URL matches only the extractable '/watch/' URLs
1762 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1763 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1765 def __init__(self, downloader=None):
1766 InfoExtractor.__init__(self, downloader)
1770 return (re.match(YahooIE._VALID_URL, url) is not None)
1772 def report_download_webpage(self, video_id):
1773 """Report webpage download."""
1774 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1776 def report_extraction(self, video_id):
1777 """Report information extraction."""
1778 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1780 def _real_initialize(self):
1783 def _real_extract(self, url, new_video=True):
1784 # Extract ID from URL
1785 mobj = re.match(self._VALID_URL, url)
1787 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1790 # At this point we have a new video
1791 self._downloader.increment_downloads()
1792 video_id = mobj.group(2)
1793 video_extension = 'flv'
1795 # Rewrite valid but non-extractable URLs as
1796 # extractable English language /watch/ URLs
1797 if re.match(self._VPAGE_URL, url) is None:
1798 request = urllib2.Request(url)
1800 webpage = urllib2.urlopen(request).read()
1801 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1802 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1805 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1807 self._downloader.trouble(u'ERROR: Unable to extract id field')
1809 yahoo_id = mobj.group(1)
1811 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1813 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1815 yahoo_vid = mobj.group(1)
1817 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1818 return self._real_extract(url, new_video=False)
1820 # Retrieve video webpage to extract further information
1821 request = urllib2.Request(url)
1823 self.report_download_webpage(video_id)
1824 webpage = urllib2.urlopen(request).read()
1825 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1826 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1829 # Extract uploader and title from webpage
1830 self.report_extraction(video_id)
1831 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1833 self._downloader.trouble(u'ERROR: unable to extract video title')
1835 video_title = mobj.group(1).decode('utf-8')
1836 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1838 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1840 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1842 video_uploader = mobj.group(1).decode('utf-8')
1844 # Extract video thumbnail
1845 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1847 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1849 video_thumbnail = mobj.group(1).decode('utf-8')
1851 # Extract video description
1852 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1854 self._downloader.trouble(u'ERROR: unable to extract video description')
1856 video_description = mobj.group(1).decode('utf-8')
1857 if not video_description: video_description = 'No description available.'
1859 # Extract video height and width
1860 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1862 self._downloader.trouble(u'ERROR: unable to extract video height')
1864 yv_video_height = mobj.group(1)
1866 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1868 self._downloader.trouble(u'ERROR: unable to extract video width')
1870 yv_video_width = mobj.group(1)
1872 # Retrieve video playlist to extract media URL
1873 # I'm not completely sure what all these options are, but we
1874 # seem to need most of them, otherwise the server sends a 401.
1875 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1876 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1877 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1878 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1879 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1881 self.report_download_webpage(video_id)
1882 webpage = urllib2.urlopen(request).read()
1883 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1884 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1887 # Extract media URL from playlist XML
1888 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1890 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1892 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1893 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1896 # Process video information
1897 self._downloader.process_info({
1898 'id': video_id.decode('utf-8'),
1900 'uploader': video_uploader,
1901 'upload_date': u'NA',
1902 'title': video_title,
1903 'stitle': simple_title,
1904 'ext': video_extension.decode('utf-8'),
1905 'thumbnail': video_thumbnail.decode('utf-8'),
1906 'description': video_description,
1907 'thumbnail': video_thumbnail,
1908 'description': video_description,
1911 except UnavailableVideoError:
1912 self._downloader.trouble(u'\nERROR: unable to download video')
1915 class VimeoIE(InfoExtractor):
1916 """Information extractor for vimeo.com."""
1918 # _VALID_URL matches Vimeo URLs
1919 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1921 def __init__(self, downloader=None):
1922 InfoExtractor.__init__(self, downloader)
1926 return (re.match(VimeoIE._VALID_URL, url) is not None)
1928 def report_download_webpage(self, video_id):
1929 """Report webpage download."""
1930 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1932 def report_extraction(self, video_id):
1933 """Report information extraction."""
1934 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1936 def _real_initialize(self):
1939 def _real_extract(self, url, new_video=True):
1940 # Extract ID from URL
1941 mobj = re.match(self._VALID_URL, url)
1943 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1946 # At this point we have a new video
1947 self._downloader.increment_downloads()
1948 video_id = mobj.group(1)
1950 # Retrieve video webpage to extract further information
1951 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
1953 self.report_download_webpage(video_id)
1954 webpage = urllib2.urlopen(request).read()
1955 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1956 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1959 # Now we begin extracting as much information as we can from what we
1960 # retrieved. First we extract the information common to all extractors,
1961 # and latter we extract those that are Vimeo specific.
1962 self.report_extraction(video_id)
1965 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
1967 self._downloader.trouble(u'ERROR: unable to extract video title')
1969 video_title = mobj.group(1).decode('utf-8')
1970 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1973 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
1975 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1977 video_uploader = mobj.group(1).decode('utf-8')
1979 # Extract video thumbnail
1980 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
1982 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1984 video_thumbnail = mobj.group(1).decode('utf-8')
1986 # # Extract video description
1987 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
1989 # self._downloader.trouble(u'ERROR: unable to extract video description')
1991 # video_description = mobj.group(1).decode('utf-8')
1992 # if not video_description: video_description = 'No description available.'
1993 video_description = 'Foo.'
1995 # Vimeo specific: extract request signature
1996 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
1998 self._downloader.trouble(u'ERROR: unable to extract request signature')
2000 sig = mobj.group(1).decode('utf-8')
2002 # Vimeo specific: Extract request signature expiration
2003 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2005 self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2007 sig_exp = mobj.group(1).decode('utf-8')
2009 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s" % (video_id, sig, sig_exp)
2012 # Process video information
2013 self._downloader.process_info({
2014 'id': video_id.decode('utf-8'),
2016 'uploader': video_uploader,
2017 'upload_date': u'NA',
2018 'title': video_title,
2019 'stitle': simple_title,
2021 'thumbnail': video_thumbnail.decode('utf-8'),
2022 'description': video_description,
2023 'thumbnail': video_thumbnail,
2024 'description': video_description,
2027 except UnavailableVideoError:
2028 self._downloader.trouble(u'ERROR: unable to download video')
2031 class GenericIE(InfoExtractor):
2032 """Generic last-resort information extractor."""
2034 def __init__(self, downloader=None):
2035 InfoExtractor.__init__(self, downloader)
2041 def report_download_webpage(self, video_id):
2042 """Report webpage download."""
2043 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2044 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2046 def report_extraction(self, video_id):
2047 """Report information extraction."""
2048 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2050 def _real_initialize(self):
2053 def _real_extract(self, url):
2054 # At this point we have a new video
2055 self._downloader.increment_downloads()
2057 video_id = url.split('/')[-1]
2058 request = urllib2.Request(url)
2060 self.report_download_webpage(video_id)
2061 webpage = urllib2.urlopen(request).read()
2062 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2063 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2065 except ValueError, err:
2066 # since this is the last-resort InfoExtractor, if
2067 # this error is thrown, it'll be thrown here
2068 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2071 self.report_extraction(video_id)
2072 # Start with something easy: JW Player in SWFObject
2073 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2075 # Broaden the search a little bit
2076 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2078 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2081 # It's possible that one of the regexes
2082 # matched, but returned an empty group:
2083 if mobj.group(1) is None:
2084 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2087 video_url = urllib.unquote(mobj.group(1))
2088 video_id = os.path.basename(video_url)
2090 # here's a fun little line of code for you:
2091 video_extension = os.path.splitext(video_id)[1][1:]
2092 video_id = os.path.splitext(video_id)[0]
2094 # it's tempting to parse this further, but you would
2095 # have to take into account all the variations like
2096 # Video Title - Site Name
2097 # Site Name | Video Title
2098 # Video Title - Tagline | Site Name
2099 # and so on and so forth; it's just not practical
2100 mobj = re.search(r'<title>(.*)</title>', webpage)
2102 self._downloader.trouble(u'ERROR: unable to extract title')
2104 video_title = mobj.group(1).decode('utf-8')
2105 video_title = sanitize_title(video_title)
2106 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2108 # video uploader is domain name
2109 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2111 self._downloader.trouble(u'ERROR: unable to extract title')
2113 video_uploader = mobj.group(1).decode('utf-8')
2116 # Process video information
2117 self._downloader.process_info({
2118 'id': video_id.decode('utf-8'),
2119 'url': video_url.decode('utf-8'),
2120 'uploader': video_uploader,
2121 'upload_date': u'NA',
2122 'title': video_title,
2123 'stitle': simple_title,
2124 'ext': video_extension.decode('utf-8'),
2128 except UnavailableVideoError, err:
2129 self._downloader.trouble(u'\nERROR: unable to download video')
2132 class YoutubeSearchIE(InfoExtractor):
2133 """Information Extractor for YouTube search queries."""
2134 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
2135 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2136 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2137 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2139 _max_youtube_results = 1000
2141 def __init__(self, youtube_ie, downloader=None):
2142 InfoExtractor.__init__(self, downloader)
2143 self._youtube_ie = youtube_ie
2147 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
2149 def report_download_page(self, query, pagenum):
2150 """Report attempt to download playlist page with given number."""
2151 query = query.decode(preferredencoding())
2152 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2154 def _real_initialize(self):
2155 self._youtube_ie.initialize()
2157 def _real_extract(self, query):
2158 mobj = re.match(self._VALID_QUERY, query)
2160 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2163 prefix, query = query.split(':')
2165 query = query.encode('utf-8')
2167 self._download_n_results(query, 1)
2169 elif prefix == 'all':
2170 self._download_n_results(query, self._max_youtube_results)
2176 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2178 elif n > self._max_youtube_results:
2179 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2180 n = self._max_youtube_results
2181 self._download_n_results(query, n)
2183 except ValueError: # parsing prefix as integer fails
2184 self._download_n_results(query, 1)
2187 def _download_n_results(self, query, n):
2188 """Downloads a specified number of results for a query"""
2191 already_seen = set()
2195 self.report_download_page(query, pagenum)
2196 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2197 request = urllib2.Request(result_url)
2199 page = urllib2.urlopen(request).read()
2200 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2201 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2204 # Extract video identifiers
2205 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2206 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2207 if video_id not in already_seen:
2208 video_ids.append(video_id)
2209 already_seen.add(video_id)
2210 if len(video_ids) == n:
2211 # Specified n videos reached
2212 for id in video_ids:
2213 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2216 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2217 for id in video_ids:
2218 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2221 pagenum = pagenum + 1
2223 class GoogleSearchIE(InfoExtractor):
2224 """Information Extractor for Google Video search queries."""
2225 _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
2226 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2227 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2228 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2230 _max_google_results = 1000
2232 def __init__(self, google_ie, downloader=None):
2233 InfoExtractor.__init__(self, downloader)
2234 self._google_ie = google_ie
2238 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
2240 def report_download_page(self, query, pagenum):
2241 """Report attempt to download playlist page with given number."""
2242 query = query.decode(preferredencoding())
2243 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2245 def _real_initialize(self):
2246 self._google_ie.initialize()
2248 def _real_extract(self, query):
2249 mobj = re.match(self._VALID_QUERY, query)
2251 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2254 prefix, query = query.split(':')
2256 query = query.encode('utf-8')
2258 self._download_n_results(query, 1)
2260 elif prefix == 'all':
2261 self._download_n_results(query, self._max_google_results)
2267 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2269 elif n > self._max_google_results:
2270 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2271 n = self._max_google_results
2272 self._download_n_results(query, n)
2274 except ValueError: # parsing prefix as integer fails
2275 self._download_n_results(query, 1)
2278 def _download_n_results(self, query, n):
2279 """Downloads a specified number of results for a query"""
2282 already_seen = set()
2286 self.report_download_page(query, pagenum)
2287 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2288 request = urllib2.Request(result_url)
2290 page = urllib2.urlopen(request).read()
2291 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2292 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2295 # Extract video identifiers
2296 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2297 video_id = mobj.group(1)
2298 if video_id not in already_seen:
2299 video_ids.append(video_id)
2300 already_seen.add(video_id)
2301 if len(video_ids) == n:
2302 # Specified n videos reached
2303 for id in video_ids:
2304 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2307 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2308 for id in video_ids:
2309 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2312 pagenum = pagenum + 1
2314 class YahooSearchIE(InfoExtractor):
2315 """Information Extractor for Yahoo! Video search queries."""
2316 _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
2317 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2318 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2319 _MORE_PAGES_INDICATOR = r'\s*Next'
2321 _max_yahoo_results = 1000
2323 def __init__(self, yahoo_ie, downloader=None):
2324 InfoExtractor.__init__(self, downloader)
2325 self._yahoo_ie = yahoo_ie
2329 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
2331 def report_download_page(self, query, pagenum):
2332 """Report attempt to download playlist page with given number."""
2333 query = query.decode(preferredencoding())
2334 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2336 def _real_initialize(self):
2337 self._yahoo_ie.initialize()
2339 def _real_extract(self, query):
2340 mobj = re.match(self._VALID_QUERY, query)
2342 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2345 prefix, query = query.split(':')
2347 query = query.encode('utf-8')
2349 self._download_n_results(query, 1)
2351 elif prefix == 'all':
2352 self._download_n_results(query, self._max_yahoo_results)
2358 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2360 elif n > self._max_yahoo_results:
2361 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2362 n = self._max_yahoo_results
2363 self._download_n_results(query, n)
2365 except ValueError: # parsing prefix as integer fails
2366 self._download_n_results(query, 1)
2369 def _download_n_results(self, query, n):
2370 """Downloads a specified number of results for a query"""
2373 already_seen = set()
2377 self.report_download_page(query, pagenum)
2378 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2379 request = urllib2.Request(result_url)
2381 page = urllib2.urlopen(request).read()
2382 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2383 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2386 # Extract video identifiers
2387 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2388 video_id = mobj.group(1)
2389 if video_id not in already_seen:
2390 video_ids.append(video_id)
2391 already_seen.add(video_id)
2392 if len(video_ids) == n:
2393 # Specified n videos reached
2394 for id in video_ids:
2395 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2398 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2399 for id in video_ids:
2400 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2403 pagenum = pagenum + 1
2405 class YoutubePlaylistIE(InfoExtractor):
2406 """Information Extractor for YouTube playlists."""
2408 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2409 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2410 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2411 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2414 def __init__(self, youtube_ie, downloader=None):
2415 InfoExtractor.__init__(self, downloader)
2416 self._youtube_ie = youtube_ie
2420 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2422 def report_download_page(self, playlist_id, pagenum):
2423 """Report attempt to download playlist page with given number."""
2424 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2426 def _real_initialize(self):
2427 self._youtube_ie.initialize()
2429 def _real_extract(self, url):
2430 # Extract playlist id
2431 mobj = re.match(self._VALID_URL, url)
2433 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2437 if mobj.group(3) is not None:
2438 self._youtube_ie.extract(mobj.group(3))
2441 # Download playlist pages
2442 # prefix is 'p' as default for playlists but there are other types that need extra care
2443 playlist_prefix = mobj.group(1)
2444 if playlist_prefix == 'a':
2445 playlist_access = 'artist'
2447 playlist_prefix = 'p'
2448 playlist_access = 'view_play_list'
2449 playlist_id = mobj.group(2)
2454 self.report_download_page(playlist_id, pagenum)
2455 request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2457 page = urllib2.urlopen(request).read()
2458 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2459 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2462 # Extract video identifiers
2464 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2465 if mobj.group(1) not in ids_in_page:
2466 ids_in_page.append(mobj.group(1))
2467 video_ids.extend(ids_in_page)
2469 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2471 pagenum = pagenum + 1
2473 playliststart = self._downloader.params.get('playliststart', 1) - 1
2474 playlistend = self._downloader.params.get('playlistend', -1)
2475 video_ids = video_ids[playliststart:playlistend]
2477 for id in video_ids:
2478 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2481 class YoutubeUserIE(InfoExtractor):
2482 """Information Extractor for YouTube users."""
2484 _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2485 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2486 _GDATA_PAGE_SIZE = 50
2487 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2488 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2491 def __init__(self, youtube_ie, downloader=None):
2492 InfoExtractor.__init__(self, downloader)
2493 self._youtube_ie = youtube_ie
2497 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2499 def report_download_page(self, username, start_index):
2500 """Report attempt to download user page."""
2501 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2502 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2504 def _real_initialize(self):
2505 self._youtube_ie.initialize()
2507 def _real_extract(self, url):
2509 mobj = re.match(self._VALID_URL, url)
2511 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2514 username = mobj.group(1)
2516 # Download video ids using YouTube Data API. Result size per
2517 # query is limited (currently to 50 videos) so we need to query
2518 # page by page until there are no video ids - it means we got
2525 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2526 self.report_download_page(username, start_index)
2528 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2531 page = urllib2.urlopen(request).read()
2532 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2533 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2536 # Extract video identifiers
2539 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2540 if mobj.group(1) not in ids_in_page:
2541 ids_in_page.append(mobj.group(1))
2543 video_ids.extend(ids_in_page)
2545 # A little optimization - if current page is not
2546 # "full", ie. does not contain PAGE_SIZE video ids then
2547 # we can assume that this page is the last one - there
2548 # are no more ids on further pages - no need to query
2551 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2556 all_ids_count = len(video_ids)
2557 playliststart = self._downloader.params.get('playliststart', 1) - 1
2558 playlistend = self._downloader.params.get('playlistend', -1)
2560 if playlistend == -1:
2561 video_ids = video_ids[playliststart:]
2563 video_ids = video_ids[playliststart:playlistend]
2565 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2566 (username, all_ids_count, len(video_ids)))
2568 for video_id in video_ids:
2569 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2572 class DepositFilesIE(InfoExtractor):
2573 """Information extractor for depositfiles.com"""
2575 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2577 def __init__(self, downloader=None):
2578 InfoExtractor.__init__(self, downloader)
2582 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2584 def report_download_webpage(self, file_id):
2585 """Report webpage download."""
2586 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2588 def report_extraction(self, file_id):
2589 """Report information extraction."""
2590 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2592 def _real_initialize(self):
2595 def _real_extract(self, url):
2596 # At this point we have a new file
2597 self._downloader.increment_downloads()
2599 file_id = url.split('/')[-1]
2600 # Rebuild url in english locale
2601 url = 'http://depositfiles.com/en/files/' + file_id
2603 # Retrieve file webpage with 'Free download' button pressed
2604 free_download_indication = { 'gateway_result' : '1' }
2605 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2607 self.report_download_webpage(file_id)
2608 webpage = urllib2.urlopen(request).read()
2609 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2610 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2613 # Search for the real file URL
2614 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2615 if (mobj is None) or (mobj.group(1) is None):
2616 # Try to figure out reason of the error.
2617 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2618 if (mobj is not None) and (mobj.group(1) is not None):
2619 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2620 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2622 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2625 file_url = mobj.group(1)
2626 file_extension = os.path.splitext(file_url)[1][1:]
2628 # Search for file title
2629 mobj = re.search(r'<b title="(.*?)">', webpage)
2631 self._downloader.trouble(u'ERROR: unable to extract title')
2633 file_title = mobj.group(1).decode('utf-8')
2636 # Process file information
2637 self._downloader.process_info({
2638 'id': file_id.decode('utf-8'),
2639 'url': file_url.decode('utf-8'),
2641 'upload_date': u'NA',
2642 'title': file_title,
2643 'stitle': file_title,
2644 'ext': file_extension.decode('utf-8'),
2648 except UnavailableVideoError, err:
2649 self._downloader.trouble(u'ERROR: unable to download file')
2651 class FacebookIE(InfoExtractor):
2652 """Information Extractor for Facebook"""
2654 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2655 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2656 _NETRC_MACHINE = 'facebook'
2657 _available_formats = ['highqual', 'lowqual']
2658 _video_extensions = {
2663 def __init__(self, downloader=None):
2664 InfoExtractor.__init__(self, downloader)
2668 return (re.match(FacebookIE._VALID_URL, url) is not None)
2670 def _reporter(self, message):
2671 """Add header and report message."""
2672 self._downloader.to_screen(u'[facebook] %s' % message)
2674 def report_login(self):
2675 """Report attempt to log in."""
2676 self._reporter(u'Logging in')
2678 def report_video_webpage_download(self, video_id):
2679 """Report attempt to download video webpage."""
2680 self._reporter(u'%s: Downloading video webpage' % video_id)
2682 def report_information_extraction(self, video_id):
2683 """Report attempt to extract video information."""
2684 self._reporter(u'%s: Extracting video information' % video_id)
2686 def _parse_page(self, video_webpage):
2687 """Extract video information from page"""
2689 data = {'title': r'class="video_title datawrap">(.*?)</',
2690 'description': r'<div class="datawrap">(.*?)</div>',
2691 'owner': r'\("video_owner_name", "(.*?)"\)',
2692 'upload_date': r'data-date="(.*?)"',
2693 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2696 for piece in data.keys():
2697 mobj = re.search(data[piece], video_webpage)
2698 if mobj is not None:
2699 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2703 for fmt in self._available_formats:
2704 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2705 if mobj is not None:
2706 # URL is in a Javascript segment inside an escaped Unicode format within
2707 # the generally utf-8 page
2708 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2709 video_info['video_urls'] = video_urls
2713 def _real_initialize(self):
2714 if self._downloader is None:
2719 downloader_params = self._downloader.params
2721 # Attempt to use provided username and password or .netrc data
2722 if downloader_params.get('username', None) is not None:
2723 useremail = downloader_params['username']
2724 password = downloader_params['password']
2725 elif downloader_params.get('usenetrc', False):
2727 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2728 if info is not None:
2732 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2733 except (IOError, netrc.NetrcParseError), err:
2734 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2737 if useremail is None:
2746 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2749 login_results = urllib2.urlopen(request).read()
2750 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2751 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2753 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2754 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2757 def _real_extract(self, url):
2758 mobj = re.match(self._VALID_URL, url)
2760 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2762 video_id = mobj.group('ID')
2765 self.report_video_webpage_download(video_id)
2766 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2768 page = urllib2.urlopen(request)
2769 video_webpage = page.read()
2770 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2771 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2774 # Start extracting information
2775 self.report_information_extraction(video_id)
2777 # Extract information
2778 video_info = self._parse_page(video_webpage)
2781 if 'owner' not in video_info:
2782 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2784 video_uploader = video_info['owner']
2787 if 'title' not in video_info:
2788 self._downloader.trouble(u'ERROR: unable to extract video title')
2790 video_title = video_info['title']
2791 video_title = video_title.decode('utf-8')
2792 video_title = sanitize_title(video_title)
2795 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2796 simple_title = simple_title.strip(ur'_')
2799 if 'thumbnail' not in video_info:
2800 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2801 video_thumbnail = ''
2803 video_thumbnail = video_info['thumbnail']
2807 if 'upload_date' in video_info:
2808 upload_time = video_info['upload_date']
2809 timetuple = email.utils.parsedate_tz(upload_time)
2810 if timetuple is not None:
2812 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2817 video_description = video_info.get('description', 'No description available.')
2819 url_map = video_info['video_urls']
2820 if len(url_map.keys()) > 0:
2821 # Decide which formats to download
2822 req_format = self._downloader.params.get('format', None)
2823 format_limit = self._downloader.params.get('format_limit', None)
2825 if format_limit is not None and format_limit in self._available_formats:
2826 format_list = self._available_formats[self._available_formats.index(format_limit):]
2828 format_list = self._available_formats
2829 existing_formats = [x for x in format_list if x in url_map]
2830 if len(existing_formats) == 0:
2831 self._downloader.trouble(u'ERROR: no known formats available for video')
2833 if req_format is None:
2834 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2835 elif req_format == '-1':
2836 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2839 if req_format not in url_map:
2840 self._downloader.trouble(u'ERROR: requested format not available')
2842 video_url_list = [(req_format, url_map[req_format])] # Specific format
2844 for format_param, video_real_url in video_url_list:
2846 # At this point we have a new video
2847 self._downloader.increment_downloads()
2850 video_extension = self._video_extensions.get(format_param, 'mp4')
2853 # Process video information
2854 self._downloader.process_info({
2855 'id': video_id.decode('utf-8'),
2856 'url': video_real_url.decode('utf-8'),
2857 'uploader': video_uploader.decode('utf-8'),
2858 'upload_date': upload_date,
2859 'title': video_title,
2860 'stitle': simple_title,
2861 'ext': video_extension.decode('utf-8'),
2862 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2863 'thumbnail': video_thumbnail.decode('utf-8'),
2864 'description': video_description.decode('utf-8'),
2867 except UnavailableVideoError, err:
2868 self._downloader.trouble(u'\nERROR: unable to download video')
2870 class BlipTVIE(InfoExtractor):
2871 """Information extractor for blip.tv"""
2873 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2874 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2878 return (re.match(BlipTVIE._VALID_URL, url) is not None)
2880 def report_extraction(self, file_id):
2881 """Report information extraction."""
2882 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
2884 def _simplify_title(self, title):
2885 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2886 res = res.strip(ur'_')
2889 def _real_extract(self, url):
2890 mobj = re.match(self._VALID_URL, url)
2892 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2899 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2900 request = urllib2.Request(json_url)
2901 self.report_extraction(mobj.group(1))
2903 json_code = urllib2.urlopen(request).read()
2904 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2905 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2908 json_data = json.loads(json_code)
2909 if 'Post' in json_data:
2910 data = json_data['Post']
2914 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2915 video_url = data['media']['url']
2916 umobj = re.match(self._URL_EXT, video_url)
2918 raise ValueError('Can not determine filename extension')
2919 ext = umobj.group(1)
2921 self._downloader.increment_downloads()
2924 'id': data['item_id'],
2926 'uploader': data['display_name'],
2927 'upload_date': upload_date,
2928 'title': data['title'],
2929 'stitle': self._simplify_title(data['title']),
2931 'format': data['media']['mimeType'],
2932 'thumbnail': data['thumbnailUrl'],
2933 'description': data['description'],
2934 'player_url': data['embedUrl']
2936 except (ValueError,KeyError), err:
2937 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2941 self._downloader.process_info(info)
2942 except UnavailableVideoError, err:
2943 self._downloader.trouble(u'\nERROR: unable to download video')
2946 class PostProcessor(object):
2947 """Post Processor class.
2949 PostProcessor objects can be added to downloaders with their
2950 add_post_processor() method. When the downloader has finished a
2951 successful download, it will take its internal chain of PostProcessors
2952 and start calling the run() method on each one of them, first with
2953 an initial argument and then with the returned value of the previous
2956 The chain will be stopped if one of them ever returns None or the end
2957 of the chain is reached.
2959 PostProcessor objects follow a "mutual registration" process similar
2960 to InfoExtractor objects.
2965 def __init__(self, downloader=None):
2966 self._downloader = downloader
2968 def set_downloader(self, downloader):
2969 """Sets the downloader for this PP."""
2970 self._downloader = downloader
2972 def run(self, information):
2973 """Run the PostProcessor.
2975 The "information" argument is a dictionary like the ones
2976 composed by InfoExtractors. The only difference is that this
2977 one has an extra field called "filepath" that points to the
2980 When this method returns None, the postprocessing chain is
2981 stopped. However, this method may return an information
2982 dictionary that will be passed to the next postprocessing
2983 object in the chain. It can be the one it received after
2984 changing some fields.
2986 In addition, this method may raise a PostProcessingError
2987 exception that will be taken into account by the downloader
2990 return information # by default, do nothing
2992 class FFmpegExtractAudioPP(PostProcessor):
2994 def __init__(self, downloader=None, preferredcodec=None):
2995 PostProcessor.__init__(self, downloader)
2996 if preferredcodec is None:
2997 preferredcodec = 'best'
2998 self._preferredcodec = preferredcodec
3001 def get_audio_codec(path):
3003 cmd = ['ffprobe', '-show_streams', '--', path]
3004 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3005 output = handle.communicate()[0]
3006 if handle.wait() != 0:
3008 except (IOError, OSError):
3011 for line in output.split('\n'):
3012 if line.startswith('codec_name='):
3013 audio_codec = line.split('=')[1].strip()
3014 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3019 def run_ffmpeg(path, out_path, codec, more_opts):
3021 cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3022 ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3024 except (IOError, OSError):
3027 def run(self, information):
3028 path = information['filepath']
3030 filecodec = self.get_audio_codec(path)
3031 if filecodec is None:
3032 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3036 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3037 if filecodec == 'aac' or filecodec == 'mp3':
3038 # Lossless if possible
3040 extension = filecodec
3041 if filecodec == 'aac':
3042 more_opts = ['-f', 'adts']
3045 acodec = 'libmp3lame'
3047 more_opts = ['-ab', '128k']
3049 # We convert the audio (lossy)
3050 acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
3051 extension = self._preferredcodec
3052 more_opts = ['-ab', '128k']
3053 if self._preferredcodec == 'aac':
3054 more_opts += ['-f', 'adts']
3056 (prefix, ext) = os.path.splitext(path)
3057 new_path = prefix + '.' + extension
3058 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3059 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3062 self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3067 except (IOError, OSError):
3068 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3071 information['filepath'] = new_path
3075 def updateSelf(downloader, filename):
3076 ''' Update the program file with the latest version from the repository '''
3077 # Note: downloader only used for options
3078 if not os.access(filename, os.W_OK):
3079 sys.exit('ERROR: no write permissions on %s' % filename)
3081 downloader.to_screen('Updating to latest version...')
3085 urlh = urllib.urlopen(UPDATE_URL)
3086 newcontent = urlh.read()
3089 except (IOError, OSError), err:
3090 sys.exit('ERROR: unable to download latest version')
3093 outf = open(filename, 'wb')
3095 outf.write(newcontent)
3098 except (IOError, OSError), err:
3099 sys.exit('ERROR: unable to overwrite current version')
3101 downloader.to_screen('Updated youtube-dl. Restart to use the new version.')
3108 def _format_option_string(option):
3109 ''' ('-o', '--option') -> -o, --format METAVAR'''
3113 if option._short_opts: opts.append(option._short_opts[0])
3114 if option._long_opts: opts.append(option._long_opts[0])
3115 if len(opts) > 1: opts.insert(1, ', ')
3117 if option.takes_value(): opts.append(' %s' % option.metavar)
3119 return "".join(opts)
3121 def _find_term_columns():
3122 columns = os.environ.get('COLUMNS', None)
3127 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3128 out,err = sp.communicate()
3129 return int(out.split()[1])
3135 max_help_position = 80
3137 # No need to wrap help messages if we're on a wide console
3138 columns = _find_term_columns()
3139 if columns: max_width = columns
3141 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
3142 fmt.format_option_strings = _format_option_string
3145 'version' : __version__,
3147 'usage' : '%prog [options] url...',
3148 'conflict_handler' : 'resolve',
3151 parser = optparse.OptionParser(**kw)
3154 general = optparse.OptionGroup(parser, 'General Options')
3155 authentication = optparse.OptionGroup(parser, 'Authentication Options')
3156 video_format = optparse.OptionGroup(parser, 'Video Format Options')
3157 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
3158 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
3159 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3161 general.add_option('-h', '--help',
3162 action='help', help='print this help text and exit')
3163 general.add_option('-v', '--version',
3164 action='version', help='print program version and exit')
3165 general.add_option('-U', '--update',
3166 action='store_true', dest='update_self', help='update this program to latest version')
3167 general.add_option('-i', '--ignore-errors',
3168 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3169 general.add_option('-r', '--rate-limit',
3170 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3171 general.add_option('-R', '--retries',
3172 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
3173 general.add_option('--playlist-start',
3174 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3175 general.add_option('--playlist-end',
3176 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3177 general.add_option('--dump-user-agent',
3178 action='store_true', dest='dump_user_agent',
3179 help='display the current browser identification', default=False)
3181 authentication.add_option('-u', '--username',
3182 dest='username', metavar='USERNAME', help='account username')
3183 authentication.add_option('-p', '--password',
3184 dest='password', metavar='PASSWORD', help='account password')
3185 authentication.add_option('-n', '--netrc',
3186 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3189 video_format.add_option('-f', '--format',
3190 action='store', dest='format', metavar='FORMAT', help='video format code')
3191 video_format.add_option('--all-formats',
3192 action='store_const', dest='format', help='download all available video formats', const='-1')
3193 video_format.add_option('--max-quality',
3194 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3197 verbosity.add_option('-q', '--quiet',
3198 action='store_true', dest='quiet', help='activates quiet mode', default=False)
3199 verbosity.add_option('-s', '--simulate',
3200 action='store_true', dest='simulate', help='do not download video', default=False)
3201 verbosity.add_option('-g', '--get-url',
3202 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3203 verbosity.add_option('-e', '--get-title',
3204 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3205 verbosity.add_option('--get-thumbnail',
3206 action='store_true', dest='getthumbnail',
3207 help='simulate, quiet but print thumbnail URL', default=False)
3208 verbosity.add_option('--get-description',
3209 action='store_true', dest='getdescription',
3210 help='simulate, quiet but print video description', default=False)
3211 verbosity.add_option('--get-filename',
3212 action='store_true', dest='getfilename',
3213 help='simulate, quiet but print output filename', default=False)
3214 verbosity.add_option('--no-progress',
3215 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3216 verbosity.add_option('--console-title',
3217 action='store_true', dest='consoletitle',
3218 help='display progress in console titlebar', default=False)
3221 filesystem.add_option('-t', '--title',
3222 action='store_true', dest='usetitle', help='use title in file name', default=False)
3223 filesystem.add_option('-l', '--literal',
3224 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3225 filesystem.add_option('-A', '--auto-number',
3226 action='store_true', dest='autonumber',
3227 help='number downloaded files starting from 00000', default=False)
3228 filesystem.add_option('-o', '--output',
3229 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
3230 filesystem.add_option('-a', '--batch-file',
3231 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3232 filesystem.add_option('-w', '--no-overwrites',
3233 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3234 filesystem.add_option('-c', '--continue',
3235 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3236 filesystem.add_option('--cookies',
3237 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
3238 filesystem.add_option('--no-part',
3239 action='store_true', dest='nopart', help='do not use .part files', default=False)
3240 filesystem.add_option('--no-mtime',
3241 action='store_false', dest='updatetime',
3242 help='do not use the Last-modified header to set the file modification time', default=True)
3243 filesystem.add_option('--write-description',
3244 action='store_true', dest='writedescription',
3245 help='write video description to a .description file', default=False)
3246 filesystem.add_option('--write-info-json',
3247 action='store_true', dest='writeinfojson',
3248 help='write video metadata to a .info.json file', default=False)
3251 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3252 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3253 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3254 help='"best", "aac" or "mp3"; best by default')
3257 parser.add_option_group(general)
3258 parser.add_option_group(filesystem)
3259 parser.add_option_group(verbosity)
3260 parser.add_option_group(video_format)
3261 parser.add_option_group(authentication)
3262 parser.add_option_group(postproc)
3264 opts, args = parser.parse_args()
3266 return parser, opts, args
3269 parser, opts, args = parseOpts()
3271 # Open appropriate CookieJar
3272 if opts.cookiefile is None:
3273 jar = cookielib.CookieJar()
3276 jar = cookielib.MozillaCookieJar(opts.cookiefile)
3277 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3279 except (IOError, OSError), err:
3280 sys.exit(u'ERROR: unable to open cookie file')
3283 if opts.dump_user_agent:
3284 print std_headers['User-Agent']
3287 # General configuration
3288 cookie_processor = urllib2.HTTPCookieProcessor(jar)
3289 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler()))
3290 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3292 # Batch file verification
3294 if opts.batchfile is not None:
3296 if opts.batchfile == '-':
3299 batchfd = open(opts.batchfile, 'r')
3300 batchurls = batchfd.readlines()
3301 batchurls = [x.strip() for x in batchurls]
3302 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3304 sys.exit(u'ERROR: batch file could not be read')
3305 all_urls = batchurls + args
3307 # Conflicting, missing and erroneous options
3308 if opts.usenetrc and (opts.username is not None or opts.password is not None):
3309 parser.error(u'using .netrc conflicts with giving username/password')
3310 if opts.password is not None and opts.username is None:
3311 parser.error(u'account username missing')
3312 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3313 parser.error(u'using output template conflicts with using title, literal title or auto number')
3314 if opts.usetitle and opts.useliteral:
3315 parser.error(u'using title conflicts with using literal title')
3316 if opts.username is not None and opts.password is None:
3317 opts.password = getpass.getpass(u'Type account password and press return:')
3318 if opts.ratelimit is not None:
3319 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3320 if numeric_limit is None:
3321 parser.error(u'invalid rate limit specified')
3322 opts.ratelimit = numeric_limit
3323 if opts.retries is not None:
3325 opts.retries = long(opts.retries)
3326 except (TypeError, ValueError), err:
3327 parser.error(u'invalid retry count specified')
3329 opts.playliststart = int(opts.playliststart)
3330 if opts.playliststart <= 0:
3331 raise ValueError(u'Playlist start must be positive')
3332 except (TypeError, ValueError), err:
3333 parser.error(u'invalid playlist start number specified')
3335 opts.playlistend = int(opts.playlistend)
3336 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3337 raise ValueError(u'Playlist end must be greater than playlist start')
3338 except (TypeError, ValueError), err:
3339 parser.error(u'invalid playlist end number specified')
3340 if opts.extractaudio:
3341 if opts.audioformat not in ['best', 'aac', 'mp3']:
3342 parser.error(u'invalid audio format specified')
3344 # Information extractors
3345 youtube_ie = YoutubeIE()
3346 metacafe_ie = MetacafeIE(youtube_ie)
3347 dailymotion_ie = DailymotionIE()
3348 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
3349 youtube_user_ie = YoutubeUserIE(youtube_ie)
3350 youtube_search_ie = YoutubeSearchIE(youtube_ie)
3351 google_ie = GoogleIE()
3352 google_search_ie = GoogleSearchIE(google_ie)
3353 photobucket_ie = PhotobucketIE()
3354 yahoo_ie = YahooIE()
3355 yahoo_search_ie = YahooSearchIE(yahoo_ie)
3356 deposit_files_ie = DepositFilesIE()
3357 facebook_ie = FacebookIE()
3358 bliptv_ie = BlipTVIE()
3359 vimeo_ie = VimeoIE()
3360 generic_ie = GenericIE()
3363 fd = FileDownloader({
3364 'usenetrc': opts.usenetrc,
3365 'username': opts.username,
3366 'password': opts.password,
3367 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3368 'forceurl': opts.geturl,
3369 'forcetitle': opts.gettitle,
3370 'forcethumbnail': opts.getthumbnail,
3371 'forcedescription': opts.getdescription,
3372 'forcefilename': opts.getfilename,
3373 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3374 'format': opts.format,
3375 'format_limit': opts.format_limit,
3376 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3377 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3378 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3379 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3380 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3381 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3382 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3383 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3384 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3385 or u'%(id)s.%(ext)s'),
3386 'ignoreerrors': opts.ignoreerrors,
3387 'ratelimit': opts.ratelimit,
3388 'nooverwrites': opts.nooverwrites,
3389 'retries': opts.retries,
3390 'continuedl': opts.continue_dl,
3391 'noprogress': opts.noprogress,
3392 'playliststart': opts.playliststart,
3393 'playlistend': opts.playlistend,
3394 'logtostderr': opts.outtmpl == '-',
3395 'consoletitle': opts.consoletitle,
3396 'nopart': opts.nopart,
3397 'updatetime': opts.updatetime,
3398 'writedescription': opts.writedescription,
3399 'writeinfojson': opts.writeinfojson,
3401 fd.add_info_extractor(youtube_search_ie)
3402 fd.add_info_extractor(youtube_pl_ie)
3403 fd.add_info_extractor(youtube_user_ie)
3404 fd.add_info_extractor(metacafe_ie)
3405 fd.add_info_extractor(dailymotion_ie)
3406 fd.add_info_extractor(youtube_ie)
3407 fd.add_info_extractor(google_ie)
3408 fd.add_info_extractor(google_search_ie)
3409 fd.add_info_extractor(photobucket_ie)
3410 fd.add_info_extractor(yahoo_ie)
3411 fd.add_info_extractor(yahoo_search_ie)
3412 fd.add_info_extractor(deposit_files_ie)
3413 fd.add_info_extractor(facebook_ie)
3414 fd.add_info_extractor(bliptv_ie)
3415 fd.add_info_extractor(vimeo_ie)
3417 # This must come last since it's the
3418 # fallback if none of the others work
3419 fd.add_info_extractor(generic_ie)
3422 if opts.extractaudio:
3423 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))
3426 if opts.update_self:
3427 updateSelf(fd, sys.argv[0])
3430 if len(all_urls) < 1:
3431 if not opts.update_self:
3432 parser.error(u'you must provide at least one URL')
3435 retcode = fd.download(all_urls)
3437 # Dump cookie jar if requested
3438 if opts.cookiefile is not None:
3441 except (IOError, OSError), err:
3442 sys.exit(u'ERROR: unable to save cookie jar')
3447 if __name__ == '__main__':
3450 except DownloadError:
3452 except SameFileError:
3453 sys.exit(u'ERROR: fixed output name but more than one file to download')
3454 except KeyboardInterrupt:
3455 sys.exit(u'\nERROR: Interrupted by user')
3457 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: