2 # -*- coding: utf-8 -*-
5 'Ricardo Garcia Gonzalez',
13 'Philipp Hagemeister',
17 __license__ = 'Public Domain'
18 __version__ = '2011.09.30'
20 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
49 except ImportError: # Python 2.4
52 import cStringIO as StringIO
56 # parse_qs was moved from the cgi module to the urlparse module recently.
58 from urlparse import parse_qs
60 from cgi import parse_qs
68 import xml.etree.ElementTree
69 except ImportError: # Python<2.5: Not officially supported, but let it slip
70 warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
73 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
74 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
75 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
76 'Accept-Encoding': 'gzip, deflate',
77 'Accept-Language': 'en-us,en;q=0.5',
80 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
84 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
90 def raiseError(msg, i):
91 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
92 def skipSpace(i, expectMore=True):
93 while i < len(s) and s[i] in ' \t\r\n':
97 raiseError('Premature end', i)
99 def decodeEscape(match):
115 return unichr(int(esc[1:5], 16))
116 if len(esc) == 5+6 and esc[5:7] == '\\u':
117 hi = int(esc[1:5], 16)
118 low = int(esc[7:11], 16)
119 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
120 raise ValueError('Unknown escape ' + str(esc))
127 while s[e-bslashes-1] == '\\':
129 if bslashes % 2 == 1:
133 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
134 stri = rexp.sub(decodeEscape, s[i:e])
140 if s[i] == '}': # Empty dictionary
144 raiseError('Expected a string object key', i)
145 i,key = parseString(i)
147 if i >= len(s) or s[i] != ':':
148 raiseError('Expected a colon', i)
155 raiseError('Expected comma or closing curly brace', i)
160 if s[i] == ']': # Empty array
165 i = skipSpace(i) # Raise exception if premature end
169 raiseError('Expected a comma or closing bracket', i)
171 def parseDiscrete(i):
172 for k,v in {'true': True, 'false': False, 'null': None}.items():
173 if s.startswith(k, i):
175 raiseError('Not a boolean (or null)', i)
177 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
179 raiseError('Not a number', i)
181 if '.' in nums or 'e' in nums or 'E' in nums:
182 return (i+len(nums), float(nums))
183 return (i+len(nums), int(nums))
184 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
187 i,res = CHARMAP.get(s[i], parseNumber)(i)
188 i = skipSpace(i, False)
192 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
195 def preferredencoding():
196 """Get preferred encoding.
198 Returns the best encoding scheme for the system, based on
199 locale.getpreferredencoding() and some further tweaks.
201 def yield_preferredencoding():
203 pref = locale.getpreferredencoding()
209 return yield_preferredencoding().next()
212 def htmlentity_transform(matchobj):
213 """Transforms an HTML entity to a Unicode character.
215 This function receives a match object and is intended to be used with
216 the re.sub() function.
218 entity = matchobj.group(1)
220 # Known non-numeric HTML entity
221 if entity in htmlentitydefs.name2codepoint:
222 return unichr(htmlentitydefs.name2codepoint[entity])
225 mobj = re.match(ur'(?u)#(x?\d+)', entity)
227 numstr = mobj.group(1)
228 if numstr.startswith(u'x'):
230 numstr = u'0%s' % numstr
233 return unichr(long(numstr, base))
235 # Unknown entity in name, return its literal representation
236 return (u'&%s;' % entity)
239 def sanitize_title(utitle):
240 """Sanitizes a video title so it could be used as part of a filename."""
241 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
242 return utitle.replace(unicode(os.sep), u'%')
245 def sanitize_open(filename, open_mode):
246 """Try to open the given filename, and slightly tweak it if this fails.
248 Attempts to open the given filename. If this fails, it tries to change
249 the filename slightly, step by step, until it's either able to open it
250 or it fails and raises a final exception, like the standard open()
253 It returns the tuple (stream, definitive_file_name).
257 if sys.platform == 'win32':
259 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
260 return (sys.stdout, filename)
261 stream = open(filename, open_mode)
262 return (stream, filename)
263 except (IOError, OSError), err:
264 # In case of error, try to remove win32 forbidden chars
265 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
267 # An exception here should be caught in the caller
268 stream = open(filename, open_mode)
269 return (stream, filename)
272 def timeconvert(timestr):
273 """Convert RFC 2822 defined time string into system timestamp"""
275 timetuple = email.utils.parsedate_tz(timestr)
276 if timetuple is not None:
277 timestamp = email.utils.mktime_tz(timetuple)
281 class DownloadError(Exception):
282 """Download Error exception.
284 This exception may be thrown by FileDownloader objects if they are not
285 configured to continue on errors. They will contain the appropriate
291 class SameFileError(Exception):
292 """Same File exception.
294 This exception will be thrown by FileDownloader objects if they detect
295 multiple files would have to be downloaded to the same file on disk.
300 class PostProcessingError(Exception):
301 """Post Processing exception.
303 This exception may be raised by PostProcessor's .run() method to
304 indicate an error in the postprocessing task.
309 class UnavailableVideoError(Exception):
310 """Unavailable Format exception.
312 This exception will be thrown when a video is requested
313 in a format that is not available for that video.
318 class ContentTooShortError(Exception):
319 """Content Too Short exception.
321 This exception may be raised by FileDownloader objects when a file they
322 download is too small for what the server announced first, indicating
323 the connection was probably interrupted.
329 def __init__(self, downloaded, expected):
330 self.downloaded = downloaded
331 self.expected = expected
334 class YoutubeDLHandler(urllib2.HTTPHandler):
335 """Handler for HTTP requests and responses.
337 This class, when installed with an OpenerDirector, automatically adds
338 the standard headers to every HTTP request and handles gzipped and
339 deflated responses from web servers. If compression is to be avoided in
340 a particular request, the original request in the program code only has
341 to include the HTTP header "Youtubedl-No-Compression", which will be
342 removed before making the real request.
344 Part of this code was copied from:
346 http://techknack.net/python-urllib2-handlers/
348 Andrew Rowls, the author of that code, agreed to release it to the
355 return zlib.decompress(data, -zlib.MAX_WBITS)
357 return zlib.decompress(data)
360 def addinfourl_wrapper(stream, headers, url, code):
361 if hasattr(urllib2.addinfourl, 'getcode'):
362 return urllib2.addinfourl(stream, headers, url, code)
363 ret = urllib2.addinfourl(stream, headers, url)
367 def http_request(self, req):
368 for h in std_headers:
371 req.add_header(h, std_headers[h])
372 if 'Youtubedl-no-compression' in req.headers:
373 if 'Accept-encoding' in req.headers:
374 del req.headers['Accept-encoding']
375 del req.headers['Youtubedl-no-compression']
378 def http_response(self, req, resp):
381 if resp.headers.get('Content-encoding', '') == 'gzip':
382 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
383 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
384 resp.msg = old_resp.msg
386 if resp.headers.get('Content-encoding', '') == 'deflate':
387 gz = StringIO.StringIO(self.deflate(resp.read()))
388 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
389 resp.msg = old_resp.msg
393 class FileDownloader(object):
394 """File Downloader class.
396 File downloader objects are the ones responsible of downloading the
397 actual video file and writing it to disk if the user has requested
398 it, among some other tasks. In most cases there should be one per
399 program. As, given a video URL, the downloader doesn't know how to
400 extract all the needed information, task that InfoExtractors do, it
401 has to pass the URL to one of them.
403 For this, file downloader objects have a method that allows
404 InfoExtractors to be registered in a given order. When it is passed
405 a URL, the file downloader handles it to the first InfoExtractor it
406 finds that reports being able to handle it. The InfoExtractor extracts
407 all the information about the video or videos the URL refers to, and
408 asks the FileDownloader to process the video information, possibly
409 downloading the video.
411 File downloaders accept a lot of parameters. In order not to saturate
412 the object constructor with arguments, it receives a dictionary of
413 options instead. These options are available through the params
414 attribute for the InfoExtractors to use. The FileDownloader also
415 registers itself as the downloader in charge for the InfoExtractors
416 that are added to it, so this is a "mutual registration".
420 username: Username for authentication purposes.
421 password: Password for authentication purposes.
422 usenetrc: Use netrc for authentication instead.
423 quiet: Do not print messages to stdout.
424 forceurl: Force printing final URL.
425 forcetitle: Force printing title.
426 forcethumbnail: Force printing thumbnail URL.
427 forcedescription: Force printing description.
428 forcefilename: Force printing final filename.
429 simulate: Do not download the video files.
430 format: Video format code.
431 format_limit: Highest quality format to try.
432 outtmpl: Template for output names.
433 ignoreerrors: Do not stop on download errors.
434 ratelimit: Download speed limit, in bytes/sec.
435 nooverwrites: Prevent overwriting files.
436 retries: Number of times to retry for HTTP error 5xx
437 continuedl: Try to continue downloads if possible.
438 noprogress: Do not print the progress bar.
439 playliststart: Playlist item to start at.
440 playlistend: Playlist item to end at.
441 matchtitle: Download only matching titles.
442 rejecttitle: Reject downloads for matching titles.
443 logtostderr: Log messages to stderr instead of stdout.
444 consoletitle: Display progress in console window's titlebar.
445 nopart: Do not use temporary .part files.
446 updatetime: Use the Last-modified header to set output file timestamps.
447 writedescription: Write the video description to a .description file
448 writeinfojson: Write the video description to a .info.json file
454 _download_retcode = None
455 _num_downloads = None
458 def __init__(self, params):
459 """Create a FileDownloader object with the given options."""
462 self._download_retcode = 0
463 self._num_downloads = 0
464 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
468 def format_bytes(bytes):
471 if type(bytes) is str:
476 exponent = long(math.log(bytes, 1024.0))
477 suffix = 'bkMGTPEZY'[exponent]
478 converted = float(bytes) / float(1024 ** exponent)
479 return '%.2f%s' % (converted, suffix)
482 def calc_percent(byte_counter, data_len):
485 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
488 def calc_eta(start, now, total, current):
492 if current == 0 or dif < 0.001: # One millisecond
494 rate = float(current) / dif
495 eta = long((float(total) - float(current)) / rate)
496 (eta_mins, eta_secs) = divmod(eta, 60)
499 return '%02d:%02d' % (eta_mins, eta_secs)
502 def calc_speed(start, now, bytes):
504 if bytes == 0 or dif < 0.001: # One millisecond
505 return '%10s' % '---b/s'
506 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
509 def best_block_size(elapsed_time, bytes):
510 new_min = max(bytes / 2.0, 1.0)
511 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
512 if elapsed_time < 0.001:
514 rate = bytes / elapsed_time
522 def parse_bytes(bytestr):
523 """Parse a string indicating a byte quantity into a long integer."""
524 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
527 number = float(matchobj.group(1))
528 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
529 return long(round(number * multiplier))
531 def add_info_extractor(self, ie):
532 """Add an InfoExtractor object to the end of the list."""
534 ie.set_downloader(self)
536 def add_post_processor(self, pp):
537 """Add a PostProcessor object to the end of the chain."""
539 pp.set_downloader(self)
541 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
542 """Print message to stdout if not in quiet mode."""
544 if not self.params.get('quiet', False):
545 terminator = [u'\n', u''][skip_eol]
546 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
547 self._screen_file.flush()
548 except (UnicodeEncodeError), err:
549 if not ignore_encoding_errors:
552 def to_stderr(self, message):
553 """Print message to stderr."""
554 print >>sys.stderr, message.encode(preferredencoding())
556 def to_cons_title(self, message):
557 """Set console/terminal window title to message."""
558 if not self.params.get('consoletitle', False):
560 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
561 # c_wchar_p() might not be necessary if `message` is
562 # already of type unicode()
563 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
564 elif 'TERM' in os.environ:
565 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
567 def fixed_template(self):
568 """Checks if the output template is fixed."""
569 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
571 def trouble(self, message=None):
572 """Determine action to take when a download problem appears.
574 Depending on if the downloader has been configured to ignore
575 download errors or not, this method may throw an exception or
576 not when errors are found, after printing the message.
578 if message is not None:
579 self.to_stderr(message)
580 if not self.params.get('ignoreerrors', False):
581 raise DownloadError(message)
582 self._download_retcode = 1
584 def slow_down(self, start_time, byte_counter):
585 """Sleep if the download speed is over the rate limit."""
586 rate_limit = self.params.get('ratelimit', None)
587 if rate_limit is None or byte_counter == 0:
590 elapsed = now - start_time
593 speed = float(byte_counter) / elapsed
594 if speed > rate_limit:
595 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
597 def temp_name(self, filename):
598 """Returns a temporary filename for the given filename."""
599 if self.params.get('nopart', False) or filename == u'-' or \
600 (os.path.exists(filename) and not os.path.isfile(filename)):
602 return filename + u'.part'
604 def undo_temp_name(self, filename):
605 if filename.endswith(u'.part'):
606 return filename[:-len(u'.part')]
609 def try_rename(self, old_filename, new_filename):
611 if old_filename == new_filename:
613 os.rename(old_filename, new_filename)
614 except (IOError, OSError), err:
615 self.trouble(u'ERROR: unable to rename file')
617 def try_utime(self, filename, last_modified_hdr):
618 """Try to set the last-modified time of the given file."""
619 if last_modified_hdr is None:
621 if not os.path.isfile(filename):
623 timestr = last_modified_hdr
626 filetime = timeconvert(timestr)
630 os.utime(filename, (time.time(), filetime))
635 def report_writedescription(self, descfn):
636 """ Report that the description file is being written """
637 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
639 def report_writeinfojson(self, infofn):
640 """ Report that the metadata file has been written """
641 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
643 def report_destination(self, filename):
644 """Report destination filename."""
645 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
647 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
648 """Report download progress."""
649 if self.params.get('noprogress', False):
651 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
652 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
653 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
654 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
656 def report_resuming_byte(self, resume_len):
657 """Report attempt to resume at given byte."""
658 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
660 def report_retry(self, count, retries):
661 """Report retry in case of HTTP error 5xx"""
662 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
664 def report_file_already_downloaded(self, file_name):
665 """Report file has already been fully downloaded."""
667 self.to_screen(u'[download] %s has already been downloaded' % file_name)
668 except (UnicodeEncodeError), err:
669 self.to_screen(u'[download] The file has already been downloaded')
671 def report_unable_to_resume(self):
672 """Report it was impossible to resume download."""
673 self.to_screen(u'[download] Unable to resume')
675 def report_finish(self):
676 """Report download finished."""
677 if self.params.get('noprogress', False):
678 self.to_screen(u'[download] Download completed')
682 def increment_downloads(self):
683 """Increment the ordinal that assigns a number to each file."""
684 self._num_downloads += 1
686 def prepare_filename(self, info_dict):
687 """Generate the output filename."""
689 template_dict = dict(info_dict)
690 template_dict['epoch'] = unicode(long(time.time()))
691 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
692 filename = self.params['outtmpl'] % template_dict
694 except (ValueError, KeyError), err:
695 self.trouble(u'ERROR: invalid system charset or erroneous output template')
698 def process_info(self, info_dict):
699 """Process a single dictionary returned by an InfoExtractor."""
700 filename = self.prepare_filename(info_dict)
703 if self.params.get('forcetitle', False):
704 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
705 if self.params.get('forceurl', False):
706 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
707 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
708 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
709 if self.params.get('forcedescription', False) and 'description' in info_dict:
710 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
711 if self.params.get('forcefilename', False) and filename is not None:
712 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
713 if self.params.get('forceformat', False):
714 print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
716 # Do nothing else if in simulate mode
717 if self.params.get('simulate', False):
723 matchtitle=self.params.get('matchtitle',False)
724 rejecttitle=self.params.get('rejecttitle',False)
725 title=info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
726 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
727 self.to_screen(u'[download] "%s" title did not match pattern "%s"' % (title, matchtitle))
729 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
730 self.to_screen(u'[download] "%s" title matched reject pattern "%s"' % (title, rejecttitle))
733 if self.params.get('nooverwrites', False) and os.path.exists(filename):
734 self.to_stderr(u'WARNING: file exists and will be skipped')
738 dn = os.path.dirname(filename)
739 if dn != '' and not os.path.exists(dn):
741 except (OSError, IOError), err:
742 self.trouble(u'ERROR: unable to create directory ' + unicode(err))
745 if self.params.get('writedescription', False):
747 descfn = filename + '.description'
748 self.report_writedescription(descfn)
749 descfile = open(descfn, 'wb')
751 descfile.write(info_dict['description'].encode('utf-8'))
754 except (OSError, IOError):
755 self.trouble(u'ERROR: Cannot write description file ' + descfn)
758 if self.params.get('writeinfojson', False):
759 infofn = filename + '.info.json'
760 self.report_writeinfojson(infofn)
763 except (NameError,AttributeError):
764 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
767 infof = open(infofn, 'wb')
769 json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
770 json.dump(json_info_dict, infof)
773 except (OSError, IOError):
774 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
777 if not self.params.get('skip_download', False):
779 success = self._do_download(filename, info_dict)
780 except (OSError, IOError), err:
781 raise UnavailableVideoError
782 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
783 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
785 except (ContentTooShortError, ), err:
786 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
791 self.post_process(filename, info_dict)
792 except (PostProcessingError), err:
793 self.trouble(u'ERROR: postprocessing: %s' % str(err))
796 def download(self, url_list):
797 """Download a given list of URLs."""
798 if len(url_list) > 1 and self.fixed_template():
799 raise SameFileError(self.params['outtmpl'])
802 suitable_found = False
804 # Go to next InfoExtractor if not suitable
805 if not ie.suitable(url):
808 # Suitable InfoExtractor found
809 suitable_found = True
811 # Extract information from URL and process it
814 # Suitable InfoExtractor had been found; go to next URL
817 if not suitable_found:
818 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
820 return self._download_retcode
822 def post_process(self, filename, ie_info):
823 """Run the postprocessing chain on the given file."""
825 info['filepath'] = filename
831 def _download_with_rtmpdump(self, filename, url, player_url):
832 self.report_destination(filename)
833 tmpfilename = self.temp_name(filename)
835 # Check for rtmpdump first
837 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
838 except (OSError, IOError):
839 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
842 # Download using rtmpdump. rtmpdump returns exit code 2 when
843 # the connection was interrumpted and resuming appears to be
844 # possible. This is part of rtmpdump's normal usage, AFAIK.
845 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
846 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
847 while retval == 2 or retval == 1:
848 prevsize = os.path.getsize(tmpfilename)
849 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
850 time.sleep(5.0) # This seems to be needed
851 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
852 cursize = os.path.getsize(tmpfilename)
853 if prevsize == cursize and retval == 1:
855 # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
856 if prevsize == cursize and retval == 2 and cursize > 1024:
857 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
861 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
862 self.try_rename(tmpfilename, filename)
865 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
868 def _do_download(self, filename, info_dict):
869 url = info_dict['url']
870 player_url = info_dict.get('player_url', None)
872 # Check file already present
873 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
874 self.report_file_already_downloaded(filename)
877 # Attempt to download using rtmpdump
878 if url.startswith('rtmp'):
879 return self._download_with_rtmpdump(filename, url, player_url)
881 tmpfilename = self.temp_name(filename)
884 # Do not include the Accept-Encoding header
885 headers = {'Youtubedl-no-compression': 'True'}
886 basic_request = urllib2.Request(url, None, headers)
887 request = urllib2.Request(url, None, headers)
889 # Establish possible resume length
890 if os.path.isfile(tmpfilename):
891 resume_len = os.path.getsize(tmpfilename)
897 if self.params.get('continuedl', False):
898 self.report_resuming_byte(resume_len)
899 request.add_header('Range','bytes=%d-' % resume_len)
905 retries = self.params.get('retries', 0)
906 while count <= retries:
907 # Establish connection
909 if count == 0 and 'urlhandle' in info_dict:
910 data = info_dict['urlhandle']
911 data = urllib2.urlopen(request)
913 except (urllib2.HTTPError, ), err:
914 if (err.code < 500 or err.code >= 600) and err.code != 416:
915 # Unexpected HTTP error
917 elif err.code == 416:
918 # Unable to resume (requested range not satisfiable)
920 # Open the connection again without the range header
921 data = urllib2.urlopen(basic_request)
922 content_length = data.info()['Content-Length']
923 except (urllib2.HTTPError, ), err:
924 if err.code < 500 or err.code >= 600:
927 # Examine the reported length
928 if (content_length is not None and
929 (resume_len - 100 < long(content_length) < resume_len + 100)):
930 # The file had already been fully downloaded.
931 # Explanation to the above condition: in issue #175 it was revealed that
932 # YouTube sometimes adds or removes a few bytes from the end of the file,
933 # changing the file size slightly and causing problems for some users. So
934 # I decided to implement a suggested change and consider the file
935 # completely downloaded if the file size differs less than 100 bytes from
936 # the one in the hard drive.
937 self.report_file_already_downloaded(filename)
938 self.try_rename(tmpfilename, filename)
941 # The length does not match, we start the download over
942 self.report_unable_to_resume()
948 self.report_retry(count, retries)
951 self.trouble(u'ERROR: giving up after %s retries' % retries)
954 data_len = data.info().get('Content-length', None)
955 if data_len is not None:
956 data_len = long(data_len) + resume_len
957 data_len_str = self.format_bytes(data_len)
958 byte_counter = 0 + resume_len
964 data_block = data.read(block_size)
966 if len(data_block) == 0:
968 byte_counter += len(data_block)
970 # Open file just in time
973 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
974 assert stream is not None
975 filename = self.undo_temp_name(tmpfilename)
976 self.report_destination(filename)
977 except (OSError, IOError), err:
978 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
981 stream.write(data_block)
982 except (IOError, OSError), err:
983 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
985 block_size = self.best_block_size(after - before, len(data_block))
988 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
990 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
992 percent_str = self.calc_percent(byte_counter, data_len)
993 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
994 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
997 self.slow_down(start, byte_counter - resume_len)
1000 self.trouble(u'\nERROR: Did not get any data blocks')
1003 self.report_finish()
1004 if data_len is not None and byte_counter != data_len:
1005 raise ContentTooShortError(byte_counter, long(data_len))
1006 self.try_rename(tmpfilename, filename)
1008 # Update file modification time
1009 if self.params.get('updatetime', True):
1010 info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1015 class InfoExtractor(object):
1016 """Information Extractor class.
1018 Information extractors are the classes that, given a URL, extract
1019 information from the video (or videos) the URL refers to. This
1020 information includes the real video URL, the video title and simplified
1021 title, author and others. The information is stored in a dictionary
1022 which is then passed to the FileDownloader. The FileDownloader
1023 processes this information possibly downloading the video to the file
1024 system, among other possible outcomes. The dictionaries must include
1025 the following fields:
1027 id: Video identifier.
1028 url: Final video URL.
1029 uploader: Nickname of the video uploader.
1030 title: Literal title.
1031 stitle: Simplified title.
1032 ext: Video filename extension.
1033 format: Video format.
1034 player_url: SWF Player URL (may be None).
1036 The following fields are optional. Their primary purpose is to allow
1037 youtube-dl to serve as the backend for a video search function, such
1038 as the one in youtube2mp3. They are only used when their respective
1039 forced printing functions are called:
1041 thumbnail: Full URL to a video thumbnail image.
1042 description: One-line video description.
1044 Subclasses of this one should re-define the _real_initialize() and
1045 _real_extract() methods and define a _VALID_URL regexp.
1046 Probably, they should also be added to the list of extractors.
1052 def __init__(self, downloader=None):
1053 """Constructor. Receives an optional downloader."""
1055 self.set_downloader(downloader)
1057 def suitable(self, url):
1058 """Receives a URL and returns True if suitable for this IE."""
1059 return re.match(self._VALID_URL, url) is not None
1061 def initialize(self):
1062 """Initializes an instance (authentication, etc)."""
1064 self._real_initialize()
1067 def extract(self, url):
1068 """Extracts URL information and returns it in list of dicts."""
1070 return self._real_extract(url)
1072 def set_downloader(self, downloader):
1073 """Sets the downloader for this IE."""
1074 self._downloader = downloader
1076 def _real_initialize(self):
1077 """Real initialization process. Redefine in subclasses."""
1080 def _real_extract(self, url):
1081 """Real extraction process. Redefine in subclasses."""
1085 class YoutubeIE(InfoExtractor):
1086 """Information extractor for youtube.com."""
1088 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1089 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1090 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1091 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1092 _NETRC_MACHINE = 'youtube'
1093 # Listed in order of quality
1094 _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1095 _video_extensions = {
1101 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1106 _video_dimensions = {
1121 IE_NAME = u'youtube'
1123 def report_lang(self):
1124 """Report attempt to set language."""
1125 self._downloader.to_screen(u'[youtube] Setting language')
1127 def report_login(self):
1128 """Report attempt to log in."""
1129 self._downloader.to_screen(u'[youtube] Logging in')
1131 def report_age_confirmation(self):
1132 """Report attempt to confirm age."""
1133 self._downloader.to_screen(u'[youtube] Confirming age')
1135 def report_video_webpage_download(self, video_id):
1136 """Report attempt to download video webpage."""
1137 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1139 def report_video_info_webpage_download(self, video_id):
1140 """Report attempt to download video info webpage."""
1141 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1143 def report_information_extraction(self, video_id):
1144 """Report attempt to extract video information."""
1145 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1147 def report_unavailable_format(self, video_id, format):
1148 """Report extracted video URL."""
1149 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1151 def report_rtmp_download(self):
1152 """Indicate the download will use the RTMP protocol."""
1153 self._downloader.to_screen(u'[youtube] RTMP download detected')
1155 def _print_formats(self, formats):
1156 print 'Available formats:'
1158 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1160 def _real_initialize(self):
1161 if self._downloader is None:
1166 downloader_params = self._downloader.params
1168 # Attempt to use provided username and password or .netrc data
1169 if downloader_params.get('username', None) is not None:
1170 username = downloader_params['username']
1171 password = downloader_params['password']
1172 elif downloader_params.get('usenetrc', False):
1174 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1175 if info is not None:
1179 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1180 except (IOError, netrc.NetrcParseError), err:
1181 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1185 request = urllib2.Request(self._LANG_URL)
1188 urllib2.urlopen(request).read()
1189 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1190 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1193 # No authentication to be performed
1194 if username is None:
1199 'current_form': 'loginForm',
1201 'action_login': 'Log In',
1202 'username': username,
1203 'password': password,
1205 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1208 login_results = urllib2.urlopen(request).read()
1209 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1210 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1212 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1213 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1219 'action_confirm': 'Confirm',
1221 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1223 self.report_age_confirmation()
1224 age_results = urllib2.urlopen(request).read()
1225 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1226 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1229 def _real_extract(self, url):
1230 # Extract video id from URL
1231 mobj = re.match(self._VALID_URL, url)
1233 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1235 video_id = mobj.group(2)
1238 self.report_video_webpage_download(video_id)
1239 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1241 video_webpage = urllib2.urlopen(request).read()
1242 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1243 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1246 # Attempt to extract SWF player URL
1247 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1248 if mobj is not None:
1249 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1254 self.report_video_info_webpage_download(video_id)
1255 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1256 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1257 % (video_id, el_type))
1258 request = urllib2.Request(video_info_url)
1260 video_info_webpage = urllib2.urlopen(request).read()
1261 video_info = parse_qs(video_info_webpage)
1262 if 'token' in video_info:
1264 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1265 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1267 if 'token' not in video_info:
1268 if 'reason' in video_info:
1269 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1271 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1274 # Start extracting information
1275 self.report_information_extraction(video_id)
1278 if 'author' not in video_info:
1279 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1281 video_uploader = urllib.unquote_plus(video_info['author'][0])
1284 if 'title' not in video_info:
1285 self._downloader.trouble(u'ERROR: unable to extract video title')
1287 video_title = urllib.unquote_plus(video_info['title'][0])
1288 video_title = video_title.decode('utf-8')
1289 video_title = sanitize_title(video_title)
1292 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1293 simple_title = simple_title.strip(ur'_')
1296 if 'thumbnail_url' not in video_info:
1297 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1298 video_thumbnail = ''
1299 else: # don't panic if we can't find it
1300 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1304 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1305 if mobj is not None:
1306 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1307 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1308 for expression in format_expressions:
1310 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1318 video_description = u'No description available.'
1319 if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1320 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1321 if mobj is not None:
1322 video_description = mobj.group(1).decode('utf-8')
1324 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1325 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1326 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1327 # TODO use another parser
1330 video_token = urllib.unquote_plus(video_info['token'][0])
1332 # Decide which formats to download
1333 req_format = self._downloader.params.get('format', None)
1335 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1336 self.report_rtmp_download()
1337 video_url_list = [(None, video_info['conn'][0])]
1338 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1339 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1340 url_data = [parse_qs(uds) for uds in url_data_strs]
1341 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1342 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1344 format_limit = self._downloader.params.get('format_limit', None)
1345 if format_limit is not None and format_limit in self._available_formats:
1346 format_list = self._available_formats[self._available_formats.index(format_limit):]
1348 format_list = self._available_formats
1349 existing_formats = [x for x in format_list if x in url_map]
1350 if len(existing_formats) == 0:
1351 self._downloader.trouble(u'ERROR: no known formats available for video')
1353 if self._downloader.params.get('listformats', None):
1354 self._print_formats(existing_formats)
1356 if req_format is None or req_format == 'best':
1357 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1358 elif req_format == 'worst':
1359 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1360 elif req_format in ('-1', 'all'):
1361 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1363 # Specific formats. We pick the first in a slash-delimeted sequence.
1364 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1365 req_formats = req_format.split('/')
1366 video_url_list = None
1367 for rf in req_formats:
1369 video_url_list = [(rf, url_map[rf])]
1371 if video_url_list is None:
1372 self._downloader.trouble(u'ERROR: requested format not available')
1375 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1378 for format_param, video_real_url in video_url_list:
1379 # At this point we have a new video
1380 self._downloader.increment_downloads()
1383 video_extension = self._video_extensions.get(format_param, 'flv')
1386 # Process video information
1387 self._downloader.process_info({
1388 'id': video_id.decode('utf-8'),
1389 'url': video_real_url.decode('utf-8'),
1390 'uploader': video_uploader.decode('utf-8'),
1391 'upload_date': upload_date,
1392 'title': video_title,
1393 'stitle': simple_title,
1394 'ext': video_extension.decode('utf-8'),
1395 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1396 'thumbnail': video_thumbnail.decode('utf-8'),
1397 'description': video_description,
1398 'player_url': player_url,
1400 except UnavailableVideoError, err:
1401 self._downloader.trouble(u'\nERROR: unable to download video')
1404 class MetacafeIE(InfoExtractor):
1405 """Information Extractor for metacafe.com."""
1407 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1408 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1409 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1411 IE_NAME = u'metacafe'
1413 def __init__(self, youtube_ie, downloader=None):
1414 InfoExtractor.__init__(self, downloader)
1415 self._youtube_ie = youtube_ie
1417 def report_disclaimer(self):
1418 """Report disclaimer retrieval."""
1419 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1421 def report_age_confirmation(self):
1422 """Report attempt to confirm age."""
1423 self._downloader.to_screen(u'[metacafe] Confirming age')
1425 def report_download_webpage(self, video_id):
1426 """Report webpage download."""
1427 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1429 def report_extraction(self, video_id):
1430 """Report information extraction."""
1431 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1433 def _real_initialize(self):
1434 # Retrieve disclaimer
1435 request = urllib2.Request(self._DISCLAIMER)
1437 self.report_disclaimer()
1438 disclaimer = urllib2.urlopen(request).read()
1439 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1440 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1446 'submit': "Continue - I'm over 18",
1448 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1450 self.report_age_confirmation()
1451 disclaimer = urllib2.urlopen(request).read()
1452 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1453 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1456 def _real_extract(self, url):
1457 # Extract id and simplified title from URL
1458 mobj = re.match(self._VALID_URL, url)
1460 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1463 video_id = mobj.group(1)
1465 # Check if video comes from YouTube
1466 mobj2 = re.match(r'^yt-(.*)$', video_id)
1467 if mobj2 is not None:
1468 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1471 # At this point we have a new video
1472 self._downloader.increment_downloads()
1474 simple_title = mobj.group(2).decode('utf-8')
1476 # Retrieve video webpage to extract further information
1477 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1479 self.report_download_webpage(video_id)
1480 webpage = urllib2.urlopen(request).read()
1481 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1482 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1485 # Extract URL, uploader and title from webpage
1486 self.report_extraction(video_id)
1487 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1488 if mobj is not None:
1489 mediaURL = urllib.unquote(mobj.group(1))
1490 video_extension = mediaURL[-3:]
1492 # Extract gdaKey if available
1493 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1495 video_url = mediaURL
1497 gdaKey = mobj.group(1)
1498 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1500 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1502 self._downloader.trouble(u'ERROR: unable to extract media URL')
1504 vardict = parse_qs(mobj.group(1))
1505 if 'mediaData' not in vardict:
1506 self._downloader.trouble(u'ERROR: unable to extract media URL')
1508 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1510 self._downloader.trouble(u'ERROR: unable to extract media URL')
1512 mediaURL = mobj.group(1).replace('\\/', '/')
1513 video_extension = mediaURL[-3:]
1514 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1516 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1518 self._downloader.trouble(u'ERROR: unable to extract title')
1520 video_title = mobj.group(1).decode('utf-8')
1521 video_title = sanitize_title(video_title)
1523 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1525 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1527 video_uploader = mobj.group(1)
1530 # Process video information
1531 self._downloader.process_info({
1532 'id': video_id.decode('utf-8'),
1533 'url': video_url.decode('utf-8'),
1534 'uploader': video_uploader.decode('utf-8'),
1535 'upload_date': u'NA',
1536 'title': video_title,
1537 'stitle': simple_title,
1538 'ext': video_extension.decode('utf-8'),
1542 except UnavailableVideoError:
1543 self._downloader.trouble(u'\nERROR: unable to download video')
1546 class DailymotionIE(InfoExtractor):
1547 """Information Extractor for Dailymotion"""
1549 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1550 IE_NAME = u'dailymotion'
1552 def __init__(self, downloader=None):
1553 InfoExtractor.__init__(self, downloader)
1555 def report_download_webpage(self, video_id):
1556 """Report webpage download."""
1557 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1559 def report_extraction(self, video_id):
1560 """Report information extraction."""
1561 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1563 def _real_initialize(self):
1566 def _real_extract(self, url):
1567 # Extract id and simplified title from URL
1568 mobj = re.match(self._VALID_URL, url)
1570 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1573 # At this point we have a new video
1574 self._downloader.increment_downloads()
1575 video_id = mobj.group(1)
1577 simple_title = mobj.group(2).decode('utf-8')
1578 video_extension = 'flv'
1580 # Retrieve video webpage to extract further information
1581 request = urllib2.Request(url)
1582 request.add_header('Cookie', 'family_filter=off')
1584 self.report_download_webpage(video_id)
1585 webpage = urllib2.urlopen(request).read()
1586 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1587 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1590 # Extract URL, uploader and title from webpage
1591 self.report_extraction(video_id)
1592 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1594 self._downloader.trouble(u'ERROR: unable to extract media URL')
1596 sequence = urllib.unquote(mobj.group(1))
1597 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1599 self._downloader.trouble(u'ERROR: unable to extract media URL')
1601 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1603 # if needed add http://www.dailymotion.com/ if relative URL
1605 video_url = mediaURL
1607 mobj = re.search(r'(?im)<title>Dailymotion\s*-\s*(.+)\s*-\s*[^<]+?</title>', webpage)
1609 self._downloader.trouble(u'ERROR: unable to extract title')
1611 video_title = mobj.group(1).decode('utf-8')
1612 video_title = sanitize_title(video_title)
1614 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1616 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1618 video_uploader = mobj.group(1)
1621 # Process video information
1622 self._downloader.process_info({
1623 'id': video_id.decode('utf-8'),
1624 'url': video_url.decode('utf-8'),
1625 'uploader': video_uploader.decode('utf-8'),
1626 'upload_date': u'NA',
1627 'title': video_title,
1628 'stitle': simple_title,
1629 'ext': video_extension.decode('utf-8'),
1633 except UnavailableVideoError:
1634 self._downloader.trouble(u'\nERROR: unable to download video')
1637 class GoogleIE(InfoExtractor):
1638 """Information extractor for video.google.com."""
1640 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1641 IE_NAME = u'video.google'
1643 def __init__(self, downloader=None):
1644 InfoExtractor.__init__(self, downloader)
1646 def report_download_webpage(self, video_id):
1647 """Report webpage download."""
1648 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1650 def report_extraction(self, video_id):
1651 """Report information extraction."""
1652 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1654 def _real_initialize(self):
1657 def _real_extract(self, url):
1658 # Extract id from URL
1659 mobj = re.match(self._VALID_URL, url)
1661 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1664 # At this point we have a new video
1665 self._downloader.increment_downloads()
1666 video_id = mobj.group(1)
1668 video_extension = 'mp4'
1670 # Retrieve video webpage to extract further information
1671 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1673 self.report_download_webpage(video_id)
1674 webpage = urllib2.urlopen(request).read()
1675 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1676 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1679 # Extract URL, uploader, and title from webpage
1680 self.report_extraction(video_id)
1681 mobj = re.search(r"download_url:'([^']+)'", webpage)
1683 video_extension = 'flv'
1684 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1686 self._downloader.trouble(u'ERROR: unable to extract media URL')
1688 mediaURL = urllib.unquote(mobj.group(1))
1689 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1690 mediaURL = mediaURL.replace('\\x26', '\x26')
1692 video_url = mediaURL
1694 mobj = re.search(r'<title>(.*)</title>', webpage)
1696 self._downloader.trouble(u'ERROR: unable to extract title')
1698 video_title = mobj.group(1).decode('utf-8')
1699 video_title = sanitize_title(video_title)
1700 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1702 # Extract video description
1703 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1705 self._downloader.trouble(u'ERROR: unable to extract video description')
1707 video_description = mobj.group(1).decode('utf-8')
1708 if not video_description:
1709 video_description = 'No description available.'
1711 # Extract video thumbnail
1712 if self._downloader.params.get('forcethumbnail', False):
1713 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1715 webpage = urllib2.urlopen(request).read()
1716 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1717 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1719 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1721 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1723 video_thumbnail = mobj.group(1)
1724 else: # we need something to pass to process_info
1725 video_thumbnail = ''
1728 # Process video information
1729 self._downloader.process_info({
1730 'id': video_id.decode('utf-8'),
1731 'url': video_url.decode('utf-8'),
1733 'upload_date': u'NA',
1734 'title': video_title,
1735 'stitle': simple_title,
1736 'ext': video_extension.decode('utf-8'),
1740 except UnavailableVideoError:
1741 self._downloader.trouble(u'\nERROR: unable to download video')
1744 class PhotobucketIE(InfoExtractor):
1745 """Information extractor for photobucket.com."""
1747 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1748 IE_NAME = u'photobucket'
1750 def __init__(self, downloader=None):
1751 InfoExtractor.__init__(self, downloader)
1753 def report_download_webpage(self, video_id):
1754 """Report webpage download."""
1755 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1757 def report_extraction(self, video_id):
1758 """Report information extraction."""
1759 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1761 def _real_initialize(self):
1764 def _real_extract(self, url):
1765 # Extract id from URL
1766 mobj = re.match(self._VALID_URL, url)
1768 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1771 # At this point we have a new video
1772 self._downloader.increment_downloads()
1773 video_id = mobj.group(1)
1775 video_extension = 'flv'
1777 # Retrieve video webpage to extract further information
1778 request = urllib2.Request(url)
1780 self.report_download_webpage(video_id)
1781 webpage = urllib2.urlopen(request).read()
1782 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1783 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1786 # Extract URL, uploader, and title from webpage
1787 self.report_extraction(video_id)
1788 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1790 self._downloader.trouble(u'ERROR: unable to extract media URL')
1792 mediaURL = urllib.unquote(mobj.group(1))
1794 video_url = mediaURL
1796 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1798 self._downloader.trouble(u'ERROR: unable to extract title')
1800 video_title = mobj.group(1).decode('utf-8')
1801 video_title = sanitize_title(video_title)
1802 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1804 video_uploader = mobj.group(2).decode('utf-8')
1807 # Process video information
1808 self._downloader.process_info({
1809 'id': video_id.decode('utf-8'),
1810 'url': video_url.decode('utf-8'),
1811 'uploader': video_uploader,
1812 'upload_date': u'NA',
1813 'title': video_title,
1814 'stitle': simple_title,
1815 'ext': video_extension.decode('utf-8'),
1819 except UnavailableVideoError:
1820 self._downloader.trouble(u'\nERROR: unable to download video')
1823 class YahooIE(InfoExtractor):
1824 """Information extractor for video.yahoo.com."""
1826 # _VALID_URL matches all Yahoo! Video URLs
1827 # _VPAGE_URL matches only the extractable '/watch/' URLs
1828 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1829 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1830 IE_NAME = u'video.yahoo'
1832 def __init__(self, downloader=None):
1833 InfoExtractor.__init__(self, downloader)
1835 def report_download_webpage(self, video_id):
1836 """Report webpage download."""
1837 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1839 def report_extraction(self, video_id):
1840 """Report information extraction."""
1841 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1843 def _real_initialize(self):
1846 def _real_extract(self, url, new_video=True):
1847 # Extract ID from URL
1848 mobj = re.match(self._VALID_URL, url)
1850 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1853 # At this point we have a new video
1854 self._downloader.increment_downloads()
1855 video_id = mobj.group(2)
1856 video_extension = 'flv'
1858 # Rewrite valid but non-extractable URLs as
1859 # extractable English language /watch/ URLs
1860 if re.match(self._VPAGE_URL, url) is None:
1861 request = urllib2.Request(url)
1863 webpage = urllib2.urlopen(request).read()
1864 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1865 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1868 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1870 self._downloader.trouble(u'ERROR: Unable to extract id field')
1872 yahoo_id = mobj.group(1)
1874 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1876 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1878 yahoo_vid = mobj.group(1)
1880 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1881 return self._real_extract(url, new_video=False)
1883 # Retrieve video webpage to extract further information
1884 request = urllib2.Request(url)
1886 self.report_download_webpage(video_id)
1887 webpage = urllib2.urlopen(request).read()
1888 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1889 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1892 # Extract uploader and title from webpage
1893 self.report_extraction(video_id)
1894 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1896 self._downloader.trouble(u'ERROR: unable to extract video title')
1898 video_title = mobj.group(1).decode('utf-8')
1899 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1901 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1903 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1905 video_uploader = mobj.group(1).decode('utf-8')
1907 # Extract video thumbnail
1908 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1910 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1912 video_thumbnail = mobj.group(1).decode('utf-8')
1914 # Extract video description
1915 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1917 self._downloader.trouble(u'ERROR: unable to extract video description')
1919 video_description = mobj.group(1).decode('utf-8')
1920 if not video_description:
1921 video_description = 'No description available.'
1923 # Extract video height and width
1924 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1926 self._downloader.trouble(u'ERROR: unable to extract video height')
1928 yv_video_height = mobj.group(1)
1930 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1932 self._downloader.trouble(u'ERROR: unable to extract video width')
1934 yv_video_width = mobj.group(1)
1936 # Retrieve video playlist to extract media URL
1937 # I'm not completely sure what all these options are, but we
1938 # seem to need most of them, otherwise the server sends a 401.
1939 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1940 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1941 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1942 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1943 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1945 self.report_download_webpage(video_id)
1946 webpage = urllib2.urlopen(request).read()
1947 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1948 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1951 # Extract media URL from playlist XML
1952 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1954 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1956 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1957 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1960 # Process video information
1961 self._downloader.process_info({
1962 'id': video_id.decode('utf-8'),
1964 'uploader': video_uploader,
1965 'upload_date': u'NA',
1966 'title': video_title,
1967 'stitle': simple_title,
1968 'ext': video_extension.decode('utf-8'),
1969 'thumbnail': video_thumbnail.decode('utf-8'),
1970 'description': video_description,
1971 'thumbnail': video_thumbnail,
1974 except UnavailableVideoError:
1975 self._downloader.trouble(u'\nERROR: unable to download video')
1978 class VimeoIE(InfoExtractor):
1979 """Information extractor for vimeo.com."""
1981 # _VALID_URL matches Vimeo URLs
1982 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1985 def __init__(self, downloader=None):
1986 InfoExtractor.__init__(self, downloader)
1988 def report_download_webpage(self, video_id):
1989 """Report webpage download."""
1990 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1992 def report_extraction(self, video_id):
1993 """Report information extraction."""
1994 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1996 def _real_initialize(self):
1999 def _real_extract(self, url, new_video=True):
2000 # Extract ID from URL
2001 mobj = re.match(self._VALID_URL, url)
2003 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2006 # At this point we have a new video
2007 self._downloader.increment_downloads()
2008 video_id = mobj.group(1)
2010 # Retrieve video webpage to extract further information
2011 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
2013 self.report_download_webpage(video_id)
2014 webpage = urllib2.urlopen(request).read()
2015 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2016 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2019 # Now we begin extracting as much information as we can from what we
2020 # retrieved. First we extract the information common to all extractors,
2021 # and latter we extract those that are Vimeo specific.
2022 self.report_extraction(video_id)
2025 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
2027 self._downloader.trouble(u'ERROR: unable to extract video title')
2029 video_title = mobj.group(1).decode('utf-8')
2030 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2033 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
2035 self._downloader.trouble(u'ERROR: unable to extract video uploader')
2037 video_uploader = mobj.group(1).decode('utf-8')
2039 # Extract video thumbnail
2040 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2042 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2044 video_thumbnail = mobj.group(1).decode('utf-8')
2046 # # Extract video description
2047 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2049 # self._downloader.trouble(u'ERROR: unable to extract video description')
2051 # video_description = mobj.group(1).decode('utf-8')
2052 # if not video_description: video_description = 'No description available.'
2053 video_description = 'Foo.'
2055 # Vimeo specific: extract request signature
2056 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2058 self._downloader.trouble(u'ERROR: unable to extract request signature')
2060 sig = mobj.group(1).decode('utf-8')
2062 # Vimeo specific: Extract request signature expiration
2063 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2065 self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2067 sig_exp = mobj.group(1).decode('utf-8')
2069 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s" % (video_id, sig, sig_exp)
2072 # Process video information
2073 self._downloader.process_info({
2074 'id': video_id.decode('utf-8'),
2076 'uploader': video_uploader,
2077 'upload_date': u'NA',
2078 'title': video_title,
2079 'stitle': simple_title,
2081 'thumbnail': video_thumbnail.decode('utf-8'),
2082 'description': video_description,
2083 'thumbnail': video_thumbnail,
2084 'description': video_description,
2087 except UnavailableVideoError:
2088 self._downloader.trouble(u'ERROR: unable to download video')
2091 class GenericIE(InfoExtractor):
2092 """Generic last-resort information extractor."""
2095 IE_NAME = u'generic'
2097 def __init__(self, downloader=None):
2098 InfoExtractor.__init__(self, downloader)
2100 def report_download_webpage(self, video_id):
2101 """Report webpage download."""
2102 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2103 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2105 def report_extraction(self, video_id):
2106 """Report information extraction."""
2107 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2109 def _real_initialize(self):
2112 def _real_extract(self, url):
2113 # At this point we have a new video
2114 self._downloader.increment_downloads()
2116 video_id = url.split('/')[-1]
2117 request = urllib2.Request(url)
2119 self.report_download_webpage(video_id)
2120 webpage = urllib2.urlopen(request).read()
2121 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2122 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2124 except ValueError, err:
2125 # since this is the last-resort InfoExtractor, if
2126 # this error is thrown, it'll be thrown here
2127 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2130 self.report_extraction(video_id)
2131 # Start with something easy: JW Player in SWFObject
2132 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2134 # Broaden the search a little bit
2135 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2137 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2140 # It's possible that one of the regexes
2141 # matched, but returned an empty group:
2142 if mobj.group(1) is None:
2143 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2146 video_url = urllib.unquote(mobj.group(1))
2147 video_id = os.path.basename(video_url)
2149 # here's a fun little line of code for you:
2150 video_extension = os.path.splitext(video_id)[1][1:]
2151 video_id = os.path.splitext(video_id)[0]
2153 # it's tempting to parse this further, but you would
2154 # have to take into account all the variations like
2155 # Video Title - Site Name
2156 # Site Name | Video Title
2157 # Video Title - Tagline | Site Name
2158 # and so on and so forth; it's just not practical
2159 mobj = re.search(r'<title>(.*)</title>', webpage)
2161 self._downloader.trouble(u'ERROR: unable to extract title')
2163 video_title = mobj.group(1).decode('utf-8')
2164 video_title = sanitize_title(video_title)
2165 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2167 # video uploader is domain name
2168 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2170 self._downloader.trouble(u'ERROR: unable to extract title')
2172 video_uploader = mobj.group(1).decode('utf-8')
2175 # Process video information
2176 self._downloader.process_info({
2177 'id': video_id.decode('utf-8'),
2178 'url': video_url.decode('utf-8'),
2179 'uploader': video_uploader,
2180 'upload_date': u'NA',
2181 'title': video_title,
2182 'stitle': simple_title,
2183 'ext': video_extension.decode('utf-8'),
2187 except UnavailableVideoError, err:
2188 self._downloader.trouble(u'\nERROR: unable to download video')
2191 class YoutubeSearchIE(InfoExtractor):
2192 """Information Extractor for YouTube search queries."""
2193 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2194 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2195 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2196 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2198 _max_youtube_results = 1000
2199 IE_NAME = u'youtube:search'
2201 def __init__(self, youtube_ie, downloader=None):
2202 InfoExtractor.__init__(self, downloader)
2203 self._youtube_ie = youtube_ie
2205 def report_download_page(self, query, pagenum):
2206 """Report attempt to download playlist page with given number."""
2207 query = query.decode(preferredencoding())
2208 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2210 def _real_initialize(self):
2211 self._youtube_ie.initialize()
2213 def _real_extract(self, query):
2214 mobj = re.match(self._VALID_URL, query)
2216 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2219 prefix, query = query.split(':')
2221 query = query.encode('utf-8')
2223 self._download_n_results(query, 1)
2225 elif prefix == 'all':
2226 self._download_n_results(query, self._max_youtube_results)
2232 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2234 elif n > self._max_youtube_results:
2235 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2236 n = self._max_youtube_results
2237 self._download_n_results(query, n)
2239 except ValueError: # parsing prefix as integer fails
2240 self._download_n_results(query, 1)
2243 def _download_n_results(self, query, n):
2244 """Downloads a specified number of results for a query"""
2247 already_seen = set()
2251 self.report_download_page(query, pagenum)
2252 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2253 request = urllib2.Request(result_url)
2255 page = urllib2.urlopen(request).read()
2256 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2257 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2260 # Extract video identifiers
2261 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2262 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2263 if video_id not in already_seen:
2264 video_ids.append(video_id)
2265 already_seen.add(video_id)
2266 if len(video_ids) == n:
2267 # Specified n videos reached
2268 for id in video_ids:
2269 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2272 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2273 for id in video_ids:
2274 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2277 pagenum = pagenum + 1
2280 class GoogleSearchIE(InfoExtractor):
2281 """Information Extractor for Google Video search queries."""
2282 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2283 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2284 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2285 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2287 _max_google_results = 1000
2288 IE_NAME = u'video.google:search'
2290 def __init__(self, google_ie, downloader=None):
2291 InfoExtractor.__init__(self, downloader)
2292 self._google_ie = google_ie
2294 def report_download_page(self, query, pagenum):
2295 """Report attempt to download playlist page with given number."""
2296 query = query.decode(preferredencoding())
2297 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2299 def _real_initialize(self):
2300 self._google_ie.initialize()
2302 def _real_extract(self, query):
2303 mobj = re.match(self._VALID_URL, query)
2305 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2308 prefix, query = query.split(':')
2310 query = query.encode('utf-8')
2312 self._download_n_results(query, 1)
2314 elif prefix == 'all':
2315 self._download_n_results(query, self._max_google_results)
2321 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2323 elif n > self._max_google_results:
2324 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2325 n = self._max_google_results
2326 self._download_n_results(query, n)
2328 except ValueError: # parsing prefix as integer fails
2329 self._download_n_results(query, 1)
2332 def _download_n_results(self, query, n):
2333 """Downloads a specified number of results for a query"""
2336 already_seen = set()
2340 self.report_download_page(query, pagenum)
2341 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2342 request = urllib2.Request(result_url)
2344 page = urllib2.urlopen(request).read()
2345 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2346 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2349 # Extract video identifiers
2350 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2351 video_id = mobj.group(1)
2352 if video_id not in already_seen:
2353 video_ids.append(video_id)
2354 already_seen.add(video_id)
2355 if len(video_ids) == n:
2356 # Specified n videos reached
2357 for id in video_ids:
2358 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2361 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2362 for id in video_ids:
2363 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2366 pagenum = pagenum + 1
2369 class YahooSearchIE(InfoExtractor):
2370 """Information Extractor for Yahoo! Video search queries."""
2371 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2372 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2373 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2374 _MORE_PAGES_INDICATOR = r'\s*Next'
2376 _max_yahoo_results = 1000
2377 IE_NAME = u'video.yahoo:search'
2379 def __init__(self, yahoo_ie, downloader=None):
2380 InfoExtractor.__init__(self, downloader)
2381 self._yahoo_ie = yahoo_ie
2383 def report_download_page(self, query, pagenum):
2384 """Report attempt to download playlist page with given number."""
2385 query = query.decode(preferredencoding())
2386 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2388 def _real_initialize(self):
2389 self._yahoo_ie.initialize()
2391 def _real_extract(self, query):
2392 mobj = re.match(self._VALID_URL, query)
2394 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2397 prefix, query = query.split(':')
2399 query = query.encode('utf-8')
2401 self._download_n_results(query, 1)
2403 elif prefix == 'all':
2404 self._download_n_results(query, self._max_yahoo_results)
2410 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2412 elif n > self._max_yahoo_results:
2413 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2414 n = self._max_yahoo_results
2415 self._download_n_results(query, n)
2417 except ValueError: # parsing prefix as integer fails
2418 self._download_n_results(query, 1)
2421 def _download_n_results(self, query, n):
2422 """Downloads a specified number of results for a query"""
2425 already_seen = set()
2429 self.report_download_page(query, pagenum)
2430 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2431 request = urllib2.Request(result_url)
2433 page = urllib2.urlopen(request).read()
2434 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2435 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2438 # Extract video identifiers
2439 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2440 video_id = mobj.group(1)
2441 if video_id not in already_seen:
2442 video_ids.append(video_id)
2443 already_seen.add(video_id)
2444 if len(video_ids) == n:
2445 # Specified n videos reached
2446 for id in video_ids:
2447 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2450 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2451 for id in video_ids:
2452 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2455 pagenum = pagenum + 1
2458 class YoutubePlaylistIE(InfoExtractor):
2459 """Information Extractor for YouTube playlists."""
2461 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2462 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2463 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2464 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2466 IE_NAME = u'youtube:playlist'
2468 def __init__(self, youtube_ie, downloader=None):
2469 InfoExtractor.__init__(self, downloader)
2470 self._youtube_ie = youtube_ie
2472 def report_download_page(self, playlist_id, pagenum):
2473 """Report attempt to download playlist page with given number."""
2474 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2476 def _real_initialize(self):
2477 self._youtube_ie.initialize()
2479 def _real_extract(self, url):
2480 # Extract playlist id
2481 mobj = re.match(self._VALID_URL, url)
2483 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2487 if mobj.group(3) is not None:
2488 self._youtube_ie.extract(mobj.group(3))
2491 # Download playlist pages
2492 # prefix is 'p' as default for playlists but there are other types that need extra care
2493 playlist_prefix = mobj.group(1)
2494 if playlist_prefix == 'a':
2495 playlist_access = 'artist'
2497 playlist_prefix = 'p'
2498 playlist_access = 'view_play_list'
2499 playlist_id = mobj.group(2)
2504 self.report_download_page(playlist_id, pagenum)
2505 request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2507 page = urllib2.urlopen(request).read()
2508 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2509 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2512 # Extract video identifiers
2514 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2515 if mobj.group(1) not in ids_in_page:
2516 ids_in_page.append(mobj.group(1))
2517 video_ids.extend(ids_in_page)
2519 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2521 pagenum = pagenum + 1
2523 playliststart = self._downloader.params.get('playliststart', 1) - 1
2524 playlistend = self._downloader.params.get('playlistend', -1)
2525 video_ids = video_ids[playliststart:playlistend]
2527 for id in video_ids:
2528 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2532 class YoutubeUserIE(InfoExtractor):
2533 """Information Extractor for YouTube users."""
2535 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2536 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2537 _GDATA_PAGE_SIZE = 50
2538 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2539 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2541 IE_NAME = u'youtube:user'
2543 def __init__(self, youtube_ie, downloader=None):
2544 InfoExtractor.__init__(self, downloader)
2545 self._youtube_ie = youtube_ie
2547 def report_download_page(self, username, start_index):
2548 """Report attempt to download user page."""
2549 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2550 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2552 def _real_initialize(self):
2553 self._youtube_ie.initialize()
2555 def _real_extract(self, url):
2557 mobj = re.match(self._VALID_URL, url)
2559 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2562 username = mobj.group(1)
2564 # Download video ids using YouTube Data API. Result size per
2565 # query is limited (currently to 50 videos) so we need to query
2566 # page by page until there are no video ids - it means we got
2573 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2574 self.report_download_page(username, start_index)
2576 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2579 page = urllib2.urlopen(request).read()
2580 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2581 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2584 # Extract video identifiers
2587 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2588 if mobj.group(1) not in ids_in_page:
2589 ids_in_page.append(mobj.group(1))
2591 video_ids.extend(ids_in_page)
2593 # A little optimization - if current page is not
2594 # "full", ie. does not contain PAGE_SIZE video ids then
2595 # we can assume that this page is the last one - there
2596 # are no more ids on further pages - no need to query
2599 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2604 all_ids_count = len(video_ids)
2605 playliststart = self._downloader.params.get('playliststart', 1) - 1
2606 playlistend = self._downloader.params.get('playlistend', -1)
2608 if playlistend == -1:
2609 video_ids = video_ids[playliststart:]
2611 video_ids = video_ids[playliststart:playlistend]
2613 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2614 (username, all_ids_count, len(video_ids)))
2616 for video_id in video_ids:
2617 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2620 class DepositFilesIE(InfoExtractor):
2621 """Information extractor for depositfiles.com"""
2623 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2624 IE_NAME = u'DepositFiles'
2626 def __init__(self, downloader=None):
2627 InfoExtractor.__init__(self, downloader)
2629 def report_download_webpage(self, file_id):
2630 """Report webpage download."""
2631 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2633 def report_extraction(self, file_id):
2634 """Report information extraction."""
2635 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2637 def _real_initialize(self):
2640 def _real_extract(self, url):
2641 # At this point we have a new file
2642 self._downloader.increment_downloads()
2644 file_id = url.split('/')[-1]
2645 # Rebuild url in english locale
2646 url = 'http://depositfiles.com/en/files/' + file_id
2648 # Retrieve file webpage with 'Free download' button pressed
2649 free_download_indication = { 'gateway_result' : '1' }
2650 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2652 self.report_download_webpage(file_id)
2653 webpage = urllib2.urlopen(request).read()
2654 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2655 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2658 # Search for the real file URL
2659 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2660 if (mobj is None) or (mobj.group(1) is None):
2661 # Try to figure out reason of the error.
2662 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2663 if (mobj is not None) and (mobj.group(1) is not None):
2664 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2665 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2667 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2670 file_url = mobj.group(1)
2671 file_extension = os.path.splitext(file_url)[1][1:]
2673 # Search for file title
2674 mobj = re.search(r'<b title="(.*?)">', webpage)
2676 self._downloader.trouble(u'ERROR: unable to extract title')
2678 file_title = mobj.group(1).decode('utf-8')
2681 # Process file information
2682 self._downloader.process_info({
2683 'id': file_id.decode('utf-8'),
2684 'url': file_url.decode('utf-8'),
2686 'upload_date': u'NA',
2687 'title': file_title,
2688 'stitle': file_title,
2689 'ext': file_extension.decode('utf-8'),
2693 except UnavailableVideoError, err:
2694 self._downloader.trouble(u'ERROR: unable to download file')
2697 class FacebookIE(InfoExtractor):
2698 """Information Extractor for Facebook"""
2700 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/video/video\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2701 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2702 _NETRC_MACHINE = 'facebook'
2703 _available_formats = ['highqual', 'lowqual']
2704 _video_extensions = {
2708 IE_NAME = u'facebook'
2710 def __init__(self, downloader=None):
2711 InfoExtractor.__init__(self, downloader)
2713 def _reporter(self, message):
2714 """Add header and report message."""
2715 self._downloader.to_screen(u'[facebook] %s' % message)
2717 def report_login(self):
2718 """Report attempt to log in."""
2719 self._reporter(u'Logging in')
2721 def report_video_webpage_download(self, video_id):
2722 """Report attempt to download video webpage."""
2723 self._reporter(u'%s: Downloading video webpage' % video_id)
2725 def report_information_extraction(self, video_id):
2726 """Report attempt to extract video information."""
2727 self._reporter(u'%s: Extracting video information' % video_id)
2729 def _parse_page(self, video_webpage):
2730 """Extract video information from page"""
2732 data = {'title': r'class="video_title datawrap">(.*?)</',
2733 'description': r'<div class="datawrap">(.*?)</div>',
2734 'owner': r'\("video_owner_name", "(.*?)"\)',
2735 'upload_date': r'data-date="(.*?)"',
2736 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2739 for piece in data.keys():
2740 mobj = re.search(data[piece], video_webpage)
2741 if mobj is not None:
2742 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2746 for fmt in self._available_formats:
2747 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2748 if mobj is not None:
2749 # URL is in a Javascript segment inside an escaped Unicode format within
2750 # the generally utf-8 page
2751 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2752 video_info['video_urls'] = video_urls
2756 def _real_initialize(self):
2757 if self._downloader is None:
2762 downloader_params = self._downloader.params
2764 # Attempt to use provided username and password or .netrc data
2765 if downloader_params.get('username', None) is not None:
2766 useremail = downloader_params['username']
2767 password = downloader_params['password']
2768 elif downloader_params.get('usenetrc', False):
2770 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2771 if info is not None:
2775 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2776 except (IOError, netrc.NetrcParseError), err:
2777 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2780 if useremail is None:
2789 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2792 login_results = urllib2.urlopen(request).read()
2793 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2794 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2796 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2797 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2800 def _real_extract(self, url):
2801 mobj = re.match(self._VALID_URL, url)
2803 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2805 video_id = mobj.group('ID')
2808 self.report_video_webpage_download(video_id)
2809 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2811 page = urllib2.urlopen(request)
2812 video_webpage = page.read()
2813 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2814 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2817 # Start extracting information
2818 self.report_information_extraction(video_id)
2820 # Extract information
2821 video_info = self._parse_page(video_webpage)
2824 if 'owner' not in video_info:
2825 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2827 video_uploader = video_info['owner']
2830 if 'title' not in video_info:
2831 self._downloader.trouble(u'ERROR: unable to extract video title')
2833 video_title = video_info['title']
2834 video_title = video_title.decode('utf-8')
2835 video_title = sanitize_title(video_title)
2838 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2839 simple_title = simple_title.strip(ur'_')
2842 if 'thumbnail' not in video_info:
2843 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2844 video_thumbnail = ''
2846 video_thumbnail = video_info['thumbnail']
2850 if 'upload_date' in video_info:
2851 upload_time = video_info['upload_date']
2852 timetuple = email.utils.parsedate_tz(upload_time)
2853 if timetuple is not None:
2855 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2860 video_description = video_info.get('description', 'No description available.')
2862 url_map = video_info['video_urls']
2863 if len(url_map.keys()) > 0:
2864 # Decide which formats to download
2865 req_format = self._downloader.params.get('format', None)
2866 format_limit = self._downloader.params.get('format_limit', None)
2868 if format_limit is not None and format_limit in self._available_formats:
2869 format_list = self._available_formats[self._available_formats.index(format_limit):]
2871 format_list = self._available_formats
2872 existing_formats = [x for x in format_list if x in url_map]
2873 if len(existing_formats) == 0:
2874 self._downloader.trouble(u'ERROR: no known formats available for video')
2876 if req_format is None:
2877 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2878 elif req_format == 'worst':
2879 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2880 elif req_format == '-1':
2881 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2884 if req_format not in url_map:
2885 self._downloader.trouble(u'ERROR: requested format not available')
2887 video_url_list = [(req_format, url_map[req_format])] # Specific format
2889 for format_param, video_real_url in video_url_list:
2891 # At this point we have a new video
2892 self._downloader.increment_downloads()
2895 video_extension = self._video_extensions.get(format_param, 'mp4')
2898 # Process video information
2899 self._downloader.process_info({
2900 'id': video_id.decode('utf-8'),
2901 'url': video_real_url.decode('utf-8'),
2902 'uploader': video_uploader.decode('utf-8'),
2903 'upload_date': upload_date,
2904 'title': video_title,
2905 'stitle': simple_title,
2906 'ext': video_extension.decode('utf-8'),
2907 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2908 'thumbnail': video_thumbnail.decode('utf-8'),
2909 'description': video_description.decode('utf-8'),
2912 except UnavailableVideoError, err:
2913 self._downloader.trouble(u'\nERROR: unable to download video')
2915 class BlipTVIE(InfoExtractor):
2916 """Information extractor for blip.tv"""
2918 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2919 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2920 IE_NAME = u'blip.tv'
2922 def report_extraction(self, file_id):
2923 """Report information extraction."""
2924 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2926 def report_direct_download(self, title):
2927 """Report information extraction."""
2928 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2930 def _simplify_title(self, title):
2931 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2932 res = res.strip(ur'_')
2935 def _real_extract(self, url):
2936 mobj = re.match(self._VALID_URL, url)
2938 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2945 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2946 request = urllib2.Request(json_url)
2947 self.report_extraction(mobj.group(1))
2950 urlh = urllib2.urlopen(request)
2951 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2952 basename = url.split('/')[-1]
2953 title,ext = os.path.splitext(basename)
2954 ext = ext.replace('.', '')
2955 self.report_direct_download(title)
2960 'stitle': self._simplify_title(title),
2964 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2965 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2967 if info is None: # Regular URL
2969 json_code = urlh.read()
2970 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2971 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2975 json_data = json.loads(json_code)
2976 if 'Post' in json_data:
2977 data = json_data['Post']
2981 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2982 video_url = data['media']['url']
2983 umobj = re.match(self._URL_EXT, video_url)
2985 raise ValueError('Can not determine filename extension')
2986 ext = umobj.group(1)
2989 'id': data['item_id'],
2991 'uploader': data['display_name'],
2992 'upload_date': upload_date,
2993 'title': data['title'],
2994 'stitle': self._simplify_title(data['title']),
2996 'format': data['media']['mimeType'],
2997 'thumbnail': data['thumbnailUrl'],
2998 'description': data['description'],
2999 'player_url': data['embedUrl']
3001 except (ValueError,KeyError), err:
3002 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3005 self._downloader.increment_downloads()
3008 self._downloader.process_info(info)
3009 except UnavailableVideoError, err:
3010 self._downloader.trouble(u'\nERROR: unable to download video')
3013 class MyVideoIE(InfoExtractor):
3014 """Information Extractor for myvideo.de."""
3016 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3017 IE_NAME = u'myvideo'
3019 def __init__(self, downloader=None):
3020 InfoExtractor.__init__(self, downloader)
3022 def report_download_webpage(self, video_id):
3023 """Report webpage download."""
3024 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3026 def report_extraction(self, video_id):
3027 """Report information extraction."""
3028 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3030 def _real_initialize(self):
3033 def _real_extract(self,url):
3034 mobj = re.match(self._VALID_URL, url)
3036 self._download.trouble(u'ERROR: invalid URL: %s' % url)
3039 video_id = mobj.group(1)
3040 simple_title = mobj.group(2).decode('utf-8')
3041 # should actually not be necessary
3042 simple_title = sanitize_title(simple_title)
3043 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', simple_title)
3046 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3048 self.report_download_webpage(video_id)
3049 webpage = urllib2.urlopen(request).read()
3050 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3051 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3054 self.report_extraction(video_id)
3055 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3058 self._downloader.trouble(u'ERROR: unable to extract media URL')
3060 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3062 mobj = re.search('<title>([^<]+)</title>', webpage)
3064 self._downloader.trouble(u'ERROR: unable to extract title')
3067 video_title = mobj.group(1)
3068 video_title = sanitize_title(video_title)
3071 self._downloader.process_info({
3075 'upload_date': u'NA',
3076 'title': video_title,
3077 'stitle': simple_title,
3082 except UnavailableVideoError:
3083 self._downloader.trouble(u'\nERROR: Unable to download video')
3085 class ComedyCentralIE(InfoExtractor):
3086 """Information extractor for The Daily Show and Colbert Report """
3088 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3089 IE_NAME = u'comedycentral'
3091 def report_extraction(self, episode_id):
3092 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3094 def report_config_download(self, episode_id):
3095 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3097 def report_index_download(self, episode_id):
3098 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3100 def report_player_url(self, episode_id):
3101 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3103 def _simplify_title(self, title):
3104 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3105 res = res.strip(ur'_')
3108 def _real_extract(self, url):
3109 mobj = re.match(self._VALID_URL, url)
3111 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3114 if mobj.group('shortname'):
3115 if mobj.group('shortname') in ('tds', 'thedailyshow'):
3116 url = 'http://www.thedailyshow.com/full-episodes/'
3118 url = 'http://www.colbertnation.com/full-episodes/'
3119 mobj = re.match(self._VALID_URL, url)
3120 assert mobj is not None
3122 dlNewest = not mobj.group('episode')
3124 epTitle = mobj.group('showname')
3126 epTitle = mobj.group('episode')
3128 req = urllib2.Request(url)
3129 self.report_extraction(epTitle)
3131 htmlHandle = urllib2.urlopen(req)
3132 html = htmlHandle.read()
3133 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3134 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3137 url = htmlHandle.geturl()
3138 mobj = re.match(self._VALID_URL, url)
3140 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3142 if mobj.group('episode') == '':
3143 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3145 epTitle = mobj.group('episode')
3147 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"/>', html)
3148 if len(mMovieParams) == 0:
3149 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3152 playerUrl_raw = mMovieParams[0][0]
3153 self.report_player_url(epTitle)
3155 urlHandle = urllib2.urlopen(playerUrl_raw)
3156 playerUrl = urlHandle.geturl()
3157 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3158 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3161 uri = mMovieParams[0][1]
3162 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3163 self.report_index_download(epTitle)
3165 indexXml = urllib2.urlopen(indexUrl).read()
3166 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3167 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3170 idoc = xml.etree.ElementTree.fromstring(indexXml)
3171 itemEls = idoc.findall('.//item')
3172 for itemEl in itemEls:
3173 mediaId = itemEl.findall('./guid')[0].text
3174 shortMediaId = mediaId.split(':')[-1]
3175 showId = mediaId.split(':')[-2].replace('.com', '')
3176 officialTitle = itemEl.findall('./title')[0].text
3177 officialDate = itemEl.findall('./pubDate')[0].text
3179 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3180 urllib.urlencode({'uri': mediaId}))
3181 configReq = urllib2.Request(configUrl)
3182 self.report_config_download(epTitle)
3184 configXml = urllib2.urlopen(configReq).read()
3185 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3186 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3189 cdoc = xml.etree.ElementTree.fromstring(configXml)
3191 for rendition in cdoc.findall('.//rendition'):
3192 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3196 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3199 # For now, just pick the highest bitrate
3200 format,video_url = turls[-1]
3202 self._downloader.increment_downloads()
3204 effTitle = showId + '-' + epTitle
3209 'upload_date': officialDate,
3211 'stitle': self._simplify_title(effTitle),
3215 'description': officialTitle,
3216 'player_url': playerUrl
3220 self._downloader.process_info(info)
3221 except UnavailableVideoError, err:
3222 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3226 class EscapistIE(InfoExtractor):
3227 """Information extractor for The Escapist """
3229 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3230 IE_NAME = u'escapist'
3232 def report_extraction(self, showName):
3233 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3235 def report_config_download(self, showName):
3236 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3238 def _simplify_title(self, title):
3239 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3240 res = res.strip(ur'_')
3243 def _real_extract(self, url):
3244 htmlParser = HTMLParser.HTMLParser()
3246 mobj = re.match(self._VALID_URL, url)
3248 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3250 showName = mobj.group('showname')
3251 videoId = mobj.group('episode')
3253 self.report_extraction(showName)
3255 webPage = urllib2.urlopen(url).read()
3256 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3257 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3260 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3261 description = htmlParser.unescape(descMatch.group(1))
3262 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3263 imgUrl = htmlParser.unescape(imgMatch.group(1))
3264 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3265 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3266 configUrlMatch = re.search('config=(.*)$', playerUrl)
3267 configUrl = urllib2.unquote(configUrlMatch.group(1))
3269 self.report_config_download(showName)
3271 configJSON = urllib2.urlopen(configUrl).read()
3272 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3273 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3276 # Technically, it's JavaScript, not JSON
3277 configJSON = configJSON.replace("'", '"')
3280 config = json.loads(configJSON)
3281 except (ValueError,), err:
3282 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3285 playlist = config['playlist']
3286 videoUrl = playlist[1]['url']
3288 self._downloader.increment_downloads()
3292 'uploader': showName,
3293 'upload_date': None,
3295 'stitle': self._simplify_title(showName),
3298 'thumbnail': imgUrl,
3299 'description': description,
3300 'player_url': playerUrl,
3304 self._downloader.process_info(info)
3305 except UnavailableVideoError, err:
3306 self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3309 class CollegeHumorIE(InfoExtractor):
3310 """Information extractor for collegehumor.com"""
3312 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3313 IE_NAME = u'collegehumor'
3315 def report_webpage(self, video_id):
3316 """Report information extraction."""
3317 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3319 def report_extraction(self, video_id):
3320 """Report information extraction."""
3321 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3323 def _simplify_title(self, title):
3324 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3325 res = res.strip(ur'_')
3328 def _real_extract(self, url):
3329 htmlParser = HTMLParser.HTMLParser()
3331 mobj = re.match(self._VALID_URL, url)
3333 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3335 video_id = mobj.group('videoid')
3337 self.report_webpage(video_id)
3338 request = urllib2.Request(url)
3340 webpage = urllib2.urlopen(request).read()
3341 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3342 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3345 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3347 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3349 internal_video_id = m.group('internalvideoid')
3353 'internal_id': internal_video_id,
3356 self.report_extraction(video_id)
3357 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3359 metaXml = urllib2.urlopen(xmlUrl).read()
3360 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3361 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3364 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3366 videoNode = mdoc.findall('./video')[0]
3367 info['description'] = videoNode.findall('./description')[0].text
3368 info['title'] = videoNode.findall('./caption')[0].text
3369 info['stitle'] = self._simplify_title(info['title'])
3370 info['url'] = videoNode.findall('./file')[0].text
3371 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3372 info['ext'] = info['url'].rpartition('.')[2]
3373 info['format'] = info['ext']
3375 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3378 self._downloader.increment_downloads()
3381 self._downloader.process_info(info)
3382 except UnavailableVideoError, err:
3383 self._downloader.trouble(u'\nERROR: unable to download video')
3386 class XVideosIE(InfoExtractor):
3387 """Information extractor for xvideos.com"""
3389 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3390 IE_NAME = u'xvideos'
3392 def report_webpage(self, video_id):
3393 """Report information extraction."""
3394 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3396 def report_extraction(self, video_id):
3397 """Report information extraction."""
3398 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3400 def _simplify_title(self, title):
3401 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3402 res = res.strip(ur'_')
3405 def _real_extract(self, url):
3406 htmlParser = HTMLParser.HTMLParser()
3408 mobj = re.match(self._VALID_URL, url)
3410 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3412 video_id = mobj.group(1).decode('utf-8')
3414 self.report_webpage(video_id)
3416 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3418 webpage = urllib2.urlopen(request).read()
3419 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3420 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3423 self.report_extraction(video_id)
3427 mobj = re.search(r'flv_url=(.+?)&', webpage)
3429 self._downloader.trouble(u'ERROR: unable to extract video url')
3431 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3435 mobj = re.search(r'<title>(.*?)</title>', webpage)
3437 self._downloader.trouble(u'ERROR: unable to extract video title')
3439 video_title = mobj.group(1).decode('utf-8')
3442 # Extract video thumbnail
3443 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3445 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3447 video_thumbnail = mobj.group(1).decode('utf-8')
3451 self._downloader.increment_downloads()
3456 'upload_date': None,
3457 'title': video_title,
3458 'stitle': self._simplify_title(video_title),
3461 'thumbnail': video_thumbnail,
3462 'description': None,
3467 self._downloader.process_info(info)
3468 except UnavailableVideoError, err:
3469 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3472 class PostProcessor(object):
3473 """Post Processor class.
3475 PostProcessor objects can be added to downloaders with their
3476 add_post_processor() method. When the downloader has finished a
3477 successful download, it will take its internal chain of PostProcessors
3478 and start calling the run() method on each one of them, first with
3479 an initial argument and then with the returned value of the previous
3482 The chain will be stopped if one of them ever returns None or the end
3483 of the chain is reached.
3485 PostProcessor objects follow a "mutual registration" process similar
3486 to InfoExtractor objects.
3491 def __init__(self, downloader=None):
3492 self._downloader = downloader
3494 def set_downloader(self, downloader):
3495 """Sets the downloader for this PP."""
3496 self._downloader = downloader
3498 def run(self, information):
3499 """Run the PostProcessor.
3501 The "information" argument is a dictionary like the ones
3502 composed by InfoExtractors. The only difference is that this
3503 one has an extra field called "filepath" that points to the
3506 When this method returns None, the postprocessing chain is
3507 stopped. However, this method may return an information
3508 dictionary that will be passed to the next postprocessing
3509 object in the chain. It can be the one it received after
3510 changing some fields.
3512 In addition, this method may raise a PostProcessingError
3513 exception that will be taken into account by the downloader
3516 return information # by default, do nothing
3519 class FFmpegExtractAudioPP(PostProcessor):
3521 def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
3522 PostProcessor.__init__(self, downloader)
3523 if preferredcodec is None:
3524 preferredcodec = 'best'
3525 self._preferredcodec = preferredcodec
3526 self._preferredquality = preferredquality
3527 self._keepvideo = keepvideo
3530 def get_audio_codec(path):
3532 cmd = ['ffprobe', '-show_streams', '--', path]
3533 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3534 output = handle.communicate()[0]
3535 if handle.wait() != 0:
3537 except (IOError, OSError):
3540 for line in output.split('\n'):
3541 if line.startswith('codec_name='):
3542 audio_codec = line.split('=')[1].strip()
3543 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3548 def run_ffmpeg(path, out_path, codec, more_opts):
3550 cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3551 ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3553 except (IOError, OSError):
3556 def run(self, information):
3557 path = information['filepath']
3559 filecodec = self.get_audio_codec(path)
3560 if filecodec is None:
3561 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3565 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3566 if filecodec in ['aac', 'mp3', 'vorbis']:
3567 # Lossless if possible
3569 extension = filecodec
3570 if filecodec == 'aac':
3571 more_opts = ['-f', 'adts']
3572 if filecodec == 'vorbis':
3576 acodec = 'libmp3lame'
3579 if self._preferredquality is not None:
3580 more_opts += ['-ab', self._preferredquality]
3582 # We convert the audio (lossy)
3583 acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'vorbis': 'libvorbis'}[self._preferredcodec]
3584 extension = self._preferredcodec
3586 if self._preferredquality is not None:
3587 more_opts += ['-ab', self._preferredquality]
3588 if self._preferredcodec == 'aac':
3589 more_opts += ['-f', 'adts']
3590 if self._preferredcodec == 'vorbis':
3593 (prefix, ext) = os.path.splitext(path)
3594 new_path = prefix + '.' + extension
3595 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3596 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3599 self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3602 # Try to update the date time for extracted audio file.
3603 if information.get('filetime') is not None:
3605 os.utime(new_path, (time.time(), information['filetime']))
3607 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
3609 if not self._keepvideo:
3612 except (IOError, OSError):
3613 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3616 information['filepath'] = new_path
3620 def updateSelf(downloader, filename):
3621 ''' Update the program file with the latest version from the repository '''
3622 # Note: downloader only used for options
3623 if not os.access(filename, os.W_OK):
3624 sys.exit('ERROR: no write permissions on %s' % filename)
3626 downloader.to_screen('Updating to latest version...')
3630 urlh = urllib.urlopen(UPDATE_URL)
3631 newcontent = urlh.read()
3633 vmatch = re.search("__version__ = '([^']+)'", newcontent)
3634 if vmatch is not None and vmatch.group(1) == __version__:
3635 downloader.to_screen('youtube-dl is up-to-date (' + __version__ + ')')
3639 except (IOError, OSError), err:
3640 sys.exit('ERROR: unable to download latest version')
3643 outf = open(filename, 'wb')
3645 outf.write(newcontent)
3648 except (IOError, OSError), err:
3649 sys.exit('ERROR: unable to overwrite current version')
3651 downloader.to_screen('Updated youtube-dl. Restart youtube-dl to use the new version.')
3658 def _format_option_string(option):
3659 ''' ('-o', '--option') -> -o, --format METAVAR'''
3663 if option._short_opts: opts.append(option._short_opts[0])
3664 if option._long_opts: opts.append(option._long_opts[0])
3665 if len(opts) > 1: opts.insert(1, ', ')
3667 if option.takes_value(): opts.append(' %s' % option.metavar)
3669 return "".join(opts)
3671 def _find_term_columns():
3672 columns = os.environ.get('COLUMNS', None)
3677 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3678 out,err = sp.communicate()
3679 return int(out.split()[1])
3685 max_help_position = 80
3687 # No need to wrap help messages if we're on a wide console
3688 columns = _find_term_columns()
3689 if columns: max_width = columns
3691 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
3692 fmt.format_option_strings = _format_option_string
3695 'version' : __version__,
3697 'usage' : '%prog [options] url [url...]',
3698 'conflict_handler' : 'resolve',
3701 parser = optparse.OptionParser(**kw)
3704 general = optparse.OptionGroup(parser, 'General Options')
3705 selection = optparse.OptionGroup(parser, 'Video Selection')
3706 authentication = optparse.OptionGroup(parser, 'Authentication Options')
3707 video_format = optparse.OptionGroup(parser, 'Video Format Options')
3708 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
3709 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
3710 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3712 general.add_option('-h', '--help',
3713 action='help', help='print this help text and exit')
3714 general.add_option('-v', '--version',
3715 action='version', help='print program version and exit')
3716 general.add_option('-U', '--update',
3717 action='store_true', dest='update_self', help='update this program to latest version')
3718 general.add_option('-i', '--ignore-errors',
3719 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3720 general.add_option('-r', '--rate-limit',
3721 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3722 general.add_option('-R', '--retries',
3723 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
3724 general.add_option('--dump-user-agent',
3725 action='store_true', dest='dump_user_agent',
3726 help='display the current browser identification', default=False)
3727 general.add_option('--list-extractors',
3728 action='store_true', dest='list_extractors',
3729 help='List all supported extractors and the URLs they would handle', default=False)
3731 selection.add_option('--playlist-start',
3732 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3733 selection.add_option('--playlist-end',
3734 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3735 selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
3736 selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
3738 authentication.add_option('-u', '--username',
3739 dest='username', metavar='USERNAME', help='account username')
3740 authentication.add_option('-p', '--password',
3741 dest='password', metavar='PASSWORD', help='account password')
3742 authentication.add_option('-n', '--netrc',
3743 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3746 video_format.add_option('-f', '--format',
3747 action='store', dest='format', metavar='FORMAT', help='video format code')
3748 video_format.add_option('--all-formats',
3749 action='store_const', dest='format', help='download all available video formats', const='all')
3750 video_format.add_option('--max-quality',
3751 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3752 video_format.add_option('-F', '--list-formats',
3753 action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
3756 verbosity.add_option('-q', '--quiet',
3757 action='store_true', dest='quiet', help='activates quiet mode', default=False)
3758 verbosity.add_option('-s', '--simulate',
3759 action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
3760 verbosity.add_option('--skip-download',
3761 action='store_true', dest='skip_download', help='do not download the video', default=False)
3762 verbosity.add_option('-g', '--get-url',
3763 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3764 verbosity.add_option('-e', '--get-title',
3765 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3766 verbosity.add_option('--get-thumbnail',
3767 action='store_true', dest='getthumbnail',
3768 help='simulate, quiet but print thumbnail URL', default=False)
3769 verbosity.add_option('--get-description',
3770 action='store_true', dest='getdescription',
3771 help='simulate, quiet but print video description', default=False)
3772 verbosity.add_option('--get-filename',
3773 action='store_true', dest='getfilename',
3774 help='simulate, quiet but print output filename', default=False)
3775 verbosity.add_option('--get-format',
3776 action='store_true', dest='getformat',
3777 help='simulate, quiet but print output format', default=False)
3778 verbosity.add_option('--no-progress',
3779 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3780 verbosity.add_option('--console-title',
3781 action='store_true', dest='consoletitle',
3782 help='display progress in console titlebar', default=False)
3785 filesystem.add_option('-t', '--title',
3786 action='store_true', dest='usetitle', help='use title in file name', default=False)
3787 filesystem.add_option('-l', '--literal',
3788 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3789 filesystem.add_option('-A', '--auto-number',
3790 action='store_true', dest='autonumber',
3791 help='number downloaded files starting from 00000', default=False)
3792 filesystem.add_option('-o', '--output',
3793 dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, and %% for a literal percent')
3794 filesystem.add_option('-a', '--batch-file',
3795 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3796 filesystem.add_option('-w', '--no-overwrites',
3797 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3798 filesystem.add_option('-c', '--continue',
3799 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3800 filesystem.add_option('--no-continue',
3801 action='store_false', dest='continue_dl',
3802 help='do not resume partially downloaded files (restart from beginning)')
3803 filesystem.add_option('--cookies',
3804 dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
3805 filesystem.add_option('--no-part',
3806 action='store_true', dest='nopart', help='do not use .part files', default=False)
3807 filesystem.add_option('--no-mtime',
3808 action='store_false', dest='updatetime',
3809 help='do not use the Last-modified header to set the file modification time', default=True)
3810 filesystem.add_option('--write-description',
3811 action='store_true', dest='writedescription',
3812 help='write video description to a .description file', default=False)
3813 filesystem.add_option('--write-info-json',
3814 action='store_true', dest='writeinfojson',
3815 help='write video metadata to a .info.json file', default=False)
3818 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3819 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3820 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3821 help='"best", "aac", "vorbis" or "mp3"; best by default')
3822 postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
3823 help='ffmpeg audio bitrate specification, 128k by default')
3824 postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
3825 help='keeps the video file on disk after the post-processing; the video is erased by default')
3828 parser.add_option_group(general)
3829 parser.add_option_group(selection)
3830 parser.add_option_group(filesystem)
3831 parser.add_option_group(verbosity)
3832 parser.add_option_group(video_format)
3833 parser.add_option_group(authentication)
3834 parser.add_option_group(postproc)
3836 opts, args = parser.parse_args()
3838 return parser, opts, args
3840 def gen_extractors():
3841 """ Return a list of an instance of every supported extractor.
3842 The order does matter; the first extractor matched is the one handling the URL.
3844 youtube_ie = YoutubeIE()
3845 google_ie = GoogleIE()
3846 yahoo_ie = YahooIE()
3848 YoutubePlaylistIE(youtube_ie),
3849 YoutubeUserIE(youtube_ie),
3850 YoutubeSearchIE(youtube_ie),
3852 MetacafeIE(youtube_ie),
3855 GoogleSearchIE(google_ie),
3858 YahooSearchIE(yahoo_ie),
3873 parser, opts, args = parseOpts()
3875 # Open appropriate CookieJar
3876 if opts.cookiefile is None:
3877 jar = cookielib.CookieJar()
3880 jar = cookielib.MozillaCookieJar(opts.cookiefile)
3881 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3883 except (IOError, OSError), err:
3884 sys.exit(u'ERROR: unable to open cookie file')
3887 if opts.dump_user_agent:
3888 print std_headers['User-Agent']
3891 # Batch file verification
3893 if opts.batchfile is not None:
3895 if opts.batchfile == '-':
3898 batchfd = open(opts.batchfile, 'r')
3899 batchurls = batchfd.readlines()
3900 batchurls = [x.strip() for x in batchurls]
3901 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3903 sys.exit(u'ERROR: batch file could not be read')
3904 all_urls = batchurls + args
3906 # General configuration
3907 cookie_processor = urllib2.HTTPCookieProcessor(jar)
3908 opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
3909 urllib2.install_opener(opener)
3910 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3912 extractors = gen_extractors()
3914 if opts.list_extractors:
3915 for ie in extractors:
3917 matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
3918 all_urls = filter(lambda url: url not in matchedUrls, all_urls)
3919 for mu in matchedUrls:
3923 # Conflicting, missing and erroneous options
3924 if opts.usenetrc and (opts.username is not None or opts.password is not None):
3925 parser.error(u'using .netrc conflicts with giving username/password')
3926 if opts.password is not None and opts.username is None:
3927 parser.error(u'account username missing')
3928 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3929 parser.error(u'using output template conflicts with using title, literal title or auto number')
3930 if opts.usetitle and opts.useliteral:
3931 parser.error(u'using title conflicts with using literal title')
3932 if opts.username is not None and opts.password is None:
3933 opts.password = getpass.getpass(u'Type account password and press return:')
3934 if opts.ratelimit is not None:
3935 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3936 if numeric_limit is None:
3937 parser.error(u'invalid rate limit specified')
3938 opts.ratelimit = numeric_limit
3939 if opts.retries is not None:
3941 opts.retries = long(opts.retries)
3942 except (TypeError, ValueError), err:
3943 parser.error(u'invalid retry count specified')
3945 opts.playliststart = int(opts.playliststart)
3946 if opts.playliststart <= 0:
3947 raise ValueError(u'Playlist start must be positive')
3948 except (TypeError, ValueError), err:
3949 parser.error(u'invalid playlist start number specified')
3951 opts.playlistend = int(opts.playlistend)
3952 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3953 raise ValueError(u'Playlist end must be greater than playlist start')
3954 except (TypeError, ValueError), err:
3955 parser.error(u'invalid playlist end number specified')
3956 if opts.extractaudio:
3957 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis']:
3958 parser.error(u'invalid audio format specified')
3961 fd = FileDownloader({
3962 'usenetrc': opts.usenetrc,
3963 'username': opts.username,
3964 'password': opts.password,
3965 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
3966 'forceurl': opts.geturl,
3967 'forcetitle': opts.gettitle,
3968 'forcethumbnail': opts.getthumbnail,
3969 'forcedescription': opts.getdescription,
3970 'forcefilename': opts.getfilename,
3971 'forceformat': opts.getformat,
3972 'simulate': opts.simulate,
3973 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
3974 'format': opts.format,
3975 'format_limit': opts.format_limit,
3976 'listformats': opts.listformats,
3977 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3978 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3979 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3980 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3981 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3982 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3983 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3984 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3985 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3986 or u'%(id)s.%(ext)s'),
3987 'ignoreerrors': opts.ignoreerrors,
3988 'ratelimit': opts.ratelimit,
3989 'nooverwrites': opts.nooverwrites,
3990 'retries': opts.retries,
3991 'continuedl': opts.continue_dl,
3992 'noprogress': opts.noprogress,
3993 'playliststart': opts.playliststart,
3994 'playlistend': opts.playlistend,
3995 'logtostderr': opts.outtmpl == '-',
3996 'consoletitle': opts.consoletitle,
3997 'nopart': opts.nopart,
3998 'updatetime': opts.updatetime,
3999 'writedescription': opts.writedescription,
4000 'writeinfojson': opts.writeinfojson,
4001 'matchtitle': opts.matchtitle,
4002 'rejecttitle': opts.rejecttitle,
4004 for extractor in extractors:
4005 fd.add_info_extractor(extractor)
4008 if opts.extractaudio:
4009 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4012 if opts.update_self:
4013 updateSelf(fd, sys.argv[0])
4016 if len(all_urls) < 1:
4017 if not opts.update_self:
4018 parser.error(u'you must provide at least one URL')
4021 retcode = fd.download(all_urls)
4023 # Dump cookie jar if requested
4024 if opts.cookiefile is not None:
4027 except (IOError, OSError), err:
4028 sys.exit(u'ERROR: unable to save cookie jar')
4033 if __name__ == '__main__':
4036 except DownloadError:
4038 except SameFileError:
4039 sys.exit(u'ERROR: fixed output name but more than one file to download')
4040 except KeyboardInterrupt:
4041 sys.exit(u'\nERROR: Interrupted by user')
4043 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: