2 # -*- coding: utf-8 -*-
5 'Ricardo Garcia Gonzalez',
13 'Philipp Hagemeister',
21 __license__ = 'Public Domain'
22 __version__ = '2012.02.27'
24 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
57 except ImportError: # Python 2.4
60 import cStringIO as StringIO
64 # parse_qs was moved from the cgi module to the urlparse module recently.
66 from urlparse import parse_qs
68 from cgi import parse_qs
76 import xml.etree.ElementTree
77 except ImportError: # Python<2.5: Not officially supported, but let it slip
78 warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
81 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
82 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
83 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
84 'Accept-Encoding': 'gzip, deflate',
85 'Accept-Language': 'en-us,en;q=0.5',
90 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
96 def raiseError(msg, i):
97 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
98 def skipSpace(i, expectMore=True):
99 while i < len(s) and s[i] in ' \t\r\n':
103 raiseError('Premature end', i)
105 def decodeEscape(match):
121 return unichr(int(esc[1:5], 16))
122 if len(esc) == 5+6 and esc[5:7] == '\\u':
123 hi = int(esc[1:5], 16)
124 low = int(esc[7:11], 16)
125 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
126 raise ValueError('Unknown escape ' + str(esc))
133 while s[e-bslashes-1] == '\\':
135 if bslashes % 2 == 1:
139 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
140 stri = rexp.sub(decodeEscape, s[i:e])
146 if s[i] == '}': # Empty dictionary
150 raiseError('Expected a string object key', i)
151 i,key = parseString(i)
153 if i >= len(s) or s[i] != ':':
154 raiseError('Expected a colon', i)
161 raiseError('Expected comma or closing curly brace', i)
166 if s[i] == ']': # Empty array
171 i = skipSpace(i) # Raise exception if premature end
175 raiseError('Expected a comma or closing bracket', i)
177 def parseDiscrete(i):
178 for k,v in {'true': True, 'false': False, 'null': None}.items():
179 if s.startswith(k, i):
181 raiseError('Not a boolean (or null)', i)
183 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
185 raiseError('Not a number', i)
187 if '.' in nums or 'e' in nums or 'E' in nums:
188 return (i+len(nums), float(nums))
189 return (i+len(nums), int(nums))
190 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
193 i,res = CHARMAP.get(s[i], parseNumber)(i)
194 i = skipSpace(i, False)
198 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
201 def preferredencoding():
202 """Get preferred encoding.
204 Returns the best encoding scheme for the system, based on
205 locale.getpreferredencoding() and some further tweaks.
207 def yield_preferredencoding():
209 pref = locale.getpreferredencoding()
215 return yield_preferredencoding().next()
218 def htmlentity_transform(matchobj):
219 """Transforms an HTML entity to a Unicode character.
221 This function receives a match object and is intended to be used with
222 the re.sub() function.
224 entity = matchobj.group(1)
226 # Known non-numeric HTML entity
227 if entity in htmlentitydefs.name2codepoint:
228 return unichr(htmlentitydefs.name2codepoint[entity])
231 mobj = re.match(ur'(?u)#(x?\d+)', entity)
233 numstr = mobj.group(1)
234 if numstr.startswith(u'x'):
236 numstr = u'0%s' % numstr
239 return unichr(long(numstr, base))
241 # Unknown entity in name, return its literal representation
242 return (u'&%s;' % entity)
245 def sanitize_title(utitle):
246 """Sanitizes a video title so it could be used as part of a filename."""
247 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
248 return utitle.replace(unicode(os.sep), u'%')
251 def sanitize_open(filename, open_mode):
252 """Try to open the given filename, and slightly tweak it if this fails.
254 Attempts to open the given filename. If this fails, it tries to change
255 the filename slightly, step by step, until it's either able to open it
256 or it fails and raises a final exception, like the standard open()
259 It returns the tuple (stream, definitive_file_name).
263 if sys.platform == 'win32':
265 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
266 return (sys.stdout, filename)
267 stream = open(_encodeFilename(filename), open_mode)
268 return (stream, filename)
269 except (IOError, OSError), err:
270 # In case of error, try to remove win32 forbidden chars
271 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
273 # An exception here should be caught in the caller
274 stream = open(_encodeFilename(filename), open_mode)
275 return (stream, filename)
278 def timeconvert(timestr):
279 """Convert RFC 2822 defined time string into system timestamp"""
281 timetuple = email.utils.parsedate_tz(timestr)
282 if timetuple is not None:
283 timestamp = email.utils.mktime_tz(timetuple)
286 def _simplify_title(title):
287 expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
288 return expr.sub(u'_', title).strip(u'_')
290 def _orderedSet(iterable):
291 """ Remove all duplicates from the input iterable """
298 def _unescapeHTML(s):
300 @param s a string (of type unicode)
302 assert type(s) == type(u'')
304 htmlParser = HTMLParser.HTMLParser()
305 return htmlParser.unescape(s)
307 def _encodeFilename(s):
309 @param s The name of the file (of type unicode)
312 assert type(s) == type(u'')
314 if sys.platform == 'win32' and sys.getwindowsversion().major >= 5:
315 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
316 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
317 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
320 return s.encode(sys.getfilesystemencoding(), 'ignore')
322 class DownloadError(Exception):
323 """Download Error exception.
325 This exception may be thrown by FileDownloader objects if they are not
326 configured to continue on errors. They will contain the appropriate
332 class SameFileError(Exception):
333 """Same File exception.
335 This exception will be thrown by FileDownloader objects if they detect
336 multiple files would have to be downloaded to the same file on disk.
341 class PostProcessingError(Exception):
342 """Post Processing exception.
344 This exception may be raised by PostProcessor's .run() method to
345 indicate an error in the postprocessing task.
349 class MaxDownloadsReached(Exception):
350 """ --max-downloads limit has been reached. """
354 class UnavailableVideoError(Exception):
355 """Unavailable Format exception.
357 This exception will be thrown when a video is requested
358 in a format that is not available for that video.
363 class ContentTooShortError(Exception):
364 """Content Too Short exception.
366 This exception may be raised by FileDownloader objects when a file they
367 download is too small for what the server announced first, indicating
368 the connection was probably interrupted.
374 def __init__(self, downloaded, expected):
375 self.downloaded = downloaded
376 self.expected = expected
379 class YoutubeDLHandler(urllib2.HTTPHandler):
380 """Handler for HTTP requests and responses.
382 This class, when installed with an OpenerDirector, automatically adds
383 the standard headers to every HTTP request and handles gzipped and
384 deflated responses from web servers. If compression is to be avoided in
385 a particular request, the original request in the program code only has
386 to include the HTTP header "Youtubedl-No-Compression", which will be
387 removed before making the real request.
389 Part of this code was copied from:
391 http://techknack.net/python-urllib2-handlers/
393 Andrew Rowls, the author of that code, agreed to release it to the
400 return zlib.decompress(data, -zlib.MAX_WBITS)
402 return zlib.decompress(data)
405 def addinfourl_wrapper(stream, headers, url, code):
406 if hasattr(urllib2.addinfourl, 'getcode'):
407 return urllib2.addinfourl(stream, headers, url, code)
408 ret = urllib2.addinfourl(stream, headers, url)
412 def http_request(self, req):
413 for h in std_headers:
416 req.add_header(h, std_headers[h])
417 if 'Youtubedl-no-compression' in req.headers:
418 if 'Accept-encoding' in req.headers:
419 del req.headers['Accept-encoding']
420 del req.headers['Youtubedl-no-compression']
423 def http_response(self, req, resp):
426 if resp.headers.get('Content-encoding', '') == 'gzip':
427 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
428 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
429 resp.msg = old_resp.msg
431 if resp.headers.get('Content-encoding', '') == 'deflate':
432 gz = StringIO.StringIO(self.deflate(resp.read()))
433 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
434 resp.msg = old_resp.msg
438 class FileDownloader(object):
439 """File Downloader class.
441 File downloader objects are the ones responsible of downloading the
442 actual video file and writing it to disk if the user has requested
443 it, among some other tasks. In most cases there should be one per
444 program. As, given a video URL, the downloader doesn't know how to
445 extract all the needed information, task that InfoExtractors do, it
446 has to pass the URL to one of them.
448 For this, file downloader objects have a method that allows
449 InfoExtractors to be registered in a given order. When it is passed
450 a URL, the file downloader handles it to the first InfoExtractor it
451 finds that reports being able to handle it. The InfoExtractor extracts
452 all the information about the video or videos the URL refers to, and
453 asks the FileDownloader to process the video information, possibly
454 downloading the video.
456 File downloaders accept a lot of parameters. In order not to saturate
457 the object constructor with arguments, it receives a dictionary of
458 options instead. These options are available through the params
459 attribute for the InfoExtractors to use. The FileDownloader also
460 registers itself as the downloader in charge for the InfoExtractors
461 that are added to it, so this is a "mutual registration".
465 username: Username for authentication purposes.
466 password: Password for authentication purposes.
467 usenetrc: Use netrc for authentication instead.
468 quiet: Do not print messages to stdout.
469 forceurl: Force printing final URL.
470 forcetitle: Force printing title.
471 forcethumbnail: Force printing thumbnail URL.
472 forcedescription: Force printing description.
473 forcefilename: Force printing final filename.
474 simulate: Do not download the video files.
475 format: Video format code.
476 format_limit: Highest quality format to try.
477 outtmpl: Template for output names.
478 ignoreerrors: Do not stop on download errors.
479 ratelimit: Download speed limit, in bytes/sec.
480 nooverwrites: Prevent overwriting files.
481 retries: Number of times to retry for HTTP error 5xx
482 continuedl: Try to continue downloads if possible.
483 noprogress: Do not print the progress bar.
484 playliststart: Playlist item to start at.
485 playlistend: Playlist item to end at.
486 matchtitle: Download only matching titles.
487 rejecttitle: Reject downloads for matching titles.
488 logtostderr: Log messages to stderr instead of stdout.
489 consoletitle: Display progress in console window's titlebar.
490 nopart: Do not use temporary .part files.
491 updatetime: Use the Last-modified header to set output file timestamps.
492 writedescription: Write the video description to a .description file
493 writeinfojson: Write the video description to a .info.json file
494 writesubtitles: Write the video subtitles to a .srt file
495 subtitleslang: Language of the subtitles to download
501 _download_retcode = None
502 _num_downloads = None
505 def __init__(self, params):
506 """Create a FileDownloader object with the given options."""
509 self._download_retcode = 0
510 self._num_downloads = 0
511 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
515 def format_bytes(bytes):
518 if type(bytes) is str:
523 exponent = long(math.log(bytes, 1024.0))
524 suffix = 'bkMGTPEZY'[exponent]
525 converted = float(bytes) / float(1024 ** exponent)
526 return '%.2f%s' % (converted, suffix)
529 def calc_percent(byte_counter, data_len):
532 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
535 def calc_eta(start, now, total, current):
539 if current == 0 or dif < 0.001: # One millisecond
541 rate = float(current) / dif
542 eta = long((float(total) - float(current)) / rate)
543 (eta_mins, eta_secs) = divmod(eta, 60)
546 return '%02d:%02d' % (eta_mins, eta_secs)
549 def calc_speed(start, now, bytes):
551 if bytes == 0 or dif < 0.001: # One millisecond
552 return '%10s' % '---b/s'
553 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
556 def best_block_size(elapsed_time, bytes):
557 new_min = max(bytes / 2.0, 1.0)
558 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
559 if elapsed_time < 0.001:
561 rate = bytes / elapsed_time
569 def parse_bytes(bytestr):
570 """Parse a string indicating a byte quantity into a long integer."""
571 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
574 number = float(matchobj.group(1))
575 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
576 return long(round(number * multiplier))
578 def add_info_extractor(self, ie):
579 """Add an InfoExtractor object to the end of the list."""
581 ie.set_downloader(self)
583 def add_post_processor(self, pp):
584 """Add a PostProcessor object to the end of the chain."""
586 pp.set_downloader(self)
588 def to_screen(self, message, skip_eol=False):
589 """Print message to stdout if not in quiet mode."""
590 assert type(message) == type(u'')
591 if not self.params.get('quiet', False):
592 terminator = [u'\n', u''][skip_eol]
593 output = message + terminator
595 if 'b' not in self._screen_file.mode or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
596 output = output.encode(preferredencoding(), 'ignore')
597 self._screen_file.write(output)
598 self._screen_file.flush()
600 def to_stderr(self, message):
601 """Print message to stderr."""
602 print >>sys.stderr, message.encode(preferredencoding())
604 def to_cons_title(self, message):
605 """Set console/terminal window title to message."""
606 if not self.params.get('consoletitle', False):
608 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
609 # c_wchar_p() might not be necessary if `message` is
610 # already of type unicode()
611 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
612 elif 'TERM' in os.environ:
613 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
615 def fixed_template(self):
616 """Checks if the output template is fixed."""
617 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
619 def trouble(self, message=None):
620 """Determine action to take when a download problem appears.
622 Depending on if the downloader has been configured to ignore
623 download errors or not, this method may throw an exception or
624 not when errors are found, after printing the message.
626 if message is not None:
627 self.to_stderr(message)
628 if not self.params.get('ignoreerrors', False):
629 raise DownloadError(message)
630 self._download_retcode = 1
632 def slow_down(self, start_time, byte_counter):
633 """Sleep if the download speed is over the rate limit."""
634 rate_limit = self.params.get('ratelimit', None)
635 if rate_limit is None or byte_counter == 0:
638 elapsed = now - start_time
641 speed = float(byte_counter) / elapsed
642 if speed > rate_limit:
643 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
645 def temp_name(self, filename):
646 """Returns a temporary filename for the given filename."""
647 if self.params.get('nopart', False) or filename == u'-' or \
648 (os.path.exists(_encodeFilename(filename)) and not os.path.isfile(_encodeFilename(filename))):
650 return filename + u'.part'
652 def undo_temp_name(self, filename):
653 if filename.endswith(u'.part'):
654 return filename[:-len(u'.part')]
657 def try_rename(self, old_filename, new_filename):
659 if old_filename == new_filename:
661 os.rename(_encodeFilename(old_filename), _encodeFilename(new_filename))
662 except (IOError, OSError), err:
663 self.trouble(u'ERROR: unable to rename file')
665 def try_utime(self, filename, last_modified_hdr):
666 """Try to set the last-modified time of the given file."""
667 if last_modified_hdr is None:
669 if not os.path.isfile(_encodeFilename(filename)):
671 timestr = last_modified_hdr
674 filetime = timeconvert(timestr)
678 os.utime(filename, (time.time(), filetime))
683 def report_writedescription(self, descfn):
684 """ Report that the description file is being written """
685 self.to_screen(u'[info] Writing video description to: ' + descfn)
687 def report_writesubtitles(self, srtfn):
688 """ Report that the subtitles file is being written """
689 self.to_screen(u'[info] Writing video subtitles to: ' + srtfn)
691 def report_writeinfojson(self, infofn):
692 """ Report that the metadata file has been written """
693 self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
695 def report_destination(self, filename):
696 """Report destination filename."""
697 self.to_screen(u'[download] Destination: ' + filename)
699 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
700 """Report download progress."""
701 if self.params.get('noprogress', False):
703 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
704 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
705 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
706 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
708 def report_resuming_byte(self, resume_len):
709 """Report attempt to resume at given byte."""
710 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
712 def report_retry(self, count, retries):
713 """Report retry in case of HTTP error 5xx"""
714 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
716 def report_file_already_downloaded(self, file_name):
717 """Report file has already been fully downloaded."""
719 self.to_screen(u'[download] %s has already been downloaded' % file_name)
720 except (UnicodeEncodeError), err:
721 self.to_screen(u'[download] The file has already been downloaded')
723 def report_unable_to_resume(self):
724 """Report it was impossible to resume download."""
725 self.to_screen(u'[download] Unable to resume')
727 def report_finish(self):
728 """Report download finished."""
729 if self.params.get('noprogress', False):
730 self.to_screen(u'[download] Download completed')
734 def increment_downloads(self):
735 """Increment the ordinal that assigns a number to each file."""
736 self._num_downloads += 1
738 def prepare_filename(self, info_dict):
739 """Generate the output filename."""
741 template_dict = dict(info_dict)
742 template_dict['epoch'] = unicode(long(time.time()))
743 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
744 filename = self.params['outtmpl'] % template_dict
746 except (ValueError, KeyError), err:
747 self.trouble(u'ERROR: invalid system charset or erroneous output template')
750 def _match_entry(self, info_dict):
751 """ Returns None iff the file should be downloaded """
753 title = info_dict['title']
754 matchtitle = self.params.get('matchtitle', False)
755 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
756 return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
757 rejecttitle = self.params.get('rejecttitle', False)
758 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
759 return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
762 def process_info(self, info_dict):
763 """Process a single dictionary returned by an InfoExtractor."""
765 reason = self._match_entry(info_dict)
766 if reason is not None:
767 self.to_screen(u'[download] ' + reason)
770 max_downloads = self.params.get('max_downloads')
771 if max_downloads is not None:
772 if self._num_downloads > int(max_downloads):
773 raise MaxDownloadsReached()
775 filename = self.prepare_filename(info_dict)
778 if self.params.get('forcetitle', False):
779 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
780 if self.params.get('forceurl', False):
781 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
782 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
783 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
784 if self.params.get('forcedescription', False) and 'description' in info_dict:
785 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
786 if self.params.get('forcefilename', False) and filename is not None:
787 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
788 if self.params.get('forceformat', False):
789 print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
791 # Do nothing else if in simulate mode
792 if self.params.get('simulate', False):
799 dn = os.path.dirname(_encodeFilename(filename))
800 if dn != '' and not os.path.exists(dn): # dn is already encoded
802 except (OSError, IOError), err:
803 self.trouble(u'ERROR: unable to create directory ' + unicode(err))
806 if self.params.get('writedescription', False):
808 descfn = filename + u'.description'
809 self.report_writedescription(descfn)
810 descfile = open(_encodeFilename(descfn), 'wb')
812 descfile.write(info_dict['description'].encode('utf-8'))
815 except (OSError, IOError):
816 self.trouble(u'ERROR: Cannot write description file ' + descfn)
819 if self.params.get('writesubtitles', False) and 'subtitles' in info_dict and info_dict['subtitles']:
820 # subtitles download errors are already managed as troubles in relevant IE
821 # that way it will silently go on when used with unsupporting IE
823 srtfn = filename.rsplit('.', 1)[0] + u'.srt'
824 self.report_writesubtitles(srtfn)
825 srtfile = open(_encodeFilename(srtfn), 'wb')
827 srtfile.write(info_dict['subtitles'].encode('utf-8'))
830 except (OSError, IOError):
831 self.trouble(u'ERROR: Cannot write subtitles file ' + descfn)
834 if self.params.get('writeinfojson', False):
835 infofn = filename + u'.info.json'
836 self.report_writeinfojson(infofn)
839 except (NameError,AttributeError):
840 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
843 infof = open(_encodeFilename(infofn), 'wb')
845 json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
846 json.dump(json_info_dict, infof)
849 except (OSError, IOError):
850 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
853 if not self.params.get('skip_download', False):
854 if self.params.get('nooverwrites', False) and os.path.exists(_encodeFilename(filename)):
858 success = self._do_download(filename, info_dict)
859 except (OSError, IOError), err:
860 raise UnavailableVideoError
861 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
862 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
864 except (ContentTooShortError, ), err:
865 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
870 self.post_process(filename, info_dict)
871 except (PostProcessingError), err:
872 self.trouble(u'ERROR: postprocessing: %s' % str(err))
875 def download(self, url_list):
876 """Download a given list of URLs."""
877 if len(url_list) > 1 and self.fixed_template():
878 raise SameFileError(self.params['outtmpl'])
881 suitable_found = False
883 # Go to next InfoExtractor if not suitable
884 if not ie.suitable(url):
887 # Suitable InfoExtractor found
888 suitable_found = True
890 # Extract information from URL and process it
893 # Suitable InfoExtractor had been found; go to next URL
896 if not suitable_found:
897 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
899 return self._download_retcode
901 def post_process(self, filename, ie_info):
902 """Run the postprocessing chain on the given file."""
904 info['filepath'] = filename
910 def _download_with_rtmpdump(self, filename, url, player_url):
911 self.report_destination(filename)
912 tmpfilename = self.temp_name(filename)
914 # Check for rtmpdump first
916 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
917 except (OSError, IOError):
918 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
921 # Download using rtmpdump. rtmpdump returns exit code 2 when
922 # the connection was interrumpted and resuming appears to be
923 # possible. This is part of rtmpdump's normal usage, AFAIK.
924 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
925 args = basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)]
926 if self.params.get('verbose', False):
929 shell_quote = lambda args: ' '.join(map(pipes.quote, args))
932 self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(args))
933 retval = subprocess.call(args)
934 while retval == 2 or retval == 1:
935 prevsize = os.path.getsize(_encodeFilename(tmpfilename))
936 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
937 time.sleep(5.0) # This seems to be needed
938 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
939 cursize = os.path.getsize(_encodeFilename(tmpfilename))
940 if prevsize == cursize and retval == 1:
942 # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
943 if prevsize == cursize and retval == 2 and cursize > 1024:
944 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
948 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(_encodeFilename(tmpfilename)))
949 self.try_rename(tmpfilename, filename)
952 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
955 def _do_download(self, filename, info_dict):
956 url = info_dict['url']
957 player_url = info_dict.get('player_url', None)
959 # Check file already present
960 if self.params.get('continuedl', False) and os.path.isfile(_encodeFilename(filename)) and not self.params.get('nopart', False):
961 self.report_file_already_downloaded(filename)
964 # Attempt to download using rtmpdump
965 if url.startswith('rtmp'):
966 return self._download_with_rtmpdump(filename, url, player_url)
968 tmpfilename = self.temp_name(filename)
971 # Do not include the Accept-Encoding header
972 headers = {'Youtubedl-no-compression': 'True'}
973 basic_request = urllib2.Request(url, None, headers)
974 request = urllib2.Request(url, None, headers)
976 # Establish possible resume length
977 if os.path.isfile(_encodeFilename(tmpfilename)):
978 resume_len = os.path.getsize(_encodeFilename(tmpfilename))
984 if self.params.get('continuedl', False):
985 self.report_resuming_byte(resume_len)
986 request.add_header('Range','bytes=%d-' % resume_len)
992 retries = self.params.get('retries', 0)
993 while count <= retries:
994 # Establish connection
996 if count == 0 and 'urlhandle' in info_dict:
997 data = info_dict['urlhandle']
998 data = urllib2.urlopen(request)
1000 except (urllib2.HTTPError, ), err:
1001 if (err.code < 500 or err.code >= 600) and err.code != 416:
1002 # Unexpected HTTP error
1004 elif err.code == 416:
1005 # Unable to resume (requested range not satisfiable)
1007 # Open the connection again without the range header
1008 data = urllib2.urlopen(basic_request)
1009 content_length = data.info()['Content-Length']
1010 except (urllib2.HTTPError, ), err:
1011 if err.code < 500 or err.code >= 600:
1014 # Examine the reported length
1015 if (content_length is not None and
1016 (resume_len - 100 < long(content_length) < resume_len + 100)):
1017 # The file had already been fully downloaded.
1018 # Explanation to the above condition: in issue #175 it was revealed that
1019 # YouTube sometimes adds or removes a few bytes from the end of the file,
1020 # changing the file size slightly and causing problems for some users. So
1021 # I decided to implement a suggested change and consider the file
1022 # completely downloaded if the file size differs less than 100 bytes from
1023 # the one in the hard drive.
1024 self.report_file_already_downloaded(filename)
1025 self.try_rename(tmpfilename, filename)
1028 # The length does not match, we start the download over
1029 self.report_unable_to_resume()
1034 if count <= retries:
1035 self.report_retry(count, retries)
1038 self.trouble(u'ERROR: giving up after %s retries' % retries)
1041 data_len = data.info().get('Content-length', None)
1042 if data_len is not None:
1043 data_len = long(data_len) + resume_len
1044 data_len_str = self.format_bytes(data_len)
1045 byte_counter = 0 + resume_len
1049 # Download and write
1050 before = time.time()
1051 data_block = data.read(block_size)
1053 if len(data_block) == 0:
1055 byte_counter += len(data_block)
1057 # Open file just in time
1060 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
1061 assert stream is not None
1062 filename = self.undo_temp_name(tmpfilename)
1063 self.report_destination(filename)
1064 except (OSError, IOError), err:
1065 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
1068 stream.write(data_block)
1069 except (IOError, OSError), err:
1070 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1072 block_size = self.best_block_size(after - before, len(data_block))
1075 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1076 if data_len is None:
1077 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1079 percent_str = self.calc_percent(byte_counter, data_len)
1080 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1081 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1084 self.slow_down(start, byte_counter - resume_len)
1087 self.trouble(u'\nERROR: Did not get any data blocks')
1090 self.report_finish()
1091 if data_len is not None and byte_counter != data_len:
1092 raise ContentTooShortError(byte_counter, long(data_len))
1093 self.try_rename(tmpfilename, filename)
1095 # Update file modification time
1096 if self.params.get('updatetime', True):
1097 info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1102 class InfoExtractor(object):
1103 """Information Extractor class.
1105 Information extractors are the classes that, given a URL, extract
1106 information from the video (or videos) the URL refers to. This
1107 information includes the real video URL, the video title and simplified
1108 title, author and others. The information is stored in a dictionary
1109 which is then passed to the FileDownloader. The FileDownloader
1110 processes this information possibly downloading the video to the file
1111 system, among other possible outcomes. The dictionaries must include
1112 the following fields:
1114 id: Video identifier.
1115 url: Final video URL.
1116 uploader: Nickname of the video uploader.
1117 title: Literal title.
1118 stitle: Simplified title.
1119 ext: Video filename extension.
1120 format: Video format.
1121 player_url: SWF Player URL (may be None).
1123 The following fields are optional. Their primary purpose is to allow
1124 youtube-dl to serve as the backend for a video search function, such
1125 as the one in youtube2mp3. They are only used when their respective
1126 forced printing functions are called:
1128 thumbnail: Full URL to a video thumbnail image.
1129 description: One-line video description.
1131 Subclasses of this one should re-define the _real_initialize() and
1132 _real_extract() methods and define a _VALID_URL regexp.
1133 Probably, they should also be added to the list of extractors.
1139 def __init__(self, downloader=None):
1140 """Constructor. Receives an optional downloader."""
1142 self.set_downloader(downloader)
1144 def suitable(self, url):
1145 """Receives a URL and returns True if suitable for this IE."""
1146 return re.match(self._VALID_URL, url) is not None
1148 def initialize(self):
1149 """Initializes an instance (authentication, etc)."""
1151 self._real_initialize()
1154 def extract(self, url):
1155 """Extracts URL information and returns it in list of dicts."""
1157 return self._real_extract(url)
1159 def set_downloader(self, downloader):
1160 """Sets the downloader for this IE."""
1161 self._downloader = downloader
1163 def _real_initialize(self):
1164 """Real initialization process. Redefine in subclasses."""
1167 def _real_extract(self, url):
1168 """Real extraction process. Redefine in subclasses."""
1172 class YoutubeIE(InfoExtractor):
1173 """Information extractor for youtube.com."""
1175 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1176 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1177 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1178 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1179 _NETRC_MACHINE = 'youtube'
1180 # Listed in order of quality
1181 _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1182 _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
1183 _video_extensions = {
1189 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1194 _video_dimensions = {
1209 IE_NAME = u'youtube'
1211 def report_lang(self):
1212 """Report attempt to set language."""
1213 self._downloader.to_screen(u'[youtube] Setting language')
1215 def report_login(self):
1216 """Report attempt to log in."""
1217 self._downloader.to_screen(u'[youtube] Logging in')
1219 def report_age_confirmation(self):
1220 """Report attempt to confirm age."""
1221 self._downloader.to_screen(u'[youtube] Confirming age')
1223 def report_video_webpage_download(self, video_id):
1224 """Report attempt to download video webpage."""
1225 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1227 def report_video_info_webpage_download(self, video_id):
1228 """Report attempt to download video info webpage."""
1229 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1231 def report_video_subtitles_download(self, video_id):
1232 """Report attempt to download video info webpage."""
1233 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
1235 def report_information_extraction(self, video_id):
1236 """Report attempt to extract video information."""
1237 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1239 def report_unavailable_format(self, video_id, format):
1240 """Report extracted video URL."""
1241 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1243 def report_rtmp_download(self):
1244 """Indicate the download will use the RTMP protocol."""
1245 self._downloader.to_screen(u'[youtube] RTMP download detected')
1247 def _closed_captions_xml_to_srt(self, xml_string):
1249 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
1250 # TODO parse xml instead of regex
1251 for n, (start, dur_tag, dur, caption) in enumerate(texts):
1252 if not dur: dur = '4'
1253 start = float(start)
1254 end = start + float(dur)
1255 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
1256 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
1257 caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption)
1258 caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption) # double cycle, inentional
1259 srt += str(n) + '\n'
1260 srt += start + ' --> ' + end + '\n'
1261 srt += caption + '\n\n'
1264 def _print_formats(self, formats):
1265 print 'Available formats:'
1267 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1269 def _real_initialize(self):
1270 if self._downloader is None:
1275 downloader_params = self._downloader.params
1277 # Attempt to use provided username and password or .netrc data
1278 if downloader_params.get('username', None) is not None:
1279 username = downloader_params['username']
1280 password = downloader_params['password']
1281 elif downloader_params.get('usenetrc', False):
1283 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1284 if info is not None:
1288 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1289 except (IOError, netrc.NetrcParseError), err:
1290 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1294 request = urllib2.Request(self._LANG_URL)
1297 urllib2.urlopen(request).read()
1298 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1299 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1302 # No authentication to be performed
1303 if username is None:
1308 'current_form': 'loginForm',
1310 'action_login': 'Log In',
1311 'username': username,
1312 'password': password,
1314 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1317 login_results = urllib2.urlopen(request).read()
1318 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1319 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1321 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1322 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1328 'action_confirm': 'Confirm',
1330 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1332 self.report_age_confirmation()
1333 age_results = urllib2.urlopen(request).read()
1334 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1335 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1338 def _real_extract(self, url):
1339 # Extract video id from URL
1340 mobj = re.match(self._VALID_URL, url)
1342 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1344 video_id = mobj.group(2)
1347 self.report_video_webpage_download(video_id)
1348 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1350 video_webpage = urllib2.urlopen(request).read()
1351 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1352 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1355 # Attempt to extract SWF player URL
1356 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1357 if mobj is not None:
1358 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1363 self.report_video_info_webpage_download(video_id)
1364 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1365 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1366 % (video_id, el_type))
1367 request = urllib2.Request(video_info_url)
1369 video_info_webpage = urllib2.urlopen(request).read()
1370 video_info = parse_qs(video_info_webpage)
1371 if 'token' in video_info:
1373 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1374 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1376 if 'token' not in video_info:
1377 if 'reason' in video_info:
1378 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1380 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1383 # Start extracting information
1384 self.report_information_extraction(video_id)
1387 if 'author' not in video_info:
1388 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1390 video_uploader = urllib.unquote_plus(video_info['author'][0])
1393 if 'title' not in video_info:
1394 self._downloader.trouble(u'ERROR: unable to extract video title')
1396 video_title = urllib.unquote_plus(video_info['title'][0])
1397 video_title = video_title.decode('utf-8')
1398 video_title = sanitize_title(video_title)
1401 simple_title = _simplify_title(video_title)
1404 if 'thumbnail_url' not in video_info:
1405 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1406 video_thumbnail = ''
1407 else: # don't panic if we can't find it
1408 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1412 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1413 if mobj is not None:
1414 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1415 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1416 for expression in format_expressions:
1418 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1426 video_description = u'No description available.'
1427 mobj = re.search(r'<meta name="description" content="(.*?)">', video_webpage)
1428 if mobj is not None:
1429 video_description = mobj.group(1).decode('utf-8')
1431 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1432 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1433 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1434 # TODO use another parser
1437 video_subtitles = None
1438 if self._downloader.params.get('writesubtitles', False):
1439 self.report_video_subtitles_download(video_id)
1440 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
1442 srt_list = urllib2.urlopen(request).read()
1443 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1444 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1446 srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
1448 if self._downloader.params.get('subtitleslang', False):
1449 srt_lang = self._downloader.params.get('subtitleslang')
1450 elif 'en' in srt_lang_list:
1453 srt_lang = srt_lang_list[0]
1454 if not srt_lang in srt_lang_list:
1455 self._downloader.trouble(u'WARNING: no closed captions found in the specified language')
1457 request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
1459 srt_xml = urllib2.urlopen(request).read()
1460 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1461 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1463 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
1465 self._downloader.trouble(u'WARNING: video has no closed captions')
1468 video_token = urllib.unquote_plus(video_info['token'][0])
1470 # Decide which formats to download
1471 req_format = self._downloader.params.get('format', None)
1473 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1474 self.report_rtmp_download()
1475 video_url_list = [(None, video_info['conn'][0])]
1476 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1477 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1478 url_data = [parse_qs(uds) for uds in url_data_strs]
1479 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1480 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1482 format_limit = self._downloader.params.get('format_limit', None)
1483 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1484 if format_limit is not None and format_limit in available_formats:
1485 format_list = available_formats[available_formats.index(format_limit):]
1487 format_list = available_formats
1488 existing_formats = [x for x in format_list if x in url_map]
1489 if len(existing_formats) == 0:
1490 self._downloader.trouble(u'ERROR: no known formats available for video')
1492 if self._downloader.params.get('listformats', None):
1493 self._print_formats(existing_formats)
1495 if req_format is None or req_format == 'best':
1496 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1497 elif req_format == 'worst':
1498 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1499 elif req_format in ('-1', 'all'):
1500 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1502 # Specific formats. We pick the first in a slash-delimeted sequence.
1503 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1504 req_formats = req_format.split('/')
1505 video_url_list = None
1506 for rf in req_formats:
1508 video_url_list = [(rf, url_map[rf])]
1510 if video_url_list is None:
1511 self._downloader.trouble(u'ERROR: requested format not available')
1514 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1517 for format_param, video_real_url in video_url_list:
1518 # At this point we have a new video
1519 self._downloader.increment_downloads()
1522 video_extension = self._video_extensions.get(format_param, 'flv')
1525 # Process video information
1526 self._downloader.process_info({
1527 'id': video_id.decode('utf-8'),
1528 'url': video_real_url.decode('utf-8'),
1529 'uploader': video_uploader.decode('utf-8'),
1530 'upload_date': upload_date,
1531 'title': video_title,
1532 'stitle': simple_title,
1533 'ext': video_extension.decode('utf-8'),
1534 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1535 'thumbnail': video_thumbnail.decode('utf-8'),
1536 'description': video_description,
1537 'player_url': player_url,
1538 'subtitles': video_subtitles
1540 except UnavailableVideoError, err:
1541 self._downloader.trouble(u'\nERROR: unable to download video')
1544 class MetacafeIE(InfoExtractor):
1545 """Information Extractor for metacafe.com."""
1547 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1548 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1549 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1551 IE_NAME = u'metacafe'
1553 def __init__(self, youtube_ie, downloader=None):
1554 InfoExtractor.__init__(self, downloader)
1555 self._youtube_ie = youtube_ie
1557 def report_disclaimer(self):
1558 """Report disclaimer retrieval."""
1559 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1561 def report_age_confirmation(self):
1562 """Report attempt to confirm age."""
1563 self._downloader.to_screen(u'[metacafe] Confirming age')
1565 def report_download_webpage(self, video_id):
1566 """Report webpage download."""
1567 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1569 def report_extraction(self, video_id):
1570 """Report information extraction."""
1571 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1573 def _real_initialize(self):
1574 # Retrieve disclaimer
1575 request = urllib2.Request(self._DISCLAIMER)
1577 self.report_disclaimer()
1578 disclaimer = urllib2.urlopen(request).read()
1579 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1580 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1586 'submit': "Continue - I'm over 18",
1588 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1590 self.report_age_confirmation()
1591 disclaimer = urllib2.urlopen(request).read()
1592 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1593 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1596 def _real_extract(self, url):
1597 # Extract id and simplified title from URL
1598 mobj = re.match(self._VALID_URL, url)
1600 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1603 video_id = mobj.group(1)
1605 # Check if video comes from YouTube
1606 mobj2 = re.match(r'^yt-(.*)$', video_id)
1607 if mobj2 is not None:
1608 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1611 # At this point we have a new video
1612 self._downloader.increment_downloads()
1614 simple_title = mobj.group(2).decode('utf-8')
1616 # Retrieve video webpage to extract further information
1617 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1619 self.report_download_webpage(video_id)
1620 webpage = urllib2.urlopen(request).read()
1621 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1622 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1625 # Extract URL, uploader and title from webpage
1626 self.report_extraction(video_id)
1627 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1628 if mobj is not None:
1629 mediaURL = urllib.unquote(mobj.group(1))
1630 video_extension = mediaURL[-3:]
1632 # Extract gdaKey if available
1633 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1635 video_url = mediaURL
1637 gdaKey = mobj.group(1)
1638 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1640 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1642 self._downloader.trouble(u'ERROR: unable to extract media URL')
1644 vardict = parse_qs(mobj.group(1))
1645 if 'mediaData' not in vardict:
1646 self._downloader.trouble(u'ERROR: unable to extract media URL')
1648 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1650 self._downloader.trouble(u'ERROR: unable to extract media URL')
1652 mediaURL = mobj.group(1).replace('\\/', '/')
1653 video_extension = mediaURL[-3:]
1654 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1656 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1658 self._downloader.trouble(u'ERROR: unable to extract title')
1660 video_title = mobj.group(1).decode('utf-8')
1661 video_title = sanitize_title(video_title)
1663 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1665 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1667 video_uploader = mobj.group(1)
1670 # Process video information
1671 self._downloader.process_info({
1672 'id': video_id.decode('utf-8'),
1673 'url': video_url.decode('utf-8'),
1674 'uploader': video_uploader.decode('utf-8'),
1675 'upload_date': u'NA',
1676 'title': video_title,
1677 'stitle': simple_title,
1678 'ext': video_extension.decode('utf-8'),
1682 except UnavailableVideoError:
1683 self._downloader.trouble(u'\nERROR: unable to download video')
1686 class DailymotionIE(InfoExtractor):
1687 """Information Extractor for Dailymotion"""
1689 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1690 IE_NAME = u'dailymotion'
1692 def __init__(self, downloader=None):
1693 InfoExtractor.__init__(self, downloader)
1695 def report_download_webpage(self, video_id):
1696 """Report webpage download."""
1697 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1699 def report_extraction(self, video_id):
1700 """Report information extraction."""
1701 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1703 def _real_extract(self, url):
1704 # Extract id and simplified title from URL
1705 mobj = re.match(self._VALID_URL, url)
1707 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1710 # At this point we have a new video
1711 self._downloader.increment_downloads()
1712 video_id = mobj.group(1)
1714 video_extension = 'flv'
1716 # Retrieve video webpage to extract further information
1717 request = urllib2.Request(url)
1718 request.add_header('Cookie', 'family_filter=off')
1720 self.report_download_webpage(video_id)
1721 webpage = urllib2.urlopen(request).read()
1722 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1723 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1726 # Extract URL, uploader and title from webpage
1727 self.report_extraction(video_id)
1728 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1730 self._downloader.trouble(u'ERROR: unable to extract media URL')
1732 sequence = urllib.unquote(mobj.group(1))
1733 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1735 self._downloader.trouble(u'ERROR: unable to extract media URL')
1737 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1739 # if needed add http://www.dailymotion.com/ if relative URL
1741 video_url = mediaURL
1743 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
1745 self._downloader.trouble(u'ERROR: unable to extract title')
1747 video_title = _unescapeHTML(mobj.group('title').decode('utf-8'))
1748 video_title = sanitize_title(video_title)
1749 simple_title = _simplify_title(video_title)
1751 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1753 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1755 video_uploader = mobj.group(1)
1758 # Process video information
1759 self._downloader.process_info({
1760 'id': video_id.decode('utf-8'),
1761 'url': video_url.decode('utf-8'),
1762 'uploader': video_uploader.decode('utf-8'),
1763 'upload_date': u'NA',
1764 'title': video_title,
1765 'stitle': simple_title,
1766 'ext': video_extension.decode('utf-8'),
1770 except UnavailableVideoError:
1771 self._downloader.trouble(u'\nERROR: unable to download video')
1774 class GoogleIE(InfoExtractor):
1775 """Information extractor for video.google.com."""
1777 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1778 IE_NAME = u'video.google'
1780 def __init__(self, downloader=None):
1781 InfoExtractor.__init__(self, downloader)
1783 def report_download_webpage(self, video_id):
1784 """Report webpage download."""
1785 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1787 def report_extraction(self, video_id):
1788 """Report information extraction."""
1789 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1791 def _real_extract(self, url):
1792 # Extract id from URL
1793 mobj = re.match(self._VALID_URL, url)
1795 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1798 # At this point we have a new video
1799 self._downloader.increment_downloads()
1800 video_id = mobj.group(1)
1802 video_extension = 'mp4'
1804 # Retrieve video webpage to extract further information
1805 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1807 self.report_download_webpage(video_id)
1808 webpage = urllib2.urlopen(request).read()
1809 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1810 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1813 # Extract URL, uploader, and title from webpage
1814 self.report_extraction(video_id)
1815 mobj = re.search(r"download_url:'([^']+)'", webpage)
1817 video_extension = 'flv'
1818 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1820 self._downloader.trouble(u'ERROR: unable to extract media URL')
1822 mediaURL = urllib.unquote(mobj.group(1))
1823 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1824 mediaURL = mediaURL.replace('\\x26', '\x26')
1826 video_url = mediaURL
1828 mobj = re.search(r'<title>(.*)</title>', webpage)
1830 self._downloader.trouble(u'ERROR: unable to extract title')
1832 video_title = mobj.group(1).decode('utf-8')
1833 video_title = sanitize_title(video_title)
1834 simple_title = _simplify_title(video_title)
1836 # Extract video description
1837 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1839 self._downloader.trouble(u'ERROR: unable to extract video description')
1841 video_description = mobj.group(1).decode('utf-8')
1842 if not video_description:
1843 video_description = 'No description available.'
1845 # Extract video thumbnail
1846 if self._downloader.params.get('forcethumbnail', False):
1847 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1849 webpage = urllib2.urlopen(request).read()
1850 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1851 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1853 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1855 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1857 video_thumbnail = mobj.group(1)
1858 else: # we need something to pass to process_info
1859 video_thumbnail = ''
1862 # Process video information
1863 self._downloader.process_info({
1864 'id': video_id.decode('utf-8'),
1865 'url': video_url.decode('utf-8'),
1867 'upload_date': u'NA',
1868 'title': video_title,
1869 'stitle': simple_title,
1870 'ext': video_extension.decode('utf-8'),
1874 except UnavailableVideoError:
1875 self._downloader.trouble(u'\nERROR: unable to download video')
1878 class PhotobucketIE(InfoExtractor):
1879 """Information extractor for photobucket.com."""
1881 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1882 IE_NAME = u'photobucket'
1884 def __init__(self, downloader=None):
1885 InfoExtractor.__init__(self, downloader)
1887 def report_download_webpage(self, video_id):
1888 """Report webpage download."""
1889 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1891 def report_extraction(self, video_id):
1892 """Report information extraction."""
1893 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1895 def _real_extract(self, url):
1896 # Extract id from URL
1897 mobj = re.match(self._VALID_URL, url)
1899 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1902 # At this point we have a new video
1903 self._downloader.increment_downloads()
1904 video_id = mobj.group(1)
1906 video_extension = 'flv'
1908 # Retrieve video webpage to extract further information
1909 request = urllib2.Request(url)
1911 self.report_download_webpage(video_id)
1912 webpage = urllib2.urlopen(request).read()
1913 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1914 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1917 # Extract URL, uploader, and title from webpage
1918 self.report_extraction(video_id)
1919 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1921 self._downloader.trouble(u'ERROR: unable to extract media URL')
1923 mediaURL = urllib.unquote(mobj.group(1))
1925 video_url = mediaURL
1927 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1929 self._downloader.trouble(u'ERROR: unable to extract title')
1931 video_title = mobj.group(1).decode('utf-8')
1932 video_title = sanitize_title(video_title)
1933 simple_title = _simplify_title(vide_title)
1935 video_uploader = mobj.group(2).decode('utf-8')
1938 # Process video information
1939 self._downloader.process_info({
1940 'id': video_id.decode('utf-8'),
1941 'url': video_url.decode('utf-8'),
1942 'uploader': video_uploader,
1943 'upload_date': u'NA',
1944 'title': video_title,
1945 'stitle': simple_title,
1946 'ext': video_extension.decode('utf-8'),
1950 except UnavailableVideoError:
1951 self._downloader.trouble(u'\nERROR: unable to download video')
1954 class YahooIE(InfoExtractor):
1955 """Information extractor for video.yahoo.com."""
1957 # _VALID_URL matches all Yahoo! Video URLs
1958 # _VPAGE_URL matches only the extractable '/watch/' URLs
1959 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1960 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1961 IE_NAME = u'video.yahoo'
1963 def __init__(self, downloader=None):
1964 InfoExtractor.__init__(self, downloader)
1966 def report_download_webpage(self, video_id):
1967 """Report webpage download."""
1968 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1970 def report_extraction(self, video_id):
1971 """Report information extraction."""
1972 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1974 def _real_extract(self, url, new_video=True):
1975 # Extract ID from URL
1976 mobj = re.match(self._VALID_URL, url)
1978 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1981 # At this point we have a new video
1982 self._downloader.increment_downloads()
1983 video_id = mobj.group(2)
1984 video_extension = 'flv'
1986 # Rewrite valid but non-extractable URLs as
1987 # extractable English language /watch/ URLs
1988 if re.match(self._VPAGE_URL, url) is None:
1989 request = urllib2.Request(url)
1991 webpage = urllib2.urlopen(request).read()
1992 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1993 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1996 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1998 self._downloader.trouble(u'ERROR: Unable to extract id field')
2000 yahoo_id = mobj.group(1)
2002 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
2004 self._downloader.trouble(u'ERROR: Unable to extract vid field')
2006 yahoo_vid = mobj.group(1)
2008 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
2009 return self._real_extract(url, new_video=False)
2011 # Retrieve video webpage to extract further information
2012 request = urllib2.Request(url)
2014 self.report_download_webpage(video_id)
2015 webpage = urllib2.urlopen(request).read()
2016 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2017 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2020 # Extract uploader and title from webpage
2021 self.report_extraction(video_id)
2022 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
2024 self._downloader.trouble(u'ERROR: unable to extract video title')
2026 video_title = mobj.group(1).decode('utf-8')
2027 simple_title = _simplify_title(video_title)
2029 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
2031 self._downloader.trouble(u'ERROR: unable to extract video uploader')
2033 video_uploader = mobj.group(1).decode('utf-8')
2035 # Extract video thumbnail
2036 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
2038 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2040 video_thumbnail = mobj.group(1).decode('utf-8')
2042 # Extract video description
2043 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
2045 self._downloader.trouble(u'ERROR: unable to extract video description')
2047 video_description = mobj.group(1).decode('utf-8')
2048 if not video_description:
2049 video_description = 'No description available.'
2051 # Extract video height and width
2052 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
2054 self._downloader.trouble(u'ERROR: unable to extract video height')
2056 yv_video_height = mobj.group(1)
2058 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
2060 self._downloader.trouble(u'ERROR: unable to extract video width')
2062 yv_video_width = mobj.group(1)
2064 # Retrieve video playlist to extract media URL
2065 # I'm not completely sure what all these options are, but we
2066 # seem to need most of them, otherwise the server sends a 401.
2067 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
2068 yv_bitrate = '700' # according to Wikipedia this is hard-coded
2069 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
2070 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
2071 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
2073 self.report_download_webpage(video_id)
2074 webpage = urllib2.urlopen(request).read()
2075 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2076 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2079 # Extract media URL from playlist XML
2080 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
2082 self._downloader.trouble(u'ERROR: Unable to extract media URL')
2084 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
2085 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
2088 # Process video information
2089 self._downloader.process_info({
2090 'id': video_id.decode('utf-8'),
2092 'uploader': video_uploader,
2093 'upload_date': u'NA',
2094 'title': video_title,
2095 'stitle': simple_title,
2096 'ext': video_extension.decode('utf-8'),
2097 'thumbnail': video_thumbnail.decode('utf-8'),
2098 'description': video_description,
2099 'thumbnail': video_thumbnail,
2102 except UnavailableVideoError:
2103 self._downloader.trouble(u'\nERROR: unable to download video')
2106 class VimeoIE(InfoExtractor):
2107 """Information extractor for vimeo.com."""
2109 # _VALID_URL matches Vimeo URLs
2110 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
2113 def __init__(self, downloader=None):
2114 InfoExtractor.__init__(self, downloader)
2116 def report_download_webpage(self, video_id):
2117 """Report webpage download."""
2118 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2120 def report_extraction(self, video_id):
2121 """Report information extraction."""
2122 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2124 def _real_extract(self, url, new_video=True):
2125 # Extract ID from URL
2126 mobj = re.match(self._VALID_URL, url)
2128 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2131 # At this point we have a new video
2132 self._downloader.increment_downloads()
2133 video_id = mobj.group(1)
2135 # Retrieve video webpage to extract further information
2136 request = urllib2.Request(url, None, std_headers)
2138 self.report_download_webpage(video_id)
2139 webpage = urllib2.urlopen(request).read()
2140 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2141 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2144 # Now we begin extracting as much information as we can from what we
2145 # retrieved. First we extract the information common to all extractors,
2146 # and latter we extract those that are Vimeo specific.
2147 self.report_extraction(video_id)
2149 # Extract the config JSON
2150 config = webpage.split(' = {config:')[1].split(',assets:')[0]
2152 config = json.loads(config)
2154 self._downloader.trouble(u'ERROR: unable to extract info section')
2158 video_title = config["video"]["title"]
2159 simple_title = _simplify_title(video_title)
2162 video_uploader = config["video"]["owner"]["name"]
2164 # Extract video thumbnail
2165 video_thumbnail = config["video"]["thumbnail"]
2167 # Extract video description
2171 video_description = u'No description available.'
2172 mobj = re.search(r'<meta name="description" content="(.*?)" />', webpage, re.MULTILINE)
2173 if mobj is not None:
2174 video_description = mobj.group(1)
2176 html_parser = lxml.etree.HTMLParser()
2177 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser)
2178 video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip()
2179 # TODO use another parser
2181 # Extract upload date
2182 video_upload_date = u'NA'
2183 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
2184 if mobj is not None:
2185 video_upload_date = mobj.group(1)
2187 # Vimeo specific: extract request signature and timestamp
2188 sig = config['request']['signature']
2189 timestamp = config['request']['timestamp']
2191 # Vimeo specific: extract video codec and quality information
2192 # TODO bind to format param
2193 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
2194 for codec in codecs:
2195 if codec[0] in config["video"]["files"]:
2196 video_codec = codec[0]
2197 video_extension = codec[1]
2198 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
2199 else: quality = 'sd'
2202 self._downloader.trouble(u'ERROR: no known codec found')
2205 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
2206 %(video_id, sig, timestamp, quality, video_codec.upper())
2209 # Process video information
2210 self._downloader.process_info({
2213 'uploader': video_uploader,
2214 'upload_date': video_upload_date,
2215 'title': video_title,
2216 'stitle': simple_title,
2217 'ext': video_extension,
2218 'thumbnail': video_thumbnail,
2219 'description': video_description,
2222 except UnavailableVideoError:
2223 self._downloader.trouble(u'ERROR: unable to download video')
2226 class GenericIE(InfoExtractor):
2227 """Generic last-resort information extractor."""
2230 IE_NAME = u'generic'
2232 def __init__(self, downloader=None):
2233 InfoExtractor.__init__(self, downloader)
2235 def report_download_webpage(self, video_id):
2236 """Report webpage download."""
2237 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2238 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2240 def report_extraction(self, video_id):
2241 """Report information extraction."""
2242 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2244 def report_following_redirect(self, new_url):
2245 """Report information extraction."""
2246 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
2248 def _test_redirect(self, url):
2249 """Check if it is a redirect, like url shorteners, in case restart chain."""
2250 class HeadRequest(urllib2.Request):
2251 def get_method(self):
2254 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
2256 Subclass the HTTPRedirectHandler to make it use our
2257 HeadRequest also on the redirected URL
2259 def redirect_request(self, req, fp, code, msg, headers, newurl):
2260 if code in (301, 302, 303, 307):
2261 newurl = newurl.replace(' ', '%20')
2262 newheaders = dict((k,v) for k,v in req.headers.items()
2263 if k.lower() not in ("content-length", "content-type"))
2264 return HeadRequest(newurl,
2266 origin_req_host=req.get_origin_req_host(),
2269 raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
2271 class HTTPMethodFallback(urllib2.BaseHandler):
2273 Fallback to GET if HEAD is not allowed (405 HTTP error)
2275 def http_error_405(self, req, fp, code, msg, headers):
2279 newheaders = dict((k,v) for k,v in req.headers.items()
2280 if k.lower() not in ("content-length", "content-type"))
2281 return self.parent.open(urllib2.Request(req.get_full_url(),
2283 origin_req_host=req.get_origin_req_host(),
2287 opener = urllib2.OpenerDirector()
2288 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
2289 HTTPMethodFallback, HEADRedirectHandler,
2290 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
2291 opener.add_handler(handler())
2293 response = opener.open(HeadRequest(url))
2294 new_url = response.geturl()
2296 if url == new_url: return False
2298 self.report_following_redirect(new_url)
2299 self._downloader.download([new_url])
2302 def _real_extract(self, url):
2303 if self._test_redirect(url): return
2305 # At this point we have a new video
2306 self._downloader.increment_downloads()
2308 video_id = url.split('/')[-1]
2309 request = urllib2.Request(url)
2311 self.report_download_webpage(video_id)
2312 webpage = urllib2.urlopen(request).read()
2313 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2314 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2316 except ValueError, err:
2317 # since this is the last-resort InfoExtractor, if
2318 # this error is thrown, it'll be thrown here
2319 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2322 self.report_extraction(video_id)
2323 # Start with something easy: JW Player in SWFObject
2324 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2326 # Broaden the search a little bit
2327 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2329 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2332 # It's possible that one of the regexes
2333 # matched, but returned an empty group:
2334 if mobj.group(1) is None:
2335 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2338 video_url = urllib.unquote(mobj.group(1))
2339 video_id = os.path.basename(video_url)
2341 # here's a fun little line of code for you:
2342 video_extension = os.path.splitext(video_id)[1][1:]
2343 video_id = os.path.splitext(video_id)[0]
2345 # it's tempting to parse this further, but you would
2346 # have to take into account all the variations like
2347 # Video Title - Site Name
2348 # Site Name | Video Title
2349 # Video Title - Tagline | Site Name
2350 # and so on and so forth; it's just not practical
2351 mobj = re.search(r'<title>(.*)</title>', webpage)
2353 self._downloader.trouble(u'ERROR: unable to extract title')
2355 video_title = mobj.group(1).decode('utf-8')
2356 video_title = sanitize_title(video_title)
2357 simple_title = _simplify_title(video_title)
2359 # video uploader is domain name
2360 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2362 self._downloader.trouble(u'ERROR: unable to extract title')
2364 video_uploader = mobj.group(1).decode('utf-8')
2367 # Process video information
2368 self._downloader.process_info({
2369 'id': video_id.decode('utf-8'),
2370 'url': video_url.decode('utf-8'),
2371 'uploader': video_uploader,
2372 'upload_date': u'NA',
2373 'title': video_title,
2374 'stitle': simple_title,
2375 'ext': video_extension.decode('utf-8'),
2379 except UnavailableVideoError, err:
2380 self._downloader.trouble(u'\nERROR: unable to download video')
2383 class YoutubeSearchIE(InfoExtractor):
2384 """Information Extractor for YouTube search queries."""
2385 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2386 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
2388 _max_youtube_results = 1000
2389 IE_NAME = u'youtube:search'
2391 def __init__(self, youtube_ie, downloader=None):
2392 InfoExtractor.__init__(self, downloader)
2393 self._youtube_ie = youtube_ie
2395 def report_download_page(self, query, pagenum):
2396 """Report attempt to download playlist page with given number."""
2397 query = query.decode(preferredencoding())
2398 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2400 def _real_initialize(self):
2401 self._youtube_ie.initialize()
2403 def _real_extract(self, query):
2404 mobj = re.match(self._VALID_URL, query)
2406 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2409 prefix, query = query.split(':')
2411 query = query.encode('utf-8')
2413 self._download_n_results(query, 1)
2415 elif prefix == 'all':
2416 self._download_n_results(query, self._max_youtube_results)
2422 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2424 elif n > self._max_youtube_results:
2425 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2426 n = self._max_youtube_results
2427 self._download_n_results(query, n)
2429 except ValueError: # parsing prefix as integer fails
2430 self._download_n_results(query, 1)
2433 def _download_n_results(self, query, n):
2434 """Downloads a specified number of results for a query"""
2440 while (50 * pagenum) < limit:
2441 self.report_download_page(query, pagenum+1)
2442 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
2443 request = urllib2.Request(result_url)
2445 data = urllib2.urlopen(request).read()
2446 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2447 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
2449 api_response = json.loads(data)['data']
2451 new_ids = list(video['id'] for video in api_response['items'])
2452 video_ids += new_ids
2454 limit = min(n, api_response['totalItems'])
2457 if len(video_ids) > n:
2458 video_ids = video_ids[:n]
2459 for id in video_ids:
2460 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2464 class GoogleSearchIE(InfoExtractor):
2465 """Information Extractor for Google Video search queries."""
2466 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2467 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2468 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
2469 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
2471 _max_google_results = 1000
2472 IE_NAME = u'video.google:search'
2474 def __init__(self, google_ie, downloader=None):
2475 InfoExtractor.__init__(self, downloader)
2476 self._google_ie = google_ie
2478 def report_download_page(self, query, pagenum):
2479 """Report attempt to download playlist page with given number."""
2480 query = query.decode(preferredencoding())
2481 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2483 def _real_initialize(self):
2484 self._google_ie.initialize()
2486 def _real_extract(self, query):
2487 mobj = re.match(self._VALID_URL, query)
2489 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2492 prefix, query = query.split(':')
2494 query = query.encode('utf-8')
2496 self._download_n_results(query, 1)
2498 elif prefix == 'all':
2499 self._download_n_results(query, self._max_google_results)
2505 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2507 elif n > self._max_google_results:
2508 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2509 n = self._max_google_results
2510 self._download_n_results(query, n)
2512 except ValueError: # parsing prefix as integer fails
2513 self._download_n_results(query, 1)
2516 def _download_n_results(self, query, n):
2517 """Downloads a specified number of results for a query"""
2523 self.report_download_page(query, pagenum)
2524 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
2525 request = urllib2.Request(result_url)
2527 page = urllib2.urlopen(request).read()
2528 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2529 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2532 # Extract video identifiers
2533 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2534 video_id = mobj.group(1)
2535 if video_id not in video_ids:
2536 video_ids.append(video_id)
2537 if len(video_ids) == n:
2538 # Specified n videos reached
2539 for id in video_ids:
2540 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2543 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2544 for id in video_ids:
2545 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2548 pagenum = pagenum + 1
2551 class YahooSearchIE(InfoExtractor):
2552 """Information Extractor for Yahoo! Video search queries."""
2553 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2554 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2555 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2556 _MORE_PAGES_INDICATOR = r'\s*Next'
2558 _max_yahoo_results = 1000
2559 IE_NAME = u'video.yahoo:search'
2561 def __init__(self, yahoo_ie, downloader=None):
2562 InfoExtractor.__init__(self, downloader)
2563 self._yahoo_ie = yahoo_ie
2565 def report_download_page(self, query, pagenum):
2566 """Report attempt to download playlist page with given number."""
2567 query = query.decode(preferredencoding())
2568 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2570 def _real_initialize(self):
2571 self._yahoo_ie.initialize()
2573 def _real_extract(self, query):
2574 mobj = re.match(self._VALID_URL, query)
2576 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2579 prefix, query = query.split(':')
2581 query = query.encode('utf-8')
2583 self._download_n_results(query, 1)
2585 elif prefix == 'all':
2586 self._download_n_results(query, self._max_yahoo_results)
2592 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2594 elif n > self._max_yahoo_results:
2595 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2596 n = self._max_yahoo_results
2597 self._download_n_results(query, n)
2599 except ValueError: # parsing prefix as integer fails
2600 self._download_n_results(query, 1)
2603 def _download_n_results(self, query, n):
2604 """Downloads a specified number of results for a query"""
2607 already_seen = set()
2611 self.report_download_page(query, pagenum)
2612 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2613 request = urllib2.Request(result_url)
2615 page = urllib2.urlopen(request).read()
2616 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2617 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2620 # Extract video identifiers
2621 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2622 video_id = mobj.group(1)
2623 if video_id not in already_seen:
2624 video_ids.append(video_id)
2625 already_seen.add(video_id)
2626 if len(video_ids) == n:
2627 # Specified n videos reached
2628 for id in video_ids:
2629 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2632 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2633 for id in video_ids:
2634 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2637 pagenum = pagenum + 1
2640 class YoutubePlaylistIE(InfoExtractor):
2641 """Information Extractor for YouTube playlists."""
2643 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2644 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2645 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&list=PL%s&'
2646 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2648 IE_NAME = u'youtube:playlist'
2650 def __init__(self, youtube_ie, downloader=None):
2651 InfoExtractor.__init__(self, downloader)
2652 self._youtube_ie = youtube_ie
2654 def report_download_page(self, playlist_id, pagenum):
2655 """Report attempt to download playlist page with given number."""
2656 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2658 def _real_initialize(self):
2659 self._youtube_ie.initialize()
2661 def _real_extract(self, url):
2662 # Extract playlist id
2663 mobj = re.match(self._VALID_URL, url)
2665 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2669 if mobj.group(3) is not None:
2670 self._youtube_ie.extract(mobj.group(3))
2673 # Download playlist pages
2674 # prefix is 'p' as default for playlists but there are other types that need extra care
2675 playlist_prefix = mobj.group(1)
2676 if playlist_prefix == 'a':
2677 playlist_access = 'artist'
2679 playlist_prefix = 'p'
2680 playlist_access = 'view_play_list'
2681 playlist_id = mobj.group(2)
2686 self.report_download_page(playlist_id, pagenum)
2687 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2688 request = urllib2.Request(url)
2690 page = urllib2.urlopen(request).read()
2691 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2692 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2695 # Extract video identifiers
2697 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
2698 if mobj.group(1) not in ids_in_page:
2699 ids_in_page.append(mobj.group(1))
2700 video_ids.extend(ids_in_page)
2702 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2704 pagenum = pagenum + 1
2706 playliststart = self._downloader.params.get('playliststart', 1) - 1
2707 playlistend = self._downloader.params.get('playlistend', -1)
2708 if playlistend == -1:
2709 video_ids = video_ids[playliststart:]
2711 video_ids = video_ids[playliststart:playlistend]
2713 for id in video_ids:
2714 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2718 class YoutubeUserIE(InfoExtractor):
2719 """Information Extractor for YouTube users."""
2721 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2722 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2723 _GDATA_PAGE_SIZE = 50
2724 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2725 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2727 IE_NAME = u'youtube:user'
2729 def __init__(self, youtube_ie, downloader=None):
2730 InfoExtractor.__init__(self, downloader)
2731 self._youtube_ie = youtube_ie
2733 def report_download_page(self, username, start_index):
2734 """Report attempt to download user page."""
2735 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2736 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2738 def _real_initialize(self):
2739 self._youtube_ie.initialize()
2741 def _real_extract(self, url):
2743 mobj = re.match(self._VALID_URL, url)
2745 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2748 username = mobj.group(1)
2750 # Download video ids using YouTube Data API. Result size per
2751 # query is limited (currently to 50 videos) so we need to query
2752 # page by page until there are no video ids - it means we got
2759 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2760 self.report_download_page(username, start_index)
2762 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2765 page = urllib2.urlopen(request).read()
2766 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2767 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2770 # Extract video identifiers
2773 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2774 if mobj.group(1) not in ids_in_page:
2775 ids_in_page.append(mobj.group(1))
2777 video_ids.extend(ids_in_page)
2779 # A little optimization - if current page is not
2780 # "full", ie. does not contain PAGE_SIZE video ids then
2781 # we can assume that this page is the last one - there
2782 # are no more ids on further pages - no need to query
2785 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2790 all_ids_count = len(video_ids)
2791 playliststart = self._downloader.params.get('playliststart', 1) - 1
2792 playlistend = self._downloader.params.get('playlistend', -1)
2794 if playlistend == -1:
2795 video_ids = video_ids[playliststart:]
2797 video_ids = video_ids[playliststart:playlistend]
2799 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2800 (username, all_ids_count, len(video_ids)))
2802 for video_id in video_ids:
2803 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2806 class DepositFilesIE(InfoExtractor):
2807 """Information extractor for depositfiles.com"""
2809 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2810 IE_NAME = u'DepositFiles'
2812 def __init__(self, downloader=None):
2813 InfoExtractor.__init__(self, downloader)
2815 def report_download_webpage(self, file_id):
2816 """Report webpage download."""
2817 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2819 def report_extraction(self, file_id):
2820 """Report information extraction."""
2821 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2823 def _real_extract(self, url):
2824 # At this point we have a new file
2825 self._downloader.increment_downloads()
2827 file_id = url.split('/')[-1]
2828 # Rebuild url in english locale
2829 url = 'http://depositfiles.com/en/files/' + file_id
2831 # Retrieve file webpage with 'Free download' button pressed
2832 free_download_indication = { 'gateway_result' : '1' }
2833 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2835 self.report_download_webpage(file_id)
2836 webpage = urllib2.urlopen(request).read()
2837 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2838 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2841 # Search for the real file URL
2842 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2843 if (mobj is None) or (mobj.group(1) is None):
2844 # Try to figure out reason of the error.
2845 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2846 if (mobj is not None) and (mobj.group(1) is not None):
2847 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2848 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2850 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2853 file_url = mobj.group(1)
2854 file_extension = os.path.splitext(file_url)[1][1:]
2856 # Search for file title
2857 mobj = re.search(r'<b title="(.*?)">', webpage)
2859 self._downloader.trouble(u'ERROR: unable to extract title')
2861 file_title = mobj.group(1).decode('utf-8')
2864 # Process file information
2865 self._downloader.process_info({
2866 'id': file_id.decode('utf-8'),
2867 'url': file_url.decode('utf-8'),
2869 'upload_date': u'NA',
2870 'title': file_title,
2871 'stitle': file_title,
2872 'ext': file_extension.decode('utf-8'),
2876 except UnavailableVideoError, err:
2877 self._downloader.trouble(u'ERROR: unable to download file')
2880 class FacebookIE(InfoExtractor):
2881 """Information Extractor for Facebook"""
2883 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2884 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2885 _NETRC_MACHINE = 'facebook'
2886 _available_formats = ['video', 'highqual', 'lowqual']
2887 _video_extensions = {
2892 IE_NAME = u'facebook'
2894 def __init__(self, downloader=None):
2895 InfoExtractor.__init__(self, downloader)
2897 def _reporter(self, message):
2898 """Add header and report message."""
2899 self._downloader.to_screen(u'[facebook] %s' % message)
2901 def report_login(self):
2902 """Report attempt to log in."""
2903 self._reporter(u'Logging in')
2905 def report_video_webpage_download(self, video_id):
2906 """Report attempt to download video webpage."""
2907 self._reporter(u'%s: Downloading video webpage' % video_id)
2909 def report_information_extraction(self, video_id):
2910 """Report attempt to extract video information."""
2911 self._reporter(u'%s: Extracting video information' % video_id)
2913 def _parse_page(self, video_webpage):
2914 """Extract video information from page"""
2916 data = {'title': r'\("video_title", "(.*?)"\)',
2917 'description': r'<div class="datawrap">(.*?)</div>',
2918 'owner': r'\("video_owner_name", "(.*?)"\)',
2919 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2922 for piece in data.keys():
2923 mobj = re.search(data[piece], video_webpage)
2924 if mobj is not None:
2925 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2929 for fmt in self._available_formats:
2930 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2931 if mobj is not None:
2932 # URL is in a Javascript segment inside an escaped Unicode format within
2933 # the generally utf-8 page
2934 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2935 video_info['video_urls'] = video_urls
2939 def _real_initialize(self):
2940 if self._downloader is None:
2945 downloader_params = self._downloader.params
2947 # Attempt to use provided username and password or .netrc data
2948 if downloader_params.get('username', None) is not None:
2949 useremail = downloader_params['username']
2950 password = downloader_params['password']
2951 elif downloader_params.get('usenetrc', False):
2953 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2954 if info is not None:
2958 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2959 except (IOError, netrc.NetrcParseError), err:
2960 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2963 if useremail is None:
2972 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2975 login_results = urllib2.urlopen(request).read()
2976 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2977 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2979 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2980 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2983 def _real_extract(self, url):
2984 mobj = re.match(self._VALID_URL, url)
2986 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2988 video_id = mobj.group('ID')
2991 self.report_video_webpage_download(video_id)
2992 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2994 page = urllib2.urlopen(request)
2995 video_webpage = page.read()
2996 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2997 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3000 # Start extracting information
3001 self.report_information_extraction(video_id)
3003 # Extract information
3004 video_info = self._parse_page(video_webpage)
3007 if 'owner' not in video_info:
3008 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
3010 video_uploader = video_info['owner']
3013 if 'title' not in video_info:
3014 self._downloader.trouble(u'ERROR: unable to extract video title')
3016 video_title = video_info['title']
3017 video_title = video_title.decode('utf-8')
3018 video_title = sanitize_title(video_title)
3020 simple_title = _simplify_title(video_title)
3023 if 'thumbnail' not in video_info:
3024 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
3025 video_thumbnail = ''
3027 video_thumbnail = video_info['thumbnail']
3031 if 'upload_date' in video_info:
3032 upload_time = video_info['upload_date']
3033 timetuple = email.utils.parsedate_tz(upload_time)
3034 if timetuple is not None:
3036 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
3041 video_description = video_info.get('description', 'No description available.')
3043 url_map = video_info['video_urls']
3044 if len(url_map.keys()) > 0:
3045 # Decide which formats to download
3046 req_format = self._downloader.params.get('format', None)
3047 format_limit = self._downloader.params.get('format_limit', None)
3049 if format_limit is not None and format_limit in self._available_formats:
3050 format_list = self._available_formats[self._available_formats.index(format_limit):]
3052 format_list = self._available_formats
3053 existing_formats = [x for x in format_list if x in url_map]
3054 if len(existing_formats) == 0:
3055 self._downloader.trouble(u'ERROR: no known formats available for video')
3057 if req_format is None:
3058 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
3059 elif req_format == 'worst':
3060 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
3061 elif req_format == '-1':
3062 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
3065 if req_format not in url_map:
3066 self._downloader.trouble(u'ERROR: requested format not available')
3068 video_url_list = [(req_format, url_map[req_format])] # Specific format
3070 for format_param, video_real_url in video_url_list:
3072 # At this point we have a new video
3073 self._downloader.increment_downloads()
3076 video_extension = self._video_extensions.get(format_param, 'mp4')
3079 # Process video information
3080 self._downloader.process_info({
3081 'id': video_id.decode('utf-8'),
3082 'url': video_real_url.decode('utf-8'),
3083 'uploader': video_uploader.decode('utf-8'),
3084 'upload_date': upload_date,
3085 'title': video_title,
3086 'stitle': simple_title,
3087 'ext': video_extension.decode('utf-8'),
3088 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3089 'thumbnail': video_thumbnail.decode('utf-8'),
3090 'description': video_description.decode('utf-8'),
3093 except UnavailableVideoError, err:
3094 self._downloader.trouble(u'\nERROR: unable to download video')
3096 class BlipTVIE(InfoExtractor):
3097 """Information extractor for blip.tv"""
3099 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
3100 _URL_EXT = r'^.*\.([a-z0-9]+)$'
3101 IE_NAME = u'blip.tv'
3103 def report_extraction(self, file_id):
3104 """Report information extraction."""
3105 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3107 def report_direct_download(self, title):
3108 """Report information extraction."""
3109 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
3111 def _real_extract(self, url):
3112 mobj = re.match(self._VALID_URL, url)
3114 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3121 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
3122 request = urllib2.Request(json_url)
3123 self.report_extraction(mobj.group(1))
3126 urlh = urllib2.urlopen(request)
3127 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
3128 basename = url.split('/')[-1]
3129 title,ext = os.path.splitext(basename)
3130 title = title.decode('UTF-8')
3131 ext = ext.replace('.', '')
3132 self.report_direct_download(title)
3137 'stitle': _simplify_title(title),
3141 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3142 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
3144 if info is None: # Regular URL
3146 json_code = urlh.read()
3147 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3148 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
3152 json_data = json.loads(json_code)
3153 if 'Post' in json_data:
3154 data = json_data['Post']
3158 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
3159 video_url = data['media']['url']
3160 umobj = re.match(self._URL_EXT, video_url)
3162 raise ValueError('Can not determine filename extension')
3163 ext = umobj.group(1)
3166 'id': data['item_id'],
3168 'uploader': data['display_name'],
3169 'upload_date': upload_date,
3170 'title': data['title'],
3171 'stitle': _simplify_title(data['title']),
3173 'format': data['media']['mimeType'],
3174 'thumbnail': data['thumbnailUrl'],
3175 'description': data['description'],
3176 'player_url': data['embedUrl']
3178 except (ValueError,KeyError), err:
3179 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3182 self._downloader.increment_downloads()
3185 self._downloader.process_info(info)
3186 except UnavailableVideoError, err:
3187 self._downloader.trouble(u'\nERROR: unable to download video')
3190 class MyVideoIE(InfoExtractor):
3191 """Information Extractor for myvideo.de."""
3193 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3194 IE_NAME = u'myvideo'
3196 def __init__(self, downloader=None):
3197 InfoExtractor.__init__(self, downloader)
3199 def report_download_webpage(self, video_id):
3200 """Report webpage download."""
3201 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3203 def report_extraction(self, video_id):
3204 """Report information extraction."""
3205 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3207 def _real_extract(self,url):
3208 mobj = re.match(self._VALID_URL, url)
3210 self._download.trouble(u'ERROR: invalid URL: %s' % url)
3213 video_id = mobj.group(1)
3216 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3218 self.report_download_webpage(video_id)
3219 webpage = urllib2.urlopen(request).read()
3220 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3221 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3224 self.report_extraction(video_id)
3225 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3228 self._downloader.trouble(u'ERROR: unable to extract media URL')
3230 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3232 mobj = re.search('<title>([^<]+)</title>', webpage)
3234 self._downloader.trouble(u'ERROR: unable to extract title')
3237 video_title = mobj.group(1)
3238 video_title = sanitize_title(video_title)
3240 simple_title = _simplify_title(video_title)
3243 self._downloader.process_info({
3247 'upload_date': u'NA',
3248 'title': video_title,
3249 'stitle': simple_title,
3254 except UnavailableVideoError:
3255 self._downloader.trouble(u'\nERROR: Unable to download video')
3257 class ComedyCentralIE(InfoExtractor):
3258 """Information extractor for The Daily Show and Colbert Report """
3260 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3261 IE_NAME = u'comedycentral'
3263 def report_extraction(self, episode_id):
3264 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3266 def report_config_download(self, episode_id):
3267 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3269 def report_index_download(self, episode_id):
3270 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3272 def report_player_url(self, episode_id):
3273 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3275 def _real_extract(self, url):
3276 mobj = re.match(self._VALID_URL, url)
3278 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3281 if mobj.group('shortname'):
3282 if mobj.group('shortname') in ('tds', 'thedailyshow'):
3283 url = u'http://www.thedailyshow.com/full-episodes/'
3285 url = u'http://www.colbertnation.com/full-episodes/'
3286 mobj = re.match(self._VALID_URL, url)
3287 assert mobj is not None
3289 dlNewest = not mobj.group('episode')
3291 epTitle = mobj.group('showname')
3293 epTitle = mobj.group('episode')
3295 req = urllib2.Request(url)
3296 self.report_extraction(epTitle)
3298 htmlHandle = urllib2.urlopen(req)
3299 html = htmlHandle.read()
3300 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3301 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3304 url = htmlHandle.geturl()
3305 mobj = re.match(self._VALID_URL, url)
3307 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3309 if mobj.group('episode') == '':
3310 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3312 epTitle = mobj.group('episode')
3314 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
3315 if len(mMovieParams) == 0:
3316 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3319 playerUrl_raw = mMovieParams[0][0]
3320 self.report_player_url(epTitle)
3322 urlHandle = urllib2.urlopen(playerUrl_raw)
3323 playerUrl = urlHandle.geturl()
3324 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3325 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3328 uri = mMovieParams[0][1]
3329 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3330 self.report_index_download(epTitle)
3332 indexXml = urllib2.urlopen(indexUrl).read()
3333 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3334 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3337 idoc = xml.etree.ElementTree.fromstring(indexXml)
3338 itemEls = idoc.findall('.//item')
3339 for itemEl in itemEls:
3340 mediaId = itemEl.findall('./guid')[0].text
3341 shortMediaId = mediaId.split(':')[-1]
3342 showId = mediaId.split(':')[-2].replace('.com', '')
3343 officialTitle = itemEl.findall('./title')[0].text
3344 officialDate = itemEl.findall('./pubDate')[0].text
3346 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3347 urllib.urlencode({'uri': mediaId}))
3348 configReq = urllib2.Request(configUrl)
3349 self.report_config_download(epTitle)
3351 configXml = urllib2.urlopen(configReq).read()
3352 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3353 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3356 cdoc = xml.etree.ElementTree.fromstring(configXml)
3358 for rendition in cdoc.findall('.//rendition'):
3359 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3363 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3366 # For now, just pick the highest bitrate
3367 format,video_url = turls[-1]
3369 self._downloader.increment_downloads()
3371 effTitle = showId + u'-' + epTitle
3376 'upload_date': officialDate,
3378 'stitle': _simplify_title(effTitle),
3382 'description': officialTitle,
3383 'player_url': playerUrl
3387 self._downloader.process_info(info)
3388 except UnavailableVideoError, err:
3389 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3393 class EscapistIE(InfoExtractor):
3394 """Information extractor for The Escapist """
3396 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3397 IE_NAME = u'escapist'
3399 def report_extraction(self, showName):
3400 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3402 def report_config_download(self, showName):
3403 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3405 def _real_extract(self, url):
3406 htmlParser = HTMLParser.HTMLParser()
3408 mobj = re.match(self._VALID_URL, url)
3410 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3412 showName = mobj.group('showname')
3413 videoId = mobj.group('episode')
3415 self.report_extraction(showName)
3417 webPage = urllib2.urlopen(url).read()
3418 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3419 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3422 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3423 description = htmlParser.unescape(descMatch.group(1))
3424 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3425 imgUrl = htmlParser.unescape(imgMatch.group(1))
3426 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3427 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3428 configUrlMatch = re.search('config=(.*)$', playerUrl)
3429 configUrl = urllib2.unquote(configUrlMatch.group(1))
3431 self.report_config_download(showName)
3433 configJSON = urllib2.urlopen(configUrl).read()
3434 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3435 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3438 # Technically, it's JavaScript, not JSON
3439 configJSON = configJSON.replace("'", '"')
3442 config = json.loads(configJSON)
3443 except (ValueError,), err:
3444 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3447 playlist = config['playlist']
3448 videoUrl = playlist[1]['url']
3450 self._downloader.increment_downloads()
3454 'uploader': showName,
3455 'upload_date': None,
3457 'stitle': _simplify_title(showName),
3460 'thumbnail': imgUrl,
3461 'description': description,
3462 'player_url': playerUrl,
3466 self._downloader.process_info(info)
3467 except UnavailableVideoError, err:
3468 self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3471 class CollegeHumorIE(InfoExtractor):
3472 """Information extractor for collegehumor.com"""
3474 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3475 IE_NAME = u'collegehumor'
3477 def report_webpage(self, video_id):
3478 """Report information extraction."""
3479 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3481 def report_extraction(self, video_id):
3482 """Report information extraction."""
3483 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3485 def _real_extract(self, url):
3486 htmlParser = HTMLParser.HTMLParser()
3488 mobj = re.match(self._VALID_URL, url)
3490 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3492 video_id = mobj.group('videoid')
3494 self.report_webpage(video_id)
3495 request = urllib2.Request(url)
3497 webpage = urllib2.urlopen(request).read()
3498 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3499 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3502 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3504 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3506 internal_video_id = m.group('internalvideoid')
3510 'internal_id': internal_video_id,
3513 self.report_extraction(video_id)
3514 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3516 metaXml = urllib2.urlopen(xmlUrl).read()
3517 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3518 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3521 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3523 videoNode = mdoc.findall('./video')[0]
3524 info['description'] = videoNode.findall('./description')[0].text
3525 info['title'] = videoNode.findall('./caption')[0].text
3526 info['stitle'] = _simplify_title(info['title'])
3527 info['url'] = videoNode.findall('./file')[0].text
3528 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3529 info['ext'] = info['url'].rpartition('.')[2]
3530 info['format'] = info['ext']
3532 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3535 self._downloader.increment_downloads()
3538 self._downloader.process_info(info)
3539 except UnavailableVideoError, err:
3540 self._downloader.trouble(u'\nERROR: unable to download video')
3543 class XVideosIE(InfoExtractor):
3544 """Information extractor for xvideos.com"""
3546 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3547 IE_NAME = u'xvideos'
3549 def report_webpage(self, video_id):
3550 """Report information extraction."""
3551 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3553 def report_extraction(self, video_id):
3554 """Report information extraction."""
3555 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3557 def _real_extract(self, url):
3558 htmlParser = HTMLParser.HTMLParser()
3560 mobj = re.match(self._VALID_URL, url)
3562 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3564 video_id = mobj.group(1).decode('utf-8')
3566 self.report_webpage(video_id)
3568 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3570 webpage = urllib2.urlopen(request).read()
3571 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3572 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3575 self.report_extraction(video_id)
3579 mobj = re.search(r'flv_url=(.+?)&', webpage)
3581 self._downloader.trouble(u'ERROR: unable to extract video url')
3583 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3587 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3589 self._downloader.trouble(u'ERROR: unable to extract video title')
3591 video_title = mobj.group(1).decode('utf-8')
3594 # Extract video thumbnail
3595 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3597 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3599 video_thumbnail = mobj.group(1).decode('utf-8')
3603 self._downloader.increment_downloads()
3608 'upload_date': None,
3609 'title': video_title,
3610 'stitle': _simplify_title(video_title),
3613 'thumbnail': video_thumbnail,
3614 'description': None,
3619 self._downloader.process_info(info)
3620 except UnavailableVideoError, err:
3621 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3624 class SoundcloudIE(InfoExtractor):
3625 """Information extractor for soundcloud.com
3626 To access the media, the uid of the song and a stream token
3627 must be extracted from the page source and the script must make
3628 a request to media.soundcloud.com/crossdomain.xml. Then
3629 the media can be grabbed by requesting from an url composed
3630 of the stream token and uid
3633 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3634 IE_NAME = u'soundcloud'
3636 def __init__(self, downloader=None):
3637 InfoExtractor.__init__(self, downloader)
3639 def report_webpage(self, video_id):
3640 """Report information extraction."""
3641 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3643 def report_extraction(self, video_id):
3644 """Report information extraction."""
3645 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3647 def _real_extract(self, url):
3648 htmlParser = HTMLParser.HTMLParser()
3650 mobj = re.match(self._VALID_URL, url)
3652 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3655 # extract uploader (which is in the url)
3656 uploader = mobj.group(1).decode('utf-8')
3657 # extract simple title (uploader + slug of song title)
3658 slug_title = mobj.group(2).decode('utf-8')
3659 simple_title = uploader + '-' + slug_title
3661 self.report_webpage('%s/%s' % (uploader, slug_title))
3663 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3665 webpage = urllib2.urlopen(request).read()
3666 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3667 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3670 self.report_extraction('%s/%s' % (uploader, slug_title))
3672 # extract uid and stream token that soundcloud hands out for access
3673 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3675 video_id = mobj.group(1)
3676 stream_token = mobj.group(2)
3678 # extract unsimplified title
3679 mobj = re.search('"title":"(.*?)",', webpage)
3681 title = mobj.group(1)
3683 # construct media url (with uid/token)
3684 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3685 mediaURL = mediaURL % (video_id, stream_token)
3688 description = u'No description available'
3689 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3691 description = mobj.group(1)
3695 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3698 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3699 except Exception, e:
3702 # for soundcloud, a request to a cross domain is required for cookies
3703 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3706 self._downloader.process_info({
3707 'id': video_id.decode('utf-8'),
3709 'uploader': uploader.decode('utf-8'),
3710 'upload_date': upload_date,
3711 'title': simple_title.decode('utf-8'),
3712 'stitle': simple_title.decode('utf-8'),
3716 'description': description.decode('utf-8')
3718 except UnavailableVideoError:
3719 self._downloader.trouble(u'\nERROR: unable to download video')
3722 class InfoQIE(InfoExtractor):
3723 """Information extractor for infoq.com"""
3725 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3728 def report_webpage(self, video_id):
3729 """Report information extraction."""
3730 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3732 def report_extraction(self, video_id):
3733 """Report information extraction."""
3734 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3736 def _real_extract(self, url):
3737 htmlParser = HTMLParser.HTMLParser()
3739 mobj = re.match(self._VALID_URL, url)
3741 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3744 self.report_webpage(url)
3746 request = urllib2.Request(url)
3748 webpage = urllib2.urlopen(request).read()
3749 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3750 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3753 self.report_extraction(url)
3757 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3759 self._downloader.trouble(u'ERROR: unable to extract video url')
3761 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3765 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3767 self._downloader.trouble(u'ERROR: unable to extract video title')
3769 video_title = mobj.group(1).decode('utf-8')
3771 # Extract description
3772 video_description = u'No description available.'
3773 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3774 if mobj is not None:
3775 video_description = mobj.group(1).decode('utf-8')
3777 video_filename = video_url.split('/')[-1]
3778 video_id, extension = video_filename.split('.')
3780 self._downloader.increment_downloads()
3785 'upload_date': None,
3786 'title': video_title,
3787 'stitle': _simplify_title(video_title),
3789 'format': extension, # Extension is always(?) mp4, but seems to be flv
3791 'description': video_description,
3796 self._downloader.process_info(info)
3797 except UnavailableVideoError, err:
3798 self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3800 class MixcloudIE(InfoExtractor):
3801 """Information extractor for www.mixcloud.com"""
3802 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3803 IE_NAME = u'mixcloud'
3805 def __init__(self, downloader=None):
3806 InfoExtractor.__init__(self, downloader)
3808 def report_download_json(self, file_id):
3809 """Report JSON download."""
3810 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3812 def report_extraction(self, file_id):
3813 """Report information extraction."""
3814 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3816 def get_urls(self, jsonData, fmt, bitrate='best'):
3817 """Get urls from 'audio_formats' section in json"""
3820 bitrate_list = jsonData[fmt]
3821 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3822 bitrate = max(bitrate_list) # select highest
3824 url_list = jsonData[fmt][bitrate]
3825 except TypeError: # we have no bitrate info.
3826 url_list = jsonData[fmt]
3830 def check_urls(self, url_list):
3831 """Returns 1st active url from list"""
3832 for url in url_list:
3834 urllib2.urlopen(url)
3836 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3841 def _print_formats(self, formats):
3842 print 'Available formats:'
3843 for fmt in formats.keys():
3844 for b in formats[fmt]:
3846 ext = formats[fmt][b][0]
3847 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3848 except TypeError: # we have no bitrate info
3849 ext = formats[fmt][0]
3850 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3853 def _real_extract(self, url):
3854 mobj = re.match(self._VALID_URL, url)
3856 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3858 # extract uploader & filename from url
3859 uploader = mobj.group(1).decode('utf-8')
3860 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3862 # construct API request
3863 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3864 # retrieve .json file with links to files
3865 request = urllib2.Request(file_url)
3867 self.report_download_json(file_url)
3868 jsonData = urllib2.urlopen(request).read()
3869 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3870 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3874 json_data = json.loads(jsonData)
3875 player_url = json_data['player_swf_url']
3876 formats = dict(json_data['audio_formats'])
3878 req_format = self._downloader.params.get('format', None)
3881 if self._downloader.params.get('listformats', None):
3882 self._print_formats(formats)
3885 if req_format is None or req_format == 'best':
3886 for format_param in formats.keys():
3887 url_list = self.get_urls(formats, format_param)
3889 file_url = self.check_urls(url_list)
3890 if file_url is not None:
3893 if req_format not in formats.keys():
3894 self._downloader.trouble(u'ERROR: format is not available')
3897 url_list = self.get_urls(formats, req_format)
3898 file_url = self.check_urls(url_list)
3899 format_param = req_format
3902 self._downloader.increment_downloads()
3904 # Process file information
3905 self._downloader.process_info({
3906 'id': file_id.decode('utf-8'),
3907 'url': file_url.decode('utf-8'),
3908 'uploader': uploader.decode('utf-8'),
3909 'upload_date': u'NA',
3910 'title': json_data['name'],
3911 'stitle': _simplify_title(json_data['name']),
3912 'ext': file_url.split('.')[-1].decode('utf-8'),
3913 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3914 'thumbnail': json_data['thumbnail_url'],
3915 'description': json_data['description'],
3916 'player_url': player_url.decode('utf-8'),
3918 except UnavailableVideoError, err:
3919 self._downloader.trouble(u'ERROR: unable to download file')
3921 class StanfordOpenClassroomIE(InfoExtractor):
3922 """Information extractor for Stanford's Open ClassRoom"""
3924 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3925 IE_NAME = u'stanfordoc'
3927 def report_download_webpage(self, objid):
3928 """Report information extraction."""
3929 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3931 def report_extraction(self, video_id):
3932 """Report information extraction."""
3933 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3935 def _real_extract(self, url):
3936 mobj = re.match(self._VALID_URL, url)
3938 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3941 if mobj.group('course') and mobj.group('video'): # A specific video
3942 course = mobj.group('course')
3943 video = mobj.group('video')
3945 'id': _simplify_title(course + '_' + video),
3948 self.report_extraction(info['id'])
3949 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3950 xmlUrl = baseUrl + video + '.xml'
3952 metaXml = urllib2.urlopen(xmlUrl).read()
3953 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3954 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
3956 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3958 info['title'] = mdoc.findall('./title')[0].text
3959 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3961 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3963 info['stitle'] = _simplify_title(info['title'])
3964 info['ext'] = info['url'].rpartition('.')[2]
3965 info['format'] = info['ext']
3966 self._downloader.increment_downloads()
3968 self._downloader.process_info(info)
3969 except UnavailableVideoError, err:
3970 self._downloader.trouble(u'\nERROR: unable to download video')
3971 elif mobj.group('course'): # A course page
3972 unescapeHTML = HTMLParser.HTMLParser().unescape
3974 course = mobj.group('course')
3976 'id': _simplify_title(course),
3980 self.report_download_webpage(info['id'])
3982 coursepage = urllib2.urlopen(url).read()
3983 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3984 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3987 m = re.search('<h1>([^<]+)</h1>', coursepage)
3989 info['title'] = unescapeHTML(m.group(1))
3991 info['title'] = info['id']
3992 info['stitle'] = _simplify_title(info['title'])
3994 m = re.search('<description>([^<]+)</description>', coursepage)
3996 info['description'] = unescapeHTML(m.group(1))
3998 links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
4001 'type': 'reference',
4002 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
4006 for entry in info['list']:
4007 assert entry['type'] == 'reference'
4008 self.extract(entry['url'])
4010 unescapeHTML = HTMLParser.HTMLParser().unescape
4013 'id': 'Stanford OpenClassroom',
4017 self.report_download_webpage(info['id'])
4018 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
4020 rootpage = urllib2.urlopen(rootURL).read()
4021 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4022 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
4025 info['title'] = info['id']
4026 info['stitle'] = _simplify_title(info['title'])
4028 links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
4031 'type': 'reference',
4032 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
4036 for entry in info['list']:
4037 assert entry['type'] == 'reference'
4038 self.extract(entry['url'])
4040 class MTVIE(InfoExtractor):
4041 """Information extractor for MTV.com"""
4043 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
4046 def report_webpage(self, video_id):
4047 """Report information extraction."""
4048 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
4050 def report_extraction(self, video_id):
4051 """Report information extraction."""
4052 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
4054 def _real_extract(self, url):
4055 mobj = re.match(self._VALID_URL, url)
4057 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4059 if not mobj.group('proto'):
4060 url = 'http://' + url
4061 video_id = mobj.group('videoid')
4062 self.report_webpage(video_id)
4064 request = urllib2.Request(url)
4066 webpage = urllib2.urlopen(request).read()
4067 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4068 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
4071 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
4073 self._downloader.trouble(u'ERROR: unable to extract song name')
4075 song_name = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4076 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
4078 self._downloader.trouble(u'ERROR: unable to extract performer')
4080 performer = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4081 video_title = performer + ' - ' + song_name
4083 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
4085 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
4087 mtvn_uri = mobj.group(1)
4089 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
4091 self._downloader.trouble(u'ERROR: unable to extract content id')
4093 content_id = mobj.group(1)
4095 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
4096 self.report_extraction(video_id)
4097 request = urllib2.Request(videogen_url)
4099 metadataXml = urllib2.urlopen(request).read()
4100 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4101 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
4104 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
4105 renditions = mdoc.findall('.//rendition')
4107 # For now, always pick the highest quality.
4108 rendition = renditions[-1]
4111 _,_,ext = rendition.attrib['type'].partition('/')
4112 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
4113 video_url = rendition.find('./src').text
4115 self._downloader.trouble('Invalid rendition field.')
4118 self._downloader.increment_downloads()
4122 'uploader': performer,
4123 'title': video_title,
4124 'stitle': _simplify_title(video_title),
4130 self._downloader.process_info(info)
4131 except UnavailableVideoError, err:
4132 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
4135 class PostProcessor(object):
4136 """Post Processor class.
4138 PostProcessor objects can be added to downloaders with their
4139 add_post_processor() method. When the downloader has finished a
4140 successful download, it will take its internal chain of PostProcessors
4141 and start calling the run() method on each one of them, first with
4142 an initial argument and then with the returned value of the previous
4145 The chain will be stopped if one of them ever returns None or the end
4146 of the chain is reached.
4148 PostProcessor objects follow a "mutual registration" process similar
4149 to InfoExtractor objects.
4154 def __init__(self, downloader=None):
4155 self._downloader = downloader
4157 def set_downloader(self, downloader):
4158 """Sets the downloader for this PP."""
4159 self._downloader = downloader
4161 def run(self, information):
4162 """Run the PostProcessor.
4164 The "information" argument is a dictionary like the ones
4165 composed by InfoExtractors. The only difference is that this
4166 one has an extra field called "filepath" that points to the
4169 When this method returns None, the postprocessing chain is
4170 stopped. However, this method may return an information
4171 dictionary that will be passed to the next postprocessing
4172 object in the chain. It can be the one it received after
4173 changing some fields.
4175 In addition, this method may raise a PostProcessingError
4176 exception that will be taken into account by the downloader
4179 return information # by default, do nothing
4181 class AudioConversionError(BaseException):
4182 def __init__(self, message):
4183 self.message = message
4185 class FFmpegExtractAudioPP(PostProcessor):
4187 def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
4188 PostProcessor.__init__(self, downloader)
4189 if preferredcodec is None:
4190 preferredcodec = 'best'
4191 self._preferredcodec = preferredcodec
4192 self._preferredquality = preferredquality
4193 self._keepvideo = keepvideo
4196 def get_audio_codec(path):
4198 cmd = ['ffprobe', '-show_streams', '--', _encodeFilename(path)]
4199 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
4200 output = handle.communicate()[0]
4201 if handle.wait() != 0:
4203 except (IOError, OSError):
4206 for line in output.split('\n'):
4207 if line.startswith('codec_name='):
4208 audio_codec = line.split('=')[1].strip()
4209 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
4214 def run_ffmpeg(path, out_path, codec, more_opts):
4218 acodec_opts = ['-acodec', codec]
4219 cmd = ['ffmpeg', '-y', '-i', _encodeFilename(path), '-vn'] + acodec_opts + more_opts + ['--', _encodeFilename(out_path)]
4221 p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4222 stdout,stderr = p.communicate()
4223 except (IOError, OSError):
4224 e = sys.exc_info()[1]
4225 if isinstance(e, OSError) and e.errno == 2:
4226 raise AudioConversionError('ffmpeg not found. Please install ffmpeg.')
4229 if p.returncode != 0:
4230 msg = stderr.strip().split('\n')[-1]
4231 raise AudioConversionError(msg)
4233 def run(self, information):
4234 path = information['filepath']
4236 filecodec = self.get_audio_codec(path)
4237 if filecodec is None:
4238 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
4242 if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'):
4243 if self._preferredcodec == 'm4a' and filecodec == 'aac':
4244 # Lossless, but in another container
4246 extension = self._preferredcodec
4247 more_opts = ['-absf', 'aac_adtstoasc']
4248 elif filecodec in ['aac', 'mp3', 'vorbis']:
4249 # Lossless if possible
4251 extension = filecodec
4252 if filecodec == 'aac':
4253 more_opts = ['-f', 'adts']
4254 if filecodec == 'vorbis':
4258 acodec = 'libmp3lame'
4261 if self._preferredquality is not None:
4262 more_opts += ['-ab', self._preferredquality]
4264 # We convert the audio (lossy)
4265 acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis', 'wav': None}[self._preferredcodec]
4266 extension = self._preferredcodec
4268 if self._preferredquality is not None:
4269 more_opts += ['-ab', self._preferredquality]
4270 if self._preferredcodec == 'aac':
4271 more_opts += ['-f', 'adts']
4272 if self._preferredcodec == 'm4a':
4273 more_opts += ['-absf', 'aac_adtstoasc']
4274 if self._preferredcodec == 'vorbis':
4276 if self._preferredcodec == 'wav':
4278 more_opts += ['-f', 'wav']
4280 prefix, sep, ext = path.rpartition(u'.') # not os.path.splitext, since the latter does not work on unicode in all setups
4281 new_path = prefix + sep + extension
4282 self._downloader.to_screen(u'[ffmpeg] Destination: ' + new_path)
4284 self.run_ffmpeg(path, new_path, acodec, more_opts)
4286 etype,e,tb = sys.exc_info()
4287 if isinstance(e, AudioConversionError):
4288 self._downloader.to_stderr(u'ERROR: audio conversion failed: ' + e.message)
4290 self._downloader.to_stderr(u'ERROR: error running ffmpeg')
4293 # Try to update the date time for extracted audio file.
4294 if information.get('filetime') is not None:
4296 os.utime(_encodeFilename(new_path), (time.time(), information['filetime']))
4298 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
4300 if not self._keepvideo:
4302 os.remove(_encodeFilename(path))
4303 except (IOError, OSError):
4304 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
4307 information['filepath'] = new_path
4311 def updateSelf(downloader, filename):
4312 ''' Update the program file with the latest version from the repository '''
4313 # Note: downloader only used for options
4314 if not os.access(filename, os.W_OK):
4315 sys.exit('ERROR: no write permissions on %s' % filename)
4317 downloader.to_screen(u'Updating to latest version...')
4321 urlh = urllib.urlopen(UPDATE_URL)
4322 newcontent = urlh.read()
4324 vmatch = re.search("__version__ = '([^']+)'", newcontent)
4325 if vmatch is not None and vmatch.group(1) == __version__:
4326 downloader.to_screen(u'youtube-dl is up-to-date (' + __version__ + ')')
4330 except (IOError, OSError), err:
4331 sys.exit('ERROR: unable to download latest version')
4334 outf = open(filename, 'wb')
4336 outf.write(newcontent)
4339 except (IOError, OSError), err:
4340 sys.exit('ERROR: unable to overwrite current version')
4342 downloader.to_screen(u'Updated youtube-dl. Restart youtube-dl to use the new version.')
4345 def _readOptions(filename_bytes):
4347 optionf = open(filename_bytes)
4349 return [] # silently skip if file is not present
4353 res += shlex.split(l, comments=True)
4358 def _format_option_string(option):
4359 ''' ('-o', '--option') -> -o, --format METAVAR'''
4363 if option._short_opts: opts.append(option._short_opts[0])
4364 if option._long_opts: opts.append(option._long_opts[0])
4365 if len(opts) > 1: opts.insert(1, ', ')
4367 if option.takes_value(): opts.append(' %s' % option.metavar)
4369 return "".join(opts)
4371 def _find_term_columns():
4372 columns = os.environ.get('COLUMNS', None)
4377 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4378 out,err = sp.communicate()
4379 return int(out.split()[1])
4385 max_help_position = 80
4387 # No need to wrap help messages if we're on a wide console
4388 columns = _find_term_columns()
4389 if columns: max_width = columns
4391 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4392 fmt.format_option_strings = _format_option_string
4395 'version' : __version__,
4397 'usage' : '%prog [options] url [url...]',
4398 'conflict_handler' : 'resolve',
4401 parser = optparse.OptionParser(**kw)
4404 general = optparse.OptionGroup(parser, 'General Options')
4405 selection = optparse.OptionGroup(parser, 'Video Selection')
4406 authentication = optparse.OptionGroup(parser, 'Authentication Options')
4407 video_format = optparse.OptionGroup(parser, 'Video Format Options')
4408 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
4409 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
4410 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4412 general.add_option('-h', '--help',
4413 action='help', help='print this help text and exit')
4414 general.add_option('-v', '--version',
4415 action='version', help='print program version and exit')
4416 general.add_option('-U', '--update',
4417 action='store_true', dest='update_self', help='update this program to latest version')
4418 general.add_option('-i', '--ignore-errors',
4419 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4420 general.add_option('-r', '--rate-limit',
4421 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4422 general.add_option('-R', '--retries',
4423 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4424 general.add_option('--dump-user-agent',
4425 action='store_true', dest='dump_user_agent',
4426 help='display the current browser identification', default=False)
4427 general.add_option('--list-extractors',
4428 action='store_true', dest='list_extractors',
4429 help='List all supported extractors and the URLs they would handle', default=False)
4431 selection.add_option('--playlist-start',
4432 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4433 selection.add_option('--playlist-end',
4434 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4435 selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4436 selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4437 selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4439 authentication.add_option('-u', '--username',
4440 dest='username', metavar='USERNAME', help='account username')
4441 authentication.add_option('-p', '--password',
4442 dest='password', metavar='PASSWORD', help='account password')
4443 authentication.add_option('-n', '--netrc',
4444 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4447 video_format.add_option('-f', '--format',
4448 action='store', dest='format', metavar='FORMAT', help='video format code')
4449 video_format.add_option('--all-formats',
4450 action='store_const', dest='format', help='download all available video formats', const='all')
4451 video_format.add_option('--prefer-free-formats',
4452 action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested')
4453 video_format.add_option('--max-quality',
4454 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4455 video_format.add_option('-F', '--list-formats',
4456 action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4457 video_format.add_option('--write-srt',
4458 action='store_true', dest='writesubtitles',
4459 help='write video closed captions to a .srt file (currently youtube only)', default=False)
4460 video_format.add_option('--srt-lang',
4461 action='store', dest='subtitleslang', metavar='LANG',
4462 help='language of the closed captions to download (optional) use IETF language tags like \'en\'')
4465 verbosity.add_option('-q', '--quiet',
4466 action='store_true', dest='quiet', help='activates quiet mode', default=False)
4467 verbosity.add_option('-s', '--simulate',
4468 action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4469 verbosity.add_option('--skip-download',
4470 action='store_true', dest='skip_download', help='do not download the video', default=False)
4471 verbosity.add_option('-g', '--get-url',
4472 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4473 verbosity.add_option('-e', '--get-title',
4474 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4475 verbosity.add_option('--get-thumbnail',
4476 action='store_true', dest='getthumbnail',
4477 help='simulate, quiet but print thumbnail URL', default=False)
4478 verbosity.add_option('--get-description',
4479 action='store_true', dest='getdescription',
4480 help='simulate, quiet but print video description', default=False)
4481 verbosity.add_option('--get-filename',
4482 action='store_true', dest='getfilename',
4483 help='simulate, quiet but print output filename', default=False)
4484 verbosity.add_option('--get-format',
4485 action='store_true', dest='getformat',
4486 help='simulate, quiet but print output format', default=False)
4487 verbosity.add_option('--no-progress',
4488 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4489 verbosity.add_option('--console-title',
4490 action='store_true', dest='consoletitle',
4491 help='display progress in console titlebar', default=False)
4492 verbosity.add_option('-v', '--verbose',
4493 action='store_true', dest='verbose', help='print various debugging information', default=False)
4496 filesystem.add_option('-t', '--title',
4497 action='store_true', dest='usetitle', help='use title in file name', default=False)
4498 filesystem.add_option('-l', '--literal',
4499 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4500 filesystem.add_option('-A', '--auto-number',
4501 action='store_true', dest='autonumber',
4502 help='number downloaded files starting from 00000', default=False)
4503 filesystem.add_option('-o', '--output',
4504 dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4505 filesystem.add_option('-a', '--batch-file',
4506 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4507 filesystem.add_option('-w', '--no-overwrites',
4508 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4509 filesystem.add_option('-c', '--continue',
4510 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=True)
4511 filesystem.add_option('--no-continue',
4512 action='store_false', dest='continue_dl',
4513 help='do not resume partially downloaded files (restart from beginning)')
4514 filesystem.add_option('--cookies',
4515 dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4516 filesystem.add_option('--no-part',
4517 action='store_true', dest='nopart', help='do not use .part files', default=False)
4518 filesystem.add_option('--no-mtime',
4519 action='store_false', dest='updatetime',
4520 help='do not use the Last-modified header to set the file modification time', default=True)
4521 filesystem.add_option('--write-description',
4522 action='store_true', dest='writedescription',
4523 help='write video description to a .description file', default=False)
4524 filesystem.add_option('--write-info-json',
4525 action='store_true', dest='writeinfojson',
4526 help='write video metadata to a .info.json file', default=False)
4529 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4530 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4531 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4532 help='"best", "aac", "vorbis", "mp3", "m4a", or "wav"; best by default')
4533 postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4534 help='ffmpeg audio bitrate specification, 128k by default')
4535 postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4536 help='keeps the video file on disk after the post-processing; the video is erased by default')
4539 parser.add_option_group(general)
4540 parser.add_option_group(selection)
4541 parser.add_option_group(filesystem)
4542 parser.add_option_group(verbosity)
4543 parser.add_option_group(video_format)
4544 parser.add_option_group(authentication)
4545 parser.add_option_group(postproc)
4547 xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4549 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4551 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4552 argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4553 opts, args = parser.parse_args(argv)
4555 return parser, opts, args
4557 def gen_extractors():
4558 """ Return a list of an instance of every supported extractor.
4559 The order does matter; the first extractor matched is the one handling the URL.
4561 youtube_ie = YoutubeIE()
4562 google_ie = GoogleIE()
4563 yahoo_ie = YahooIE()
4565 YoutubePlaylistIE(youtube_ie),
4566 YoutubeUserIE(youtube_ie),
4567 YoutubeSearchIE(youtube_ie),
4569 MetacafeIE(youtube_ie),
4572 GoogleSearchIE(google_ie),
4575 YahooSearchIE(yahoo_ie),
4588 StanfordOpenClassroomIE(),
4595 parser, opts, args = parseOpts()
4597 # Open appropriate CookieJar
4598 if opts.cookiefile is None:
4599 jar = cookielib.CookieJar()
4602 jar = cookielib.MozillaCookieJar(opts.cookiefile)
4603 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4605 except (IOError, OSError), err:
4606 sys.exit(u'ERROR: unable to open cookie file')
4609 if opts.dump_user_agent:
4610 print std_headers['User-Agent']
4613 # Batch file verification
4615 if opts.batchfile is not None:
4617 if opts.batchfile == '-':
4620 batchfd = open(opts.batchfile, 'r')
4621 batchurls = batchfd.readlines()
4622 batchurls = [x.strip() for x in batchurls]
4623 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4625 sys.exit(u'ERROR: batch file could not be read')
4626 all_urls = batchurls + args
4628 # General configuration
4629 cookie_processor = urllib2.HTTPCookieProcessor(jar)
4630 proxy_handler = urllib2.ProxyHandler()
4631 opener = urllib2.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler())
4632 urllib2.install_opener(opener)
4633 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4636 print(u'[debug] Proxy map: ' + str(proxy_handler.proxies))
4638 extractors = gen_extractors()
4640 if opts.list_extractors:
4641 for ie in extractors:
4643 matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4644 all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4645 for mu in matchedUrls:
4649 # Conflicting, missing and erroneous options
4650 if opts.usenetrc and (opts.username is not None or opts.password is not None):
4651 parser.error(u'using .netrc conflicts with giving username/password')
4652 if opts.password is not None and opts.username is None:
4653 parser.error(u'account username missing')
4654 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4655 parser.error(u'using output template conflicts with using title, literal title or auto number')
4656 if opts.usetitle and opts.useliteral:
4657 parser.error(u'using title conflicts with using literal title')
4658 if opts.username is not None and opts.password is None:
4659 opts.password = getpass.getpass(u'Type account password and press return:')
4660 if opts.ratelimit is not None:
4661 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4662 if numeric_limit is None:
4663 parser.error(u'invalid rate limit specified')
4664 opts.ratelimit = numeric_limit
4665 if opts.retries is not None:
4667 opts.retries = long(opts.retries)
4668 except (TypeError, ValueError), err:
4669 parser.error(u'invalid retry count specified')
4671 opts.playliststart = int(opts.playliststart)
4672 if opts.playliststart <= 0:
4673 raise ValueError(u'Playlist start must be positive')
4674 except (TypeError, ValueError), err:
4675 parser.error(u'invalid playlist start number specified')
4677 opts.playlistend = int(opts.playlistend)
4678 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4679 raise ValueError(u'Playlist end must be greater than playlist start')
4680 except (TypeError, ValueError), err:
4681 parser.error(u'invalid playlist end number specified')
4682 if opts.extractaudio:
4683 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis', 'm4a', 'wav']:
4684 parser.error(u'invalid audio format specified')
4687 fd = FileDownloader({
4688 'usenetrc': opts.usenetrc,
4689 'username': opts.username,
4690 'password': opts.password,
4691 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4692 'forceurl': opts.geturl,
4693 'forcetitle': opts.gettitle,
4694 'forcethumbnail': opts.getthumbnail,
4695 'forcedescription': opts.getdescription,
4696 'forcefilename': opts.getfilename,
4697 'forceformat': opts.getformat,
4698 'simulate': opts.simulate,
4699 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4700 'format': opts.format,
4701 'format_limit': opts.format_limit,
4702 'listformats': opts.listformats,
4703 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4704 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4705 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4706 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4707 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4708 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4709 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4710 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4711 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4712 or u'%(id)s.%(ext)s'),
4713 'ignoreerrors': opts.ignoreerrors,
4714 'ratelimit': opts.ratelimit,
4715 'nooverwrites': opts.nooverwrites,
4716 'retries': opts.retries,
4717 'continuedl': opts.continue_dl,
4718 'noprogress': opts.noprogress,
4719 'playliststart': opts.playliststart,
4720 'playlistend': opts.playlistend,
4721 'logtostderr': opts.outtmpl == '-',
4722 'consoletitle': opts.consoletitle,
4723 'nopart': opts.nopart,
4724 'updatetime': opts.updatetime,
4725 'writedescription': opts.writedescription,
4726 'writeinfojson': opts.writeinfojson,
4727 'writesubtitles': opts.writesubtitles,
4728 'subtitleslang': opts.subtitleslang,
4729 'matchtitle': opts.matchtitle,
4730 'rejecttitle': opts.rejecttitle,
4731 'max_downloads': opts.max_downloads,
4732 'prefer_free_formats': opts.prefer_free_formats,
4733 'verbose': opts.verbose,
4735 for extractor in extractors:
4736 fd.add_info_extractor(extractor)
4739 if opts.extractaudio:
4740 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4743 if opts.update_self:
4744 updateSelf(fd, sys.argv[0])
4747 if len(all_urls) < 1:
4748 if not opts.update_self:
4749 parser.error(u'you must provide at least one URL')
4754 retcode = fd.download(all_urls)
4755 except MaxDownloadsReached:
4756 fd.to_screen(u'--max-download limit reached, aborting.')
4759 # Dump cookie jar if requested
4760 if opts.cookiefile is not None:
4763 except (IOError, OSError), err:
4764 sys.exit(u'ERROR: unable to save cookie jar')
4771 except DownloadError:
4773 except SameFileError:
4774 sys.exit(u'ERROR: fixed output name but more than one file to download')
4775 except KeyboardInterrupt:
4776 sys.exit(u'\nERROR: Interrupted by user')
4778 if __name__ == '__main__':
4781 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: