2 # -*- coding: utf-8 -*-
5 'Ricardo Garcia Gonzalez',
13 'Philipp Hagemeister',
20 __license__ = 'Public Domain'
21 __version__ = '2012.01.08b'
23 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
56 except ImportError: # Python 2.4
59 import cStringIO as StringIO
63 # parse_qs was moved from the cgi module to the urlparse module recently.
65 from urlparse import parse_qs
67 from cgi import parse_qs
75 import xml.etree.ElementTree
76 except ImportError: # Python<2.5: Not officially supported, but let it slip
77 warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
80 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
81 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
82 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
83 'Accept-Encoding': 'gzip, deflate',
84 'Accept-Language': 'en-us,en;q=0.5',
89 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
95 def raiseError(msg, i):
96 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
97 def skipSpace(i, expectMore=True):
98 while i < len(s) and s[i] in ' \t\r\n':
102 raiseError('Premature end', i)
104 def decodeEscape(match):
120 return unichr(int(esc[1:5], 16))
121 if len(esc) == 5+6 and esc[5:7] == '\\u':
122 hi = int(esc[1:5], 16)
123 low = int(esc[7:11], 16)
124 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
125 raise ValueError('Unknown escape ' + str(esc))
132 while s[e-bslashes-1] == '\\':
134 if bslashes % 2 == 1:
138 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
139 stri = rexp.sub(decodeEscape, s[i:e])
145 if s[i] == '}': # Empty dictionary
149 raiseError('Expected a string object key', i)
150 i,key = parseString(i)
152 if i >= len(s) or s[i] != ':':
153 raiseError('Expected a colon', i)
160 raiseError('Expected comma or closing curly brace', i)
165 if s[i] == ']': # Empty array
170 i = skipSpace(i) # Raise exception if premature end
174 raiseError('Expected a comma or closing bracket', i)
176 def parseDiscrete(i):
177 for k,v in {'true': True, 'false': False, 'null': None}.items():
178 if s.startswith(k, i):
180 raiseError('Not a boolean (or null)', i)
182 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
184 raiseError('Not a number', i)
186 if '.' in nums or 'e' in nums or 'E' in nums:
187 return (i+len(nums), float(nums))
188 return (i+len(nums), int(nums))
189 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
192 i,res = CHARMAP.get(s[i], parseNumber)(i)
193 i = skipSpace(i, False)
197 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
200 def preferredencoding():
201 """Get preferred encoding.
203 Returns the best encoding scheme for the system, based on
204 locale.getpreferredencoding() and some further tweaks.
206 def yield_preferredencoding():
208 pref = locale.getpreferredencoding()
214 return yield_preferredencoding().next()
217 def htmlentity_transform(matchobj):
218 """Transforms an HTML entity to a Unicode character.
220 This function receives a match object and is intended to be used with
221 the re.sub() function.
223 entity = matchobj.group(1)
225 # Known non-numeric HTML entity
226 if entity in htmlentitydefs.name2codepoint:
227 return unichr(htmlentitydefs.name2codepoint[entity])
230 mobj = re.match(ur'(?u)#(x?\d+)', entity)
232 numstr = mobj.group(1)
233 if numstr.startswith(u'x'):
235 numstr = u'0%s' % numstr
238 return unichr(long(numstr, base))
240 # Unknown entity in name, return its literal representation
241 return (u'&%s;' % entity)
244 def sanitize_title(utitle):
245 """Sanitizes a video title so it could be used as part of a filename."""
246 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
247 return utitle.replace(unicode(os.sep), u'%')
250 def sanitize_open(filename, open_mode):
251 """Try to open the given filename, and slightly tweak it if this fails.
253 Attempts to open the given filename. If this fails, it tries to change
254 the filename slightly, step by step, until it's either able to open it
255 or it fails and raises a final exception, like the standard open()
258 It returns the tuple (stream, definitive_file_name).
262 if sys.platform == 'win32':
264 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
265 return (sys.stdout, filename)
266 stream = open(_encodeFilename(filename), open_mode)
267 return (stream, filename)
268 except (IOError, OSError), err:
269 # In case of error, try to remove win32 forbidden chars
270 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
272 # An exception here should be caught in the caller
273 stream = open(_encodeFilename(filename), open_mode)
274 return (stream, filename)
277 def timeconvert(timestr):
278 """Convert RFC 2822 defined time string into system timestamp"""
280 timetuple = email.utils.parsedate_tz(timestr)
281 if timetuple is not None:
282 timestamp = email.utils.mktime_tz(timetuple)
285 def _simplify_title(title):
286 expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
287 return expr.sub(u'_', title).strip(u'_')
289 def _orderedSet(iterable):
290 """ Remove all duplicates from the input iterable """
297 def _unescapeHTML(s):
299 @param s a string (of type unicode)
301 assert type(s) == type(u'')
303 htmlParser = HTMLParser.HTMLParser()
304 return htmlParser.unescape(s)
306 def _encodeFilename(s):
308 @param s The name of the file (of type unicode)
311 assert type(s) == type(u'')
313 if sys.platform == 'win32' and sys.getwindowsversion().major >= 5:
314 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
315 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
316 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
319 return s.encode(sys.getfilesystemencoding(), 'ignore')
321 class DownloadError(Exception):
322 """Download Error exception.
324 This exception may be thrown by FileDownloader objects if they are not
325 configured to continue on errors. They will contain the appropriate
331 class SameFileError(Exception):
332 """Same File exception.
334 This exception will be thrown by FileDownloader objects if they detect
335 multiple files would have to be downloaded to the same file on disk.
340 class PostProcessingError(Exception):
341 """Post Processing exception.
343 This exception may be raised by PostProcessor's .run() method to
344 indicate an error in the postprocessing task.
348 class MaxDownloadsReached(Exception):
349 """ --max-downloads limit has been reached. """
353 class UnavailableVideoError(Exception):
354 """Unavailable Format exception.
356 This exception will be thrown when a video is requested
357 in a format that is not available for that video.
362 class ContentTooShortError(Exception):
363 """Content Too Short exception.
365 This exception may be raised by FileDownloader objects when a file they
366 download is too small for what the server announced first, indicating
367 the connection was probably interrupted.
373 def __init__(self, downloaded, expected):
374 self.downloaded = downloaded
375 self.expected = expected
378 class YoutubeDLHandler(urllib2.HTTPHandler):
379 """Handler for HTTP requests and responses.
381 This class, when installed with an OpenerDirector, automatically adds
382 the standard headers to every HTTP request and handles gzipped and
383 deflated responses from web servers. If compression is to be avoided in
384 a particular request, the original request in the program code only has
385 to include the HTTP header "Youtubedl-No-Compression", which will be
386 removed before making the real request.
388 Part of this code was copied from:
390 http://techknack.net/python-urllib2-handlers/
392 Andrew Rowls, the author of that code, agreed to release it to the
399 return zlib.decompress(data, -zlib.MAX_WBITS)
401 return zlib.decompress(data)
404 def addinfourl_wrapper(stream, headers, url, code):
405 if hasattr(urllib2.addinfourl, 'getcode'):
406 return urllib2.addinfourl(stream, headers, url, code)
407 ret = urllib2.addinfourl(stream, headers, url)
411 def http_request(self, req):
412 for h in std_headers:
415 req.add_header(h, std_headers[h])
416 if 'Youtubedl-no-compression' in req.headers:
417 if 'Accept-encoding' in req.headers:
418 del req.headers['Accept-encoding']
419 del req.headers['Youtubedl-no-compression']
422 def http_response(self, req, resp):
425 if resp.headers.get('Content-encoding', '') == 'gzip':
426 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
427 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
428 resp.msg = old_resp.msg
430 if resp.headers.get('Content-encoding', '') == 'deflate':
431 gz = StringIO.StringIO(self.deflate(resp.read()))
432 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
433 resp.msg = old_resp.msg
437 class FileDownloader(object):
438 """File Downloader class.
440 File downloader objects are the ones responsible of downloading the
441 actual video file and writing it to disk if the user has requested
442 it, among some other tasks. In most cases there should be one per
443 program. As, given a video URL, the downloader doesn't know how to
444 extract all the needed information, task that InfoExtractors do, it
445 has to pass the URL to one of them.
447 For this, file downloader objects have a method that allows
448 InfoExtractors to be registered in a given order. When it is passed
449 a URL, the file downloader handles it to the first InfoExtractor it
450 finds that reports being able to handle it. The InfoExtractor extracts
451 all the information about the video or videos the URL refers to, and
452 asks the FileDownloader to process the video information, possibly
453 downloading the video.
455 File downloaders accept a lot of parameters. In order not to saturate
456 the object constructor with arguments, it receives a dictionary of
457 options instead. These options are available through the params
458 attribute for the InfoExtractors to use. The FileDownloader also
459 registers itself as the downloader in charge for the InfoExtractors
460 that are added to it, so this is a "mutual registration".
464 username: Username for authentication purposes.
465 password: Password for authentication purposes.
466 usenetrc: Use netrc for authentication instead.
467 quiet: Do not print messages to stdout.
468 forceurl: Force printing final URL.
469 forcetitle: Force printing title.
470 forcethumbnail: Force printing thumbnail URL.
471 forcedescription: Force printing description.
472 forcefilename: Force printing final filename.
473 simulate: Do not download the video files.
474 format: Video format code.
475 format_limit: Highest quality format to try.
476 outtmpl: Template for output names.
477 ignoreerrors: Do not stop on download errors.
478 ratelimit: Download speed limit, in bytes/sec.
479 nooverwrites: Prevent overwriting files.
480 retries: Number of times to retry for HTTP error 5xx
481 continuedl: Try to continue downloads if possible.
482 noprogress: Do not print the progress bar.
483 playliststart: Playlist item to start at.
484 playlistend: Playlist item to end at.
485 matchtitle: Download only matching titles.
486 rejecttitle: Reject downloads for matching titles.
487 logtostderr: Log messages to stderr instead of stdout.
488 consoletitle: Display progress in console window's titlebar.
489 nopart: Do not use temporary .part files.
490 updatetime: Use the Last-modified header to set output file timestamps.
491 writedescription: Write the video description to a .description file
492 writeinfojson: Write the video description to a .info.json file
498 _download_retcode = None
499 _num_downloads = None
502 def __init__(self, params):
503 """Create a FileDownloader object with the given options."""
506 self._download_retcode = 0
507 self._num_downloads = 0
508 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
512 def format_bytes(bytes):
515 if type(bytes) is str:
520 exponent = long(math.log(bytes, 1024.0))
521 suffix = 'bkMGTPEZY'[exponent]
522 converted = float(bytes) / float(1024 ** exponent)
523 return '%.2f%s' % (converted, suffix)
526 def calc_percent(byte_counter, data_len):
529 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
532 def calc_eta(start, now, total, current):
536 if current == 0 or dif < 0.001: # One millisecond
538 rate = float(current) / dif
539 eta = long((float(total) - float(current)) / rate)
540 (eta_mins, eta_secs) = divmod(eta, 60)
543 return '%02d:%02d' % (eta_mins, eta_secs)
546 def calc_speed(start, now, bytes):
548 if bytes == 0 or dif < 0.001: # One millisecond
549 return '%10s' % '---b/s'
550 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
553 def best_block_size(elapsed_time, bytes):
554 new_min = max(bytes / 2.0, 1.0)
555 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
556 if elapsed_time < 0.001:
558 rate = bytes / elapsed_time
566 def parse_bytes(bytestr):
567 """Parse a string indicating a byte quantity into a long integer."""
568 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
571 number = float(matchobj.group(1))
572 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
573 return long(round(number * multiplier))
575 def add_info_extractor(self, ie):
576 """Add an InfoExtractor object to the end of the list."""
578 ie.set_downloader(self)
580 def add_post_processor(self, pp):
581 """Add a PostProcessor object to the end of the chain."""
583 pp.set_downloader(self)
585 def to_screen(self, message, skip_eol=False):
586 """Print message to stdout if not in quiet mode."""
587 assert type(message) == type(u'')
588 if not self.params.get('quiet', False):
589 terminator = [u'\n', u''][skip_eol]
590 output = message + terminator
592 if 'b' not in self._screen_file.mode or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
593 output = output.encode(preferredencoding(), 'ignore')
594 self._screen_file.write(output)
595 self._screen_file.flush()
597 def to_stderr(self, message):
598 """Print message to stderr."""
599 print >>sys.stderr, message.encode(preferredencoding())
601 def to_cons_title(self, message):
602 """Set console/terminal window title to message."""
603 if not self.params.get('consoletitle', False):
605 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
606 # c_wchar_p() might not be necessary if `message` is
607 # already of type unicode()
608 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
609 elif 'TERM' in os.environ:
610 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
612 def fixed_template(self):
613 """Checks if the output template is fixed."""
614 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
616 def trouble(self, message=None):
617 """Determine action to take when a download problem appears.
619 Depending on if the downloader has been configured to ignore
620 download errors or not, this method may throw an exception or
621 not when errors are found, after printing the message.
623 if message is not None:
624 self.to_stderr(message)
625 if not self.params.get('ignoreerrors', False):
626 raise DownloadError(message)
627 self._download_retcode = 1
629 def slow_down(self, start_time, byte_counter):
630 """Sleep if the download speed is over the rate limit."""
631 rate_limit = self.params.get('ratelimit', None)
632 if rate_limit is None or byte_counter == 0:
635 elapsed = now - start_time
638 speed = float(byte_counter) / elapsed
639 if speed > rate_limit:
640 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
642 def temp_name(self, filename):
643 """Returns a temporary filename for the given filename."""
644 if self.params.get('nopart', False) or filename == u'-' or \
645 (os.path.exists(_encodeFilename(filename)) and not os.path.isfile(_encodeFilename(filename))):
647 return filename + u'.part'
649 def undo_temp_name(self, filename):
650 if filename.endswith(u'.part'):
651 return filename[:-len(u'.part')]
654 def try_rename(self, old_filename, new_filename):
656 if old_filename == new_filename:
658 os.rename(_encodeFilename(old_filename), _encodeFilename(new_filename))
659 except (IOError, OSError), err:
660 self.trouble(u'ERROR: unable to rename file')
662 def try_utime(self, filename, last_modified_hdr):
663 """Try to set the last-modified time of the given file."""
664 if last_modified_hdr is None:
666 if not os.path.isfile(_encodeFilename(filename)):
668 timestr = last_modified_hdr
671 filetime = timeconvert(timestr)
675 os.utime(filename, (time.time(), filetime))
680 def report_writedescription(self, descfn):
681 """ Report that the description file is being written """
682 self.to_screen(u'[info] Writing video description to: ' + descfn)
684 def report_writeinfojson(self, infofn):
685 """ Report that the metadata file has been written """
686 self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
688 def report_destination(self, filename):
689 """Report destination filename."""
690 self.to_screen(u'[download] Destination: ' + filename)
692 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
693 """Report download progress."""
694 if self.params.get('noprogress', False):
696 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
697 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
698 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
699 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
701 def report_resuming_byte(self, resume_len):
702 """Report attempt to resume at given byte."""
703 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
705 def report_retry(self, count, retries):
706 """Report retry in case of HTTP error 5xx"""
707 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
709 def report_file_already_downloaded(self, file_name):
710 """Report file has already been fully downloaded."""
712 self.to_screen(u'[download] %s has already been downloaded' % file_name)
713 except (UnicodeEncodeError), err:
714 self.to_screen(u'[download] The file has already been downloaded')
716 def report_unable_to_resume(self):
717 """Report it was impossible to resume download."""
718 self.to_screen(u'[download] Unable to resume')
720 def report_finish(self):
721 """Report download finished."""
722 if self.params.get('noprogress', False):
723 self.to_screen(u'[download] Download completed')
727 def increment_downloads(self):
728 """Increment the ordinal that assigns a number to each file."""
729 self._num_downloads += 1
731 def prepare_filename(self, info_dict):
732 """Generate the output filename."""
734 template_dict = dict(info_dict)
735 template_dict['epoch'] = unicode(long(time.time()))
736 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
737 filename = self.params['outtmpl'] % template_dict
739 except (ValueError, KeyError), err:
740 self.trouble(u'ERROR: invalid system charset or erroneous output template')
743 def _match_entry(self, info_dict):
744 """ Returns None iff the file should be downloaded """
746 title = info_dict['title']
747 matchtitle = self.params.get('matchtitle', False)
748 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
749 return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
750 rejecttitle = self.params.get('rejecttitle', False)
751 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
752 return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
755 def process_info(self, info_dict):
756 """Process a single dictionary returned by an InfoExtractor."""
758 reason = self._match_entry(info_dict)
759 if reason is not None:
760 self.to_screen(u'[download] ' + reason)
763 max_downloads = self.params.get('max_downloads')
764 if max_downloads is not None:
765 if self._num_downloads > int(max_downloads):
766 raise MaxDownloadsReached()
768 filename = self.prepare_filename(info_dict)
771 if self.params.get('forcetitle', False):
772 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
773 if self.params.get('forceurl', False):
774 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
775 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
776 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
777 if self.params.get('forcedescription', False) and 'description' in info_dict:
778 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
779 if self.params.get('forcefilename', False) and filename is not None:
780 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
781 if self.params.get('forceformat', False):
782 print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
784 # Do nothing else if in simulate mode
785 if self.params.get('simulate', False):
792 dn = os.path.dirname(_encodeFilename(filename))
793 if dn != '' and not os.path.exists(dn): # dn is already encoded
795 except (OSError, IOError), err:
796 self.trouble(u'ERROR: unable to create directory ' + unicode(err))
799 if self.params.get('writedescription', False):
801 descfn = filename + u'.description'
802 self.report_writedescription(descfn)
803 descfile = open(_encodeFilename(descfn), 'wb')
805 descfile.write(info_dict['description'].encode('utf-8'))
808 except (OSError, IOError):
809 self.trouble(u'ERROR: Cannot write description file ' + descfn)
812 if self.params.get('writeinfojson', False):
813 infofn = filename + u'.info.json'
814 self.report_writeinfojson(infofn)
817 except (NameError,AttributeError):
818 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
821 infof = open(_encodeFilename(infofn), 'wb')
823 json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
824 json.dump(json_info_dict, infof)
827 except (OSError, IOError):
828 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
831 if not self.params.get('skip_download', False):
832 if self.params.get('nooverwrites', False) and os.path.exists(_encodeFilename(filename)):
836 success = self._do_download(filename, info_dict)
837 except (OSError, IOError), err:
838 raise UnavailableVideoError
839 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
840 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
842 except (ContentTooShortError, ), err:
843 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
848 self.post_process(filename, info_dict)
849 except (PostProcessingError), err:
850 self.trouble(u'ERROR: postprocessing: %s' % str(err))
853 def download(self, url_list):
854 """Download a given list of URLs."""
855 if len(url_list) > 1 and self.fixed_template():
856 raise SameFileError(self.params['outtmpl'])
859 suitable_found = False
861 # Go to next InfoExtractor if not suitable
862 if not ie.suitable(url):
865 # Suitable InfoExtractor found
866 suitable_found = True
868 # Extract information from URL and process it
871 # Suitable InfoExtractor had been found; go to next URL
874 if not suitable_found:
875 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
877 return self._download_retcode
879 def post_process(self, filename, ie_info):
880 """Run the postprocessing chain on the given file."""
882 info['filepath'] = filename
888 def _download_with_rtmpdump(self, filename, url, player_url):
889 self.report_destination(filename)
890 tmpfilename = self.temp_name(filename)
892 # Check for rtmpdump first
894 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
895 except (OSError, IOError):
896 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
899 # Download using rtmpdump. rtmpdump returns exit code 2 when
900 # the connection was interrumpted and resuming appears to be
901 # possible. This is part of rtmpdump's normal usage, AFAIK.
902 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
903 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
904 while retval == 2 or retval == 1:
905 prevsize = os.path.getsize(_encodeFilename(tmpfilename))
906 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
907 time.sleep(5.0) # This seems to be needed
908 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
909 cursize = os.path.getsize(_encodeFilename(tmpfilename))
910 if prevsize == cursize and retval == 1:
912 # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
913 if prevsize == cursize and retval == 2 and cursize > 1024:
914 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
918 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(_encodeFilename(tmpfilename)))
919 self.try_rename(tmpfilename, filename)
922 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
925 def _do_download(self, filename, info_dict):
926 url = info_dict['url']
927 player_url = info_dict.get('player_url', None)
929 # Check file already present
930 if self.params.get('continuedl', False) and os.path.isfile(_encodeFilename(filename)) and not self.params.get('nopart', False):
931 self.report_file_already_downloaded(filename)
934 # Attempt to download using rtmpdump
935 if url.startswith('rtmp'):
936 return self._download_with_rtmpdump(filename, url, player_url)
938 tmpfilename = self.temp_name(filename)
941 # Do not include the Accept-Encoding header
942 headers = {'Youtubedl-no-compression': 'True'}
943 basic_request = urllib2.Request(url, None, headers)
944 request = urllib2.Request(url, None, headers)
946 # Establish possible resume length
947 if os.path.isfile(_encodeFilename(tmpfilename)):
948 resume_len = os.path.getsize(_encodeFilename(tmpfilename))
954 if self.params.get('continuedl', False):
955 self.report_resuming_byte(resume_len)
956 request.add_header('Range','bytes=%d-' % resume_len)
962 retries = self.params.get('retries', 0)
963 while count <= retries:
964 # Establish connection
966 if count == 0 and 'urlhandle' in info_dict:
967 data = info_dict['urlhandle']
968 data = urllib2.urlopen(request)
970 except (urllib2.HTTPError, ), err:
971 if (err.code < 500 or err.code >= 600) and err.code != 416:
972 # Unexpected HTTP error
974 elif err.code == 416:
975 # Unable to resume (requested range not satisfiable)
977 # Open the connection again without the range header
978 data = urllib2.urlopen(basic_request)
979 content_length = data.info()['Content-Length']
980 except (urllib2.HTTPError, ), err:
981 if err.code < 500 or err.code >= 600:
984 # Examine the reported length
985 if (content_length is not None and
986 (resume_len - 100 < long(content_length) < resume_len + 100)):
987 # The file had already been fully downloaded.
988 # Explanation to the above condition: in issue #175 it was revealed that
989 # YouTube sometimes adds or removes a few bytes from the end of the file,
990 # changing the file size slightly and causing problems for some users. So
991 # I decided to implement a suggested change and consider the file
992 # completely downloaded if the file size differs less than 100 bytes from
993 # the one in the hard drive.
994 self.report_file_already_downloaded(filename)
995 self.try_rename(tmpfilename, filename)
998 # The length does not match, we start the download over
999 self.report_unable_to_resume()
1004 if count <= retries:
1005 self.report_retry(count, retries)
1008 self.trouble(u'ERROR: giving up after %s retries' % retries)
1011 data_len = data.info().get('Content-length', None)
1012 if data_len is not None:
1013 data_len = long(data_len) + resume_len
1014 data_len_str = self.format_bytes(data_len)
1015 byte_counter = 0 + resume_len
1019 # Download and write
1020 before = time.time()
1021 data_block = data.read(block_size)
1023 if len(data_block) == 0:
1025 byte_counter += len(data_block)
1027 # Open file just in time
1030 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
1031 assert stream is not None
1032 filename = self.undo_temp_name(tmpfilename)
1033 self.report_destination(filename)
1034 except (OSError, IOError), err:
1035 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
1038 stream.write(data_block)
1039 except (IOError, OSError), err:
1040 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1042 block_size = self.best_block_size(after - before, len(data_block))
1045 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1046 if data_len is None:
1047 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1049 percent_str = self.calc_percent(byte_counter, data_len)
1050 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1051 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1054 self.slow_down(start, byte_counter - resume_len)
1057 self.trouble(u'\nERROR: Did not get any data blocks')
1060 self.report_finish()
1061 if data_len is not None and byte_counter != data_len:
1062 raise ContentTooShortError(byte_counter, long(data_len))
1063 self.try_rename(tmpfilename, filename)
1065 # Update file modification time
1066 if self.params.get('updatetime', True):
1067 info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1072 class InfoExtractor(object):
1073 """Information Extractor class.
1075 Information extractors are the classes that, given a URL, extract
1076 information from the video (or videos) the URL refers to. This
1077 information includes the real video URL, the video title and simplified
1078 title, author and others. The information is stored in a dictionary
1079 which is then passed to the FileDownloader. The FileDownloader
1080 processes this information possibly downloading the video to the file
1081 system, among other possible outcomes. The dictionaries must include
1082 the following fields:
1084 id: Video identifier.
1085 url: Final video URL.
1086 uploader: Nickname of the video uploader.
1087 title: Literal title.
1088 stitle: Simplified title.
1089 ext: Video filename extension.
1090 format: Video format.
1091 player_url: SWF Player URL (may be None).
1093 The following fields are optional. Their primary purpose is to allow
1094 youtube-dl to serve as the backend for a video search function, such
1095 as the one in youtube2mp3. They are only used when their respective
1096 forced printing functions are called:
1098 thumbnail: Full URL to a video thumbnail image.
1099 description: One-line video description.
1101 Subclasses of this one should re-define the _real_initialize() and
1102 _real_extract() methods and define a _VALID_URL regexp.
1103 Probably, they should also be added to the list of extractors.
1109 def __init__(self, downloader=None):
1110 """Constructor. Receives an optional downloader."""
1112 self.set_downloader(downloader)
1114 def suitable(self, url):
1115 """Receives a URL and returns True if suitable for this IE."""
1116 return re.match(self._VALID_URL, url) is not None
1118 def initialize(self):
1119 """Initializes an instance (authentication, etc)."""
1121 self._real_initialize()
1124 def extract(self, url):
1125 """Extracts URL information and returns it in list of dicts."""
1127 return self._real_extract(url)
1129 def set_downloader(self, downloader):
1130 """Sets the downloader for this IE."""
1131 self._downloader = downloader
1133 def _real_initialize(self):
1134 """Real initialization process. Redefine in subclasses."""
1137 def _real_extract(self, url):
1138 """Real extraction process. Redefine in subclasses."""
1142 class YoutubeIE(InfoExtractor):
1143 """Information extractor for youtube.com."""
1145 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1146 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1147 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1148 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1149 _NETRC_MACHINE = 'youtube'
1150 # Listed in order of quality
1151 _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1152 _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
1153 _video_extensions = {
1159 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1164 _video_dimensions = {
1179 IE_NAME = u'youtube'
1181 def report_lang(self):
1182 """Report attempt to set language."""
1183 self._downloader.to_screen(u'[youtube] Setting language')
1185 def report_login(self):
1186 """Report attempt to log in."""
1187 self._downloader.to_screen(u'[youtube] Logging in')
1189 def report_age_confirmation(self):
1190 """Report attempt to confirm age."""
1191 self._downloader.to_screen(u'[youtube] Confirming age')
1193 def report_video_webpage_download(self, video_id):
1194 """Report attempt to download video webpage."""
1195 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1197 def report_video_info_webpage_download(self, video_id):
1198 """Report attempt to download video info webpage."""
1199 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1201 def report_information_extraction(self, video_id):
1202 """Report attempt to extract video information."""
1203 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1205 def report_unavailable_format(self, video_id, format):
1206 """Report extracted video URL."""
1207 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1209 def report_rtmp_download(self):
1210 """Indicate the download will use the RTMP protocol."""
1211 self._downloader.to_screen(u'[youtube] RTMP download detected')
1213 def _print_formats(self, formats):
1214 print 'Available formats:'
1216 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1218 def _real_initialize(self):
1219 if self._downloader is None:
1224 downloader_params = self._downloader.params
1226 # Attempt to use provided username and password or .netrc data
1227 if downloader_params.get('username', None) is not None:
1228 username = downloader_params['username']
1229 password = downloader_params['password']
1230 elif downloader_params.get('usenetrc', False):
1232 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1233 if info is not None:
1237 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1238 except (IOError, netrc.NetrcParseError), err:
1239 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1243 request = urllib2.Request(self._LANG_URL)
1246 urllib2.urlopen(request).read()
1247 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1248 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1251 # No authentication to be performed
1252 if username is None:
1257 'current_form': 'loginForm',
1259 'action_login': 'Log In',
1260 'username': username,
1261 'password': password,
1263 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1266 login_results = urllib2.urlopen(request).read()
1267 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1268 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1270 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1271 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1277 'action_confirm': 'Confirm',
1279 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1281 self.report_age_confirmation()
1282 age_results = urllib2.urlopen(request).read()
1283 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1284 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1287 def _real_extract(self, url):
1288 # Extract video id from URL
1289 mobj = re.match(self._VALID_URL, url)
1291 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1293 video_id = mobj.group(2)
1296 self.report_video_webpage_download(video_id)
1297 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1299 video_webpage = urllib2.urlopen(request).read()
1300 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1301 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1304 # Attempt to extract SWF player URL
1305 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1306 if mobj is not None:
1307 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1312 self.report_video_info_webpage_download(video_id)
1313 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1314 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1315 % (video_id, el_type))
1316 request = urllib2.Request(video_info_url)
1318 video_info_webpage = urllib2.urlopen(request).read()
1319 video_info = parse_qs(video_info_webpage)
1320 if 'token' in video_info:
1322 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1323 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1325 if 'token' not in video_info:
1326 if 'reason' in video_info:
1327 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1329 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1332 # Start extracting information
1333 self.report_information_extraction(video_id)
1336 if 'author' not in video_info:
1337 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1339 video_uploader = urllib.unquote_plus(video_info['author'][0])
1342 if 'title' not in video_info:
1343 self._downloader.trouble(u'ERROR: unable to extract video title')
1345 video_title = urllib.unquote_plus(video_info['title'][0])
1346 video_title = video_title.decode('utf-8')
1347 video_title = sanitize_title(video_title)
1350 simple_title = _simplify_title(video_title)
1353 if 'thumbnail_url' not in video_info:
1354 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1355 video_thumbnail = ''
1356 else: # don't panic if we can't find it
1357 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1361 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1362 if mobj is not None:
1363 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1364 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1365 for expression in format_expressions:
1367 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1375 video_description = u'No description available.'
1376 if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1377 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1378 if mobj is not None:
1379 video_description = mobj.group(1).decode('utf-8')
1381 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1382 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1383 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1384 # TODO use another parser
1387 video_token = urllib.unquote_plus(video_info['token'][0])
1389 # Decide which formats to download
1390 req_format = self._downloader.params.get('format', None)
1392 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1393 self.report_rtmp_download()
1394 video_url_list = [(None, video_info['conn'][0])]
1395 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1396 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1397 url_data = [parse_qs(uds) for uds in url_data_strs]
1398 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1399 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1401 format_limit = self._downloader.params.get('format_limit', None)
1402 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1403 if format_limit is not None and format_limit in available_formats:
1404 format_list = available_formats[available_formats.index(format_limit):]
1406 format_list = available_formats
1407 existing_formats = [x for x in format_list if x in url_map]
1408 if len(existing_formats) == 0:
1409 self._downloader.trouble(u'ERROR: no known formats available for video')
1411 if self._downloader.params.get('listformats', None):
1412 self._print_formats(existing_formats)
1414 if req_format is None or req_format == 'best':
1415 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1416 elif req_format == 'worst':
1417 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1418 elif req_format in ('-1', 'all'):
1419 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1421 # Specific formats. We pick the first in a slash-delimeted sequence.
1422 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1423 req_formats = req_format.split('/')
1424 video_url_list = None
1425 for rf in req_formats:
1427 video_url_list = [(rf, url_map[rf])]
1429 if video_url_list is None:
1430 self._downloader.trouble(u'ERROR: requested format not available')
1433 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1436 for format_param, video_real_url in video_url_list:
1437 # At this point we have a new video
1438 self._downloader.increment_downloads()
1441 video_extension = self._video_extensions.get(format_param, 'flv')
1444 # Process video information
1445 self._downloader.process_info({
1446 'id': video_id.decode('utf-8'),
1447 'url': video_real_url.decode('utf-8'),
1448 'uploader': video_uploader.decode('utf-8'),
1449 'upload_date': upload_date,
1450 'title': video_title,
1451 'stitle': simple_title,
1452 'ext': video_extension.decode('utf-8'),
1453 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1454 'thumbnail': video_thumbnail.decode('utf-8'),
1455 'description': video_description,
1456 'player_url': player_url,
1458 except UnavailableVideoError, err:
1459 self._downloader.trouble(u'\nERROR: unable to download video')
1462 class MetacafeIE(InfoExtractor):
1463 """Information Extractor for metacafe.com."""
1465 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1466 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1467 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1469 IE_NAME = u'metacafe'
1471 def __init__(self, youtube_ie, downloader=None):
1472 InfoExtractor.__init__(self, downloader)
1473 self._youtube_ie = youtube_ie
1475 def report_disclaimer(self):
1476 """Report disclaimer retrieval."""
1477 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1479 def report_age_confirmation(self):
1480 """Report attempt to confirm age."""
1481 self._downloader.to_screen(u'[metacafe] Confirming age')
1483 def report_download_webpage(self, video_id):
1484 """Report webpage download."""
1485 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1487 def report_extraction(self, video_id):
1488 """Report information extraction."""
1489 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1491 def _real_initialize(self):
1492 # Retrieve disclaimer
1493 request = urllib2.Request(self._DISCLAIMER)
1495 self.report_disclaimer()
1496 disclaimer = urllib2.urlopen(request).read()
1497 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1498 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1504 'submit': "Continue - I'm over 18",
1506 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1508 self.report_age_confirmation()
1509 disclaimer = urllib2.urlopen(request).read()
1510 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1511 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1514 def _real_extract(self, url):
1515 # Extract id and simplified title from URL
1516 mobj = re.match(self._VALID_URL, url)
1518 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1521 video_id = mobj.group(1)
1523 # Check if video comes from YouTube
1524 mobj2 = re.match(r'^yt-(.*)$', video_id)
1525 if mobj2 is not None:
1526 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1529 # At this point we have a new video
1530 self._downloader.increment_downloads()
1532 simple_title = mobj.group(2).decode('utf-8')
1534 # Retrieve video webpage to extract further information
1535 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1537 self.report_download_webpage(video_id)
1538 webpage = urllib2.urlopen(request).read()
1539 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1540 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1543 # Extract URL, uploader and title from webpage
1544 self.report_extraction(video_id)
1545 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1546 if mobj is not None:
1547 mediaURL = urllib.unquote(mobj.group(1))
1548 video_extension = mediaURL[-3:]
1550 # Extract gdaKey if available
1551 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1553 video_url = mediaURL
1555 gdaKey = mobj.group(1)
1556 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1558 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1560 self._downloader.trouble(u'ERROR: unable to extract media URL')
1562 vardict = parse_qs(mobj.group(1))
1563 if 'mediaData' not in vardict:
1564 self._downloader.trouble(u'ERROR: unable to extract media URL')
1566 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1568 self._downloader.trouble(u'ERROR: unable to extract media URL')
1570 mediaURL = mobj.group(1).replace('\\/', '/')
1571 video_extension = mediaURL[-3:]
1572 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1574 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1576 self._downloader.trouble(u'ERROR: unable to extract title')
1578 video_title = mobj.group(1).decode('utf-8')
1579 video_title = sanitize_title(video_title)
1581 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1583 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1585 video_uploader = mobj.group(1)
1588 # Process video information
1589 self._downloader.process_info({
1590 'id': video_id.decode('utf-8'),
1591 'url': video_url.decode('utf-8'),
1592 'uploader': video_uploader.decode('utf-8'),
1593 'upload_date': u'NA',
1594 'title': video_title,
1595 'stitle': simple_title,
1596 'ext': video_extension.decode('utf-8'),
1600 except UnavailableVideoError:
1601 self._downloader.trouble(u'\nERROR: unable to download video')
1604 class DailymotionIE(InfoExtractor):
1605 """Information Extractor for Dailymotion"""
1607 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1608 IE_NAME = u'dailymotion'
1610 def __init__(self, downloader=None):
1611 InfoExtractor.__init__(self, downloader)
1613 def report_download_webpage(self, video_id):
1614 """Report webpage download."""
1615 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1617 def report_extraction(self, video_id):
1618 """Report information extraction."""
1619 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1621 def _real_extract(self, url):
1622 # Extract id and simplified title from URL
1623 mobj = re.match(self._VALID_URL, url)
1625 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1628 # At this point we have a new video
1629 self._downloader.increment_downloads()
1630 video_id = mobj.group(1)
1632 video_extension = 'flv'
1634 # Retrieve video webpage to extract further information
1635 request = urllib2.Request(url)
1636 request.add_header('Cookie', 'family_filter=off')
1638 self.report_download_webpage(video_id)
1639 webpage = urllib2.urlopen(request).read()
1640 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1641 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1644 # Extract URL, uploader and title from webpage
1645 self.report_extraction(video_id)
1646 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1648 self._downloader.trouble(u'ERROR: unable to extract media URL')
1650 sequence = urllib.unquote(mobj.group(1))
1651 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1653 self._downloader.trouble(u'ERROR: unable to extract media URL')
1655 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1657 # if needed add http://www.dailymotion.com/ if relative URL
1659 video_url = mediaURL
1661 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
1663 self._downloader.trouble(u'ERROR: unable to extract title')
1665 video_title = _unescapeHTML(mobj.group('title').decode('utf-8'))
1666 video_title = sanitize_title(video_title)
1667 simple_title = _simplify_title(video_title)
1669 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1671 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1673 video_uploader = mobj.group(1)
1676 # Process video information
1677 self._downloader.process_info({
1678 'id': video_id.decode('utf-8'),
1679 'url': video_url.decode('utf-8'),
1680 'uploader': video_uploader.decode('utf-8'),
1681 'upload_date': u'NA',
1682 'title': video_title,
1683 'stitle': simple_title,
1684 'ext': video_extension.decode('utf-8'),
1688 except UnavailableVideoError:
1689 self._downloader.trouble(u'\nERROR: unable to download video')
1692 class GoogleIE(InfoExtractor):
1693 """Information extractor for video.google.com."""
1695 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1696 IE_NAME = u'video.google'
1698 def __init__(self, downloader=None):
1699 InfoExtractor.__init__(self, downloader)
1701 def report_download_webpage(self, video_id):
1702 """Report webpage download."""
1703 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1705 def report_extraction(self, video_id):
1706 """Report information extraction."""
1707 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1709 def _real_extract(self, url):
1710 # Extract id from URL
1711 mobj = re.match(self._VALID_URL, url)
1713 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1716 # At this point we have a new video
1717 self._downloader.increment_downloads()
1718 video_id = mobj.group(1)
1720 video_extension = 'mp4'
1722 # Retrieve video webpage to extract further information
1723 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1725 self.report_download_webpage(video_id)
1726 webpage = urllib2.urlopen(request).read()
1727 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1728 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1731 # Extract URL, uploader, and title from webpage
1732 self.report_extraction(video_id)
1733 mobj = re.search(r"download_url:'([^']+)'", webpage)
1735 video_extension = 'flv'
1736 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1738 self._downloader.trouble(u'ERROR: unable to extract media URL')
1740 mediaURL = urllib.unquote(mobj.group(1))
1741 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1742 mediaURL = mediaURL.replace('\\x26', '\x26')
1744 video_url = mediaURL
1746 mobj = re.search(r'<title>(.*)</title>', webpage)
1748 self._downloader.trouble(u'ERROR: unable to extract title')
1750 video_title = mobj.group(1).decode('utf-8')
1751 video_title = sanitize_title(video_title)
1752 simple_title = _simplify_title(video_title)
1754 # Extract video description
1755 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1757 self._downloader.trouble(u'ERROR: unable to extract video description')
1759 video_description = mobj.group(1).decode('utf-8')
1760 if not video_description:
1761 video_description = 'No description available.'
1763 # Extract video thumbnail
1764 if self._downloader.params.get('forcethumbnail', False):
1765 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1767 webpage = urllib2.urlopen(request).read()
1768 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1769 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1771 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1773 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1775 video_thumbnail = mobj.group(1)
1776 else: # we need something to pass to process_info
1777 video_thumbnail = ''
1780 # Process video information
1781 self._downloader.process_info({
1782 'id': video_id.decode('utf-8'),
1783 'url': video_url.decode('utf-8'),
1785 'upload_date': u'NA',
1786 'title': video_title,
1787 'stitle': simple_title,
1788 'ext': video_extension.decode('utf-8'),
1792 except UnavailableVideoError:
1793 self._downloader.trouble(u'\nERROR: unable to download video')
1796 class PhotobucketIE(InfoExtractor):
1797 """Information extractor for photobucket.com."""
1799 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1800 IE_NAME = u'photobucket'
1802 def __init__(self, downloader=None):
1803 InfoExtractor.__init__(self, downloader)
1805 def report_download_webpage(self, video_id):
1806 """Report webpage download."""
1807 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1809 def report_extraction(self, video_id):
1810 """Report information extraction."""
1811 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1813 def _real_extract(self, url):
1814 # Extract id from URL
1815 mobj = re.match(self._VALID_URL, url)
1817 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1820 # At this point we have a new video
1821 self._downloader.increment_downloads()
1822 video_id = mobj.group(1)
1824 video_extension = 'flv'
1826 # Retrieve video webpage to extract further information
1827 request = urllib2.Request(url)
1829 self.report_download_webpage(video_id)
1830 webpage = urllib2.urlopen(request).read()
1831 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1832 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1835 # Extract URL, uploader, and title from webpage
1836 self.report_extraction(video_id)
1837 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1839 self._downloader.trouble(u'ERROR: unable to extract media URL')
1841 mediaURL = urllib.unquote(mobj.group(1))
1843 video_url = mediaURL
1845 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1847 self._downloader.trouble(u'ERROR: unable to extract title')
1849 video_title = mobj.group(1).decode('utf-8')
1850 video_title = sanitize_title(video_title)
1851 simple_title = _simplify_title(vide_title)
1853 video_uploader = mobj.group(2).decode('utf-8')
1856 # Process video information
1857 self._downloader.process_info({
1858 'id': video_id.decode('utf-8'),
1859 'url': video_url.decode('utf-8'),
1860 'uploader': video_uploader,
1861 'upload_date': u'NA',
1862 'title': video_title,
1863 'stitle': simple_title,
1864 'ext': video_extension.decode('utf-8'),
1868 except UnavailableVideoError:
1869 self._downloader.trouble(u'\nERROR: unable to download video')
1872 class YahooIE(InfoExtractor):
1873 """Information extractor for video.yahoo.com."""
1875 # _VALID_URL matches all Yahoo! Video URLs
1876 # _VPAGE_URL matches only the extractable '/watch/' URLs
1877 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1878 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1879 IE_NAME = u'video.yahoo'
1881 def __init__(self, downloader=None):
1882 InfoExtractor.__init__(self, downloader)
1884 def report_download_webpage(self, video_id):
1885 """Report webpage download."""
1886 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1888 def report_extraction(self, video_id):
1889 """Report information extraction."""
1890 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1892 def _real_extract(self, url, new_video=True):
1893 # Extract ID from URL
1894 mobj = re.match(self._VALID_URL, url)
1896 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1899 # At this point we have a new video
1900 self._downloader.increment_downloads()
1901 video_id = mobj.group(2)
1902 video_extension = 'flv'
1904 # Rewrite valid but non-extractable URLs as
1905 # extractable English language /watch/ URLs
1906 if re.match(self._VPAGE_URL, url) is None:
1907 request = urllib2.Request(url)
1909 webpage = urllib2.urlopen(request).read()
1910 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1911 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1914 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1916 self._downloader.trouble(u'ERROR: Unable to extract id field')
1918 yahoo_id = mobj.group(1)
1920 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1922 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1924 yahoo_vid = mobj.group(1)
1926 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1927 return self._real_extract(url, new_video=False)
1929 # Retrieve video webpage to extract further information
1930 request = urllib2.Request(url)
1932 self.report_download_webpage(video_id)
1933 webpage = urllib2.urlopen(request).read()
1934 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1935 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1938 # Extract uploader and title from webpage
1939 self.report_extraction(video_id)
1940 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1942 self._downloader.trouble(u'ERROR: unable to extract video title')
1944 video_title = mobj.group(1).decode('utf-8')
1945 simple_title = _simplify_title(video_title)
1947 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1949 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1951 video_uploader = mobj.group(1).decode('utf-8')
1953 # Extract video thumbnail
1954 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1956 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1958 video_thumbnail = mobj.group(1).decode('utf-8')
1960 # Extract video description
1961 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1963 self._downloader.trouble(u'ERROR: unable to extract video description')
1965 video_description = mobj.group(1).decode('utf-8')
1966 if not video_description:
1967 video_description = 'No description available.'
1969 # Extract video height and width
1970 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1972 self._downloader.trouble(u'ERROR: unable to extract video height')
1974 yv_video_height = mobj.group(1)
1976 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1978 self._downloader.trouble(u'ERROR: unable to extract video width')
1980 yv_video_width = mobj.group(1)
1982 # Retrieve video playlist to extract media URL
1983 # I'm not completely sure what all these options are, but we
1984 # seem to need most of them, otherwise the server sends a 401.
1985 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1986 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1987 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1988 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1989 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1991 self.report_download_webpage(video_id)
1992 webpage = urllib2.urlopen(request).read()
1993 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1994 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1997 # Extract media URL from playlist XML
1998 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
2000 self._downloader.trouble(u'ERROR: Unable to extract media URL')
2002 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
2003 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
2006 # Process video information
2007 self._downloader.process_info({
2008 'id': video_id.decode('utf-8'),
2010 'uploader': video_uploader,
2011 'upload_date': u'NA',
2012 'title': video_title,
2013 'stitle': simple_title,
2014 'ext': video_extension.decode('utf-8'),
2015 'thumbnail': video_thumbnail.decode('utf-8'),
2016 'description': video_description,
2017 'thumbnail': video_thumbnail,
2020 except UnavailableVideoError:
2021 self._downloader.trouble(u'\nERROR: unable to download video')
2024 class VimeoIE(InfoExtractor):
2025 """Information extractor for vimeo.com."""
2027 # _VALID_URL matches Vimeo URLs
2028 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
2031 def __init__(self, downloader=None):
2032 InfoExtractor.__init__(self, downloader)
2034 def report_download_webpage(self, video_id):
2035 """Report webpage download."""
2036 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2038 def report_extraction(self, video_id):
2039 """Report information extraction."""
2040 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2042 def _real_extract(self, url, new_video=True):
2043 # Extract ID from URL
2044 mobj = re.match(self._VALID_URL, url)
2046 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2049 # At this point we have a new video
2050 self._downloader.increment_downloads()
2051 video_id = mobj.group(1)
2053 # Retrieve video webpage to extract further information
2054 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
2056 self.report_download_webpage(video_id)
2057 webpage = urllib2.urlopen(request).read()
2058 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2059 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2062 # Now we begin extracting as much information as we can from what we
2063 # retrieved. First we extract the information common to all extractors,
2064 # and latter we extract those that are Vimeo specific.
2065 self.report_extraction(video_id)
2068 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
2070 self._downloader.trouble(u'ERROR: unable to extract video title')
2072 video_title = mobj.group(1).decode('utf-8')
2073 simple_title = _simplify_title(video_title)
2076 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
2078 self._downloader.trouble(u'ERROR: unable to extract video uploader')
2080 video_uploader = mobj.group(1).decode('utf-8')
2082 # Extract video thumbnail
2083 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2085 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2087 video_thumbnail = mobj.group(1).decode('utf-8')
2089 # # Extract video description
2090 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2092 # self._downloader.trouble(u'ERROR: unable to extract video description')
2094 # video_description = mobj.group(1).decode('utf-8')
2095 # if not video_description: video_description = 'No description available.'
2096 video_description = 'Foo.'
2098 # Vimeo specific: extract request signature
2099 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2101 self._downloader.trouble(u'ERROR: unable to extract request signature')
2103 sig = mobj.group(1).decode('utf-8')
2105 # Vimeo specific: extract video quality information
2106 mobj = re.search(r'<isHD>(\d+)</isHD>', webpage)
2108 self._downloader.trouble(u'ERROR: unable to extract video quality information')
2110 quality = mobj.group(1).decode('utf-8')
2112 if int(quality) == 1:
2117 # Vimeo specific: Extract request signature expiration
2118 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2120 self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2122 sig_exp = mobj.group(1).decode('utf-8')
2124 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s/?q=%s" % (video_id, sig, sig_exp, quality)
2127 # Process video information
2128 self._downloader.process_info({
2129 'id': video_id.decode('utf-8'),
2131 'uploader': video_uploader,
2132 'upload_date': u'NA',
2133 'title': video_title,
2134 'stitle': simple_title,
2136 'thumbnail': video_thumbnail.decode('utf-8'),
2137 'description': video_description,
2138 'thumbnail': video_thumbnail,
2139 'description': video_description,
2142 except UnavailableVideoError:
2143 self._downloader.trouble(u'ERROR: unable to download video')
2146 class GenericIE(InfoExtractor):
2147 """Generic last-resort information extractor."""
2150 IE_NAME = u'generic'
2152 def __init__(self, downloader=None):
2153 InfoExtractor.__init__(self, downloader)
2155 def report_download_webpage(self, video_id):
2156 """Report webpage download."""
2157 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2158 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2160 def report_extraction(self, video_id):
2161 """Report information extraction."""
2162 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2164 def _real_extract(self, url):
2165 # At this point we have a new video
2166 self._downloader.increment_downloads()
2168 video_id = url.split('/')[-1]
2169 request = urllib2.Request(url)
2171 self.report_download_webpage(video_id)
2172 webpage = urllib2.urlopen(request).read()
2173 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2174 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2176 except ValueError, err:
2177 # since this is the last-resort InfoExtractor, if
2178 # this error is thrown, it'll be thrown here
2179 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2182 self.report_extraction(video_id)
2183 # Start with something easy: JW Player in SWFObject
2184 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2186 # Broaden the search a little bit
2187 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2189 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2192 # It's possible that one of the regexes
2193 # matched, but returned an empty group:
2194 if mobj.group(1) is None:
2195 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2198 video_url = urllib.unquote(mobj.group(1))
2199 video_id = os.path.basename(video_url)
2201 # here's a fun little line of code for you:
2202 video_extension = os.path.splitext(video_id)[1][1:]
2203 video_id = os.path.splitext(video_id)[0]
2205 # it's tempting to parse this further, but you would
2206 # have to take into account all the variations like
2207 # Video Title - Site Name
2208 # Site Name | Video Title
2209 # Video Title - Tagline | Site Name
2210 # and so on and so forth; it's just not practical
2211 mobj = re.search(r'<title>(.*)</title>', webpage)
2213 self._downloader.trouble(u'ERROR: unable to extract title')
2215 video_title = mobj.group(1).decode('utf-8')
2216 video_title = sanitize_title(video_title)
2217 simple_title = _simplify_title(video_title)
2219 # video uploader is domain name
2220 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2222 self._downloader.trouble(u'ERROR: unable to extract title')
2224 video_uploader = mobj.group(1).decode('utf-8')
2227 # Process video information
2228 self._downloader.process_info({
2229 'id': video_id.decode('utf-8'),
2230 'url': video_url.decode('utf-8'),
2231 'uploader': video_uploader,
2232 'upload_date': u'NA',
2233 'title': video_title,
2234 'stitle': simple_title,
2235 'ext': video_extension.decode('utf-8'),
2239 except UnavailableVideoError, err:
2240 self._downloader.trouble(u'\nERROR: unable to download video')
2243 class YoutubeSearchIE(InfoExtractor):
2244 """Information Extractor for YouTube search queries."""
2245 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2246 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2247 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2248 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2250 _max_youtube_results = 1000
2251 IE_NAME = u'youtube:search'
2253 def __init__(self, youtube_ie, downloader=None):
2254 InfoExtractor.__init__(self, downloader)
2255 self._youtube_ie = youtube_ie
2257 def report_download_page(self, query, pagenum):
2258 """Report attempt to download playlist page with given number."""
2259 query = query.decode(preferredencoding())
2260 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2262 def _real_initialize(self):
2263 self._youtube_ie.initialize()
2265 def _real_extract(self, query):
2266 mobj = re.match(self._VALID_URL, query)
2268 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2271 prefix, query = query.split(':')
2273 query = query.encode('utf-8')
2275 self._download_n_results(query, 1)
2277 elif prefix == 'all':
2278 self._download_n_results(query, self._max_youtube_results)
2284 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2286 elif n > self._max_youtube_results:
2287 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2288 n = self._max_youtube_results
2289 self._download_n_results(query, n)
2291 except ValueError: # parsing prefix as integer fails
2292 self._download_n_results(query, 1)
2295 def _download_n_results(self, query, n):
2296 """Downloads a specified number of results for a query"""
2299 already_seen = set()
2303 self.report_download_page(query, pagenum)
2304 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2305 request = urllib2.Request(result_url)
2307 page = urllib2.urlopen(request).read()
2308 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2309 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2312 # Extract video identifiers
2313 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2314 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2315 if video_id not in already_seen:
2316 video_ids.append(video_id)
2317 already_seen.add(video_id)
2318 if len(video_ids) == n:
2319 # Specified n videos reached
2320 for id in video_ids:
2321 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2324 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2325 for id in video_ids:
2326 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2329 pagenum = pagenum + 1
2332 class GoogleSearchIE(InfoExtractor):
2333 """Information Extractor for Google Video search queries."""
2334 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2335 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2336 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2337 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2339 _max_google_results = 1000
2340 IE_NAME = u'video.google:search'
2342 def __init__(self, google_ie, downloader=None):
2343 InfoExtractor.__init__(self, downloader)
2344 self._google_ie = google_ie
2346 def report_download_page(self, query, pagenum):
2347 """Report attempt to download playlist page with given number."""
2348 query = query.decode(preferredencoding())
2349 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2351 def _real_initialize(self):
2352 self._google_ie.initialize()
2354 def _real_extract(self, query):
2355 mobj = re.match(self._VALID_URL, query)
2357 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2360 prefix, query = query.split(':')
2362 query = query.encode('utf-8')
2364 self._download_n_results(query, 1)
2366 elif prefix == 'all':
2367 self._download_n_results(query, self._max_google_results)
2373 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2375 elif n > self._max_google_results:
2376 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2377 n = self._max_google_results
2378 self._download_n_results(query, n)
2380 except ValueError: # parsing prefix as integer fails
2381 self._download_n_results(query, 1)
2384 def _download_n_results(self, query, n):
2385 """Downloads a specified number of results for a query"""
2388 already_seen = set()
2392 self.report_download_page(query, pagenum)
2393 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2394 request = urllib2.Request(result_url)
2396 page = urllib2.urlopen(request).read()
2397 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2398 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2401 # Extract video identifiers
2402 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2403 video_id = mobj.group(1)
2404 if video_id not in already_seen:
2405 video_ids.append(video_id)
2406 already_seen.add(video_id)
2407 if len(video_ids) == n:
2408 # Specified n videos reached
2409 for id in video_ids:
2410 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2413 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2414 for id in video_ids:
2415 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2418 pagenum = pagenum + 1
2421 class YahooSearchIE(InfoExtractor):
2422 """Information Extractor for Yahoo! Video search queries."""
2423 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2424 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2425 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2426 _MORE_PAGES_INDICATOR = r'\s*Next'
2428 _max_yahoo_results = 1000
2429 IE_NAME = u'video.yahoo:search'
2431 def __init__(self, yahoo_ie, downloader=None):
2432 InfoExtractor.__init__(self, downloader)
2433 self._yahoo_ie = yahoo_ie
2435 def report_download_page(self, query, pagenum):
2436 """Report attempt to download playlist page with given number."""
2437 query = query.decode(preferredencoding())
2438 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2440 def _real_initialize(self):
2441 self._yahoo_ie.initialize()
2443 def _real_extract(self, query):
2444 mobj = re.match(self._VALID_URL, query)
2446 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2449 prefix, query = query.split(':')
2451 query = query.encode('utf-8')
2453 self._download_n_results(query, 1)
2455 elif prefix == 'all':
2456 self._download_n_results(query, self._max_yahoo_results)
2462 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2464 elif n > self._max_yahoo_results:
2465 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2466 n = self._max_yahoo_results
2467 self._download_n_results(query, n)
2469 except ValueError: # parsing prefix as integer fails
2470 self._download_n_results(query, 1)
2473 def _download_n_results(self, query, n):
2474 """Downloads a specified number of results for a query"""
2477 already_seen = set()
2481 self.report_download_page(query, pagenum)
2482 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2483 request = urllib2.Request(result_url)
2485 page = urllib2.urlopen(request).read()
2486 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2487 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2490 # Extract video identifiers
2491 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2492 video_id = mobj.group(1)
2493 if video_id not in already_seen:
2494 video_ids.append(video_id)
2495 already_seen.add(video_id)
2496 if len(video_ids) == n:
2497 # Specified n videos reached
2498 for id in video_ids:
2499 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2502 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2503 for id in video_ids:
2504 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2507 pagenum = pagenum + 1
2510 class YoutubePlaylistIE(InfoExtractor):
2511 """Information Extractor for YouTube playlists."""
2513 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2514 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2515 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2516 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2518 IE_NAME = u'youtube:playlist'
2520 def __init__(self, youtube_ie, downloader=None):
2521 InfoExtractor.__init__(self, downloader)
2522 self._youtube_ie = youtube_ie
2524 def report_download_page(self, playlist_id, pagenum):
2525 """Report attempt to download playlist page with given number."""
2526 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2528 def _real_initialize(self):
2529 self._youtube_ie.initialize()
2531 def _real_extract(self, url):
2532 # Extract playlist id
2533 mobj = re.match(self._VALID_URL, url)
2535 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2539 if mobj.group(3) is not None:
2540 self._youtube_ie.extract(mobj.group(3))
2543 # Download playlist pages
2544 # prefix is 'p' as default for playlists but there are other types that need extra care
2545 playlist_prefix = mobj.group(1)
2546 if playlist_prefix == 'a':
2547 playlist_access = 'artist'
2549 playlist_prefix = 'p'
2550 playlist_access = 'view_play_list'
2551 playlist_id = mobj.group(2)
2556 self.report_download_page(playlist_id, pagenum)
2557 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2558 request = urllib2.Request(url)
2560 page = urllib2.urlopen(request).read()
2561 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2562 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2565 # Extract video identifiers
2567 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2568 if mobj.group(1) not in ids_in_page:
2569 ids_in_page.append(mobj.group(1))
2570 video_ids.extend(ids_in_page)
2572 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2574 pagenum = pagenum + 1
2576 playliststart = self._downloader.params.get('playliststart', 1) - 1
2577 playlistend = self._downloader.params.get('playlistend', -1)
2578 video_ids = video_ids[playliststart:playlistend]
2580 for id in video_ids:
2581 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2585 class YoutubeUserIE(InfoExtractor):
2586 """Information Extractor for YouTube users."""
2588 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2589 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2590 _GDATA_PAGE_SIZE = 50
2591 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2592 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2594 IE_NAME = u'youtube:user'
2596 def __init__(self, youtube_ie, downloader=None):
2597 InfoExtractor.__init__(self, downloader)
2598 self._youtube_ie = youtube_ie
2600 def report_download_page(self, username, start_index):
2601 """Report attempt to download user page."""
2602 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2603 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2605 def _real_initialize(self):
2606 self._youtube_ie.initialize()
2608 def _real_extract(self, url):
2610 mobj = re.match(self._VALID_URL, url)
2612 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2615 username = mobj.group(1)
2617 # Download video ids using YouTube Data API. Result size per
2618 # query is limited (currently to 50 videos) so we need to query
2619 # page by page until there are no video ids - it means we got
2626 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2627 self.report_download_page(username, start_index)
2629 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2632 page = urllib2.urlopen(request).read()
2633 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2634 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2637 # Extract video identifiers
2640 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2641 if mobj.group(1) not in ids_in_page:
2642 ids_in_page.append(mobj.group(1))
2644 video_ids.extend(ids_in_page)
2646 # A little optimization - if current page is not
2647 # "full", ie. does not contain PAGE_SIZE video ids then
2648 # we can assume that this page is the last one - there
2649 # are no more ids on further pages - no need to query
2652 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2657 all_ids_count = len(video_ids)
2658 playliststart = self._downloader.params.get('playliststart', 1) - 1
2659 playlistend = self._downloader.params.get('playlistend', -1)
2661 if playlistend == -1:
2662 video_ids = video_ids[playliststart:]
2664 video_ids = video_ids[playliststart:playlistend]
2666 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2667 (username, all_ids_count, len(video_ids)))
2669 for video_id in video_ids:
2670 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2673 class DepositFilesIE(InfoExtractor):
2674 """Information extractor for depositfiles.com"""
2676 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2677 IE_NAME = u'DepositFiles'
2679 def __init__(self, downloader=None):
2680 InfoExtractor.__init__(self, downloader)
2682 def report_download_webpage(self, file_id):
2683 """Report webpage download."""
2684 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2686 def report_extraction(self, file_id):
2687 """Report information extraction."""
2688 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2690 def _real_extract(self, url):
2691 # At this point we have a new file
2692 self._downloader.increment_downloads()
2694 file_id = url.split('/')[-1]
2695 # Rebuild url in english locale
2696 url = 'http://depositfiles.com/en/files/' + file_id
2698 # Retrieve file webpage with 'Free download' button pressed
2699 free_download_indication = { 'gateway_result' : '1' }
2700 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2702 self.report_download_webpage(file_id)
2703 webpage = urllib2.urlopen(request).read()
2704 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2705 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2708 # Search for the real file URL
2709 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2710 if (mobj is None) or (mobj.group(1) is None):
2711 # Try to figure out reason of the error.
2712 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2713 if (mobj is not None) and (mobj.group(1) is not None):
2714 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2715 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2717 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2720 file_url = mobj.group(1)
2721 file_extension = os.path.splitext(file_url)[1][1:]
2723 # Search for file title
2724 mobj = re.search(r'<b title="(.*?)">', webpage)
2726 self._downloader.trouble(u'ERROR: unable to extract title')
2728 file_title = mobj.group(1).decode('utf-8')
2731 # Process file information
2732 self._downloader.process_info({
2733 'id': file_id.decode('utf-8'),
2734 'url': file_url.decode('utf-8'),
2736 'upload_date': u'NA',
2737 'title': file_title,
2738 'stitle': file_title,
2739 'ext': file_extension.decode('utf-8'),
2743 except UnavailableVideoError, err:
2744 self._downloader.trouble(u'ERROR: unable to download file')
2747 class FacebookIE(InfoExtractor):
2748 """Information Extractor for Facebook"""
2750 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2751 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2752 _NETRC_MACHINE = 'facebook'
2753 _available_formats = ['video', 'highqual', 'lowqual']
2754 _video_extensions = {
2759 IE_NAME = u'facebook'
2761 def __init__(self, downloader=None):
2762 InfoExtractor.__init__(self, downloader)
2764 def _reporter(self, message):
2765 """Add header and report message."""
2766 self._downloader.to_screen(u'[facebook] %s' % message)
2768 def report_login(self):
2769 """Report attempt to log in."""
2770 self._reporter(u'Logging in')
2772 def report_video_webpage_download(self, video_id):
2773 """Report attempt to download video webpage."""
2774 self._reporter(u'%s: Downloading video webpage' % video_id)
2776 def report_information_extraction(self, video_id):
2777 """Report attempt to extract video information."""
2778 self._reporter(u'%s: Extracting video information' % video_id)
2780 def _parse_page(self, video_webpage):
2781 """Extract video information from page"""
2783 data = {'title': r'\("video_title", "(.*?)"\)',
2784 'description': r'<div class="datawrap">(.*?)</div>',
2785 'owner': r'\("video_owner_name", "(.*?)"\)',
2786 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2789 for piece in data.keys():
2790 mobj = re.search(data[piece], video_webpage)
2791 if mobj is not None:
2792 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2796 for fmt in self._available_formats:
2797 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2798 if mobj is not None:
2799 # URL is in a Javascript segment inside an escaped Unicode format within
2800 # the generally utf-8 page
2801 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2802 video_info['video_urls'] = video_urls
2806 def _real_initialize(self):
2807 if self._downloader is None:
2812 downloader_params = self._downloader.params
2814 # Attempt to use provided username and password or .netrc data
2815 if downloader_params.get('username', None) is not None:
2816 useremail = downloader_params['username']
2817 password = downloader_params['password']
2818 elif downloader_params.get('usenetrc', False):
2820 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2821 if info is not None:
2825 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2826 except (IOError, netrc.NetrcParseError), err:
2827 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2830 if useremail is None:
2839 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2842 login_results = urllib2.urlopen(request).read()
2843 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2844 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2846 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2847 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2850 def _real_extract(self, url):
2851 mobj = re.match(self._VALID_URL, url)
2853 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2855 video_id = mobj.group('ID')
2858 self.report_video_webpage_download(video_id)
2859 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2861 page = urllib2.urlopen(request)
2862 video_webpage = page.read()
2863 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2864 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2867 # Start extracting information
2868 self.report_information_extraction(video_id)
2870 # Extract information
2871 video_info = self._parse_page(video_webpage)
2874 if 'owner' not in video_info:
2875 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2877 video_uploader = video_info['owner']
2880 if 'title' not in video_info:
2881 self._downloader.trouble(u'ERROR: unable to extract video title')
2883 video_title = video_info['title']
2884 video_title = video_title.decode('utf-8')
2885 video_title = sanitize_title(video_title)
2887 simple_title = _simplify_title(video_title)
2890 if 'thumbnail' not in video_info:
2891 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2892 video_thumbnail = ''
2894 video_thumbnail = video_info['thumbnail']
2898 if 'upload_date' in video_info:
2899 upload_time = video_info['upload_date']
2900 timetuple = email.utils.parsedate_tz(upload_time)
2901 if timetuple is not None:
2903 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2908 video_description = video_info.get('description', 'No description available.')
2910 url_map = video_info['video_urls']
2911 if len(url_map.keys()) > 0:
2912 # Decide which formats to download
2913 req_format = self._downloader.params.get('format', None)
2914 format_limit = self._downloader.params.get('format_limit', None)
2916 if format_limit is not None and format_limit in self._available_formats:
2917 format_list = self._available_formats[self._available_formats.index(format_limit):]
2919 format_list = self._available_formats
2920 existing_formats = [x for x in format_list if x in url_map]
2921 if len(existing_formats) == 0:
2922 self._downloader.trouble(u'ERROR: no known formats available for video')
2924 if req_format is None:
2925 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2926 elif req_format == 'worst':
2927 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2928 elif req_format == '-1':
2929 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2932 if req_format not in url_map:
2933 self._downloader.trouble(u'ERROR: requested format not available')
2935 video_url_list = [(req_format, url_map[req_format])] # Specific format
2937 for format_param, video_real_url in video_url_list:
2939 # At this point we have a new video
2940 self._downloader.increment_downloads()
2943 video_extension = self._video_extensions.get(format_param, 'mp4')
2946 # Process video information
2947 self._downloader.process_info({
2948 'id': video_id.decode('utf-8'),
2949 'url': video_real_url.decode('utf-8'),
2950 'uploader': video_uploader.decode('utf-8'),
2951 'upload_date': upload_date,
2952 'title': video_title,
2953 'stitle': simple_title,
2954 'ext': video_extension.decode('utf-8'),
2955 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2956 'thumbnail': video_thumbnail.decode('utf-8'),
2957 'description': video_description.decode('utf-8'),
2960 except UnavailableVideoError, err:
2961 self._downloader.trouble(u'\nERROR: unable to download video')
2963 class BlipTVIE(InfoExtractor):
2964 """Information extractor for blip.tv"""
2966 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2967 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2968 IE_NAME = u'blip.tv'
2970 def report_extraction(self, file_id):
2971 """Report information extraction."""
2972 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2974 def report_direct_download(self, title):
2975 """Report information extraction."""
2976 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2978 def _real_extract(self, url):
2979 mobj = re.match(self._VALID_URL, url)
2981 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2988 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2989 request = urllib2.Request(json_url)
2990 self.report_extraction(mobj.group(1))
2993 urlh = urllib2.urlopen(request)
2994 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2995 basename = url.split('/')[-1]
2996 title,ext = os.path.splitext(basename)
2997 title = title.decode('UTF-8')
2998 ext = ext.replace('.', '')
2999 self.report_direct_download(title)
3004 'stitle': _simplify_title(title),
3008 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3009 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
3011 if info is None: # Regular URL
3013 json_code = urlh.read()
3014 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3015 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
3019 json_data = json.loads(json_code)
3020 if 'Post' in json_data:
3021 data = json_data['Post']
3025 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
3026 video_url = data['media']['url']
3027 umobj = re.match(self._URL_EXT, video_url)
3029 raise ValueError('Can not determine filename extension')
3030 ext = umobj.group(1)
3033 'id': data['item_id'],
3035 'uploader': data['display_name'],
3036 'upload_date': upload_date,
3037 'title': data['title'],
3038 'stitle': _simplify_title(data['title']),
3040 'format': data['media']['mimeType'],
3041 'thumbnail': data['thumbnailUrl'],
3042 'description': data['description'],
3043 'player_url': data['embedUrl']
3045 except (ValueError,KeyError), err:
3046 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3049 self._downloader.increment_downloads()
3052 self._downloader.process_info(info)
3053 except UnavailableVideoError, err:
3054 self._downloader.trouble(u'\nERROR: unable to download video')
3057 class MyVideoIE(InfoExtractor):
3058 """Information Extractor for myvideo.de."""
3060 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3061 IE_NAME = u'myvideo'
3063 def __init__(self, downloader=None):
3064 InfoExtractor.__init__(self, downloader)
3066 def report_download_webpage(self, video_id):
3067 """Report webpage download."""
3068 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3070 def report_extraction(self, video_id):
3071 """Report information extraction."""
3072 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3074 def _real_extract(self,url):
3075 mobj = re.match(self._VALID_URL, url)
3077 self._download.trouble(u'ERROR: invalid URL: %s' % url)
3080 video_id = mobj.group(1)
3083 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3085 self.report_download_webpage(video_id)
3086 webpage = urllib2.urlopen(request).read()
3087 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3088 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3091 self.report_extraction(video_id)
3092 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3095 self._downloader.trouble(u'ERROR: unable to extract media URL')
3097 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3099 mobj = re.search('<title>([^<]+)</title>', webpage)
3101 self._downloader.trouble(u'ERROR: unable to extract title')
3104 video_title = mobj.group(1)
3105 video_title = sanitize_title(video_title)
3107 simple_title = _simplify_title(video_title)
3110 self._downloader.process_info({
3114 'upload_date': u'NA',
3115 'title': video_title,
3116 'stitle': simple_title,
3121 except UnavailableVideoError:
3122 self._downloader.trouble(u'\nERROR: Unable to download video')
3124 class ComedyCentralIE(InfoExtractor):
3125 """Information extractor for The Daily Show and Colbert Report """
3127 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3128 IE_NAME = u'comedycentral'
3130 def report_extraction(self, episode_id):
3131 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3133 def report_config_download(self, episode_id):
3134 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3136 def report_index_download(self, episode_id):
3137 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3139 def report_player_url(self, episode_id):
3140 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3142 def _real_extract(self, url):
3143 mobj = re.match(self._VALID_URL, url)
3145 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3148 if mobj.group('shortname'):
3149 if mobj.group('shortname') in ('tds', 'thedailyshow'):
3150 url = u'http://www.thedailyshow.com/full-episodes/'
3152 url = u'http://www.colbertnation.com/full-episodes/'
3153 mobj = re.match(self._VALID_URL, url)
3154 assert mobj is not None
3156 dlNewest = not mobj.group('episode')
3158 epTitle = mobj.group('showname')
3160 epTitle = mobj.group('episode')
3162 req = urllib2.Request(url)
3163 self.report_extraction(epTitle)
3165 htmlHandle = urllib2.urlopen(req)
3166 html = htmlHandle.read()
3167 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3168 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3171 url = htmlHandle.geturl()
3172 mobj = re.match(self._VALID_URL, url)
3174 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3176 if mobj.group('episode') == '':
3177 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3179 epTitle = mobj.group('episode')
3181 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
3182 if len(mMovieParams) == 0:
3183 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3186 playerUrl_raw = mMovieParams[0][0]
3187 self.report_player_url(epTitle)
3189 urlHandle = urllib2.urlopen(playerUrl_raw)
3190 playerUrl = urlHandle.geturl()
3191 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3192 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3195 uri = mMovieParams[0][1]
3196 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3197 self.report_index_download(epTitle)
3199 indexXml = urllib2.urlopen(indexUrl).read()
3200 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3201 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3204 idoc = xml.etree.ElementTree.fromstring(indexXml)
3205 itemEls = idoc.findall('.//item')
3206 for itemEl in itemEls:
3207 mediaId = itemEl.findall('./guid')[0].text
3208 shortMediaId = mediaId.split(':')[-1]
3209 showId = mediaId.split(':')[-2].replace('.com', '')
3210 officialTitle = itemEl.findall('./title')[0].text
3211 officialDate = itemEl.findall('./pubDate')[0].text
3213 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3214 urllib.urlencode({'uri': mediaId}))
3215 configReq = urllib2.Request(configUrl)
3216 self.report_config_download(epTitle)
3218 configXml = urllib2.urlopen(configReq).read()
3219 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3220 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3223 cdoc = xml.etree.ElementTree.fromstring(configXml)
3225 for rendition in cdoc.findall('.//rendition'):
3226 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3230 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3233 # For now, just pick the highest bitrate
3234 format,video_url = turls[-1]
3236 self._downloader.increment_downloads()
3238 effTitle = showId + u'-' + epTitle
3243 'upload_date': officialDate,
3245 'stitle': _simplify_title(effTitle),
3249 'description': officialTitle,
3250 'player_url': playerUrl
3254 self._downloader.process_info(info)
3255 except UnavailableVideoError, err:
3256 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3260 class EscapistIE(InfoExtractor):
3261 """Information extractor for The Escapist """
3263 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3264 IE_NAME = u'escapist'
3266 def report_extraction(self, showName):
3267 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3269 def report_config_download(self, showName):
3270 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3272 def _real_extract(self, url):
3273 htmlParser = HTMLParser.HTMLParser()
3275 mobj = re.match(self._VALID_URL, url)
3277 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3279 showName = mobj.group('showname')
3280 videoId = mobj.group('episode')
3282 self.report_extraction(showName)
3284 webPage = urllib2.urlopen(url).read()
3285 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3286 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3289 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3290 description = htmlParser.unescape(descMatch.group(1))
3291 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3292 imgUrl = htmlParser.unescape(imgMatch.group(1))
3293 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3294 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3295 configUrlMatch = re.search('config=(.*)$', playerUrl)
3296 configUrl = urllib2.unquote(configUrlMatch.group(1))
3298 self.report_config_download(showName)
3300 configJSON = urllib2.urlopen(configUrl).read()
3301 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3302 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3305 # Technically, it's JavaScript, not JSON
3306 configJSON = configJSON.replace("'", '"')
3309 config = json.loads(configJSON)
3310 except (ValueError,), err:
3311 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3314 playlist = config['playlist']
3315 videoUrl = playlist[1]['url']
3317 self._downloader.increment_downloads()
3321 'uploader': showName,
3322 'upload_date': None,
3324 'stitle': _simplify_title(showName),
3327 'thumbnail': imgUrl,
3328 'description': description,
3329 'player_url': playerUrl,
3333 self._downloader.process_info(info)
3334 except UnavailableVideoError, err:
3335 self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3338 class CollegeHumorIE(InfoExtractor):
3339 """Information extractor for collegehumor.com"""
3341 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3342 IE_NAME = u'collegehumor'
3344 def report_webpage(self, video_id):
3345 """Report information extraction."""
3346 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3348 def report_extraction(self, video_id):
3349 """Report information extraction."""
3350 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3352 def _real_extract(self, url):
3353 htmlParser = HTMLParser.HTMLParser()
3355 mobj = re.match(self._VALID_URL, url)
3357 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3359 video_id = mobj.group('videoid')
3361 self.report_webpage(video_id)
3362 request = urllib2.Request(url)
3364 webpage = urllib2.urlopen(request).read()
3365 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3366 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3369 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3371 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3373 internal_video_id = m.group('internalvideoid')
3377 'internal_id': internal_video_id,
3380 self.report_extraction(video_id)
3381 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3383 metaXml = urllib2.urlopen(xmlUrl).read()
3384 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3385 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3388 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3390 videoNode = mdoc.findall('./video')[0]
3391 info['description'] = videoNode.findall('./description')[0].text
3392 info['title'] = videoNode.findall('./caption')[0].text
3393 info['stitle'] = _simplify_title(info['title'])
3394 info['url'] = videoNode.findall('./file')[0].text
3395 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3396 info['ext'] = info['url'].rpartition('.')[2]
3397 info['format'] = info['ext']
3399 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3402 self._downloader.increment_downloads()
3405 self._downloader.process_info(info)
3406 except UnavailableVideoError, err:
3407 self._downloader.trouble(u'\nERROR: unable to download video')
3410 class XVideosIE(InfoExtractor):
3411 """Information extractor for xvideos.com"""
3413 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3414 IE_NAME = u'xvideos'
3416 def report_webpage(self, video_id):
3417 """Report information extraction."""
3418 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3420 def report_extraction(self, video_id):
3421 """Report information extraction."""
3422 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3424 def _real_extract(self, url):
3425 htmlParser = HTMLParser.HTMLParser()
3427 mobj = re.match(self._VALID_URL, url)
3429 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3431 video_id = mobj.group(1).decode('utf-8')
3433 self.report_webpage(video_id)
3435 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3437 webpage = urllib2.urlopen(request).read()
3438 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3439 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3442 self.report_extraction(video_id)
3446 mobj = re.search(r'flv_url=(.+?)&', webpage)
3448 self._downloader.trouble(u'ERROR: unable to extract video url')
3450 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3454 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3456 self._downloader.trouble(u'ERROR: unable to extract video title')
3458 video_title = mobj.group(1).decode('utf-8')
3461 # Extract video thumbnail
3462 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3464 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3466 video_thumbnail = mobj.group(1).decode('utf-8')
3470 self._downloader.increment_downloads()
3475 'upload_date': None,
3476 'title': video_title,
3477 'stitle': _simplify_title(video_title),
3480 'thumbnail': video_thumbnail,
3481 'description': None,
3486 self._downloader.process_info(info)
3487 except UnavailableVideoError, err:
3488 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3491 class SoundcloudIE(InfoExtractor):
3492 """Information extractor for soundcloud.com
3493 To access the media, the uid of the song and a stream token
3494 must be extracted from the page source and the script must make
3495 a request to media.soundcloud.com/crossdomain.xml. Then
3496 the media can be grabbed by requesting from an url composed
3497 of the stream token and uid
3500 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3501 IE_NAME = u'soundcloud'
3503 def __init__(self, downloader=None):
3504 InfoExtractor.__init__(self, downloader)
3506 def report_webpage(self, video_id):
3507 """Report information extraction."""
3508 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3510 def report_extraction(self, video_id):
3511 """Report information extraction."""
3512 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3514 def _real_extract(self, url):
3515 htmlParser = HTMLParser.HTMLParser()
3517 mobj = re.match(self._VALID_URL, url)
3519 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3522 # extract uploader (which is in the url)
3523 uploader = mobj.group(1).decode('utf-8')
3524 # extract simple title (uploader + slug of song title)
3525 slug_title = mobj.group(2).decode('utf-8')
3526 simple_title = uploader + '-' + slug_title
3528 self.report_webpage('%s/%s' % (uploader, slug_title))
3530 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3532 webpage = urllib2.urlopen(request).read()
3533 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3534 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3537 self.report_extraction('%s/%s' % (uploader, slug_title))
3539 # extract uid and stream token that soundcloud hands out for access
3540 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3542 video_id = mobj.group(1)
3543 stream_token = mobj.group(2)
3545 # extract unsimplified title
3546 mobj = re.search('"title":"(.*?)",', webpage)
3548 title = mobj.group(1)
3550 # construct media url (with uid/token)
3551 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3552 mediaURL = mediaURL % (video_id, stream_token)
3555 description = u'No description available'
3556 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3558 description = mobj.group(1)
3562 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3565 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3566 except Exception, e:
3569 # for soundcloud, a request to a cross domain is required for cookies
3570 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3573 self._downloader.process_info({
3574 'id': video_id.decode('utf-8'),
3576 'uploader': uploader.decode('utf-8'),
3577 'upload_date': upload_date,
3578 'title': simple_title.decode('utf-8'),
3579 'stitle': simple_title.decode('utf-8'),
3583 'description': description.decode('utf-8')
3585 except UnavailableVideoError:
3586 self._downloader.trouble(u'\nERROR: unable to download video')
3589 class InfoQIE(InfoExtractor):
3590 """Information extractor for infoq.com"""
3592 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3595 def report_webpage(self, video_id):
3596 """Report information extraction."""
3597 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3599 def report_extraction(self, video_id):
3600 """Report information extraction."""
3601 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3603 def _real_extract(self, url):
3604 htmlParser = HTMLParser.HTMLParser()
3606 mobj = re.match(self._VALID_URL, url)
3608 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3611 self.report_webpage(url)
3613 request = urllib2.Request(url)
3615 webpage = urllib2.urlopen(request).read()
3616 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3617 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3620 self.report_extraction(url)
3624 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3626 self._downloader.trouble(u'ERROR: unable to extract video url')
3628 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3632 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3634 self._downloader.trouble(u'ERROR: unable to extract video title')
3636 video_title = mobj.group(1).decode('utf-8')
3638 # Extract description
3639 video_description = u'No description available.'
3640 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3641 if mobj is not None:
3642 video_description = mobj.group(1).decode('utf-8')
3644 video_filename = video_url.split('/')[-1]
3645 video_id, extension = video_filename.split('.')
3647 self._downloader.increment_downloads()
3652 'upload_date': None,
3653 'title': video_title,
3654 'stitle': _simplify_title(video_title),
3656 'format': extension, # Extension is always(?) mp4, but seems to be flv
3658 'description': video_description,
3663 self._downloader.process_info(info)
3664 except UnavailableVideoError, err:
3665 self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3667 class MixcloudIE(InfoExtractor):
3668 """Information extractor for www.mixcloud.com"""
3669 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3670 IE_NAME = u'mixcloud'
3672 def __init__(self, downloader=None):
3673 InfoExtractor.__init__(self, downloader)
3675 def report_download_json(self, file_id):
3676 """Report JSON download."""
3677 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3679 def report_extraction(self, file_id):
3680 """Report information extraction."""
3681 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3683 def get_urls(self, jsonData, fmt, bitrate='best'):
3684 """Get urls from 'audio_formats' section in json"""
3687 bitrate_list = jsonData[fmt]
3688 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3689 bitrate = max(bitrate_list) # select highest
3691 url_list = jsonData[fmt][bitrate]
3692 except TypeError: # we have no bitrate info.
3693 url_list = jsonData[fmt]
3697 def check_urls(self, url_list):
3698 """Returns 1st active url from list"""
3699 for url in url_list:
3701 urllib2.urlopen(url)
3703 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3708 def _print_formats(self, formats):
3709 print 'Available formats:'
3710 for fmt in formats.keys():
3711 for b in formats[fmt]:
3713 ext = formats[fmt][b][0]
3714 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3715 except TypeError: # we have no bitrate info
3716 ext = formats[fmt][0]
3717 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3720 def _real_extract(self, url):
3721 mobj = re.match(self._VALID_URL, url)
3723 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3725 # extract uploader & filename from url
3726 uploader = mobj.group(1).decode('utf-8')
3727 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3729 # construct API request
3730 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3731 # retrieve .json file with links to files
3732 request = urllib2.Request(file_url)
3734 self.report_download_json(file_url)
3735 jsonData = urllib2.urlopen(request).read()
3736 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3737 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3741 json_data = json.loads(jsonData)
3742 player_url = json_data['player_swf_url']
3743 formats = dict(json_data['audio_formats'])
3745 req_format = self._downloader.params.get('format', None)
3748 if self._downloader.params.get('listformats', None):
3749 self._print_formats(formats)
3752 if req_format is None or req_format == 'best':
3753 for format_param in formats.keys():
3754 url_list = self.get_urls(formats, format_param)
3756 file_url = self.check_urls(url_list)
3757 if file_url is not None:
3760 if req_format not in formats.keys():
3761 self._downloader.trouble(u'ERROR: format is not available')
3764 url_list = self.get_urls(formats, req_format)
3765 file_url = self.check_urls(url_list)
3766 format_param = req_format
3769 self._downloader.increment_downloads()
3771 # Process file information
3772 self._downloader.process_info({
3773 'id': file_id.decode('utf-8'),
3774 'url': file_url.decode('utf-8'),
3775 'uploader': uploader.decode('utf-8'),
3776 'upload_date': u'NA',
3777 'title': json_data['name'],
3778 'stitle': _simplify_title(json_data['name']),
3779 'ext': file_url.split('.')[-1].decode('utf-8'),
3780 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3781 'thumbnail': json_data['thumbnail_url'],
3782 'description': json_data['description'],
3783 'player_url': player_url.decode('utf-8'),
3785 except UnavailableVideoError, err:
3786 self._downloader.trouble(u'ERROR: unable to download file')
3788 class StanfordOpenClassroomIE(InfoExtractor):
3789 """Information extractor for Stanford's Open ClassRoom"""
3791 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3792 IE_NAME = u'stanfordoc'
3794 def report_download_webpage(self, objid):
3795 """Report information extraction."""
3796 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3798 def report_extraction(self, video_id):
3799 """Report information extraction."""
3800 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3802 def _real_extract(self, url):
3803 mobj = re.match(self._VALID_URL, url)
3805 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3808 if mobj.group('course') and mobj.group('video'): # A specific video
3809 course = mobj.group('course')
3810 video = mobj.group('video')
3812 'id': _simplify_title(course + '_' + video),
3815 self.report_extraction(info['id'])
3816 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3817 xmlUrl = baseUrl + video + '.xml'
3819 metaXml = urllib2.urlopen(xmlUrl).read()
3820 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3821 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
3823 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3825 info['title'] = mdoc.findall('./title')[0].text
3826 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3828 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3830 info['stitle'] = _simplify_title(info['title'])
3831 info['ext'] = info['url'].rpartition('.')[2]
3832 info['format'] = info['ext']
3833 self._downloader.increment_downloads()
3835 self._downloader.process_info(info)
3836 except UnavailableVideoError, err:
3837 self._downloader.trouble(u'\nERROR: unable to download video')
3838 elif mobj.group('course'): # A course page
3839 unescapeHTML = HTMLParser.HTMLParser().unescape
3841 course = mobj.group('course')
3843 'id': _simplify_title(course),
3847 self.report_download_webpage(info['id'])
3849 coursepage = urllib2.urlopen(url).read()
3850 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3851 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3854 m = re.search('<h1>([^<]+)</h1>', coursepage)
3856 info['title'] = unescapeHTML(m.group(1))
3858 info['title'] = info['id']
3859 info['stitle'] = _simplify_title(info['title'])
3861 m = re.search('<description>([^<]+)</description>', coursepage)
3863 info['description'] = unescapeHTML(m.group(1))
3865 links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3868 'type': 'reference',
3869 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3873 for entry in info['list']:
3874 assert entry['type'] == 'reference'
3875 self.extract(entry['url'])
3877 unescapeHTML = HTMLParser.HTMLParser().unescape
3880 'id': 'Stanford OpenClassroom',
3884 self.report_download_webpage(info['id'])
3885 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3887 rootpage = urllib2.urlopen(rootURL).read()
3888 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3889 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3892 info['title'] = info['id']
3893 info['stitle'] = _simplify_title(info['title'])
3895 links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3898 'type': 'reference',
3899 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3903 for entry in info['list']:
3904 assert entry['type'] == 'reference'
3905 self.extract(entry['url'])
3907 class MTVIE(InfoExtractor):
3908 """Information extractor for MTV.com"""
3910 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3913 def report_webpage(self, video_id):
3914 """Report information extraction."""
3915 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3917 def report_extraction(self, video_id):
3918 """Report information extraction."""
3919 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3921 def _real_extract(self, url):
3922 mobj = re.match(self._VALID_URL, url)
3924 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3926 if not mobj.group('proto'):
3927 url = 'http://' + url
3928 video_id = mobj.group('videoid')
3929 self.report_webpage(video_id)
3931 request = urllib2.Request(url)
3933 webpage = urllib2.urlopen(request).read()
3934 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3935 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3938 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3940 self._downloader.trouble(u'ERROR: unable to extract song name')
3942 song_name = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3943 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3945 self._downloader.trouble(u'ERROR: unable to extract performer')
3947 performer = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3948 video_title = performer + ' - ' + song_name
3950 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3952 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3954 mtvn_uri = mobj.group(1)
3956 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3958 self._downloader.trouble(u'ERROR: unable to extract content id')
3960 content_id = mobj.group(1)
3962 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3963 self.report_extraction(video_id)
3964 request = urllib2.Request(videogen_url)
3966 metadataXml = urllib2.urlopen(request).read()
3967 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3968 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
3971 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3972 renditions = mdoc.findall('.//rendition')
3974 # For now, always pick the highest quality.
3975 rendition = renditions[-1]
3978 _,_,ext = rendition.attrib['type'].partition('/')
3979 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3980 video_url = rendition.find('./src').text
3982 self._downloader.trouble('Invalid rendition field.')
3985 self._downloader.increment_downloads()
3989 'uploader': performer,
3990 'title': video_title,
3991 'stitle': _simplify_title(video_title),
3997 self._downloader.process_info(info)
3998 except UnavailableVideoError, err:
3999 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
4002 class PostProcessor(object):
4003 """Post Processor class.
4005 PostProcessor objects can be added to downloaders with their
4006 add_post_processor() method. When the downloader has finished a
4007 successful download, it will take its internal chain of PostProcessors
4008 and start calling the run() method on each one of them, first with
4009 an initial argument and then with the returned value of the previous
4012 The chain will be stopped if one of them ever returns None or the end
4013 of the chain is reached.
4015 PostProcessor objects follow a "mutual registration" process similar
4016 to InfoExtractor objects.
4021 def __init__(self, downloader=None):
4022 self._downloader = downloader
4024 def set_downloader(self, downloader):
4025 """Sets the downloader for this PP."""
4026 self._downloader = downloader
4028 def run(self, information):
4029 """Run the PostProcessor.
4031 The "information" argument is a dictionary like the ones
4032 composed by InfoExtractors. The only difference is that this
4033 one has an extra field called "filepath" that points to the
4036 When this method returns None, the postprocessing chain is
4037 stopped. However, this method may return an information
4038 dictionary that will be passed to the next postprocessing
4039 object in the chain. It can be the one it received after
4040 changing some fields.
4042 In addition, this method may raise a PostProcessingError
4043 exception that will be taken into account by the downloader
4046 return information # by default, do nothing
4048 class AudioConversionError(BaseException):
4049 def __init__(self, message):
4050 self.message = message
4052 class FFmpegExtractAudioPP(PostProcessor):
4054 def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
4055 PostProcessor.__init__(self, downloader)
4056 if preferredcodec is None:
4057 preferredcodec = 'best'
4058 self._preferredcodec = preferredcodec
4059 self._preferredquality = preferredquality
4060 self._keepvideo = keepvideo
4063 def get_audio_codec(path):
4065 cmd = ['ffprobe', '-show_streams', '--', _encodeFilename(path)]
4066 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
4067 output = handle.communicate()[0]
4068 if handle.wait() != 0:
4070 except (IOError, OSError):
4073 for line in output.split('\n'):
4074 if line.startswith('codec_name='):
4075 audio_codec = line.split('=')[1].strip()
4076 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
4081 def run_ffmpeg(path, out_path, codec, more_opts):
4085 acodec_opts = ['-acodec', codec]
4086 cmd = ['ffmpeg', '-y', '-i', _encodeFilename(path), '-vn'] + acodec_opts + more_opts + ['--', _encodeFilename(out_path)]
4088 p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4089 stdout,stderr = p.communicate()
4090 except (IOError, OSError):
4091 e = sys.exc_info()[1]
4092 if isinstance(e, OSError) and e.errno == 2:
4093 raise AudioConversionError('ffmpeg not found. Please install ffmpeg.')
4096 if p.returncode != 0:
4097 msg = stderr.strip().split('\n')[-1]
4098 raise AudioConversionError(msg)
4100 def run(self, information):
4101 path = information['filepath']
4103 filecodec = self.get_audio_codec(path)
4104 if filecodec is None:
4105 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
4109 if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'):
4110 if self._preferredcodec == 'm4a' and filecodec == 'aac':
4111 # Lossless, but in another container
4113 extension = self._preferredcodec
4114 more_opts = ['-absf', 'aac_adtstoasc']
4115 elif filecodec in ['aac', 'mp3', 'vorbis']:
4116 # Lossless if possible
4118 extension = filecodec
4119 if filecodec == 'aac':
4120 more_opts = ['-f', 'adts']
4121 if filecodec == 'vorbis':
4125 acodec = 'libmp3lame'
4128 if self._preferredquality is not None:
4129 more_opts += ['-ab', self._preferredquality]
4131 # We convert the audio (lossy)
4132 acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis', 'wav': None}[self._preferredcodec]
4133 extension = self._preferredcodec
4135 if self._preferredquality is not None:
4136 more_opts += ['-ab', self._preferredquality]
4137 if self._preferredcodec == 'aac':
4138 more_opts += ['-f', 'adts']
4139 if self._preferredcodec == 'm4a':
4140 more_opts += ['-absf', 'aac_adtstoasc']
4141 if self._preferredcodec == 'vorbis':
4143 if self._preferredcodec == 'wav':
4145 more_opts += ['-f', 'wav']
4147 prefix, sep, ext = path.rpartition(u'.') # not os.path.splitext, since the latter does not work on unicode in all setups
4148 new_path = prefix + sep + extension
4149 self._downloader.to_screen(u'[ffmpeg] Destination: ' + new_path)
4151 self.run_ffmpeg(path, new_path, acodec, more_opts)
4153 etype,e,tb = sys.exc_info()
4154 if isinstance(e, AudioConversionError):
4155 self._downloader.to_stderr(u'ERROR: audio conversion failed: ' + e.message)
4157 self._downloader.to_stderr(u'ERROR: error running ffmpeg')
4160 # Try to update the date time for extracted audio file.
4161 if information.get('filetime') is not None:
4163 os.utime(_encodeFilename(new_path), (time.time(), information['filetime']))
4165 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
4167 if not self._keepvideo:
4169 os.remove(_encodeFilename(path))
4170 except (IOError, OSError):
4171 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
4174 information['filepath'] = new_path
4178 def updateSelf(downloader, filename):
4179 ''' Update the program file with the latest version from the repository '''
4180 # Note: downloader only used for options
4181 if not os.access(filename, os.W_OK):
4182 sys.exit('ERROR: no write permissions on %s' % filename)
4184 downloader.to_screen(u'Updating to latest version...')
4188 urlh = urllib.urlopen(UPDATE_URL)
4189 newcontent = urlh.read()
4191 vmatch = re.search("__version__ = '([^']+)'", newcontent)
4192 if vmatch is not None and vmatch.group(1) == __version__:
4193 downloader.to_screen(u'youtube-dl is up-to-date (' + __version__ + ')')
4197 except (IOError, OSError), err:
4198 sys.exit('ERROR: unable to download latest version')
4201 outf = open(filename, 'wb')
4203 outf.write(newcontent)
4206 except (IOError, OSError), err:
4207 sys.exit('ERROR: unable to overwrite current version')
4209 downloader.to_screen(u'Updated youtube-dl. Restart youtube-dl to use the new version.')
4212 def _readOptions(filename_bytes):
4214 optionf = open(filename_bytes)
4216 return [] # silently skip if file is not present
4220 res += shlex.split(l, comments=True)
4225 def _format_option_string(option):
4226 ''' ('-o', '--option') -> -o, --format METAVAR'''
4230 if option._short_opts: opts.append(option._short_opts[0])
4231 if option._long_opts: opts.append(option._long_opts[0])
4232 if len(opts) > 1: opts.insert(1, ', ')
4234 if option.takes_value(): opts.append(' %s' % option.metavar)
4236 return "".join(opts)
4238 def _find_term_columns():
4239 columns = os.environ.get('COLUMNS', None)
4244 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4245 out,err = sp.communicate()
4246 return int(out.split()[1])
4252 max_help_position = 80
4254 # No need to wrap help messages if we're on a wide console
4255 columns = _find_term_columns()
4256 if columns: max_width = columns
4258 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4259 fmt.format_option_strings = _format_option_string
4262 'version' : __version__,
4264 'usage' : '%prog [options] url [url...]',
4265 'conflict_handler' : 'resolve',
4268 parser = optparse.OptionParser(**kw)
4271 general = optparse.OptionGroup(parser, 'General Options')
4272 selection = optparse.OptionGroup(parser, 'Video Selection')
4273 authentication = optparse.OptionGroup(parser, 'Authentication Options')
4274 video_format = optparse.OptionGroup(parser, 'Video Format Options')
4275 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
4276 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
4277 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4279 general.add_option('-h', '--help',
4280 action='help', help='print this help text and exit')
4281 general.add_option('-v', '--version',
4282 action='version', help='print program version and exit')
4283 general.add_option('-U', '--update',
4284 action='store_true', dest='update_self', help='update this program to latest version')
4285 general.add_option('-i', '--ignore-errors',
4286 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4287 general.add_option('-r', '--rate-limit',
4288 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4289 general.add_option('-R', '--retries',
4290 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4291 general.add_option('--dump-user-agent',
4292 action='store_true', dest='dump_user_agent',
4293 help='display the current browser identification', default=False)
4294 general.add_option('--list-extractors',
4295 action='store_true', dest='list_extractors',
4296 help='List all supported extractors and the URLs they would handle', default=False)
4298 selection.add_option('--playlist-start',
4299 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4300 selection.add_option('--playlist-end',
4301 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4302 selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4303 selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4304 selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4306 authentication.add_option('-u', '--username',
4307 dest='username', metavar='USERNAME', help='account username')
4308 authentication.add_option('-p', '--password',
4309 dest='password', metavar='PASSWORD', help='account password')
4310 authentication.add_option('-n', '--netrc',
4311 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4314 video_format.add_option('-f', '--format',
4315 action='store', dest='format', metavar='FORMAT', help='video format code')
4316 video_format.add_option('--all-formats',
4317 action='store_const', dest='format', help='download all available video formats', const='all')
4318 video_format.add_option('--prefer-free-formats',
4319 action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested')
4320 video_format.add_option('--max-quality',
4321 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4322 video_format.add_option('-F', '--list-formats',
4323 action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4326 verbosity.add_option('-q', '--quiet',
4327 action='store_true', dest='quiet', help='activates quiet mode', default=False)
4328 verbosity.add_option('-s', '--simulate',
4329 action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4330 verbosity.add_option('--skip-download',
4331 action='store_true', dest='skip_download', help='do not download the video', default=False)
4332 verbosity.add_option('-g', '--get-url',
4333 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4334 verbosity.add_option('-e', '--get-title',
4335 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4336 verbosity.add_option('--get-thumbnail',
4337 action='store_true', dest='getthumbnail',
4338 help='simulate, quiet but print thumbnail URL', default=False)
4339 verbosity.add_option('--get-description',
4340 action='store_true', dest='getdescription',
4341 help='simulate, quiet but print video description', default=False)
4342 verbosity.add_option('--get-filename',
4343 action='store_true', dest='getfilename',
4344 help='simulate, quiet but print output filename', default=False)
4345 verbosity.add_option('--get-format',
4346 action='store_true', dest='getformat',
4347 help='simulate, quiet but print output format', default=False)
4348 verbosity.add_option('--no-progress',
4349 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4350 verbosity.add_option('--console-title',
4351 action='store_true', dest='consoletitle',
4352 help='display progress in console titlebar', default=False)
4353 verbosity.add_option('-v', '--verbose',
4354 action='store_true', dest='verbose', help='print various debugging information', default=False)
4357 filesystem.add_option('-t', '--title',
4358 action='store_true', dest='usetitle', help='use title in file name', default=False)
4359 filesystem.add_option('-l', '--literal',
4360 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4361 filesystem.add_option('-A', '--auto-number',
4362 action='store_true', dest='autonumber',
4363 help='number downloaded files starting from 00000', default=False)
4364 filesystem.add_option('-o', '--output',
4365 dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4366 filesystem.add_option('-a', '--batch-file',
4367 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4368 filesystem.add_option('-w', '--no-overwrites',
4369 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4370 filesystem.add_option('-c', '--continue',
4371 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=True)
4372 filesystem.add_option('--no-continue',
4373 action='store_false', dest='continue_dl',
4374 help='do not resume partially downloaded files (restart from beginning)')
4375 filesystem.add_option('--cookies',
4376 dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4377 filesystem.add_option('--no-part',
4378 action='store_true', dest='nopart', help='do not use .part files', default=False)
4379 filesystem.add_option('--no-mtime',
4380 action='store_false', dest='updatetime',
4381 help='do not use the Last-modified header to set the file modification time', default=True)
4382 filesystem.add_option('--write-description',
4383 action='store_true', dest='writedescription',
4384 help='write video description to a .description file', default=False)
4385 filesystem.add_option('--write-info-json',
4386 action='store_true', dest='writeinfojson',
4387 help='write video metadata to a .info.json file', default=False)
4390 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4391 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4392 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4393 help='"best", "aac", "vorbis", "mp3", "m4a", or "wav"; best by default')
4394 postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4395 help='ffmpeg audio bitrate specification, 128k by default')
4396 postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4397 help='keeps the video file on disk after the post-processing; the video is erased by default')
4400 parser.add_option_group(general)
4401 parser.add_option_group(selection)
4402 parser.add_option_group(filesystem)
4403 parser.add_option_group(verbosity)
4404 parser.add_option_group(video_format)
4405 parser.add_option_group(authentication)
4406 parser.add_option_group(postproc)
4408 xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4410 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4412 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4413 argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4414 opts, args = parser.parse_args(argv)
4416 return parser, opts, args
4418 def gen_extractors():
4419 """ Return a list of an instance of every supported extractor.
4420 The order does matter; the first extractor matched is the one handling the URL.
4422 youtube_ie = YoutubeIE()
4423 google_ie = GoogleIE()
4424 yahoo_ie = YahooIE()
4426 YoutubePlaylistIE(youtube_ie),
4427 YoutubeUserIE(youtube_ie),
4428 YoutubeSearchIE(youtube_ie),
4430 MetacafeIE(youtube_ie),
4433 GoogleSearchIE(google_ie),
4436 YahooSearchIE(yahoo_ie),
4449 StanfordOpenClassroomIE(),
4456 parser, opts, args = parseOpts()
4458 # Open appropriate CookieJar
4459 if opts.cookiefile is None:
4460 jar = cookielib.CookieJar()
4463 jar = cookielib.MozillaCookieJar(opts.cookiefile)
4464 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4466 except (IOError, OSError), err:
4467 sys.exit(u'ERROR: unable to open cookie file')
4470 if opts.dump_user_agent:
4471 print std_headers['User-Agent']
4474 # Batch file verification
4476 if opts.batchfile is not None:
4478 if opts.batchfile == '-':
4481 batchfd = open(opts.batchfile, 'r')
4482 batchurls = batchfd.readlines()
4483 batchurls = [x.strip() for x in batchurls]
4484 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4486 sys.exit(u'ERROR: batch file could not be read')
4487 all_urls = batchurls + args
4489 # General configuration
4490 cookie_processor = urllib2.HTTPCookieProcessor(jar)
4491 proxy_handler = urllib2.ProxyHandler()
4492 opener = urllib2.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler())
4493 urllib2.install_opener(opener)
4494 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4497 print(u'[debug] Proxy map: ' + str(proxy_handler.proxies))
4499 extractors = gen_extractors()
4501 if opts.list_extractors:
4502 for ie in extractors:
4504 matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4505 all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4506 for mu in matchedUrls:
4510 # Conflicting, missing and erroneous options
4511 if opts.usenetrc and (opts.username is not None or opts.password is not None):
4512 parser.error(u'using .netrc conflicts with giving username/password')
4513 if opts.password is not None and opts.username is None:
4514 parser.error(u'account username missing')
4515 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4516 parser.error(u'using output template conflicts with using title, literal title or auto number')
4517 if opts.usetitle and opts.useliteral:
4518 parser.error(u'using title conflicts with using literal title')
4519 if opts.username is not None and opts.password is None:
4520 opts.password = getpass.getpass(u'Type account password and press return:')
4521 if opts.ratelimit is not None:
4522 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4523 if numeric_limit is None:
4524 parser.error(u'invalid rate limit specified')
4525 opts.ratelimit = numeric_limit
4526 if opts.retries is not None:
4528 opts.retries = long(opts.retries)
4529 except (TypeError, ValueError), err:
4530 parser.error(u'invalid retry count specified')
4532 opts.playliststart = int(opts.playliststart)
4533 if opts.playliststart <= 0:
4534 raise ValueError(u'Playlist start must be positive')
4535 except (TypeError, ValueError), err:
4536 parser.error(u'invalid playlist start number specified')
4538 opts.playlistend = int(opts.playlistend)
4539 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4540 raise ValueError(u'Playlist end must be greater than playlist start')
4541 except (TypeError, ValueError), err:
4542 parser.error(u'invalid playlist end number specified')
4543 if opts.extractaudio:
4544 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis', 'm4a', 'wav']:
4545 parser.error(u'invalid audio format specified')
4548 fd = FileDownloader({
4549 'usenetrc': opts.usenetrc,
4550 'username': opts.username,
4551 'password': opts.password,
4552 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4553 'forceurl': opts.geturl,
4554 'forcetitle': opts.gettitle,
4555 'forcethumbnail': opts.getthumbnail,
4556 'forcedescription': opts.getdescription,
4557 'forcefilename': opts.getfilename,
4558 'forceformat': opts.getformat,
4559 'simulate': opts.simulate,
4560 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4561 'format': opts.format,
4562 'format_limit': opts.format_limit,
4563 'listformats': opts.listformats,
4564 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4565 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4566 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4567 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4568 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4569 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4570 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4571 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4572 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4573 or u'%(id)s.%(ext)s'),
4574 'ignoreerrors': opts.ignoreerrors,
4575 'ratelimit': opts.ratelimit,
4576 'nooverwrites': opts.nooverwrites,
4577 'retries': opts.retries,
4578 'continuedl': opts.continue_dl,
4579 'noprogress': opts.noprogress,
4580 'playliststart': opts.playliststart,
4581 'playlistend': opts.playlistend,
4582 'logtostderr': opts.outtmpl == '-',
4583 'consoletitle': opts.consoletitle,
4584 'nopart': opts.nopart,
4585 'updatetime': opts.updatetime,
4586 'writedescription': opts.writedescription,
4587 'writeinfojson': opts.writeinfojson,
4588 'matchtitle': opts.matchtitle,
4589 'rejecttitle': opts.rejecttitle,
4590 'max_downloads': opts.max_downloads,
4591 'prefer_free_formats': opts.prefer_free_formats,
4593 for extractor in extractors:
4594 fd.add_info_extractor(extractor)
4597 if opts.extractaudio:
4598 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4601 if opts.update_self:
4602 updateSelf(fd, sys.argv[0])
4605 if len(all_urls) < 1:
4606 if not opts.update_self:
4607 parser.error(u'you must provide at least one URL')
4612 retcode = fd.download(all_urls)
4613 except MaxDownloadsReached:
4614 fd.to_screen(u'--max-download limit reached, aborting.')
4617 # Dump cookie jar if requested
4618 if opts.cookiefile is not None:
4621 except (IOError, OSError), err:
4622 sys.exit(u'ERROR: unable to save cookie jar')
4629 except DownloadError:
4631 except SameFileError:
4632 sys.exit(u'ERROR: fixed output name but more than one file to download')
4633 except KeyboardInterrupt:
4634 sys.exit(u'\nERROR: Interrupted by user')
4636 if __name__ == '__main__':
4639 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: