2 # -*- coding: utf-8 -*-
5 'Ricardo Garcia Gonzalez',
13 'Philipp Hagemeister',
20 __license__ = 'Public Domain'
21 __version__ = '2012.02.26'
23 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
56 except ImportError: # Python 2.4
59 import cStringIO as StringIO
63 # parse_qs was moved from the cgi module to the urlparse module recently.
65 from urlparse import parse_qs
67 from cgi import parse_qs
75 import xml.etree.ElementTree
76 except ImportError: # Python<2.5: Not officially supported, but let it slip
77 warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
80 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
81 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
82 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
83 'Accept-Encoding': 'gzip, deflate',
84 'Accept-Language': 'en-us,en;q=0.5',
89 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
95 def raiseError(msg, i):
96 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
97 def skipSpace(i, expectMore=True):
98 while i < len(s) and s[i] in ' \t\r\n':
102 raiseError('Premature end', i)
104 def decodeEscape(match):
120 return unichr(int(esc[1:5], 16))
121 if len(esc) == 5+6 and esc[5:7] == '\\u':
122 hi = int(esc[1:5], 16)
123 low = int(esc[7:11], 16)
124 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
125 raise ValueError('Unknown escape ' + str(esc))
132 while s[e-bslashes-1] == '\\':
134 if bslashes % 2 == 1:
138 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
139 stri = rexp.sub(decodeEscape, s[i:e])
145 if s[i] == '}': # Empty dictionary
149 raiseError('Expected a string object key', i)
150 i,key = parseString(i)
152 if i >= len(s) or s[i] != ':':
153 raiseError('Expected a colon', i)
160 raiseError('Expected comma or closing curly brace', i)
165 if s[i] == ']': # Empty array
170 i = skipSpace(i) # Raise exception if premature end
174 raiseError('Expected a comma or closing bracket', i)
176 def parseDiscrete(i):
177 for k,v in {'true': True, 'false': False, 'null': None}.items():
178 if s.startswith(k, i):
180 raiseError('Not a boolean (or null)', i)
182 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
184 raiseError('Not a number', i)
186 if '.' in nums or 'e' in nums or 'E' in nums:
187 return (i+len(nums), float(nums))
188 return (i+len(nums), int(nums))
189 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
192 i,res = CHARMAP.get(s[i], parseNumber)(i)
193 i = skipSpace(i, False)
197 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
200 def preferredencoding():
201 """Get preferred encoding.
203 Returns the best encoding scheme for the system, based on
204 locale.getpreferredencoding() and some further tweaks.
206 def yield_preferredencoding():
208 pref = locale.getpreferredencoding()
214 return yield_preferredencoding().next()
217 def htmlentity_transform(matchobj):
218 """Transforms an HTML entity to a Unicode character.
220 This function receives a match object and is intended to be used with
221 the re.sub() function.
223 entity = matchobj.group(1)
225 # Known non-numeric HTML entity
226 if entity in htmlentitydefs.name2codepoint:
227 return unichr(htmlentitydefs.name2codepoint[entity])
230 mobj = re.match(ur'(?u)#(x?\d+)', entity)
232 numstr = mobj.group(1)
233 if numstr.startswith(u'x'):
235 numstr = u'0%s' % numstr
238 return unichr(long(numstr, base))
240 # Unknown entity in name, return its literal representation
241 return (u'&%s;' % entity)
244 def sanitize_title(utitle):
245 """Sanitizes a video title so it could be used as part of a filename."""
246 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
247 return utitle.replace(unicode(os.sep), u'%')
250 def sanitize_open(filename, open_mode):
251 """Try to open the given filename, and slightly tweak it if this fails.
253 Attempts to open the given filename. If this fails, it tries to change
254 the filename slightly, step by step, until it's either able to open it
255 or it fails and raises a final exception, like the standard open()
258 It returns the tuple (stream, definitive_file_name).
262 if sys.platform == 'win32':
264 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
265 return (sys.stdout, filename)
266 stream = open(_encodeFilename(filename), open_mode)
267 return (stream, filename)
268 except (IOError, OSError), err:
269 # In case of error, try to remove win32 forbidden chars
270 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
272 # An exception here should be caught in the caller
273 stream = open(_encodeFilename(filename), open_mode)
274 return (stream, filename)
277 def timeconvert(timestr):
278 """Convert RFC 2822 defined time string into system timestamp"""
280 timetuple = email.utils.parsedate_tz(timestr)
281 if timetuple is not None:
282 timestamp = email.utils.mktime_tz(timetuple)
285 def _simplify_title(title):
286 expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
287 return expr.sub(u'_', title).strip(u'_')
289 def _orderedSet(iterable):
290 """ Remove all duplicates from the input iterable """
297 def _unescapeHTML(s):
299 @param s a string (of type unicode)
301 assert type(s) == type(u'')
303 htmlParser = HTMLParser.HTMLParser()
304 return htmlParser.unescape(s)
306 def _encodeFilename(s):
308 @param s The name of the file (of type unicode)
311 assert type(s) == type(u'')
313 if sys.platform == 'win32' and sys.getwindowsversion().major >= 5:
314 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
315 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
316 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
319 return s.encode(sys.getfilesystemencoding(), 'ignore')
321 class DownloadError(Exception):
322 """Download Error exception.
324 This exception may be thrown by FileDownloader objects if they are not
325 configured to continue on errors. They will contain the appropriate
331 class SameFileError(Exception):
332 """Same File exception.
334 This exception will be thrown by FileDownloader objects if they detect
335 multiple files would have to be downloaded to the same file on disk.
340 class PostProcessingError(Exception):
341 """Post Processing exception.
343 This exception may be raised by PostProcessor's .run() method to
344 indicate an error in the postprocessing task.
348 class MaxDownloadsReached(Exception):
349 """ --max-downloads limit has been reached. """
353 class UnavailableVideoError(Exception):
354 """Unavailable Format exception.
356 This exception will be thrown when a video is requested
357 in a format that is not available for that video.
362 class ContentTooShortError(Exception):
363 """Content Too Short exception.
365 This exception may be raised by FileDownloader objects when a file they
366 download is too small for what the server announced first, indicating
367 the connection was probably interrupted.
373 def __init__(self, downloaded, expected):
374 self.downloaded = downloaded
375 self.expected = expected
378 class YoutubeDLHandler(urllib2.HTTPHandler):
379 """Handler for HTTP requests and responses.
381 This class, when installed with an OpenerDirector, automatically adds
382 the standard headers to every HTTP request and handles gzipped and
383 deflated responses from web servers. If compression is to be avoided in
384 a particular request, the original request in the program code only has
385 to include the HTTP header "Youtubedl-No-Compression", which will be
386 removed before making the real request.
388 Part of this code was copied from:
390 http://techknack.net/python-urllib2-handlers/
392 Andrew Rowls, the author of that code, agreed to release it to the
399 return zlib.decompress(data, -zlib.MAX_WBITS)
401 return zlib.decompress(data)
404 def addinfourl_wrapper(stream, headers, url, code):
405 if hasattr(urllib2.addinfourl, 'getcode'):
406 return urllib2.addinfourl(stream, headers, url, code)
407 ret = urllib2.addinfourl(stream, headers, url)
411 def http_request(self, req):
412 for h in std_headers:
415 req.add_header(h, std_headers[h])
416 if 'Youtubedl-no-compression' in req.headers:
417 if 'Accept-encoding' in req.headers:
418 del req.headers['Accept-encoding']
419 del req.headers['Youtubedl-no-compression']
422 def http_response(self, req, resp):
425 if resp.headers.get('Content-encoding', '') == 'gzip':
426 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
427 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
428 resp.msg = old_resp.msg
430 if resp.headers.get('Content-encoding', '') == 'deflate':
431 gz = StringIO.StringIO(self.deflate(resp.read()))
432 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
433 resp.msg = old_resp.msg
437 class FileDownloader(object):
438 """File Downloader class.
440 File downloader objects are the ones responsible of downloading the
441 actual video file and writing it to disk if the user has requested
442 it, among some other tasks. In most cases there should be one per
443 program. As, given a video URL, the downloader doesn't know how to
444 extract all the needed information, task that InfoExtractors do, it
445 has to pass the URL to one of them.
447 For this, file downloader objects have a method that allows
448 InfoExtractors to be registered in a given order. When it is passed
449 a URL, the file downloader handles it to the first InfoExtractor it
450 finds that reports being able to handle it. The InfoExtractor extracts
451 all the information about the video or videos the URL refers to, and
452 asks the FileDownloader to process the video information, possibly
453 downloading the video.
455 File downloaders accept a lot of parameters. In order not to saturate
456 the object constructor with arguments, it receives a dictionary of
457 options instead. These options are available through the params
458 attribute for the InfoExtractors to use. The FileDownloader also
459 registers itself as the downloader in charge for the InfoExtractors
460 that are added to it, so this is a "mutual registration".
464 username: Username for authentication purposes.
465 password: Password for authentication purposes.
466 usenetrc: Use netrc for authentication instead.
467 quiet: Do not print messages to stdout.
468 forceurl: Force printing final URL.
469 forcetitle: Force printing title.
470 forcethumbnail: Force printing thumbnail URL.
471 forcedescription: Force printing description.
472 forcefilename: Force printing final filename.
473 simulate: Do not download the video files.
474 format: Video format code.
475 format_limit: Highest quality format to try.
476 outtmpl: Template for output names.
477 ignoreerrors: Do not stop on download errors.
478 ratelimit: Download speed limit, in bytes/sec.
479 nooverwrites: Prevent overwriting files.
480 retries: Number of times to retry for HTTP error 5xx
481 continuedl: Try to continue downloads if possible.
482 noprogress: Do not print the progress bar.
483 playliststart: Playlist item to start at.
484 playlistend: Playlist item to end at.
485 matchtitle: Download only matching titles.
486 rejecttitle: Reject downloads for matching titles.
487 logtostderr: Log messages to stderr instead of stdout.
488 consoletitle: Display progress in console window's titlebar.
489 nopart: Do not use temporary .part files.
490 updatetime: Use the Last-modified header to set output file timestamps.
491 writedescription: Write the video description to a .description file
492 writeinfojson: Write the video description to a .info.json file
498 _download_retcode = None
499 _num_downloads = None
502 def __init__(self, params):
503 """Create a FileDownloader object with the given options."""
506 self._download_retcode = 0
507 self._num_downloads = 0
508 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
512 def format_bytes(bytes):
515 if type(bytes) is str:
520 exponent = long(math.log(bytes, 1024.0))
521 suffix = 'bkMGTPEZY'[exponent]
522 converted = float(bytes) / float(1024 ** exponent)
523 return '%.2f%s' % (converted, suffix)
526 def calc_percent(byte_counter, data_len):
529 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
532 def calc_eta(start, now, total, current):
536 if current == 0 or dif < 0.001: # One millisecond
538 rate = float(current) / dif
539 eta = long((float(total) - float(current)) / rate)
540 (eta_mins, eta_secs) = divmod(eta, 60)
543 return '%02d:%02d' % (eta_mins, eta_secs)
546 def calc_speed(start, now, bytes):
548 if bytes == 0 or dif < 0.001: # One millisecond
549 return '%10s' % '---b/s'
550 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
553 def best_block_size(elapsed_time, bytes):
554 new_min = max(bytes / 2.0, 1.0)
555 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
556 if elapsed_time < 0.001:
558 rate = bytes / elapsed_time
566 def parse_bytes(bytestr):
567 """Parse a string indicating a byte quantity into a long integer."""
568 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
571 number = float(matchobj.group(1))
572 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
573 return long(round(number * multiplier))
575 def add_info_extractor(self, ie):
576 """Add an InfoExtractor object to the end of the list."""
578 ie.set_downloader(self)
580 def add_post_processor(self, pp):
581 """Add a PostProcessor object to the end of the chain."""
583 pp.set_downloader(self)
585 def to_screen(self, message, skip_eol=False):
586 """Print message to stdout if not in quiet mode."""
587 assert type(message) == type(u'')
588 if not self.params.get('quiet', False):
589 terminator = [u'\n', u''][skip_eol]
590 output = message + terminator
592 if 'b' not in self._screen_file.mode or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
593 output = output.encode(preferredencoding(), 'ignore')
594 self._screen_file.write(output)
595 self._screen_file.flush()
597 def to_stderr(self, message):
598 """Print message to stderr."""
599 print >>sys.stderr, message.encode(preferredencoding())
601 def to_cons_title(self, message):
602 """Set console/terminal window title to message."""
603 if not self.params.get('consoletitle', False):
605 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
606 # c_wchar_p() might not be necessary if `message` is
607 # already of type unicode()
608 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
609 elif 'TERM' in os.environ:
610 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
612 def fixed_template(self):
613 """Checks if the output template is fixed."""
614 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
616 def trouble(self, message=None):
617 """Determine action to take when a download problem appears.
619 Depending on if the downloader has been configured to ignore
620 download errors or not, this method may throw an exception or
621 not when errors are found, after printing the message.
623 if message is not None:
624 self.to_stderr(message)
625 if not self.params.get('ignoreerrors', False):
626 raise DownloadError(message)
627 self._download_retcode = 1
629 def slow_down(self, start_time, byte_counter):
630 """Sleep if the download speed is over the rate limit."""
631 rate_limit = self.params.get('ratelimit', None)
632 if rate_limit is None or byte_counter == 0:
635 elapsed = now - start_time
638 speed = float(byte_counter) / elapsed
639 if speed > rate_limit:
640 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
642 def temp_name(self, filename):
643 """Returns a temporary filename for the given filename."""
644 if self.params.get('nopart', False) or filename == u'-' or \
645 (os.path.exists(_encodeFilename(filename)) and not os.path.isfile(_encodeFilename(filename))):
647 return filename + u'.part'
649 def undo_temp_name(self, filename):
650 if filename.endswith(u'.part'):
651 return filename[:-len(u'.part')]
654 def try_rename(self, old_filename, new_filename):
656 if old_filename == new_filename:
658 os.rename(_encodeFilename(old_filename), _encodeFilename(new_filename))
659 except (IOError, OSError), err:
660 self.trouble(u'ERROR: unable to rename file')
662 def try_utime(self, filename, last_modified_hdr):
663 """Try to set the last-modified time of the given file."""
664 if last_modified_hdr is None:
666 if not os.path.isfile(_encodeFilename(filename)):
668 timestr = last_modified_hdr
671 filetime = timeconvert(timestr)
675 os.utime(filename, (time.time(), filetime))
680 def report_writedescription(self, descfn):
681 """ Report that the description file is being written """
682 self.to_screen(u'[info] Writing video description to: ' + descfn)
684 def report_writeinfojson(self, infofn):
685 """ Report that the metadata file has been written """
686 self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
688 def report_destination(self, filename):
689 """Report destination filename."""
690 self.to_screen(u'[download] Destination: ' + filename)
692 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
693 """Report download progress."""
694 if self.params.get('noprogress', False):
696 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
697 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
698 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
699 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
701 def report_resuming_byte(self, resume_len):
702 """Report attempt to resume at given byte."""
703 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
705 def report_retry(self, count, retries):
706 """Report retry in case of HTTP error 5xx"""
707 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
709 def report_file_already_downloaded(self, file_name):
710 """Report file has already been fully downloaded."""
712 self.to_screen(u'[download] %s has already been downloaded' % file_name)
713 except (UnicodeEncodeError), err:
714 self.to_screen(u'[download] The file has already been downloaded')
716 def report_unable_to_resume(self):
717 """Report it was impossible to resume download."""
718 self.to_screen(u'[download] Unable to resume')
720 def report_finish(self):
721 """Report download finished."""
722 if self.params.get('noprogress', False):
723 self.to_screen(u'[download] Download completed')
727 def increment_downloads(self):
728 """Increment the ordinal that assigns a number to each file."""
729 self._num_downloads += 1
731 def prepare_filename(self, info_dict):
732 """Generate the output filename."""
734 template_dict = dict(info_dict)
735 template_dict['epoch'] = unicode(long(time.time()))
736 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
737 filename = self.params['outtmpl'] % template_dict
739 except (ValueError, KeyError), err:
740 self.trouble(u'ERROR: invalid system charset or erroneous output template')
743 def _match_entry(self, info_dict):
744 """ Returns None iff the file should be downloaded """
746 title = info_dict['title']
747 matchtitle = self.params.get('matchtitle', False)
748 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
749 return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
750 rejecttitle = self.params.get('rejecttitle', False)
751 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
752 return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
755 def process_info(self, info_dict):
756 """Process a single dictionary returned by an InfoExtractor."""
758 reason = self._match_entry(info_dict)
759 if reason is not None:
760 self.to_screen(u'[download] ' + reason)
763 max_downloads = self.params.get('max_downloads')
764 if max_downloads is not None:
765 if self._num_downloads > int(max_downloads):
766 raise MaxDownloadsReached()
768 filename = self.prepare_filename(info_dict)
771 if self.params.get('forcetitle', False):
772 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
773 if self.params.get('forceurl', False):
774 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
775 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
776 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
777 if self.params.get('forcedescription', False) and 'description' in info_dict:
778 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
779 if self.params.get('forcefilename', False) and filename is not None:
780 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
781 if self.params.get('forceformat', False):
782 print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
784 # Do nothing else if in simulate mode
785 if self.params.get('simulate', False):
792 dn = os.path.dirname(_encodeFilename(filename))
793 if dn != '' and not os.path.exists(dn): # dn is already encoded
795 except (OSError, IOError), err:
796 self.trouble(u'ERROR: unable to create directory ' + unicode(err))
799 if self.params.get('writedescription', False):
801 descfn = filename + u'.description'
802 self.report_writedescription(descfn)
803 descfile = open(_encodeFilename(descfn), 'wb')
805 descfile.write(info_dict['description'].encode('utf-8'))
808 except (OSError, IOError):
809 self.trouble(u'ERROR: Cannot write description file ' + descfn)
812 if self.params.get('writeinfojson', False):
813 infofn = filename + u'.info.json'
814 self.report_writeinfojson(infofn)
817 except (NameError,AttributeError):
818 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
821 infof = open(_encodeFilename(infofn), 'wb')
823 json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
824 json.dump(json_info_dict, infof)
827 except (OSError, IOError):
828 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
831 if not self.params.get('skip_download', False):
832 if self.params.get('nooverwrites', False) and os.path.exists(_encodeFilename(filename)):
836 success = self._do_download(filename, info_dict)
837 except (OSError, IOError), err:
838 raise UnavailableVideoError
839 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
840 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
842 except (ContentTooShortError, ), err:
843 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
848 self.post_process(filename, info_dict)
849 except (PostProcessingError), err:
850 self.trouble(u'ERROR: postprocessing: %s' % str(err))
853 def download(self, url_list):
854 """Download a given list of URLs."""
855 if len(url_list) > 1 and self.fixed_template():
856 raise SameFileError(self.params['outtmpl'])
859 suitable_found = False
861 # Go to next InfoExtractor if not suitable
862 if not ie.suitable(url):
865 # Suitable InfoExtractor found
866 suitable_found = True
868 # Extract information from URL and process it
871 # Suitable InfoExtractor had been found; go to next URL
874 if not suitable_found:
875 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
877 return self._download_retcode
879 def post_process(self, filename, ie_info):
880 """Run the postprocessing chain on the given file."""
882 info['filepath'] = filename
888 def _download_with_rtmpdump(self, filename, url, player_url):
889 self.report_destination(filename)
890 tmpfilename = self.temp_name(filename)
892 # Check for rtmpdump first
894 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
895 except (OSError, IOError):
896 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
899 # Download using rtmpdump. rtmpdump returns exit code 2 when
900 # the connection was interrumpted and resuming appears to be
901 # possible. This is part of rtmpdump's normal usage, AFAIK.
902 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
903 args = basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)]
904 if self.params['verbose']:
907 shell_quote = lambda args: ' '.join(map(pipes.quote, args))
910 self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(args))
911 retval = subprocess.call(args)
912 while retval == 2 or retval == 1:
913 prevsize = os.path.getsize(_encodeFilename(tmpfilename))
914 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
915 time.sleep(5.0) # This seems to be needed
916 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
917 cursize = os.path.getsize(_encodeFilename(tmpfilename))
918 if prevsize == cursize and retval == 1:
920 # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
921 if prevsize == cursize and retval == 2 and cursize > 1024:
922 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
926 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(_encodeFilename(tmpfilename)))
927 self.try_rename(tmpfilename, filename)
930 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
933 def _do_download(self, filename, info_dict):
934 url = info_dict['url']
935 player_url = info_dict.get('player_url', None)
937 # Check file already present
938 if self.params.get('continuedl', False) and os.path.isfile(_encodeFilename(filename)) and not self.params.get('nopart', False):
939 self.report_file_already_downloaded(filename)
942 # Attempt to download using rtmpdump
943 if url.startswith('rtmp'):
944 return self._download_with_rtmpdump(filename, url, player_url)
946 tmpfilename = self.temp_name(filename)
949 # Do not include the Accept-Encoding header
950 headers = {'Youtubedl-no-compression': 'True'}
951 basic_request = urllib2.Request(url, None, headers)
952 request = urllib2.Request(url, None, headers)
954 # Establish possible resume length
955 if os.path.isfile(_encodeFilename(tmpfilename)):
956 resume_len = os.path.getsize(_encodeFilename(tmpfilename))
962 if self.params.get('continuedl', False):
963 self.report_resuming_byte(resume_len)
964 request.add_header('Range','bytes=%d-' % resume_len)
970 retries = self.params.get('retries', 0)
971 while count <= retries:
972 # Establish connection
974 if count == 0 and 'urlhandle' in info_dict:
975 data = info_dict['urlhandle']
976 data = urllib2.urlopen(request)
978 except (urllib2.HTTPError, ), err:
979 if (err.code < 500 or err.code >= 600) and err.code != 416:
980 # Unexpected HTTP error
982 elif err.code == 416:
983 # Unable to resume (requested range not satisfiable)
985 # Open the connection again without the range header
986 data = urllib2.urlopen(basic_request)
987 content_length = data.info()['Content-Length']
988 except (urllib2.HTTPError, ), err:
989 if err.code < 500 or err.code >= 600:
992 # Examine the reported length
993 if (content_length is not None and
994 (resume_len - 100 < long(content_length) < resume_len + 100)):
995 # The file had already been fully downloaded.
996 # Explanation to the above condition: in issue #175 it was revealed that
997 # YouTube sometimes adds or removes a few bytes from the end of the file,
998 # changing the file size slightly and causing problems for some users. So
999 # I decided to implement a suggested change and consider the file
1000 # completely downloaded if the file size differs less than 100 bytes from
1001 # the one in the hard drive.
1002 self.report_file_already_downloaded(filename)
1003 self.try_rename(tmpfilename, filename)
1006 # The length does not match, we start the download over
1007 self.report_unable_to_resume()
1012 if count <= retries:
1013 self.report_retry(count, retries)
1016 self.trouble(u'ERROR: giving up after %s retries' % retries)
1019 data_len = data.info().get('Content-length', None)
1020 if data_len is not None:
1021 data_len = long(data_len) + resume_len
1022 data_len_str = self.format_bytes(data_len)
1023 byte_counter = 0 + resume_len
1027 # Download and write
1028 before = time.time()
1029 data_block = data.read(block_size)
1031 if len(data_block) == 0:
1033 byte_counter += len(data_block)
1035 # Open file just in time
1038 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
1039 assert stream is not None
1040 filename = self.undo_temp_name(tmpfilename)
1041 self.report_destination(filename)
1042 except (OSError, IOError), err:
1043 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
1046 stream.write(data_block)
1047 except (IOError, OSError), err:
1048 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1050 block_size = self.best_block_size(after - before, len(data_block))
1053 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1054 if data_len is None:
1055 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1057 percent_str = self.calc_percent(byte_counter, data_len)
1058 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1059 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1062 self.slow_down(start, byte_counter - resume_len)
1065 self.trouble(u'\nERROR: Did not get any data blocks')
1068 self.report_finish()
1069 if data_len is not None and byte_counter != data_len:
1070 raise ContentTooShortError(byte_counter, long(data_len))
1071 self.try_rename(tmpfilename, filename)
1073 # Update file modification time
1074 if self.params.get('updatetime', True):
1075 info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1080 class InfoExtractor(object):
1081 """Information Extractor class.
1083 Information extractors are the classes that, given a URL, extract
1084 information from the video (or videos) the URL refers to. This
1085 information includes the real video URL, the video title and simplified
1086 title, author and others. The information is stored in a dictionary
1087 which is then passed to the FileDownloader. The FileDownloader
1088 processes this information possibly downloading the video to the file
1089 system, among other possible outcomes. The dictionaries must include
1090 the following fields:
1092 id: Video identifier.
1093 url: Final video URL.
1094 uploader: Nickname of the video uploader.
1095 title: Literal title.
1096 stitle: Simplified title.
1097 ext: Video filename extension.
1098 format: Video format.
1099 player_url: SWF Player URL (may be None).
1101 The following fields are optional. Their primary purpose is to allow
1102 youtube-dl to serve as the backend for a video search function, such
1103 as the one in youtube2mp3. They are only used when their respective
1104 forced printing functions are called:
1106 thumbnail: Full URL to a video thumbnail image.
1107 description: One-line video description.
1109 Subclasses of this one should re-define the _real_initialize() and
1110 _real_extract() methods and define a _VALID_URL regexp.
1111 Probably, they should also be added to the list of extractors.
1117 def __init__(self, downloader=None):
1118 """Constructor. Receives an optional downloader."""
1120 self.set_downloader(downloader)
1122 def suitable(self, url):
1123 """Receives a URL and returns True if suitable for this IE."""
1124 return re.match(self._VALID_URL, url) is not None
1126 def initialize(self):
1127 """Initializes an instance (authentication, etc)."""
1129 self._real_initialize()
1132 def extract(self, url):
1133 """Extracts URL information and returns it in list of dicts."""
1135 return self._real_extract(url)
1137 def set_downloader(self, downloader):
1138 """Sets the downloader for this IE."""
1139 self._downloader = downloader
1141 def _real_initialize(self):
1142 """Real initialization process. Redefine in subclasses."""
1145 def _real_extract(self, url):
1146 """Real extraction process. Redefine in subclasses."""
1150 class YoutubeIE(InfoExtractor):
1151 """Information extractor for youtube.com."""
1153 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1154 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1155 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1156 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1157 _NETRC_MACHINE = 'youtube'
1158 # Listed in order of quality
1159 _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1160 _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
1161 _video_extensions = {
1167 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1172 _video_dimensions = {
1187 IE_NAME = u'youtube'
1189 def report_lang(self):
1190 """Report attempt to set language."""
1191 self._downloader.to_screen(u'[youtube] Setting language')
1193 def report_login(self):
1194 """Report attempt to log in."""
1195 self._downloader.to_screen(u'[youtube] Logging in')
1197 def report_age_confirmation(self):
1198 """Report attempt to confirm age."""
1199 self._downloader.to_screen(u'[youtube] Confirming age')
1201 def report_video_webpage_download(self, video_id):
1202 """Report attempt to download video webpage."""
1203 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1205 def report_video_info_webpage_download(self, video_id):
1206 """Report attempt to download video info webpage."""
1207 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1209 def report_information_extraction(self, video_id):
1210 """Report attempt to extract video information."""
1211 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1213 def report_unavailable_format(self, video_id, format):
1214 """Report extracted video URL."""
1215 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1217 def report_rtmp_download(self):
1218 """Indicate the download will use the RTMP protocol."""
1219 self._downloader.to_screen(u'[youtube] RTMP download detected')
1221 def _print_formats(self, formats):
1222 print 'Available formats:'
1224 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1226 def _real_initialize(self):
1227 if self._downloader is None:
1232 downloader_params = self._downloader.params
1234 # Attempt to use provided username and password or .netrc data
1235 if downloader_params.get('username', None) is not None:
1236 username = downloader_params['username']
1237 password = downloader_params['password']
1238 elif downloader_params.get('usenetrc', False):
1240 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1241 if info is not None:
1245 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1246 except (IOError, netrc.NetrcParseError), err:
1247 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1251 request = urllib2.Request(self._LANG_URL)
1254 urllib2.urlopen(request).read()
1255 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1256 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1259 # No authentication to be performed
1260 if username is None:
1265 'current_form': 'loginForm',
1267 'action_login': 'Log In',
1268 'username': username,
1269 'password': password,
1271 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1274 login_results = urllib2.urlopen(request).read()
1275 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1276 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1278 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1279 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1285 'action_confirm': 'Confirm',
1287 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1289 self.report_age_confirmation()
1290 age_results = urllib2.urlopen(request).read()
1291 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1292 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1295 def _real_extract(self, url):
1296 # Extract video id from URL
1297 mobj = re.match(self._VALID_URL, url)
1299 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1301 video_id = mobj.group(2)
1304 self.report_video_webpage_download(video_id)
1305 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1307 video_webpage = urllib2.urlopen(request).read()
1308 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1309 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1312 # Attempt to extract SWF player URL
1313 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1314 if mobj is not None:
1315 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1320 self.report_video_info_webpage_download(video_id)
1321 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1322 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1323 % (video_id, el_type))
1324 request = urllib2.Request(video_info_url)
1326 video_info_webpage = urllib2.urlopen(request).read()
1327 video_info = parse_qs(video_info_webpage)
1328 if 'token' in video_info:
1330 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1331 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1333 if 'token' not in video_info:
1334 if 'reason' in video_info:
1335 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1337 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1340 # Start extracting information
1341 self.report_information_extraction(video_id)
1344 if 'author' not in video_info:
1345 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1347 video_uploader = urllib.unquote_plus(video_info['author'][0])
1350 if 'title' not in video_info:
1351 self._downloader.trouble(u'ERROR: unable to extract video title')
1353 video_title = urllib.unquote_plus(video_info['title'][0])
1354 video_title = video_title.decode('utf-8')
1355 video_title = sanitize_title(video_title)
1358 simple_title = _simplify_title(video_title)
1361 if 'thumbnail_url' not in video_info:
1362 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1363 video_thumbnail = ''
1364 else: # don't panic if we can't find it
1365 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1369 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1370 if mobj is not None:
1371 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1372 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1373 for expression in format_expressions:
1375 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1383 video_description = u'No description available.'
1384 if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1385 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1386 if mobj is not None:
1387 video_description = mobj.group(1).decode('utf-8')
1389 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1390 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1391 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1392 # TODO use another parser
1395 video_token = urllib.unquote_plus(video_info['token'][0])
1397 # Decide which formats to download
1398 req_format = self._downloader.params.get('format', None)
1400 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1401 self.report_rtmp_download()
1402 video_url_list = [(None, video_info['conn'][0])]
1403 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1404 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1405 url_data = [parse_qs(uds) for uds in url_data_strs]
1406 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1407 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1409 format_limit = self._downloader.params.get('format_limit', None)
1410 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1411 if format_limit is not None and format_limit in available_formats:
1412 format_list = available_formats[available_formats.index(format_limit):]
1414 format_list = available_formats
1415 existing_formats = [x for x in format_list if x in url_map]
1416 if len(existing_formats) == 0:
1417 self._downloader.trouble(u'ERROR: no known formats available for video')
1419 if self._downloader.params.get('listformats', None):
1420 self._print_formats(existing_formats)
1422 if req_format is None or req_format == 'best':
1423 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1424 elif req_format == 'worst':
1425 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1426 elif req_format in ('-1', 'all'):
1427 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1429 # Specific formats. We pick the first in a slash-delimeted sequence.
1430 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1431 req_formats = req_format.split('/')
1432 video_url_list = None
1433 for rf in req_formats:
1435 video_url_list = [(rf, url_map[rf])]
1437 if video_url_list is None:
1438 self._downloader.trouble(u'ERROR: requested format not available')
1441 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1444 for format_param, video_real_url in video_url_list:
1445 # At this point we have a new video
1446 self._downloader.increment_downloads()
1449 video_extension = self._video_extensions.get(format_param, 'flv')
1452 # Process video information
1453 self._downloader.process_info({
1454 'id': video_id.decode('utf-8'),
1455 'url': video_real_url.decode('utf-8'),
1456 'uploader': video_uploader.decode('utf-8'),
1457 'upload_date': upload_date,
1458 'title': video_title,
1459 'stitle': simple_title,
1460 'ext': video_extension.decode('utf-8'),
1461 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1462 'thumbnail': video_thumbnail.decode('utf-8'),
1463 'description': video_description,
1464 'player_url': player_url,
1466 except UnavailableVideoError, err:
1467 self._downloader.trouble(u'\nERROR: unable to download video')
1470 class MetacafeIE(InfoExtractor):
1471 """Information Extractor for metacafe.com."""
1473 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1474 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1475 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1477 IE_NAME = u'metacafe'
1479 def __init__(self, youtube_ie, downloader=None):
1480 InfoExtractor.__init__(self, downloader)
1481 self._youtube_ie = youtube_ie
1483 def report_disclaimer(self):
1484 """Report disclaimer retrieval."""
1485 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1487 def report_age_confirmation(self):
1488 """Report attempt to confirm age."""
1489 self._downloader.to_screen(u'[metacafe] Confirming age')
1491 def report_download_webpage(self, video_id):
1492 """Report webpage download."""
1493 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1495 def report_extraction(self, video_id):
1496 """Report information extraction."""
1497 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1499 def _real_initialize(self):
1500 # Retrieve disclaimer
1501 request = urllib2.Request(self._DISCLAIMER)
1503 self.report_disclaimer()
1504 disclaimer = urllib2.urlopen(request).read()
1505 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1506 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1512 'submit': "Continue - I'm over 18",
1514 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1516 self.report_age_confirmation()
1517 disclaimer = urllib2.urlopen(request).read()
1518 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1519 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1522 def _real_extract(self, url):
1523 # Extract id and simplified title from URL
1524 mobj = re.match(self._VALID_URL, url)
1526 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1529 video_id = mobj.group(1)
1531 # Check if video comes from YouTube
1532 mobj2 = re.match(r'^yt-(.*)$', video_id)
1533 if mobj2 is not None:
1534 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1537 # At this point we have a new video
1538 self._downloader.increment_downloads()
1540 simple_title = mobj.group(2).decode('utf-8')
1542 # Retrieve video webpage to extract further information
1543 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1545 self.report_download_webpage(video_id)
1546 webpage = urllib2.urlopen(request).read()
1547 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1548 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1551 # Extract URL, uploader and title from webpage
1552 self.report_extraction(video_id)
1553 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1554 if mobj is not None:
1555 mediaURL = urllib.unquote(mobj.group(1))
1556 video_extension = mediaURL[-3:]
1558 # Extract gdaKey if available
1559 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1561 video_url = mediaURL
1563 gdaKey = mobj.group(1)
1564 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1566 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1568 self._downloader.trouble(u'ERROR: unable to extract media URL')
1570 vardict = parse_qs(mobj.group(1))
1571 if 'mediaData' not in vardict:
1572 self._downloader.trouble(u'ERROR: unable to extract media URL')
1574 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1576 self._downloader.trouble(u'ERROR: unable to extract media URL')
1578 mediaURL = mobj.group(1).replace('\\/', '/')
1579 video_extension = mediaURL[-3:]
1580 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1582 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1584 self._downloader.trouble(u'ERROR: unable to extract title')
1586 video_title = mobj.group(1).decode('utf-8')
1587 video_title = sanitize_title(video_title)
1589 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1591 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1593 video_uploader = mobj.group(1)
1596 # Process video information
1597 self._downloader.process_info({
1598 'id': video_id.decode('utf-8'),
1599 'url': video_url.decode('utf-8'),
1600 'uploader': video_uploader.decode('utf-8'),
1601 'upload_date': u'NA',
1602 'title': video_title,
1603 'stitle': simple_title,
1604 'ext': video_extension.decode('utf-8'),
1608 except UnavailableVideoError:
1609 self._downloader.trouble(u'\nERROR: unable to download video')
1612 class DailymotionIE(InfoExtractor):
1613 """Information Extractor for Dailymotion"""
1615 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1616 IE_NAME = u'dailymotion'
1618 def __init__(self, downloader=None):
1619 InfoExtractor.__init__(self, downloader)
1621 def report_download_webpage(self, video_id):
1622 """Report webpage download."""
1623 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1625 def report_extraction(self, video_id):
1626 """Report information extraction."""
1627 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1629 def _real_extract(self, url):
1630 # Extract id and simplified title from URL
1631 mobj = re.match(self._VALID_URL, url)
1633 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1636 # At this point we have a new video
1637 self._downloader.increment_downloads()
1638 video_id = mobj.group(1)
1640 video_extension = 'flv'
1642 # Retrieve video webpage to extract further information
1643 request = urllib2.Request(url)
1644 request.add_header('Cookie', 'family_filter=off')
1646 self.report_download_webpage(video_id)
1647 webpage = urllib2.urlopen(request).read()
1648 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1649 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1652 # Extract URL, uploader and title from webpage
1653 self.report_extraction(video_id)
1654 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1656 self._downloader.trouble(u'ERROR: unable to extract media URL')
1658 sequence = urllib.unquote(mobj.group(1))
1659 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1661 self._downloader.trouble(u'ERROR: unable to extract media URL')
1663 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1665 # if needed add http://www.dailymotion.com/ if relative URL
1667 video_url = mediaURL
1669 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
1671 self._downloader.trouble(u'ERROR: unable to extract title')
1673 video_title = _unescapeHTML(mobj.group('title').decode('utf-8'))
1674 video_title = sanitize_title(video_title)
1675 simple_title = _simplify_title(video_title)
1677 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1679 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1681 video_uploader = mobj.group(1)
1684 # Process video information
1685 self._downloader.process_info({
1686 'id': video_id.decode('utf-8'),
1687 'url': video_url.decode('utf-8'),
1688 'uploader': video_uploader.decode('utf-8'),
1689 'upload_date': u'NA',
1690 'title': video_title,
1691 'stitle': simple_title,
1692 'ext': video_extension.decode('utf-8'),
1696 except UnavailableVideoError:
1697 self._downloader.trouble(u'\nERROR: unable to download video')
1700 class GoogleIE(InfoExtractor):
1701 """Information extractor for video.google.com."""
1703 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1704 IE_NAME = u'video.google'
1706 def __init__(self, downloader=None):
1707 InfoExtractor.__init__(self, downloader)
1709 def report_download_webpage(self, video_id):
1710 """Report webpage download."""
1711 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1713 def report_extraction(self, video_id):
1714 """Report information extraction."""
1715 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1717 def _real_extract(self, url):
1718 # Extract id from URL
1719 mobj = re.match(self._VALID_URL, url)
1721 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1724 # At this point we have a new video
1725 self._downloader.increment_downloads()
1726 video_id = mobj.group(1)
1728 video_extension = 'mp4'
1730 # Retrieve video webpage to extract further information
1731 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1733 self.report_download_webpage(video_id)
1734 webpage = urllib2.urlopen(request).read()
1735 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1736 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1739 # Extract URL, uploader, and title from webpage
1740 self.report_extraction(video_id)
1741 mobj = re.search(r"download_url:'([^']+)'", webpage)
1743 video_extension = 'flv'
1744 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1746 self._downloader.trouble(u'ERROR: unable to extract media URL')
1748 mediaURL = urllib.unquote(mobj.group(1))
1749 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1750 mediaURL = mediaURL.replace('\\x26', '\x26')
1752 video_url = mediaURL
1754 mobj = re.search(r'<title>(.*)</title>', webpage)
1756 self._downloader.trouble(u'ERROR: unable to extract title')
1758 video_title = mobj.group(1).decode('utf-8')
1759 video_title = sanitize_title(video_title)
1760 simple_title = _simplify_title(video_title)
1762 # Extract video description
1763 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1765 self._downloader.trouble(u'ERROR: unable to extract video description')
1767 video_description = mobj.group(1).decode('utf-8')
1768 if not video_description:
1769 video_description = 'No description available.'
1771 # Extract video thumbnail
1772 if self._downloader.params.get('forcethumbnail', False):
1773 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1775 webpage = urllib2.urlopen(request).read()
1776 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1777 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1779 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1781 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1783 video_thumbnail = mobj.group(1)
1784 else: # we need something to pass to process_info
1785 video_thumbnail = ''
1788 # Process video information
1789 self._downloader.process_info({
1790 'id': video_id.decode('utf-8'),
1791 'url': video_url.decode('utf-8'),
1793 'upload_date': u'NA',
1794 'title': video_title,
1795 'stitle': simple_title,
1796 'ext': video_extension.decode('utf-8'),
1800 except UnavailableVideoError:
1801 self._downloader.trouble(u'\nERROR: unable to download video')
1804 class PhotobucketIE(InfoExtractor):
1805 """Information extractor for photobucket.com."""
1807 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1808 IE_NAME = u'photobucket'
1810 def __init__(self, downloader=None):
1811 InfoExtractor.__init__(self, downloader)
1813 def report_download_webpage(self, video_id):
1814 """Report webpage download."""
1815 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1817 def report_extraction(self, video_id):
1818 """Report information extraction."""
1819 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1821 def _real_extract(self, url):
1822 # Extract id from URL
1823 mobj = re.match(self._VALID_URL, url)
1825 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1828 # At this point we have a new video
1829 self._downloader.increment_downloads()
1830 video_id = mobj.group(1)
1832 video_extension = 'flv'
1834 # Retrieve video webpage to extract further information
1835 request = urllib2.Request(url)
1837 self.report_download_webpage(video_id)
1838 webpage = urllib2.urlopen(request).read()
1839 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1840 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1843 # Extract URL, uploader, and title from webpage
1844 self.report_extraction(video_id)
1845 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1847 self._downloader.trouble(u'ERROR: unable to extract media URL')
1849 mediaURL = urllib.unquote(mobj.group(1))
1851 video_url = mediaURL
1853 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1855 self._downloader.trouble(u'ERROR: unable to extract title')
1857 video_title = mobj.group(1).decode('utf-8')
1858 video_title = sanitize_title(video_title)
1859 simple_title = _simplify_title(vide_title)
1861 video_uploader = mobj.group(2).decode('utf-8')
1864 # Process video information
1865 self._downloader.process_info({
1866 'id': video_id.decode('utf-8'),
1867 'url': video_url.decode('utf-8'),
1868 'uploader': video_uploader,
1869 'upload_date': u'NA',
1870 'title': video_title,
1871 'stitle': simple_title,
1872 'ext': video_extension.decode('utf-8'),
1876 except UnavailableVideoError:
1877 self._downloader.trouble(u'\nERROR: unable to download video')
1880 class YahooIE(InfoExtractor):
1881 """Information extractor for video.yahoo.com."""
1883 # _VALID_URL matches all Yahoo! Video URLs
1884 # _VPAGE_URL matches only the extractable '/watch/' URLs
1885 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1886 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1887 IE_NAME = u'video.yahoo'
1889 def __init__(self, downloader=None):
1890 InfoExtractor.__init__(self, downloader)
1892 def report_download_webpage(self, video_id):
1893 """Report webpage download."""
1894 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1896 def report_extraction(self, video_id):
1897 """Report information extraction."""
1898 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1900 def _real_extract(self, url, new_video=True):
1901 # Extract ID from URL
1902 mobj = re.match(self._VALID_URL, url)
1904 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1907 # At this point we have a new video
1908 self._downloader.increment_downloads()
1909 video_id = mobj.group(2)
1910 video_extension = 'flv'
1912 # Rewrite valid but non-extractable URLs as
1913 # extractable English language /watch/ URLs
1914 if re.match(self._VPAGE_URL, url) is None:
1915 request = urllib2.Request(url)
1917 webpage = urllib2.urlopen(request).read()
1918 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1919 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1922 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1924 self._downloader.trouble(u'ERROR: Unable to extract id field')
1926 yahoo_id = mobj.group(1)
1928 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1930 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1932 yahoo_vid = mobj.group(1)
1934 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1935 return self._real_extract(url, new_video=False)
1937 # Retrieve video webpage to extract further information
1938 request = urllib2.Request(url)
1940 self.report_download_webpage(video_id)
1941 webpage = urllib2.urlopen(request).read()
1942 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1943 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1946 # Extract uploader and title from webpage
1947 self.report_extraction(video_id)
1948 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1950 self._downloader.trouble(u'ERROR: unable to extract video title')
1952 video_title = mobj.group(1).decode('utf-8')
1953 simple_title = _simplify_title(video_title)
1955 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1957 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1959 video_uploader = mobj.group(1).decode('utf-8')
1961 # Extract video thumbnail
1962 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1964 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1966 video_thumbnail = mobj.group(1).decode('utf-8')
1968 # Extract video description
1969 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1971 self._downloader.trouble(u'ERROR: unable to extract video description')
1973 video_description = mobj.group(1).decode('utf-8')
1974 if not video_description:
1975 video_description = 'No description available.'
1977 # Extract video height and width
1978 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1980 self._downloader.trouble(u'ERROR: unable to extract video height')
1982 yv_video_height = mobj.group(1)
1984 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1986 self._downloader.trouble(u'ERROR: unable to extract video width')
1988 yv_video_width = mobj.group(1)
1990 # Retrieve video playlist to extract media URL
1991 # I'm not completely sure what all these options are, but we
1992 # seem to need most of them, otherwise the server sends a 401.
1993 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1994 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1995 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1996 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1997 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1999 self.report_download_webpage(video_id)
2000 webpage = urllib2.urlopen(request).read()
2001 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2002 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2005 # Extract media URL from playlist XML
2006 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
2008 self._downloader.trouble(u'ERROR: Unable to extract media URL')
2010 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
2011 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
2014 # Process video information
2015 self._downloader.process_info({
2016 'id': video_id.decode('utf-8'),
2018 'uploader': video_uploader,
2019 'upload_date': u'NA',
2020 'title': video_title,
2021 'stitle': simple_title,
2022 'ext': video_extension.decode('utf-8'),
2023 'thumbnail': video_thumbnail.decode('utf-8'),
2024 'description': video_description,
2025 'thumbnail': video_thumbnail,
2028 except UnavailableVideoError:
2029 self._downloader.trouble(u'\nERROR: unable to download video')
2032 class VimeoIE(InfoExtractor):
2033 """Information extractor for vimeo.com."""
2035 # _VALID_URL matches Vimeo URLs
2036 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
2039 def __init__(self, downloader=None):
2040 InfoExtractor.__init__(self, downloader)
2042 def report_download_webpage(self, video_id):
2043 """Report webpage download."""
2044 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2046 def report_extraction(self, video_id):
2047 """Report information extraction."""
2048 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2050 def _real_extract(self, url, new_video=True):
2051 # Extract ID from URL
2052 mobj = re.match(self._VALID_URL, url)
2054 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2057 # At this point we have a new video
2058 self._downloader.increment_downloads()
2059 video_id = mobj.group(1)
2061 # Retrieve video webpage to extract further information
2062 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
2064 self.report_download_webpage(video_id)
2065 webpage = urllib2.urlopen(request).read()
2066 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2067 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2070 # Now we begin extracting as much information as we can from what we
2071 # retrieved. First we extract the information common to all extractors,
2072 # and latter we extract those that are Vimeo specific.
2073 self.report_extraction(video_id)
2076 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
2078 self._downloader.trouble(u'ERROR: unable to extract video title')
2080 video_title = mobj.group(1).decode('utf-8')
2081 simple_title = _simplify_title(video_title)
2084 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
2086 self._downloader.trouble(u'ERROR: unable to extract video uploader')
2088 video_uploader = mobj.group(1).decode('utf-8')
2090 # Extract video thumbnail
2091 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2093 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2095 video_thumbnail = mobj.group(1).decode('utf-8')
2097 # # Extract video description
2098 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2100 # self._downloader.trouble(u'ERROR: unable to extract video description')
2102 # video_description = mobj.group(1).decode('utf-8')
2103 # if not video_description: video_description = 'No description available.'
2104 video_description = 'Foo.'
2106 # Vimeo specific: extract request signature
2107 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2109 self._downloader.trouble(u'ERROR: unable to extract request signature')
2111 sig = mobj.group(1).decode('utf-8')
2113 # Vimeo specific: extract video quality information
2114 mobj = re.search(r'<isHD>(\d+)</isHD>', webpage)
2116 self._downloader.trouble(u'ERROR: unable to extract video quality information')
2118 quality = mobj.group(1).decode('utf-8')
2120 if int(quality) == 1:
2125 # Vimeo specific: Extract request signature expiration
2126 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2128 self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2130 sig_exp = mobj.group(1).decode('utf-8')
2132 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s/?q=%s" % (video_id, sig, sig_exp, quality)
2135 # Process video information
2136 self._downloader.process_info({
2137 'id': video_id.decode('utf-8'),
2139 'uploader': video_uploader,
2140 'upload_date': u'NA',
2141 'title': video_title,
2142 'stitle': simple_title,
2144 'thumbnail': video_thumbnail.decode('utf-8'),
2145 'description': video_description,
2146 'thumbnail': video_thumbnail,
2147 'description': video_description,
2150 except UnavailableVideoError:
2151 self._downloader.trouble(u'ERROR: unable to download video')
2154 class GenericIE(InfoExtractor):
2155 """Generic last-resort information extractor."""
2158 IE_NAME = u'generic'
2160 def __init__(self, downloader=None):
2161 InfoExtractor.__init__(self, downloader)
2163 def report_download_webpage(self, video_id):
2164 """Report webpage download."""
2165 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2166 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2168 def report_extraction(self, video_id):
2169 """Report information extraction."""
2170 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2172 def _real_extract(self, url):
2173 # At this point we have a new video
2174 self._downloader.increment_downloads()
2176 video_id = url.split('/')[-1]
2177 request = urllib2.Request(url)
2179 self.report_download_webpage(video_id)
2180 webpage = urllib2.urlopen(request).read()
2181 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2182 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2184 except ValueError, err:
2185 # since this is the last-resort InfoExtractor, if
2186 # this error is thrown, it'll be thrown here
2187 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2190 self.report_extraction(video_id)
2191 # Start with something easy: JW Player in SWFObject
2192 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2194 # Broaden the search a little bit
2195 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2197 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2200 # It's possible that one of the regexes
2201 # matched, but returned an empty group:
2202 if mobj.group(1) is None:
2203 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2206 video_url = urllib.unquote(mobj.group(1))
2207 video_id = os.path.basename(video_url)
2209 # here's a fun little line of code for you:
2210 video_extension = os.path.splitext(video_id)[1][1:]
2211 video_id = os.path.splitext(video_id)[0]
2213 # it's tempting to parse this further, but you would
2214 # have to take into account all the variations like
2215 # Video Title - Site Name
2216 # Site Name | Video Title
2217 # Video Title - Tagline | Site Name
2218 # and so on and so forth; it's just not practical
2219 mobj = re.search(r'<title>(.*)</title>', webpage)
2221 self._downloader.trouble(u'ERROR: unable to extract title')
2223 video_title = mobj.group(1).decode('utf-8')
2224 video_title = sanitize_title(video_title)
2225 simple_title = _simplify_title(video_title)
2227 # video uploader is domain name
2228 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2230 self._downloader.trouble(u'ERROR: unable to extract title')
2232 video_uploader = mobj.group(1).decode('utf-8')
2235 # Process video information
2236 self._downloader.process_info({
2237 'id': video_id.decode('utf-8'),
2238 'url': video_url.decode('utf-8'),
2239 'uploader': video_uploader,
2240 'upload_date': u'NA',
2241 'title': video_title,
2242 'stitle': simple_title,
2243 'ext': video_extension.decode('utf-8'),
2247 except UnavailableVideoError, err:
2248 self._downloader.trouble(u'\nERROR: unable to download video')
2251 class YoutubeSearchIE(InfoExtractor):
2252 """Information Extractor for YouTube search queries."""
2253 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2254 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2255 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2256 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2258 _max_youtube_results = 1000
2259 IE_NAME = u'youtube:search'
2261 def __init__(self, youtube_ie, downloader=None):
2262 InfoExtractor.__init__(self, downloader)
2263 self._youtube_ie = youtube_ie
2265 def report_download_page(self, query, pagenum):
2266 """Report attempt to download playlist page with given number."""
2267 query = query.decode(preferredencoding())
2268 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2270 def _real_initialize(self):
2271 self._youtube_ie.initialize()
2273 def _real_extract(self, query):
2274 mobj = re.match(self._VALID_URL, query)
2276 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2279 prefix, query = query.split(':')
2281 query = query.encode('utf-8')
2283 self._download_n_results(query, 1)
2285 elif prefix == 'all':
2286 self._download_n_results(query, self._max_youtube_results)
2292 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2294 elif n > self._max_youtube_results:
2295 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2296 n = self._max_youtube_results
2297 self._download_n_results(query, n)
2299 except ValueError: # parsing prefix as integer fails
2300 self._download_n_results(query, 1)
2303 def _download_n_results(self, query, n):
2304 """Downloads a specified number of results for a query"""
2307 already_seen = set()
2311 self.report_download_page(query, pagenum)
2312 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2313 request = urllib2.Request(result_url)
2315 page = urllib2.urlopen(request).read()
2316 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2317 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2320 # Extract video identifiers
2321 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2322 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2323 if video_id not in already_seen:
2324 video_ids.append(video_id)
2325 already_seen.add(video_id)
2326 if len(video_ids) == n:
2327 # Specified n videos reached
2328 for id in video_ids:
2329 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2332 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2333 for id in video_ids:
2334 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2337 pagenum = pagenum + 1
2340 class GoogleSearchIE(InfoExtractor):
2341 """Information Extractor for Google Video search queries."""
2342 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2343 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2344 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2345 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2347 _max_google_results = 1000
2348 IE_NAME = u'video.google:search'
2350 def __init__(self, google_ie, downloader=None):
2351 InfoExtractor.__init__(self, downloader)
2352 self._google_ie = google_ie
2354 def report_download_page(self, query, pagenum):
2355 """Report attempt to download playlist page with given number."""
2356 query = query.decode(preferredencoding())
2357 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2359 def _real_initialize(self):
2360 self._google_ie.initialize()
2362 def _real_extract(self, query):
2363 mobj = re.match(self._VALID_URL, query)
2365 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2368 prefix, query = query.split(':')
2370 query = query.encode('utf-8')
2372 self._download_n_results(query, 1)
2374 elif prefix == 'all':
2375 self._download_n_results(query, self._max_google_results)
2381 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2383 elif n > self._max_google_results:
2384 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2385 n = self._max_google_results
2386 self._download_n_results(query, n)
2388 except ValueError: # parsing prefix as integer fails
2389 self._download_n_results(query, 1)
2392 def _download_n_results(self, query, n):
2393 """Downloads a specified number of results for a query"""
2396 already_seen = set()
2400 self.report_download_page(query, pagenum)
2401 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2402 request = urllib2.Request(result_url)
2404 page = urllib2.urlopen(request).read()
2405 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2406 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2409 # Extract video identifiers
2410 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2411 video_id = mobj.group(1)
2412 if video_id not in already_seen:
2413 video_ids.append(video_id)
2414 already_seen.add(video_id)
2415 if len(video_ids) == n:
2416 # Specified n videos reached
2417 for id in video_ids:
2418 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2421 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2422 for id in video_ids:
2423 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2426 pagenum = pagenum + 1
2429 class YahooSearchIE(InfoExtractor):
2430 """Information Extractor for Yahoo! Video search queries."""
2431 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2432 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2433 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2434 _MORE_PAGES_INDICATOR = r'\s*Next'
2436 _max_yahoo_results = 1000
2437 IE_NAME = u'video.yahoo:search'
2439 def __init__(self, yahoo_ie, downloader=None):
2440 InfoExtractor.__init__(self, downloader)
2441 self._yahoo_ie = yahoo_ie
2443 def report_download_page(self, query, pagenum):
2444 """Report attempt to download playlist page with given number."""
2445 query = query.decode(preferredencoding())
2446 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2448 def _real_initialize(self):
2449 self._yahoo_ie.initialize()
2451 def _real_extract(self, query):
2452 mobj = re.match(self._VALID_URL, query)
2454 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2457 prefix, query = query.split(':')
2459 query = query.encode('utf-8')
2461 self._download_n_results(query, 1)
2463 elif prefix == 'all':
2464 self._download_n_results(query, self._max_yahoo_results)
2470 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2472 elif n > self._max_yahoo_results:
2473 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2474 n = self._max_yahoo_results
2475 self._download_n_results(query, n)
2477 except ValueError: # parsing prefix as integer fails
2478 self._download_n_results(query, 1)
2481 def _download_n_results(self, query, n):
2482 """Downloads a specified number of results for a query"""
2485 already_seen = set()
2489 self.report_download_page(query, pagenum)
2490 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2491 request = urllib2.Request(result_url)
2493 page = urllib2.urlopen(request).read()
2494 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2495 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2498 # Extract video identifiers
2499 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2500 video_id = mobj.group(1)
2501 if video_id not in already_seen:
2502 video_ids.append(video_id)
2503 already_seen.add(video_id)
2504 if len(video_ids) == n:
2505 # Specified n videos reached
2506 for id in video_ids:
2507 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2510 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2511 for id in video_ids:
2512 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2515 pagenum = pagenum + 1
2518 class YoutubePlaylistIE(InfoExtractor):
2519 """Information Extractor for YouTube playlists."""
2521 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2522 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2523 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2524 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2526 IE_NAME = u'youtube:playlist'
2528 def __init__(self, youtube_ie, downloader=None):
2529 InfoExtractor.__init__(self, downloader)
2530 self._youtube_ie = youtube_ie
2532 def report_download_page(self, playlist_id, pagenum):
2533 """Report attempt to download playlist page with given number."""
2534 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2536 def _real_initialize(self):
2537 self._youtube_ie.initialize()
2539 def _real_extract(self, url):
2540 # Extract playlist id
2541 mobj = re.match(self._VALID_URL, url)
2543 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2547 if mobj.group(3) is not None:
2548 self._youtube_ie.extract(mobj.group(3))
2551 # Download playlist pages
2552 # prefix is 'p' as default for playlists but there are other types that need extra care
2553 playlist_prefix = mobj.group(1)
2554 if playlist_prefix == 'a':
2555 playlist_access = 'artist'
2557 playlist_prefix = 'p'
2558 playlist_access = 'view_play_list'
2559 playlist_id = mobj.group(2)
2564 self.report_download_page(playlist_id, pagenum)
2565 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2566 request = urllib2.Request(url)
2568 page = urllib2.urlopen(request).read()
2569 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2570 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2573 # Extract video identifiers
2575 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2576 if mobj.group(1) not in ids_in_page:
2577 ids_in_page.append(mobj.group(1))
2578 video_ids.extend(ids_in_page)
2580 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2582 pagenum = pagenum + 1
2584 playliststart = self._downloader.params.get('playliststart', 1) - 1
2585 playlistend = self._downloader.params.get('playlistend', -1)
2586 video_ids = video_ids[playliststart:playlistend]
2588 for id in video_ids:
2589 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2593 class YoutubeUserIE(InfoExtractor):
2594 """Information Extractor for YouTube users."""
2596 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2597 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2598 _GDATA_PAGE_SIZE = 50
2599 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2600 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2602 IE_NAME = u'youtube:user'
2604 def __init__(self, youtube_ie, downloader=None):
2605 InfoExtractor.__init__(self, downloader)
2606 self._youtube_ie = youtube_ie
2608 def report_download_page(self, username, start_index):
2609 """Report attempt to download user page."""
2610 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2611 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2613 def _real_initialize(self):
2614 self._youtube_ie.initialize()
2616 def _real_extract(self, url):
2618 mobj = re.match(self._VALID_URL, url)
2620 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2623 username = mobj.group(1)
2625 # Download video ids using YouTube Data API. Result size per
2626 # query is limited (currently to 50 videos) so we need to query
2627 # page by page until there are no video ids - it means we got
2634 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2635 self.report_download_page(username, start_index)
2637 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2640 page = urllib2.urlopen(request).read()
2641 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2642 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2645 # Extract video identifiers
2648 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2649 if mobj.group(1) not in ids_in_page:
2650 ids_in_page.append(mobj.group(1))
2652 video_ids.extend(ids_in_page)
2654 # A little optimization - if current page is not
2655 # "full", ie. does not contain PAGE_SIZE video ids then
2656 # we can assume that this page is the last one - there
2657 # are no more ids on further pages - no need to query
2660 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2665 all_ids_count = len(video_ids)
2666 playliststart = self._downloader.params.get('playliststart', 1) - 1
2667 playlistend = self._downloader.params.get('playlistend', -1)
2669 if playlistend == -1:
2670 video_ids = video_ids[playliststart:]
2672 video_ids = video_ids[playliststart:playlistend]
2674 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2675 (username, all_ids_count, len(video_ids)))
2677 for video_id in video_ids:
2678 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2681 class DepositFilesIE(InfoExtractor):
2682 """Information extractor for depositfiles.com"""
2684 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2685 IE_NAME = u'DepositFiles'
2687 def __init__(self, downloader=None):
2688 InfoExtractor.__init__(self, downloader)
2690 def report_download_webpage(self, file_id):
2691 """Report webpage download."""
2692 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2694 def report_extraction(self, file_id):
2695 """Report information extraction."""
2696 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2698 def _real_extract(self, url):
2699 # At this point we have a new file
2700 self._downloader.increment_downloads()
2702 file_id = url.split('/')[-1]
2703 # Rebuild url in english locale
2704 url = 'http://depositfiles.com/en/files/' + file_id
2706 # Retrieve file webpage with 'Free download' button pressed
2707 free_download_indication = { 'gateway_result' : '1' }
2708 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2710 self.report_download_webpage(file_id)
2711 webpage = urllib2.urlopen(request).read()
2712 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2713 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2716 # Search for the real file URL
2717 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2718 if (mobj is None) or (mobj.group(1) is None):
2719 # Try to figure out reason of the error.
2720 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2721 if (mobj is not None) and (mobj.group(1) is not None):
2722 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2723 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2725 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2728 file_url = mobj.group(1)
2729 file_extension = os.path.splitext(file_url)[1][1:]
2731 # Search for file title
2732 mobj = re.search(r'<b title="(.*?)">', webpage)
2734 self._downloader.trouble(u'ERROR: unable to extract title')
2736 file_title = mobj.group(1).decode('utf-8')
2739 # Process file information
2740 self._downloader.process_info({
2741 'id': file_id.decode('utf-8'),
2742 'url': file_url.decode('utf-8'),
2744 'upload_date': u'NA',
2745 'title': file_title,
2746 'stitle': file_title,
2747 'ext': file_extension.decode('utf-8'),
2751 except UnavailableVideoError, err:
2752 self._downloader.trouble(u'ERROR: unable to download file')
2755 class FacebookIE(InfoExtractor):
2756 """Information Extractor for Facebook"""
2758 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2759 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2760 _NETRC_MACHINE = 'facebook'
2761 _available_formats = ['video', 'highqual', 'lowqual']
2762 _video_extensions = {
2767 IE_NAME = u'facebook'
2769 def __init__(self, downloader=None):
2770 InfoExtractor.__init__(self, downloader)
2772 def _reporter(self, message):
2773 """Add header and report message."""
2774 self._downloader.to_screen(u'[facebook] %s' % message)
2776 def report_login(self):
2777 """Report attempt to log in."""
2778 self._reporter(u'Logging in')
2780 def report_video_webpage_download(self, video_id):
2781 """Report attempt to download video webpage."""
2782 self._reporter(u'%s: Downloading video webpage' % video_id)
2784 def report_information_extraction(self, video_id):
2785 """Report attempt to extract video information."""
2786 self._reporter(u'%s: Extracting video information' % video_id)
2788 def _parse_page(self, video_webpage):
2789 """Extract video information from page"""
2791 data = {'title': r'\("video_title", "(.*?)"\)',
2792 'description': r'<div class="datawrap">(.*?)</div>',
2793 'owner': r'\("video_owner_name", "(.*?)"\)',
2794 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2797 for piece in data.keys():
2798 mobj = re.search(data[piece], video_webpage)
2799 if mobj is not None:
2800 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2804 for fmt in self._available_formats:
2805 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2806 if mobj is not None:
2807 # URL is in a Javascript segment inside an escaped Unicode format within
2808 # the generally utf-8 page
2809 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2810 video_info['video_urls'] = video_urls
2814 def _real_initialize(self):
2815 if self._downloader is None:
2820 downloader_params = self._downloader.params
2822 # Attempt to use provided username and password or .netrc data
2823 if downloader_params.get('username', None) is not None:
2824 useremail = downloader_params['username']
2825 password = downloader_params['password']
2826 elif downloader_params.get('usenetrc', False):
2828 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2829 if info is not None:
2833 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2834 except (IOError, netrc.NetrcParseError), err:
2835 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2838 if useremail is None:
2847 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2850 login_results = urllib2.urlopen(request).read()
2851 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2852 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2854 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2855 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2858 def _real_extract(self, url):
2859 mobj = re.match(self._VALID_URL, url)
2861 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2863 video_id = mobj.group('ID')
2866 self.report_video_webpage_download(video_id)
2867 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2869 page = urllib2.urlopen(request)
2870 video_webpage = page.read()
2871 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2872 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2875 # Start extracting information
2876 self.report_information_extraction(video_id)
2878 # Extract information
2879 video_info = self._parse_page(video_webpage)
2882 if 'owner' not in video_info:
2883 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2885 video_uploader = video_info['owner']
2888 if 'title' not in video_info:
2889 self._downloader.trouble(u'ERROR: unable to extract video title')
2891 video_title = video_info['title']
2892 video_title = video_title.decode('utf-8')
2893 video_title = sanitize_title(video_title)
2895 simple_title = _simplify_title(video_title)
2898 if 'thumbnail' not in video_info:
2899 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2900 video_thumbnail = ''
2902 video_thumbnail = video_info['thumbnail']
2906 if 'upload_date' in video_info:
2907 upload_time = video_info['upload_date']
2908 timetuple = email.utils.parsedate_tz(upload_time)
2909 if timetuple is not None:
2911 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2916 video_description = video_info.get('description', 'No description available.')
2918 url_map = video_info['video_urls']
2919 if len(url_map.keys()) > 0:
2920 # Decide which formats to download
2921 req_format = self._downloader.params.get('format', None)
2922 format_limit = self._downloader.params.get('format_limit', None)
2924 if format_limit is not None and format_limit in self._available_formats:
2925 format_list = self._available_formats[self._available_formats.index(format_limit):]
2927 format_list = self._available_formats
2928 existing_formats = [x for x in format_list if x in url_map]
2929 if len(existing_formats) == 0:
2930 self._downloader.trouble(u'ERROR: no known formats available for video')
2932 if req_format is None:
2933 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2934 elif req_format == 'worst':
2935 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2936 elif req_format == '-1':
2937 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2940 if req_format not in url_map:
2941 self._downloader.trouble(u'ERROR: requested format not available')
2943 video_url_list = [(req_format, url_map[req_format])] # Specific format
2945 for format_param, video_real_url in video_url_list:
2947 # At this point we have a new video
2948 self._downloader.increment_downloads()
2951 video_extension = self._video_extensions.get(format_param, 'mp4')
2954 # Process video information
2955 self._downloader.process_info({
2956 'id': video_id.decode('utf-8'),
2957 'url': video_real_url.decode('utf-8'),
2958 'uploader': video_uploader.decode('utf-8'),
2959 'upload_date': upload_date,
2960 'title': video_title,
2961 'stitle': simple_title,
2962 'ext': video_extension.decode('utf-8'),
2963 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2964 'thumbnail': video_thumbnail.decode('utf-8'),
2965 'description': video_description.decode('utf-8'),
2968 except UnavailableVideoError, err:
2969 self._downloader.trouble(u'\nERROR: unable to download video')
2971 class BlipTVIE(InfoExtractor):
2972 """Information extractor for blip.tv"""
2974 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2975 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2976 IE_NAME = u'blip.tv'
2978 def report_extraction(self, file_id):
2979 """Report information extraction."""
2980 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2982 def report_direct_download(self, title):
2983 """Report information extraction."""
2984 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2986 def _real_extract(self, url):
2987 mobj = re.match(self._VALID_URL, url)
2989 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2996 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2997 request = urllib2.Request(json_url)
2998 self.report_extraction(mobj.group(1))
3001 urlh = urllib2.urlopen(request)
3002 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
3003 basename = url.split('/')[-1]
3004 title,ext = os.path.splitext(basename)
3005 title = title.decode('UTF-8')
3006 ext = ext.replace('.', '')
3007 self.report_direct_download(title)
3012 'stitle': _simplify_title(title),
3016 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3017 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
3019 if info is None: # Regular URL
3021 json_code = urlh.read()
3022 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3023 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
3027 json_data = json.loads(json_code)
3028 if 'Post' in json_data:
3029 data = json_data['Post']
3033 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
3034 video_url = data['media']['url']
3035 umobj = re.match(self._URL_EXT, video_url)
3037 raise ValueError('Can not determine filename extension')
3038 ext = umobj.group(1)
3041 'id': data['item_id'],
3043 'uploader': data['display_name'],
3044 'upload_date': upload_date,
3045 'title': data['title'],
3046 'stitle': _simplify_title(data['title']),
3048 'format': data['media']['mimeType'],
3049 'thumbnail': data['thumbnailUrl'],
3050 'description': data['description'],
3051 'player_url': data['embedUrl']
3053 except (ValueError,KeyError), err:
3054 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3057 self._downloader.increment_downloads()
3060 self._downloader.process_info(info)
3061 except UnavailableVideoError, err:
3062 self._downloader.trouble(u'\nERROR: unable to download video')
3065 class MyVideoIE(InfoExtractor):
3066 """Information Extractor for myvideo.de."""
3068 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3069 IE_NAME = u'myvideo'
3071 def __init__(self, downloader=None):
3072 InfoExtractor.__init__(self, downloader)
3074 def report_download_webpage(self, video_id):
3075 """Report webpage download."""
3076 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3078 def report_extraction(self, video_id):
3079 """Report information extraction."""
3080 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3082 def _real_extract(self,url):
3083 mobj = re.match(self._VALID_URL, url)
3085 self._download.trouble(u'ERROR: invalid URL: %s' % url)
3088 video_id = mobj.group(1)
3091 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3093 self.report_download_webpage(video_id)
3094 webpage = urllib2.urlopen(request).read()
3095 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3096 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3099 self.report_extraction(video_id)
3100 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3103 self._downloader.trouble(u'ERROR: unable to extract media URL')
3105 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3107 mobj = re.search('<title>([^<]+)</title>', webpage)
3109 self._downloader.trouble(u'ERROR: unable to extract title')
3112 video_title = mobj.group(1)
3113 video_title = sanitize_title(video_title)
3115 simple_title = _simplify_title(video_title)
3118 self._downloader.process_info({
3122 'upload_date': u'NA',
3123 'title': video_title,
3124 'stitle': simple_title,
3129 except UnavailableVideoError:
3130 self._downloader.trouble(u'\nERROR: Unable to download video')
3132 class ComedyCentralIE(InfoExtractor):
3133 """Information extractor for The Daily Show and Colbert Report """
3135 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3136 IE_NAME = u'comedycentral'
3138 def report_extraction(self, episode_id):
3139 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3141 def report_config_download(self, episode_id):
3142 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3144 def report_index_download(self, episode_id):
3145 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3147 def report_player_url(self, episode_id):
3148 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3150 def _real_extract(self, url):
3151 mobj = re.match(self._VALID_URL, url)
3153 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3156 if mobj.group('shortname'):
3157 if mobj.group('shortname') in ('tds', 'thedailyshow'):
3158 url = u'http://www.thedailyshow.com/full-episodes/'
3160 url = u'http://www.colbertnation.com/full-episodes/'
3161 mobj = re.match(self._VALID_URL, url)
3162 assert mobj is not None
3164 dlNewest = not mobj.group('episode')
3166 epTitle = mobj.group('showname')
3168 epTitle = mobj.group('episode')
3170 req = urllib2.Request(url)
3171 self.report_extraction(epTitle)
3173 htmlHandle = urllib2.urlopen(req)
3174 html = htmlHandle.read()
3175 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3176 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3179 url = htmlHandle.geturl()
3180 mobj = re.match(self._VALID_URL, url)
3182 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3184 if mobj.group('episode') == '':
3185 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3187 epTitle = mobj.group('episode')
3189 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
3190 if len(mMovieParams) == 0:
3191 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3194 playerUrl_raw = mMovieParams[0][0]
3195 self.report_player_url(epTitle)
3197 urlHandle = urllib2.urlopen(playerUrl_raw)
3198 playerUrl = urlHandle.geturl()
3199 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3200 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3203 uri = mMovieParams[0][1]
3204 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3205 self.report_index_download(epTitle)
3207 indexXml = urllib2.urlopen(indexUrl).read()
3208 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3209 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3212 idoc = xml.etree.ElementTree.fromstring(indexXml)
3213 itemEls = idoc.findall('.//item')
3214 for itemEl in itemEls:
3215 mediaId = itemEl.findall('./guid')[0].text
3216 shortMediaId = mediaId.split(':')[-1]
3217 showId = mediaId.split(':')[-2].replace('.com', '')
3218 officialTitle = itemEl.findall('./title')[0].text
3219 officialDate = itemEl.findall('./pubDate')[0].text
3221 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3222 urllib.urlencode({'uri': mediaId}))
3223 configReq = urllib2.Request(configUrl)
3224 self.report_config_download(epTitle)
3226 configXml = urllib2.urlopen(configReq).read()
3227 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3228 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3231 cdoc = xml.etree.ElementTree.fromstring(configXml)
3233 for rendition in cdoc.findall('.//rendition'):
3234 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3238 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3241 # For now, just pick the highest bitrate
3242 format,video_url = turls[-1]
3244 self._downloader.increment_downloads()
3246 effTitle = showId + u'-' + epTitle
3251 'upload_date': officialDate,
3253 'stitle': _simplify_title(effTitle),
3257 'description': officialTitle,
3258 'player_url': playerUrl
3262 self._downloader.process_info(info)
3263 except UnavailableVideoError, err:
3264 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3268 class EscapistIE(InfoExtractor):
3269 """Information extractor for The Escapist """
3271 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3272 IE_NAME = u'escapist'
3274 def report_extraction(self, showName):
3275 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3277 def report_config_download(self, showName):
3278 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3280 def _real_extract(self, url):
3281 htmlParser = HTMLParser.HTMLParser()
3283 mobj = re.match(self._VALID_URL, url)
3285 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3287 showName = mobj.group('showname')
3288 videoId = mobj.group('episode')
3290 self.report_extraction(showName)
3292 webPage = urllib2.urlopen(url).read()
3293 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3294 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3297 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3298 description = htmlParser.unescape(descMatch.group(1))
3299 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3300 imgUrl = htmlParser.unescape(imgMatch.group(1))
3301 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3302 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3303 configUrlMatch = re.search('config=(.*)$', playerUrl)
3304 configUrl = urllib2.unquote(configUrlMatch.group(1))
3306 self.report_config_download(showName)
3308 configJSON = urllib2.urlopen(configUrl).read()
3309 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3310 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3313 # Technically, it's JavaScript, not JSON
3314 configJSON = configJSON.replace("'", '"')
3317 config = json.loads(configJSON)
3318 except (ValueError,), err:
3319 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3322 playlist = config['playlist']
3323 videoUrl = playlist[1]['url']
3325 self._downloader.increment_downloads()
3329 'uploader': showName,
3330 'upload_date': None,
3332 'stitle': _simplify_title(showName),
3335 'thumbnail': imgUrl,
3336 'description': description,
3337 'player_url': playerUrl,
3341 self._downloader.process_info(info)
3342 except UnavailableVideoError, err:
3343 self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3346 class CollegeHumorIE(InfoExtractor):
3347 """Information extractor for collegehumor.com"""
3349 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3350 IE_NAME = u'collegehumor'
3352 def report_webpage(self, video_id):
3353 """Report information extraction."""
3354 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3356 def report_extraction(self, video_id):
3357 """Report information extraction."""
3358 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3360 def _real_extract(self, url):
3361 htmlParser = HTMLParser.HTMLParser()
3363 mobj = re.match(self._VALID_URL, url)
3365 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3367 video_id = mobj.group('videoid')
3369 self.report_webpage(video_id)
3370 request = urllib2.Request(url)
3372 webpage = urllib2.urlopen(request).read()
3373 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3374 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3377 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3379 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3381 internal_video_id = m.group('internalvideoid')
3385 'internal_id': internal_video_id,
3388 self.report_extraction(video_id)
3389 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3391 metaXml = urllib2.urlopen(xmlUrl).read()
3392 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3393 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3396 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3398 videoNode = mdoc.findall('./video')[0]
3399 info['description'] = videoNode.findall('./description')[0].text
3400 info['title'] = videoNode.findall('./caption')[0].text
3401 info['stitle'] = _simplify_title(info['title'])
3402 info['url'] = videoNode.findall('./file')[0].text
3403 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3404 info['ext'] = info['url'].rpartition('.')[2]
3405 info['format'] = info['ext']
3407 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3410 self._downloader.increment_downloads()
3413 self._downloader.process_info(info)
3414 except UnavailableVideoError, err:
3415 self._downloader.trouble(u'\nERROR: unable to download video')
3418 class XVideosIE(InfoExtractor):
3419 """Information extractor for xvideos.com"""
3421 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3422 IE_NAME = u'xvideos'
3424 def report_webpage(self, video_id):
3425 """Report information extraction."""
3426 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3428 def report_extraction(self, video_id):
3429 """Report information extraction."""
3430 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3432 def _real_extract(self, url):
3433 htmlParser = HTMLParser.HTMLParser()
3435 mobj = re.match(self._VALID_URL, url)
3437 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3439 video_id = mobj.group(1).decode('utf-8')
3441 self.report_webpage(video_id)
3443 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3445 webpage = urllib2.urlopen(request).read()
3446 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3447 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3450 self.report_extraction(video_id)
3454 mobj = re.search(r'flv_url=(.+?)&', webpage)
3456 self._downloader.trouble(u'ERROR: unable to extract video url')
3458 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3462 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3464 self._downloader.trouble(u'ERROR: unable to extract video title')
3466 video_title = mobj.group(1).decode('utf-8')
3469 # Extract video thumbnail
3470 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3472 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3474 video_thumbnail = mobj.group(1).decode('utf-8')
3478 self._downloader.increment_downloads()
3483 'upload_date': None,
3484 'title': video_title,
3485 'stitle': _simplify_title(video_title),
3488 'thumbnail': video_thumbnail,
3489 'description': None,
3494 self._downloader.process_info(info)
3495 except UnavailableVideoError, err:
3496 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3499 class SoundcloudIE(InfoExtractor):
3500 """Information extractor for soundcloud.com
3501 To access the media, the uid of the song and a stream token
3502 must be extracted from the page source and the script must make
3503 a request to media.soundcloud.com/crossdomain.xml. Then
3504 the media can be grabbed by requesting from an url composed
3505 of the stream token and uid
3508 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3509 IE_NAME = u'soundcloud'
3511 def __init__(self, downloader=None):
3512 InfoExtractor.__init__(self, downloader)
3514 def report_webpage(self, video_id):
3515 """Report information extraction."""
3516 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3518 def report_extraction(self, video_id):
3519 """Report information extraction."""
3520 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3522 def _real_extract(self, url):
3523 htmlParser = HTMLParser.HTMLParser()
3525 mobj = re.match(self._VALID_URL, url)
3527 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3530 # extract uploader (which is in the url)
3531 uploader = mobj.group(1).decode('utf-8')
3532 # extract simple title (uploader + slug of song title)
3533 slug_title = mobj.group(2).decode('utf-8')
3534 simple_title = uploader + '-' + slug_title
3536 self.report_webpage('%s/%s' % (uploader, slug_title))
3538 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3540 webpage = urllib2.urlopen(request).read()
3541 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3542 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3545 self.report_extraction('%s/%s' % (uploader, slug_title))
3547 # extract uid and stream token that soundcloud hands out for access
3548 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3550 video_id = mobj.group(1)
3551 stream_token = mobj.group(2)
3553 # extract unsimplified title
3554 mobj = re.search('"title":"(.*?)",', webpage)
3556 title = mobj.group(1)
3558 # construct media url (with uid/token)
3559 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3560 mediaURL = mediaURL % (video_id, stream_token)
3563 description = u'No description available'
3564 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3566 description = mobj.group(1)
3570 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3573 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3574 except Exception, e:
3577 # for soundcloud, a request to a cross domain is required for cookies
3578 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3581 self._downloader.process_info({
3582 'id': video_id.decode('utf-8'),
3584 'uploader': uploader.decode('utf-8'),
3585 'upload_date': upload_date,
3586 'title': simple_title.decode('utf-8'),
3587 'stitle': simple_title.decode('utf-8'),
3591 'description': description.decode('utf-8')
3593 except UnavailableVideoError:
3594 self._downloader.trouble(u'\nERROR: unable to download video')
3597 class InfoQIE(InfoExtractor):
3598 """Information extractor for infoq.com"""
3600 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3603 def report_webpage(self, video_id):
3604 """Report information extraction."""
3605 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3607 def report_extraction(self, video_id):
3608 """Report information extraction."""
3609 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3611 def _real_extract(self, url):
3612 htmlParser = HTMLParser.HTMLParser()
3614 mobj = re.match(self._VALID_URL, url)
3616 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3619 self.report_webpage(url)
3621 request = urllib2.Request(url)
3623 webpage = urllib2.urlopen(request).read()
3624 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3625 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3628 self.report_extraction(url)
3632 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3634 self._downloader.trouble(u'ERROR: unable to extract video url')
3636 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3640 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3642 self._downloader.trouble(u'ERROR: unable to extract video title')
3644 video_title = mobj.group(1).decode('utf-8')
3646 # Extract description
3647 video_description = u'No description available.'
3648 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3649 if mobj is not None:
3650 video_description = mobj.group(1).decode('utf-8')
3652 video_filename = video_url.split('/')[-1]
3653 video_id, extension = video_filename.split('.')
3655 self._downloader.increment_downloads()
3660 'upload_date': None,
3661 'title': video_title,
3662 'stitle': _simplify_title(video_title),
3664 'format': extension, # Extension is always(?) mp4, but seems to be flv
3666 'description': video_description,
3671 self._downloader.process_info(info)
3672 except UnavailableVideoError, err:
3673 self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3675 class MixcloudIE(InfoExtractor):
3676 """Information extractor for www.mixcloud.com"""
3677 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3678 IE_NAME = u'mixcloud'
3680 def __init__(self, downloader=None):
3681 InfoExtractor.__init__(self, downloader)
3683 def report_download_json(self, file_id):
3684 """Report JSON download."""
3685 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3687 def report_extraction(self, file_id):
3688 """Report information extraction."""
3689 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3691 def get_urls(self, jsonData, fmt, bitrate='best'):
3692 """Get urls from 'audio_formats' section in json"""
3695 bitrate_list = jsonData[fmt]
3696 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3697 bitrate = max(bitrate_list) # select highest
3699 url_list = jsonData[fmt][bitrate]
3700 except TypeError: # we have no bitrate info.
3701 url_list = jsonData[fmt]
3705 def check_urls(self, url_list):
3706 """Returns 1st active url from list"""
3707 for url in url_list:
3709 urllib2.urlopen(url)
3711 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3716 def _print_formats(self, formats):
3717 print 'Available formats:'
3718 for fmt in formats.keys():
3719 for b in formats[fmt]:
3721 ext = formats[fmt][b][0]
3722 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3723 except TypeError: # we have no bitrate info
3724 ext = formats[fmt][0]
3725 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3728 def _real_extract(self, url):
3729 mobj = re.match(self._VALID_URL, url)
3731 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3733 # extract uploader & filename from url
3734 uploader = mobj.group(1).decode('utf-8')
3735 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3737 # construct API request
3738 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3739 # retrieve .json file with links to files
3740 request = urllib2.Request(file_url)
3742 self.report_download_json(file_url)
3743 jsonData = urllib2.urlopen(request).read()
3744 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3745 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3749 json_data = json.loads(jsonData)
3750 player_url = json_data['player_swf_url']
3751 formats = dict(json_data['audio_formats'])
3753 req_format = self._downloader.params.get('format', None)
3756 if self._downloader.params.get('listformats', None):
3757 self._print_formats(formats)
3760 if req_format is None or req_format == 'best':
3761 for format_param in formats.keys():
3762 url_list = self.get_urls(formats, format_param)
3764 file_url = self.check_urls(url_list)
3765 if file_url is not None:
3768 if req_format not in formats.keys():
3769 self._downloader.trouble(u'ERROR: format is not available')
3772 url_list = self.get_urls(formats, req_format)
3773 file_url = self.check_urls(url_list)
3774 format_param = req_format
3777 self._downloader.increment_downloads()
3779 # Process file information
3780 self._downloader.process_info({
3781 'id': file_id.decode('utf-8'),
3782 'url': file_url.decode('utf-8'),
3783 'uploader': uploader.decode('utf-8'),
3784 'upload_date': u'NA',
3785 'title': json_data['name'],
3786 'stitle': _simplify_title(json_data['name']),
3787 'ext': file_url.split('.')[-1].decode('utf-8'),
3788 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3789 'thumbnail': json_data['thumbnail_url'],
3790 'description': json_data['description'],
3791 'player_url': player_url.decode('utf-8'),
3793 except UnavailableVideoError, err:
3794 self._downloader.trouble(u'ERROR: unable to download file')
3796 class StanfordOpenClassroomIE(InfoExtractor):
3797 """Information extractor for Stanford's Open ClassRoom"""
3799 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3800 IE_NAME = u'stanfordoc'
3802 def report_download_webpage(self, objid):
3803 """Report information extraction."""
3804 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3806 def report_extraction(self, video_id):
3807 """Report information extraction."""
3808 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3810 def _real_extract(self, url):
3811 mobj = re.match(self._VALID_URL, url)
3813 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3816 if mobj.group('course') and mobj.group('video'): # A specific video
3817 course = mobj.group('course')
3818 video = mobj.group('video')
3820 'id': _simplify_title(course + '_' + video),
3823 self.report_extraction(info['id'])
3824 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3825 xmlUrl = baseUrl + video + '.xml'
3827 metaXml = urllib2.urlopen(xmlUrl).read()
3828 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3829 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
3831 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3833 info['title'] = mdoc.findall('./title')[0].text
3834 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3836 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3838 info['stitle'] = _simplify_title(info['title'])
3839 info['ext'] = info['url'].rpartition('.')[2]
3840 info['format'] = info['ext']
3841 self._downloader.increment_downloads()
3843 self._downloader.process_info(info)
3844 except UnavailableVideoError, err:
3845 self._downloader.trouble(u'\nERROR: unable to download video')
3846 elif mobj.group('course'): # A course page
3847 unescapeHTML = HTMLParser.HTMLParser().unescape
3849 course = mobj.group('course')
3851 'id': _simplify_title(course),
3855 self.report_download_webpage(info['id'])
3857 coursepage = urllib2.urlopen(url).read()
3858 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3859 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3862 m = re.search('<h1>([^<]+)</h1>', coursepage)
3864 info['title'] = unescapeHTML(m.group(1))
3866 info['title'] = info['id']
3867 info['stitle'] = _simplify_title(info['title'])
3869 m = re.search('<description>([^<]+)</description>', coursepage)
3871 info['description'] = unescapeHTML(m.group(1))
3873 links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3876 'type': 'reference',
3877 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3881 for entry in info['list']:
3882 assert entry['type'] == 'reference'
3883 self.extract(entry['url'])
3885 unescapeHTML = HTMLParser.HTMLParser().unescape
3888 'id': 'Stanford OpenClassroom',
3892 self.report_download_webpage(info['id'])
3893 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3895 rootpage = urllib2.urlopen(rootURL).read()
3896 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3897 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3900 info['title'] = info['id']
3901 info['stitle'] = _simplify_title(info['title'])
3903 links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3906 'type': 'reference',
3907 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3911 for entry in info['list']:
3912 assert entry['type'] == 'reference'
3913 self.extract(entry['url'])
3915 class MTVIE(InfoExtractor):
3916 """Information extractor for MTV.com"""
3918 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3921 def report_webpage(self, video_id):
3922 """Report information extraction."""
3923 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3925 def report_extraction(self, video_id):
3926 """Report information extraction."""
3927 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3929 def _real_extract(self, url):
3930 mobj = re.match(self._VALID_URL, url)
3932 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3934 if not mobj.group('proto'):
3935 url = 'http://' + url
3936 video_id = mobj.group('videoid')
3937 self.report_webpage(video_id)
3939 request = urllib2.Request(url)
3941 webpage = urllib2.urlopen(request).read()
3942 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3943 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3946 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3948 self._downloader.trouble(u'ERROR: unable to extract song name')
3950 song_name = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3951 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3953 self._downloader.trouble(u'ERROR: unable to extract performer')
3955 performer = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3956 video_title = performer + ' - ' + song_name
3958 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3960 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3962 mtvn_uri = mobj.group(1)
3964 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3966 self._downloader.trouble(u'ERROR: unable to extract content id')
3968 content_id = mobj.group(1)
3970 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3971 self.report_extraction(video_id)
3972 request = urllib2.Request(videogen_url)
3974 metadataXml = urllib2.urlopen(request).read()
3975 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3976 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
3979 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3980 renditions = mdoc.findall('.//rendition')
3982 # For now, always pick the highest quality.
3983 rendition = renditions[-1]
3986 _,_,ext = rendition.attrib['type'].partition('/')
3987 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3988 video_url = rendition.find('./src').text
3990 self._downloader.trouble('Invalid rendition field.')
3993 self._downloader.increment_downloads()
3997 'uploader': performer,
3998 'title': video_title,
3999 'stitle': _simplify_title(video_title),
4005 self._downloader.process_info(info)
4006 except UnavailableVideoError, err:
4007 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
4010 class PostProcessor(object):
4011 """Post Processor class.
4013 PostProcessor objects can be added to downloaders with their
4014 add_post_processor() method. When the downloader has finished a
4015 successful download, it will take its internal chain of PostProcessors
4016 and start calling the run() method on each one of them, first with
4017 an initial argument and then with the returned value of the previous
4020 The chain will be stopped if one of them ever returns None or the end
4021 of the chain is reached.
4023 PostProcessor objects follow a "mutual registration" process similar
4024 to InfoExtractor objects.
4029 def __init__(self, downloader=None):
4030 self._downloader = downloader
4032 def set_downloader(self, downloader):
4033 """Sets the downloader for this PP."""
4034 self._downloader = downloader
4036 def run(self, information):
4037 """Run the PostProcessor.
4039 The "information" argument is a dictionary like the ones
4040 composed by InfoExtractors. The only difference is that this
4041 one has an extra field called "filepath" that points to the
4044 When this method returns None, the postprocessing chain is
4045 stopped. However, this method may return an information
4046 dictionary that will be passed to the next postprocessing
4047 object in the chain. It can be the one it received after
4048 changing some fields.
4050 In addition, this method may raise a PostProcessingError
4051 exception that will be taken into account by the downloader
4054 return information # by default, do nothing
4056 class AudioConversionError(BaseException):
4057 def __init__(self, message):
4058 self.message = message
4060 class FFmpegExtractAudioPP(PostProcessor):
4062 def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
4063 PostProcessor.__init__(self, downloader)
4064 if preferredcodec is None:
4065 preferredcodec = 'best'
4066 self._preferredcodec = preferredcodec
4067 self._preferredquality = preferredquality
4068 self._keepvideo = keepvideo
4071 def get_audio_codec(path):
4073 cmd = ['ffprobe', '-show_streams', '--', _encodeFilename(path)]
4074 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
4075 output = handle.communicate()[0]
4076 if handle.wait() != 0:
4078 except (IOError, OSError):
4081 for line in output.split('\n'):
4082 if line.startswith('codec_name='):
4083 audio_codec = line.split('=')[1].strip()
4084 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
4089 def run_ffmpeg(path, out_path, codec, more_opts):
4093 acodec_opts = ['-acodec', codec]
4094 cmd = ['ffmpeg', '-y', '-i', _encodeFilename(path), '-vn'] + acodec_opts + more_opts + ['--', _encodeFilename(out_path)]
4096 p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4097 stdout,stderr = p.communicate()
4098 except (IOError, OSError):
4099 e = sys.exc_info()[1]
4100 if isinstance(e, OSError) and e.errno == 2:
4101 raise AudioConversionError('ffmpeg not found. Please install ffmpeg.')
4104 if p.returncode != 0:
4105 msg = stderr.strip().split('\n')[-1]
4106 raise AudioConversionError(msg)
4108 def run(self, information):
4109 path = information['filepath']
4111 filecodec = self.get_audio_codec(path)
4112 if filecodec is None:
4113 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
4117 if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'):
4118 if self._preferredcodec == 'm4a' and filecodec == 'aac':
4119 # Lossless, but in another container
4121 extension = self._preferredcodec
4122 more_opts = ['-absf', 'aac_adtstoasc']
4123 elif filecodec in ['aac', 'mp3', 'vorbis']:
4124 # Lossless if possible
4126 extension = filecodec
4127 if filecodec == 'aac':
4128 more_opts = ['-f', 'adts']
4129 if filecodec == 'vorbis':
4133 acodec = 'libmp3lame'
4136 if self._preferredquality is not None:
4137 more_opts += ['-ab', self._preferredquality]
4139 # We convert the audio (lossy)
4140 acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis', 'wav': None}[self._preferredcodec]
4141 extension = self._preferredcodec
4143 if self._preferredquality is not None:
4144 more_opts += ['-ab', self._preferredquality]
4145 if self._preferredcodec == 'aac':
4146 more_opts += ['-f', 'adts']
4147 if self._preferredcodec == 'm4a':
4148 more_opts += ['-absf', 'aac_adtstoasc']
4149 if self._preferredcodec == 'vorbis':
4151 if self._preferredcodec == 'wav':
4153 more_opts += ['-f', 'wav']
4155 prefix, sep, ext = path.rpartition(u'.') # not os.path.splitext, since the latter does not work on unicode in all setups
4156 new_path = prefix + sep + extension
4157 self._downloader.to_screen(u'[ffmpeg] Destination: ' + new_path)
4159 self.run_ffmpeg(path, new_path, acodec, more_opts)
4161 etype,e,tb = sys.exc_info()
4162 if isinstance(e, AudioConversionError):
4163 self._downloader.to_stderr(u'ERROR: audio conversion failed: ' + e.message)
4165 self._downloader.to_stderr(u'ERROR: error running ffmpeg')
4168 # Try to update the date time for extracted audio file.
4169 if information.get('filetime') is not None:
4171 os.utime(_encodeFilename(new_path), (time.time(), information['filetime']))
4173 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
4175 if not self._keepvideo:
4177 os.remove(_encodeFilename(path))
4178 except (IOError, OSError):
4179 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
4182 information['filepath'] = new_path
4186 def updateSelf(downloader, filename):
4187 ''' Update the program file with the latest version from the repository '''
4188 # Note: downloader only used for options
4189 if not os.access(filename, os.W_OK):
4190 sys.exit('ERROR: no write permissions on %s' % filename)
4192 downloader.to_screen(u'Updating to latest version...')
4196 urlh = urllib.urlopen(UPDATE_URL)
4197 newcontent = urlh.read()
4199 vmatch = re.search("__version__ = '([^']+)'", newcontent)
4200 if vmatch is not None and vmatch.group(1) == __version__:
4201 downloader.to_screen(u'youtube-dl is up-to-date (' + __version__ + ')')
4205 except (IOError, OSError), err:
4206 sys.exit('ERROR: unable to download latest version')
4209 outf = open(filename, 'wb')
4211 outf.write(newcontent)
4214 except (IOError, OSError), err:
4215 sys.exit('ERROR: unable to overwrite current version')
4217 downloader.to_screen(u'Updated youtube-dl. Restart youtube-dl to use the new version.')
4220 def _readOptions(filename_bytes):
4222 optionf = open(filename_bytes)
4224 return [] # silently skip if file is not present
4228 res += shlex.split(l, comments=True)
4233 def _format_option_string(option):
4234 ''' ('-o', '--option') -> -o, --format METAVAR'''
4238 if option._short_opts: opts.append(option._short_opts[0])
4239 if option._long_opts: opts.append(option._long_opts[0])
4240 if len(opts) > 1: opts.insert(1, ', ')
4242 if option.takes_value(): opts.append(' %s' % option.metavar)
4244 return "".join(opts)
4246 def _find_term_columns():
4247 columns = os.environ.get('COLUMNS', None)
4252 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4253 out,err = sp.communicate()
4254 return int(out.split()[1])
4260 max_help_position = 80
4262 # No need to wrap help messages if we're on a wide console
4263 columns = _find_term_columns()
4264 if columns: max_width = columns
4266 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4267 fmt.format_option_strings = _format_option_string
4270 'version' : __version__,
4272 'usage' : '%prog [options] url [url...]',
4273 'conflict_handler' : 'resolve',
4276 parser = optparse.OptionParser(**kw)
4279 general = optparse.OptionGroup(parser, 'General Options')
4280 selection = optparse.OptionGroup(parser, 'Video Selection')
4281 authentication = optparse.OptionGroup(parser, 'Authentication Options')
4282 video_format = optparse.OptionGroup(parser, 'Video Format Options')
4283 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
4284 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
4285 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4287 general.add_option('-h', '--help',
4288 action='help', help='print this help text and exit')
4289 general.add_option('-v', '--version',
4290 action='version', help='print program version and exit')
4291 general.add_option('-U', '--update',
4292 action='store_true', dest='update_self', help='update this program to latest version')
4293 general.add_option('-i', '--ignore-errors',
4294 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4295 general.add_option('-r', '--rate-limit',
4296 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4297 general.add_option('-R', '--retries',
4298 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4299 general.add_option('--dump-user-agent',
4300 action='store_true', dest='dump_user_agent',
4301 help='display the current browser identification', default=False)
4302 general.add_option('--list-extractors',
4303 action='store_true', dest='list_extractors',
4304 help='List all supported extractors and the URLs they would handle', default=False)
4306 selection.add_option('--playlist-start',
4307 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4308 selection.add_option('--playlist-end',
4309 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4310 selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4311 selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4312 selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4314 authentication.add_option('-u', '--username',
4315 dest='username', metavar='USERNAME', help='account username')
4316 authentication.add_option('-p', '--password',
4317 dest='password', metavar='PASSWORD', help='account password')
4318 authentication.add_option('-n', '--netrc',
4319 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4322 video_format.add_option('-f', '--format',
4323 action='store', dest='format', metavar='FORMAT', help='video format code')
4324 video_format.add_option('--all-formats',
4325 action='store_const', dest='format', help='download all available video formats', const='all')
4326 video_format.add_option('--prefer-free-formats',
4327 action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested')
4328 video_format.add_option('--max-quality',
4329 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4330 video_format.add_option('-F', '--list-formats',
4331 action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4334 verbosity.add_option('-q', '--quiet',
4335 action='store_true', dest='quiet', help='activates quiet mode', default=False)
4336 verbosity.add_option('-s', '--simulate',
4337 action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4338 verbosity.add_option('--skip-download',
4339 action='store_true', dest='skip_download', help='do not download the video', default=False)
4340 verbosity.add_option('-g', '--get-url',
4341 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4342 verbosity.add_option('-e', '--get-title',
4343 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4344 verbosity.add_option('--get-thumbnail',
4345 action='store_true', dest='getthumbnail',
4346 help='simulate, quiet but print thumbnail URL', default=False)
4347 verbosity.add_option('--get-description',
4348 action='store_true', dest='getdescription',
4349 help='simulate, quiet but print video description', default=False)
4350 verbosity.add_option('--get-filename',
4351 action='store_true', dest='getfilename',
4352 help='simulate, quiet but print output filename', default=False)
4353 verbosity.add_option('--get-format',
4354 action='store_true', dest='getformat',
4355 help='simulate, quiet but print output format', default=False)
4356 verbosity.add_option('--no-progress',
4357 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4358 verbosity.add_option('--console-title',
4359 action='store_true', dest='consoletitle',
4360 help='display progress in console titlebar', default=False)
4361 verbosity.add_option('-v', '--verbose',
4362 action='store_true', dest='verbose', help='print various debugging information', default=False)
4365 filesystem.add_option('-t', '--title',
4366 action='store_true', dest='usetitle', help='use title in file name', default=False)
4367 filesystem.add_option('-l', '--literal',
4368 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4369 filesystem.add_option('-A', '--auto-number',
4370 action='store_true', dest='autonumber',
4371 help='number downloaded files starting from 00000', default=False)
4372 filesystem.add_option('-o', '--output',
4373 dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4374 filesystem.add_option('-a', '--batch-file',
4375 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4376 filesystem.add_option('-w', '--no-overwrites',
4377 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4378 filesystem.add_option('-c', '--continue',
4379 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=True)
4380 filesystem.add_option('--no-continue',
4381 action='store_false', dest='continue_dl',
4382 help='do not resume partially downloaded files (restart from beginning)')
4383 filesystem.add_option('--cookies',
4384 dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4385 filesystem.add_option('--no-part',
4386 action='store_true', dest='nopart', help='do not use .part files', default=False)
4387 filesystem.add_option('--no-mtime',
4388 action='store_false', dest='updatetime',
4389 help='do not use the Last-modified header to set the file modification time', default=True)
4390 filesystem.add_option('--write-description',
4391 action='store_true', dest='writedescription',
4392 help='write video description to a .description file', default=False)
4393 filesystem.add_option('--write-info-json',
4394 action='store_true', dest='writeinfojson',
4395 help='write video metadata to a .info.json file', default=False)
4398 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4399 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4400 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4401 help='"best", "aac", "vorbis", "mp3", "m4a", or "wav"; best by default')
4402 postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4403 help='ffmpeg audio bitrate specification, 128k by default')
4404 postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4405 help='keeps the video file on disk after the post-processing; the video is erased by default')
4408 parser.add_option_group(general)
4409 parser.add_option_group(selection)
4410 parser.add_option_group(filesystem)
4411 parser.add_option_group(verbosity)
4412 parser.add_option_group(video_format)
4413 parser.add_option_group(authentication)
4414 parser.add_option_group(postproc)
4416 xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4418 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4420 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4421 argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4422 opts, args = parser.parse_args(argv)
4424 return parser, opts, args
4426 def gen_extractors():
4427 """ Return a list of an instance of every supported extractor.
4428 The order does matter; the first extractor matched is the one handling the URL.
4430 youtube_ie = YoutubeIE()
4431 google_ie = GoogleIE()
4432 yahoo_ie = YahooIE()
4434 YoutubePlaylistIE(youtube_ie),
4435 YoutubeUserIE(youtube_ie),
4436 YoutubeSearchIE(youtube_ie),
4438 MetacafeIE(youtube_ie),
4441 GoogleSearchIE(google_ie),
4444 YahooSearchIE(yahoo_ie),
4457 StanfordOpenClassroomIE(),
4464 parser, opts, args = parseOpts()
4466 # Open appropriate CookieJar
4467 if opts.cookiefile is None:
4468 jar = cookielib.CookieJar()
4471 jar = cookielib.MozillaCookieJar(opts.cookiefile)
4472 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4474 except (IOError, OSError), err:
4475 sys.exit(u'ERROR: unable to open cookie file')
4478 if opts.dump_user_agent:
4479 print std_headers['User-Agent']
4482 # Batch file verification
4484 if opts.batchfile is not None:
4486 if opts.batchfile == '-':
4489 batchfd = open(opts.batchfile, 'r')
4490 batchurls = batchfd.readlines()
4491 batchurls = [x.strip() for x in batchurls]
4492 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4494 sys.exit(u'ERROR: batch file could not be read')
4495 all_urls = batchurls + args
4497 # General configuration
4498 cookie_processor = urllib2.HTTPCookieProcessor(jar)
4499 proxy_handler = urllib2.ProxyHandler()
4500 opener = urllib2.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler())
4501 urllib2.install_opener(opener)
4502 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4505 print(u'[debug] Proxy map: ' + str(proxy_handler.proxies))
4507 extractors = gen_extractors()
4509 if opts.list_extractors:
4510 for ie in extractors:
4512 matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4513 all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4514 for mu in matchedUrls:
4518 # Conflicting, missing and erroneous options
4519 if opts.usenetrc and (opts.username is not None or opts.password is not None):
4520 parser.error(u'using .netrc conflicts with giving username/password')
4521 if opts.password is not None and opts.username is None:
4522 parser.error(u'account username missing')
4523 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4524 parser.error(u'using output template conflicts with using title, literal title or auto number')
4525 if opts.usetitle and opts.useliteral:
4526 parser.error(u'using title conflicts with using literal title')
4527 if opts.username is not None and opts.password is None:
4528 opts.password = getpass.getpass(u'Type account password and press return:')
4529 if opts.ratelimit is not None:
4530 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4531 if numeric_limit is None:
4532 parser.error(u'invalid rate limit specified')
4533 opts.ratelimit = numeric_limit
4534 if opts.retries is not None:
4536 opts.retries = long(opts.retries)
4537 except (TypeError, ValueError), err:
4538 parser.error(u'invalid retry count specified')
4540 opts.playliststart = int(opts.playliststart)
4541 if opts.playliststart <= 0:
4542 raise ValueError(u'Playlist start must be positive')
4543 except (TypeError, ValueError), err:
4544 parser.error(u'invalid playlist start number specified')
4546 opts.playlistend = int(opts.playlistend)
4547 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4548 raise ValueError(u'Playlist end must be greater than playlist start')
4549 except (TypeError, ValueError), err:
4550 parser.error(u'invalid playlist end number specified')
4551 if opts.extractaudio:
4552 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis', 'm4a', 'wav']:
4553 parser.error(u'invalid audio format specified')
4556 fd = FileDownloader({
4557 'usenetrc': opts.usenetrc,
4558 'username': opts.username,
4559 'password': opts.password,
4560 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4561 'forceurl': opts.geturl,
4562 'forcetitle': opts.gettitle,
4563 'forcethumbnail': opts.getthumbnail,
4564 'forcedescription': opts.getdescription,
4565 'forcefilename': opts.getfilename,
4566 'forceformat': opts.getformat,
4567 'simulate': opts.simulate,
4568 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4569 'format': opts.format,
4570 'format_limit': opts.format_limit,
4571 'listformats': opts.listformats,
4572 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4573 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4574 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4575 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4576 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4577 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4578 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4579 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4580 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4581 or u'%(id)s.%(ext)s'),
4582 'ignoreerrors': opts.ignoreerrors,
4583 'ratelimit': opts.ratelimit,
4584 'nooverwrites': opts.nooverwrites,
4585 'retries': opts.retries,
4586 'continuedl': opts.continue_dl,
4587 'noprogress': opts.noprogress,
4588 'playliststart': opts.playliststart,
4589 'playlistend': opts.playlistend,
4590 'logtostderr': opts.outtmpl == '-',
4591 'consoletitle': opts.consoletitle,
4592 'nopart': opts.nopart,
4593 'updatetime': opts.updatetime,
4594 'writedescription': opts.writedescription,
4595 'writeinfojson': opts.writeinfojson,
4596 'matchtitle': opts.matchtitle,
4597 'rejecttitle': opts.rejecttitle,
4598 'max_downloads': opts.max_downloads,
4599 'prefer_free_formats': opts.prefer_free_formats,
4600 'verbose': opts.verbose,
4602 for extractor in extractors:
4603 fd.add_info_extractor(extractor)
4606 if opts.extractaudio:
4607 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4610 if opts.update_self:
4611 updateSelf(fd, sys.argv[0])
4614 if len(all_urls) < 1:
4615 if not opts.update_self:
4616 parser.error(u'you must provide at least one URL')
4621 retcode = fd.download(all_urls)
4622 except MaxDownloadsReached:
4623 fd.to_screen(u'--max-download limit reached, aborting.')
4626 # Dump cookie jar if requested
4627 if opts.cookiefile is not None:
4630 except (IOError, OSError), err:
4631 sys.exit(u'ERROR: unable to save cookie jar')
4638 except DownloadError:
4640 except SameFileError:
4641 sys.exit(u'ERROR: fixed output name but more than one file to download')
4642 except KeyboardInterrupt:
4643 sys.exit(u'\nERROR: Interrupted by user')
4645 if __name__ == '__main__':
4648 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: