2 # -*- coding: utf-8 -*-
5 'Ricardo Garcia Gonzalez',
13 'Philipp Hagemeister',
17 __license__ = 'Public Domain'
18 __version__ = '2011.09.18'
20 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
49 except ImportError: # Python 2.4
52 import cStringIO as StringIO
56 # parse_qs was moved from the cgi module to the urlparse module recently.
58 from urlparse import parse_qs
60 from cgi import parse_qs
68 import xml.etree.ElementTree
69 except ImportError: # Python<2.5: Not officially supported, but let it slip
70 warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
73 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
74 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
75 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
76 'Accept-Encoding': 'gzip, deflate',
77 'Accept-Language': 'en-us,en;q=0.5',
80 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
84 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
90 def raiseError(msg, i):
91 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
92 def skipSpace(i, expectMore=True):
93 while i < len(s) and s[i] in ' \t\r\n':
97 raiseError('Premature end', i)
99 def decodeEscape(match):
115 return unichr(int(esc[1:5], 16))
116 if len(esc) == 5+6 and esc[5:7] == '\\u':
117 hi = int(esc[1:5], 16)
118 low = int(esc[7:11], 16)
119 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
120 raise ValueError('Unknown escape ' + str(esc))
127 while s[e-bslashes-1] == '\\':
129 if bslashes % 2 == 1:
133 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
134 stri = rexp.sub(decodeEscape, s[i:e])
140 if s[i] == '}': # Empty dictionary
144 raiseError('Expected a string object key', i)
145 i,key = parseString(i)
147 if i >= len(s) or s[i] != ':':
148 raiseError('Expected a colon', i)
155 raiseError('Expected comma or closing curly brace', i)
160 if s[i] == ']': # Empty array
165 i = skipSpace(i) # Raise exception if premature end
169 raiseError('Expected a comma or closing bracket', i)
171 def parseDiscrete(i):
172 for k,v in {'true': True, 'false': False, 'null': None}.items():
173 if s.startswith(k, i):
175 raiseError('Not a boolean (or null)', i)
177 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
179 raiseError('Not a number', i)
181 if '.' in nums or 'e' in nums or 'E' in nums:
182 return (i+len(nums), float(nums))
183 return (i+len(nums), int(nums))
184 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
187 i,res = CHARMAP.get(s[i], parseNumber)(i)
188 i = skipSpace(i, False)
192 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
195 def preferredencoding():
196 """Get preferred encoding.
198 Returns the best encoding scheme for the system, based on
199 locale.getpreferredencoding() and some further tweaks.
201 def yield_preferredencoding():
203 pref = locale.getpreferredencoding()
209 return yield_preferredencoding().next()
212 def htmlentity_transform(matchobj):
213 """Transforms an HTML entity to a Unicode character.
215 This function receives a match object and is intended to be used with
216 the re.sub() function.
218 entity = matchobj.group(1)
220 # Known non-numeric HTML entity
221 if entity in htmlentitydefs.name2codepoint:
222 return unichr(htmlentitydefs.name2codepoint[entity])
225 mobj = re.match(ur'(?u)#(x?\d+)', entity)
227 numstr = mobj.group(1)
228 if numstr.startswith(u'x'):
230 numstr = u'0%s' % numstr
233 return unichr(long(numstr, base))
235 # Unknown entity in name, return its literal representation
236 return (u'&%s;' % entity)
239 def sanitize_title(utitle):
240 """Sanitizes a video title so it could be used as part of a filename."""
241 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
242 return utitle.replace(unicode(os.sep), u'%')
245 def sanitize_open(filename, open_mode):
246 """Try to open the given filename, and slightly tweak it if this fails.
248 Attempts to open the given filename. If this fails, it tries to change
249 the filename slightly, step by step, until it's either able to open it
250 or it fails and raises a final exception, like the standard open()
253 It returns the tuple (stream, definitive_file_name).
257 if sys.platform == 'win32':
259 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
260 return (sys.stdout, filename)
261 stream = open(filename, open_mode)
262 return (stream, filename)
263 except (IOError, OSError), err:
264 # In case of error, try to remove win32 forbidden chars
265 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
267 # An exception here should be caught in the caller
268 stream = open(filename, open_mode)
269 return (stream, filename)
272 def timeconvert(timestr):
273 """Convert RFC 2822 defined time string into system timestamp"""
275 timetuple = email.utils.parsedate_tz(timestr)
276 if timetuple is not None:
277 timestamp = email.utils.mktime_tz(timetuple)
281 class DownloadError(Exception):
282 """Download Error exception.
284 This exception may be thrown by FileDownloader objects if they are not
285 configured to continue on errors. They will contain the appropriate
291 class SameFileError(Exception):
292 """Same File exception.
294 This exception will be thrown by FileDownloader objects if they detect
295 multiple files would have to be downloaded to the same file on disk.
300 class PostProcessingError(Exception):
301 """Post Processing exception.
303 This exception may be raised by PostProcessor's .run() method to
304 indicate an error in the postprocessing task.
309 class UnavailableVideoError(Exception):
310 """Unavailable Format exception.
312 This exception will be thrown when a video is requested
313 in a format that is not available for that video.
318 class ContentTooShortError(Exception):
319 """Content Too Short exception.
321 This exception may be raised by FileDownloader objects when a file they
322 download is too small for what the server announced first, indicating
323 the connection was probably interrupted.
329 def __init__(self, downloaded, expected):
330 self.downloaded = downloaded
331 self.expected = expected
334 class YoutubeDLHandler(urllib2.HTTPHandler):
335 """Handler for HTTP requests and responses.
337 This class, when installed with an OpenerDirector, automatically adds
338 the standard headers to every HTTP request and handles gzipped and
339 deflated responses from web servers. If compression is to be avoided in
340 a particular request, the original request in the program code only has
341 to include the HTTP header "Youtubedl-No-Compression", which will be
342 removed before making the real request.
344 Part of this code was copied from:
346 http://techknack.net/python-urllib2-handlers/
348 Andrew Rowls, the author of that code, agreed to release it to the
355 return zlib.decompress(data, -zlib.MAX_WBITS)
357 return zlib.decompress(data)
360 def addinfourl_wrapper(stream, headers, url, code):
361 if hasattr(urllib2.addinfourl, 'getcode'):
362 return urllib2.addinfourl(stream, headers, url, code)
363 ret = urllib2.addinfourl(stream, headers, url)
367 def http_request(self, req):
368 for h in std_headers:
371 req.add_header(h, std_headers[h])
372 if 'Youtubedl-no-compression' in req.headers:
373 if 'Accept-encoding' in req.headers:
374 del req.headers['Accept-encoding']
375 del req.headers['Youtubedl-no-compression']
378 def http_response(self, req, resp):
381 if resp.headers.get('Content-encoding', '') == 'gzip':
382 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
383 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
384 resp.msg = old_resp.msg
386 if resp.headers.get('Content-encoding', '') == 'deflate':
387 gz = StringIO.StringIO(self.deflate(resp.read()))
388 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
389 resp.msg = old_resp.msg
393 class FileDownloader(object):
394 """File Downloader class.
396 File downloader objects are the ones responsible of downloading the
397 actual video file and writing it to disk if the user has requested
398 it, among some other tasks. In most cases there should be one per
399 program. As, given a video URL, the downloader doesn't know how to
400 extract all the needed information, task that InfoExtractors do, it
401 has to pass the URL to one of them.
403 For this, file downloader objects have a method that allows
404 InfoExtractors to be registered in a given order. When it is passed
405 a URL, the file downloader handles it to the first InfoExtractor it
406 finds that reports being able to handle it. The InfoExtractor extracts
407 all the information about the video or videos the URL refers to, and
408 asks the FileDownloader to process the video information, possibly
409 downloading the video.
411 File downloaders accept a lot of parameters. In order not to saturate
412 the object constructor with arguments, it receives a dictionary of
413 options instead. These options are available through the params
414 attribute for the InfoExtractors to use. The FileDownloader also
415 registers itself as the downloader in charge for the InfoExtractors
416 that are added to it, so this is a "mutual registration".
420 username: Username for authentication purposes.
421 password: Password for authentication purposes.
422 usenetrc: Use netrc for authentication instead.
423 quiet: Do not print messages to stdout.
424 forceurl: Force printing final URL.
425 forcetitle: Force printing title.
426 forcethumbnail: Force printing thumbnail URL.
427 forcedescription: Force printing description.
428 forcefilename: Force printing final filename.
429 simulate: Do not download the video files.
430 format: Video format code.
431 format_limit: Highest quality format to try.
432 outtmpl: Template for output names.
433 ignoreerrors: Do not stop on download errors.
434 ratelimit: Download speed limit, in bytes/sec.
435 nooverwrites: Prevent overwriting files.
436 retries: Number of times to retry for HTTP error 5xx
437 continuedl: Try to continue downloads if possible.
438 noprogress: Do not print the progress bar.
439 playliststart: Playlist item to start at.
440 playlistend: Playlist item to end at.
441 matchtitle: Download only matching titles.
442 rejecttitle: Reject downloads for matching titles.
443 logtostderr: Log messages to stderr instead of stdout.
444 consoletitle: Display progress in console window's titlebar.
445 nopart: Do not use temporary .part files.
446 updatetime: Use the Last-modified header to set output file timestamps.
447 writedescription: Write the video description to a .description file
448 writeinfojson: Write the video description to a .info.json file
454 _download_retcode = None
455 _num_downloads = None
458 def __init__(self, params):
459 """Create a FileDownloader object with the given options."""
462 self._download_retcode = 0
463 self._num_downloads = 0
464 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
468 def format_bytes(bytes):
471 if type(bytes) is str:
476 exponent = long(math.log(bytes, 1024.0))
477 suffix = 'bkMGTPEZY'[exponent]
478 converted = float(bytes) / float(1024 ** exponent)
479 return '%.2f%s' % (converted, suffix)
482 def calc_percent(byte_counter, data_len):
485 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
488 def calc_eta(start, now, total, current):
492 if current == 0 or dif < 0.001: # One millisecond
494 rate = float(current) / dif
495 eta = long((float(total) - float(current)) / rate)
496 (eta_mins, eta_secs) = divmod(eta, 60)
499 return '%02d:%02d' % (eta_mins, eta_secs)
502 def calc_speed(start, now, bytes):
504 if bytes == 0 or dif < 0.001: # One millisecond
505 return '%10s' % '---b/s'
506 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
509 def best_block_size(elapsed_time, bytes):
510 new_min = max(bytes / 2.0, 1.0)
511 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
512 if elapsed_time < 0.001:
514 rate = bytes / elapsed_time
522 def parse_bytes(bytestr):
523 """Parse a string indicating a byte quantity into a long integer."""
524 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
527 number = float(matchobj.group(1))
528 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
529 return long(round(number * multiplier))
531 def add_info_extractor(self, ie):
532 """Add an InfoExtractor object to the end of the list."""
534 ie.set_downloader(self)
536 def add_post_processor(self, pp):
537 """Add a PostProcessor object to the end of the chain."""
539 pp.set_downloader(self)
541 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
542 """Print message to stdout if not in quiet mode."""
544 if not self.params.get('quiet', False):
545 terminator = [u'\n', u''][skip_eol]
546 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
547 self._screen_file.flush()
548 except (UnicodeEncodeError), err:
549 if not ignore_encoding_errors:
552 def to_stderr(self, message):
553 """Print message to stderr."""
554 print >>sys.stderr, message.encode(preferredencoding())
556 def to_cons_title(self, message):
557 """Set console/terminal window title to message."""
558 if not self.params.get('consoletitle', False):
560 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
561 # c_wchar_p() might not be necessary if `message` is
562 # already of type unicode()
563 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
564 elif 'TERM' in os.environ:
565 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
567 def fixed_template(self):
568 """Checks if the output template is fixed."""
569 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
571 def trouble(self, message=None):
572 """Determine action to take when a download problem appears.
574 Depending on if the downloader has been configured to ignore
575 download errors or not, this method may throw an exception or
576 not when errors are found, after printing the message.
578 if message is not None:
579 self.to_stderr(message)
580 if not self.params.get('ignoreerrors', False):
581 raise DownloadError(message)
582 self._download_retcode = 1
584 def slow_down(self, start_time, byte_counter):
585 """Sleep if the download speed is over the rate limit."""
586 rate_limit = self.params.get('ratelimit', None)
587 if rate_limit is None or byte_counter == 0:
590 elapsed = now - start_time
593 speed = float(byte_counter) / elapsed
594 if speed > rate_limit:
595 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
597 def temp_name(self, filename):
598 """Returns a temporary filename for the given filename."""
599 if self.params.get('nopart', False) or filename == u'-' or \
600 (os.path.exists(filename) and not os.path.isfile(filename)):
602 return filename + u'.part'
604 def undo_temp_name(self, filename):
605 if filename.endswith(u'.part'):
606 return filename[:-len(u'.part')]
609 def try_rename(self, old_filename, new_filename):
611 if old_filename == new_filename:
613 os.rename(old_filename, new_filename)
614 except (IOError, OSError), err:
615 self.trouble(u'ERROR: unable to rename file')
617 def try_utime(self, filename, last_modified_hdr):
618 """Try to set the last-modified time of the given file."""
619 if last_modified_hdr is None:
621 if not os.path.isfile(filename):
623 timestr = last_modified_hdr
626 filetime = timeconvert(timestr)
630 os.utime(filename, (time.time(), filetime))
635 def report_writedescription(self, descfn):
636 """ Report that the description file is being written """
637 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
639 def report_writeinfojson(self, infofn):
640 """ Report that the metadata file has been written """
641 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
643 def report_destination(self, filename):
644 """Report destination filename."""
645 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
647 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
648 """Report download progress."""
649 if self.params.get('noprogress', False):
651 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
652 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
653 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
654 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
656 def report_resuming_byte(self, resume_len):
657 """Report attempt to resume at given byte."""
658 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
660 def report_retry(self, count, retries):
661 """Report retry in case of HTTP error 5xx"""
662 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
664 def report_file_already_downloaded(self, file_name):
665 """Report file has already been fully downloaded."""
667 self.to_screen(u'[download] %s has already been downloaded' % file_name)
668 except (UnicodeEncodeError), err:
669 self.to_screen(u'[download] The file has already been downloaded')
671 def report_unable_to_resume(self):
672 """Report it was impossible to resume download."""
673 self.to_screen(u'[download] Unable to resume')
675 def report_finish(self):
676 """Report download finished."""
677 if self.params.get('noprogress', False):
678 self.to_screen(u'[download] Download completed')
682 def increment_downloads(self):
683 """Increment the ordinal that assigns a number to each file."""
684 self._num_downloads += 1
686 def prepare_filename(self, info_dict):
687 """Generate the output filename."""
689 template_dict = dict(info_dict)
690 template_dict['epoch'] = unicode(long(time.time()))
691 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
692 filename = self.params['outtmpl'] % template_dict
694 except (ValueError, KeyError), err:
695 self.trouble(u'ERROR: invalid system charset or erroneous output template')
698 def process_info(self, info_dict):
699 """Process a single dictionary returned by an InfoExtractor."""
700 filename = self.prepare_filename(info_dict)
703 if self.params.get('forcetitle', False):
704 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
705 if self.params.get('forceurl', False):
706 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
707 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
708 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
709 if self.params.get('forcedescription', False) and 'description' in info_dict:
710 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
711 if self.params.get('forcefilename', False) and filename is not None:
712 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
713 if self.params.get('forceformat', False):
714 print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
716 # Do nothing else if in simulate mode
717 if self.params.get('simulate', False):
723 matchtitle=self.params.get('matchtitle',False)
724 rejecttitle=self.params.get('rejecttitle',False)
725 title=info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
726 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
727 self.to_screen(u'[download] "%s" title did not match pattern "%s"' % (title, matchtitle))
729 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
730 self.to_screen(u'[download] "%s" title matched reject pattern "%s"' % (title, rejecttitle))
733 if self.params.get('nooverwrites', False) and os.path.exists(filename):
734 self.to_stderr(u'WARNING: file exists and will be skipped')
738 dn = os.path.dirname(filename)
739 if dn != '' and not os.path.exists(dn):
741 except (OSError, IOError), err:
742 self.trouble(u'ERROR: unable to create directory ' + unicode(err))
745 if self.params.get('writedescription', False):
747 descfn = filename + '.description'
748 self.report_writedescription(descfn)
749 descfile = open(descfn, 'wb')
751 descfile.write(info_dict['description'].encode('utf-8'))
754 except (OSError, IOError):
755 self.trouble(u'ERROR: Cannot write description file ' + descfn)
758 if self.params.get('writeinfojson', False):
759 infofn = filename + '.info.json'
760 self.report_writeinfojson(infofn)
763 except (NameError,AttributeError):
764 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
767 infof = open(infofn, 'wb')
769 json.dump(info_dict, infof)
772 except (OSError, IOError):
773 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
776 if not self.params.get('skip_download', False):
778 success = self._do_download(filename, info_dict)
779 except (OSError, IOError), err:
780 raise UnavailableVideoError
781 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
782 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
784 except (ContentTooShortError, ), err:
785 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
790 self.post_process(filename, info_dict)
791 except (PostProcessingError), err:
792 self.trouble(u'ERROR: postprocessing: %s' % str(err))
795 def download(self, url_list):
796 """Download a given list of URLs."""
797 if len(url_list) > 1 and self.fixed_template():
798 raise SameFileError(self.params['outtmpl'])
801 suitable_found = False
803 # Go to next InfoExtractor if not suitable
804 if not ie.suitable(url):
807 # Suitable InfoExtractor found
808 suitable_found = True
810 # Extract information from URL and process it
813 # Suitable InfoExtractor had been found; go to next URL
816 if not suitable_found:
817 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
819 return self._download_retcode
821 def post_process(self, filename, ie_info):
822 """Run the postprocessing chain on the given file."""
824 info['filepath'] = filename
830 def _download_with_rtmpdump(self, filename, url, player_url):
831 self.report_destination(filename)
832 tmpfilename = self.temp_name(filename)
834 # Check for rtmpdump first
836 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
837 except (OSError, IOError):
838 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
841 # Download using rtmpdump. rtmpdump returns exit code 2 when
842 # the connection was interrumpted and resuming appears to be
843 # possible. This is part of rtmpdump's normal usage, AFAIK.
844 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
845 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
846 while retval == 2 or retval == 1:
847 prevsize = os.path.getsize(tmpfilename)
848 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
849 time.sleep(5.0) # This seems to be needed
850 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
851 cursize = os.path.getsize(tmpfilename)
852 if prevsize == cursize and retval == 1:
854 # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
855 if prevsize == cursize and retval == 2 and cursize > 1024:
856 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
860 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
861 self.try_rename(tmpfilename, filename)
864 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
867 def _do_download(self, filename, info_dict):
868 url = info_dict['url']
869 player_url = info_dict.get('player_url', None)
871 # Check file already present
872 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
873 self.report_file_already_downloaded(filename)
876 # Attempt to download using rtmpdump
877 if url.startswith('rtmp'):
878 return self._download_with_rtmpdump(filename, url, player_url)
880 tmpfilename = self.temp_name(filename)
883 # Do not include the Accept-Encoding header
884 headers = {'Youtubedl-no-compression': 'True'}
885 basic_request = urllib2.Request(url, None, headers)
886 request = urllib2.Request(url, None, headers)
888 # Establish possible resume length
889 if os.path.isfile(tmpfilename):
890 resume_len = os.path.getsize(tmpfilename)
896 if self.params.get('continuedl', False):
897 self.report_resuming_byte(resume_len)
898 request.add_header('Range','bytes=%d-' % resume_len)
904 retries = self.params.get('retries', 0)
905 while count <= retries:
906 # Establish connection
908 data = urllib2.urlopen(request)
910 except (urllib2.HTTPError, ), err:
911 if (err.code < 500 or err.code >= 600) and err.code != 416:
912 # Unexpected HTTP error
914 elif err.code == 416:
915 # Unable to resume (requested range not satisfiable)
917 # Open the connection again without the range header
918 data = urllib2.urlopen(basic_request)
919 content_length = data.info()['Content-Length']
920 except (urllib2.HTTPError, ), err:
921 if err.code < 500 or err.code >= 600:
924 # Examine the reported length
925 if (content_length is not None and
926 (resume_len - 100 < long(content_length) < resume_len + 100)):
927 # The file had already been fully downloaded.
928 # Explanation to the above condition: in issue #175 it was revealed that
929 # YouTube sometimes adds or removes a few bytes from the end of the file,
930 # changing the file size slightly and causing problems for some users. So
931 # I decided to implement a suggested change and consider the file
932 # completely downloaded if the file size differs less than 100 bytes from
933 # the one in the hard drive.
934 self.report_file_already_downloaded(filename)
935 self.try_rename(tmpfilename, filename)
938 # The length does not match, we start the download over
939 self.report_unable_to_resume()
945 self.report_retry(count, retries)
948 self.trouble(u'ERROR: giving up after %s retries' % retries)
951 data_len = data.info().get('Content-length', None)
952 if data_len is not None:
953 data_len = long(data_len) + resume_len
954 data_len_str = self.format_bytes(data_len)
955 byte_counter = 0 + resume_len
961 data_block = data.read(block_size)
963 if len(data_block) == 0:
965 byte_counter += len(data_block)
967 # Open file just in time
970 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
971 assert stream is not None
972 filename = self.undo_temp_name(tmpfilename)
973 self.report_destination(filename)
974 except (OSError, IOError), err:
975 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
978 stream.write(data_block)
979 except (IOError, OSError), err:
980 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
982 block_size = self.best_block_size(after - before, len(data_block))
985 percent_str = self.calc_percent(byte_counter, data_len)
986 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
987 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
988 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
991 self.slow_down(start, byte_counter - resume_len)
994 self.trouble(u'\nERROR: Did not get any data blocks')
998 if data_len is not None and byte_counter != data_len:
999 raise ContentTooShortError(byte_counter, long(data_len))
1000 self.try_rename(tmpfilename, filename)
1002 # Update file modification time
1003 if self.params.get('updatetime', True):
1004 info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1009 class InfoExtractor(object):
1010 """Information Extractor class.
1012 Information extractors are the classes that, given a URL, extract
1013 information from the video (or videos) the URL refers to. This
1014 information includes the real video URL, the video title and simplified
1015 title, author and others. The information is stored in a dictionary
1016 which is then passed to the FileDownloader. The FileDownloader
1017 processes this information possibly downloading the video to the file
1018 system, among other possible outcomes. The dictionaries must include
1019 the following fields:
1021 id: Video identifier.
1022 url: Final video URL.
1023 uploader: Nickname of the video uploader.
1024 title: Literal title.
1025 stitle: Simplified title.
1026 ext: Video filename extension.
1027 format: Video format.
1028 player_url: SWF Player URL (may be None).
1030 The following fields are optional. Their primary purpose is to allow
1031 youtube-dl to serve as the backend for a video search function, such
1032 as the one in youtube2mp3. They are only used when their respective
1033 forced printing functions are called:
1035 thumbnail: Full URL to a video thumbnail image.
1036 description: One-line video description.
1038 Subclasses of this one should re-define the _real_initialize() and
1039 _real_extract() methods and define a _VALID_URL regexp.
1040 Probably, they should also be added to the list of extractors.
1046 def __init__(self, downloader=None):
1047 """Constructor. Receives an optional downloader."""
1049 self.set_downloader(downloader)
1051 def suitable(self, url):
1052 """Receives a URL and returns True if suitable for this IE."""
1053 return re.match(self._VALID_URL, url) is not None
1055 def initialize(self):
1056 """Initializes an instance (authentication, etc)."""
1058 self._real_initialize()
1061 def extract(self, url):
1062 """Extracts URL information and returns it in list of dicts."""
1064 return self._real_extract(url)
1066 def set_downloader(self, downloader):
1067 """Sets the downloader for this IE."""
1068 self._downloader = downloader
1070 def _real_initialize(self):
1071 """Real initialization process. Redefine in subclasses."""
1074 def _real_extract(self, url):
1075 """Real extraction process. Redefine in subclasses."""
1079 class YoutubeIE(InfoExtractor):
1080 """Information extractor for youtube.com."""
1082 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1083 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1084 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1085 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1086 _NETRC_MACHINE = 'youtube'
1087 # Listed in order of quality
1088 _available_formats = ['38', '37', '45', '22', '43', '35', '34', '18', '6', '5', '17', '13']
1089 _video_extensions = {
1095 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1099 IE_NAME = u'youtube'
1101 def report_lang(self):
1102 """Report attempt to set language."""
1103 self._downloader.to_screen(u'[youtube] Setting language')
1105 def report_login(self):
1106 """Report attempt to log in."""
1107 self._downloader.to_screen(u'[youtube] Logging in')
1109 def report_age_confirmation(self):
1110 """Report attempt to confirm age."""
1111 self._downloader.to_screen(u'[youtube] Confirming age')
1113 def report_video_webpage_download(self, video_id):
1114 """Report attempt to download video webpage."""
1115 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1117 def report_video_info_webpage_download(self, video_id):
1118 """Report attempt to download video info webpage."""
1119 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1121 def report_information_extraction(self, video_id):
1122 """Report attempt to extract video information."""
1123 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1125 def report_unavailable_format(self, video_id, format):
1126 """Report extracted video URL."""
1127 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1129 def report_rtmp_download(self):
1130 """Indicate the download will use the RTMP protocol."""
1131 self._downloader.to_screen(u'[youtube] RTMP download detected')
1133 def _real_initialize(self):
1134 if self._downloader is None:
1139 downloader_params = self._downloader.params
1141 # Attempt to use provided username and password or .netrc data
1142 if downloader_params.get('username', None) is not None:
1143 username = downloader_params['username']
1144 password = downloader_params['password']
1145 elif downloader_params.get('usenetrc', False):
1147 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1148 if info is not None:
1152 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1153 except (IOError, netrc.NetrcParseError), err:
1154 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1158 request = urllib2.Request(self._LANG_URL)
1161 urllib2.urlopen(request).read()
1162 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1163 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1166 # No authentication to be performed
1167 if username is None:
1172 'current_form': 'loginForm',
1174 'action_login': 'Log In',
1175 'username': username,
1176 'password': password,
1178 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1181 login_results = urllib2.urlopen(request).read()
1182 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1183 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1185 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1186 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1192 'action_confirm': 'Confirm',
1194 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1196 self.report_age_confirmation()
1197 age_results = urllib2.urlopen(request).read()
1198 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1199 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1202 def _real_extract(self, url):
1203 # Extract video id from URL
1204 mobj = re.match(self._VALID_URL, url)
1206 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1208 video_id = mobj.group(2)
1211 self.report_video_webpage_download(video_id)
1212 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1214 video_webpage = urllib2.urlopen(request).read()
1215 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1216 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1219 # Attempt to extract SWF player URL
1220 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1221 if mobj is not None:
1222 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1227 self.report_video_info_webpage_download(video_id)
1228 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1229 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1230 % (video_id, el_type))
1231 request = urllib2.Request(video_info_url)
1233 video_info_webpage = urllib2.urlopen(request).read()
1234 video_info = parse_qs(video_info_webpage)
1235 if 'token' in video_info:
1237 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1238 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1240 if 'token' not in video_info:
1241 if 'reason' in video_info:
1242 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1244 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1247 # Start extracting information
1248 self.report_information_extraction(video_id)
1251 if 'author' not in video_info:
1252 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1254 video_uploader = urllib.unquote_plus(video_info['author'][0])
1257 if 'title' not in video_info:
1258 self._downloader.trouble(u'ERROR: unable to extract video title')
1260 video_title = urllib.unquote_plus(video_info['title'][0])
1261 video_title = video_title.decode('utf-8')
1262 video_title = sanitize_title(video_title)
1265 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1266 simple_title = simple_title.strip(ur'_')
1269 if 'thumbnail_url' not in video_info:
1270 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1271 video_thumbnail = ''
1272 else: # don't panic if we can't find it
1273 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1277 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1278 if mobj is not None:
1279 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1280 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1281 for expression in format_expressions:
1283 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1291 video_description = u'No description available.'
1292 if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1293 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1294 if mobj is not None:
1295 video_description = mobj.group(1).decode('utf-8')
1297 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1298 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1299 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1300 # TODO use another parser
1303 video_token = urllib.unquote_plus(video_info['token'][0])
1305 # Decide which formats to download
1306 req_format = self._downloader.params.get('format', None)
1308 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1309 self.report_rtmp_download()
1310 video_url_list = [(None, video_info['conn'][0])]
1311 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1312 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1313 url_data = [parse_qs(uds) for uds in url_data_strs]
1314 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1315 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1317 format_limit = self._downloader.params.get('format_limit', None)
1318 if format_limit is not None and format_limit in self._available_formats:
1319 format_list = self._available_formats[self._available_formats.index(format_limit):]
1321 format_list = self._available_formats
1322 existing_formats = [x for x in format_list if x in url_map]
1323 if len(existing_formats) == 0:
1324 self._downloader.trouble(u'ERROR: no known formats available for video')
1326 if req_format is None or req_format == 'best':
1327 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1328 elif req_format == 'worst':
1329 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1330 elif req_format in ('-1', 'all'):
1331 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1333 # Specific formats. We pick the first in a slash-delimeted sequence.
1334 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1335 req_formats = req_format.split('/')
1336 video_url_list = None
1337 for rf in req_formats:
1339 video_url_list = [(rf, url_map[rf])]
1341 if video_url_list is None:
1342 self._downloader.trouble(u'ERROR: requested format not available')
1345 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1348 for format_param, video_real_url in video_url_list:
1349 # At this point we have a new video
1350 self._downloader.increment_downloads()
1353 video_extension = self._video_extensions.get(format_param, 'flv')
1356 # Process video information
1357 self._downloader.process_info({
1358 'id': video_id.decode('utf-8'),
1359 'url': video_real_url.decode('utf-8'),
1360 'uploader': video_uploader.decode('utf-8'),
1361 'upload_date': upload_date,
1362 'title': video_title,
1363 'stitle': simple_title,
1364 'ext': video_extension.decode('utf-8'),
1365 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1366 'thumbnail': video_thumbnail.decode('utf-8'),
1367 'description': video_description,
1368 'player_url': player_url,
1370 except UnavailableVideoError, err:
1371 self._downloader.trouble(u'\nERROR: unable to download video')
1374 class MetacafeIE(InfoExtractor):
1375 """Information Extractor for metacafe.com."""
1377 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1378 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1379 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1381 IE_NAME = u'metacafe'
1383 def __init__(self, youtube_ie, downloader=None):
1384 InfoExtractor.__init__(self, downloader)
1385 self._youtube_ie = youtube_ie
1387 def report_disclaimer(self):
1388 """Report disclaimer retrieval."""
1389 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1391 def report_age_confirmation(self):
1392 """Report attempt to confirm age."""
1393 self._downloader.to_screen(u'[metacafe] Confirming age')
1395 def report_download_webpage(self, video_id):
1396 """Report webpage download."""
1397 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1399 def report_extraction(self, video_id):
1400 """Report information extraction."""
1401 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1403 def _real_initialize(self):
1404 # Retrieve disclaimer
1405 request = urllib2.Request(self._DISCLAIMER)
1407 self.report_disclaimer()
1408 disclaimer = urllib2.urlopen(request).read()
1409 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1410 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1416 'submit': "Continue - I'm over 18",
1418 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1420 self.report_age_confirmation()
1421 disclaimer = urllib2.urlopen(request).read()
1422 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1423 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1426 def _real_extract(self, url):
1427 # Extract id and simplified title from URL
1428 mobj = re.match(self._VALID_URL, url)
1430 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1433 video_id = mobj.group(1)
1435 # Check if video comes from YouTube
1436 mobj2 = re.match(r'^yt-(.*)$', video_id)
1437 if mobj2 is not None:
1438 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1441 # At this point we have a new video
1442 self._downloader.increment_downloads()
1444 simple_title = mobj.group(2).decode('utf-8')
1446 # Retrieve video webpage to extract further information
1447 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1449 self.report_download_webpage(video_id)
1450 webpage = urllib2.urlopen(request).read()
1451 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1452 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1455 # Extract URL, uploader and title from webpage
1456 self.report_extraction(video_id)
1457 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1458 if mobj is not None:
1459 mediaURL = urllib.unquote(mobj.group(1))
1460 video_extension = mediaURL[-3:]
1462 # Extract gdaKey if available
1463 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1465 video_url = mediaURL
1467 gdaKey = mobj.group(1)
1468 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1470 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1472 self._downloader.trouble(u'ERROR: unable to extract media URL')
1474 vardict = parse_qs(mobj.group(1))
1475 if 'mediaData' not in vardict:
1476 self._downloader.trouble(u'ERROR: unable to extract media URL')
1478 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1480 self._downloader.trouble(u'ERROR: unable to extract media URL')
1482 mediaURL = mobj.group(1).replace('\\/', '/')
1483 video_extension = mediaURL[-3:]
1484 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1486 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1488 self._downloader.trouble(u'ERROR: unable to extract title')
1490 video_title = mobj.group(1).decode('utf-8')
1491 video_title = sanitize_title(video_title)
1493 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1495 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1497 video_uploader = mobj.group(1)
1500 # Process video information
1501 self._downloader.process_info({
1502 'id': video_id.decode('utf-8'),
1503 'url': video_url.decode('utf-8'),
1504 'uploader': video_uploader.decode('utf-8'),
1505 'upload_date': u'NA',
1506 'title': video_title,
1507 'stitle': simple_title,
1508 'ext': video_extension.decode('utf-8'),
1512 except UnavailableVideoError:
1513 self._downloader.trouble(u'\nERROR: unable to download video')
1516 class DailymotionIE(InfoExtractor):
1517 """Information Extractor for Dailymotion"""
1519 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1520 IE_NAME = u'dailymotion'
1522 def __init__(self, downloader=None):
1523 InfoExtractor.__init__(self, downloader)
1525 def report_download_webpage(self, video_id):
1526 """Report webpage download."""
1527 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1529 def report_extraction(self, video_id):
1530 """Report information extraction."""
1531 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1533 def _real_initialize(self):
1536 def _real_extract(self, url):
1537 # Extract id and simplified title from URL
1538 mobj = re.match(self._VALID_URL, url)
1540 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1543 # At this point we have a new video
1544 self._downloader.increment_downloads()
1545 video_id = mobj.group(1)
1547 simple_title = mobj.group(2).decode('utf-8')
1548 video_extension = 'flv'
1550 # Retrieve video webpage to extract further information
1551 request = urllib2.Request(url)
1552 request.add_header('Cookie', 'family_filter=off')
1554 self.report_download_webpage(video_id)
1555 webpage = urllib2.urlopen(request).read()
1556 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1557 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1560 # Extract URL, uploader and title from webpage
1561 self.report_extraction(video_id)
1562 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1564 self._downloader.trouble(u'ERROR: unable to extract media URL')
1566 sequence = urllib.unquote(mobj.group(1))
1567 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1569 self._downloader.trouble(u'ERROR: unable to extract media URL')
1571 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1573 # if needed add http://www.dailymotion.com/ if relative URL
1575 video_url = mediaURL
1577 mobj = re.search(r'(?im)<title>Dailymotion\s*-\s*(.+)\s*-\s*[^<]+?</title>', webpage)
1579 self._downloader.trouble(u'ERROR: unable to extract title')
1581 video_title = mobj.group(1).decode('utf-8')
1582 video_title = sanitize_title(video_title)
1584 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1586 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1588 video_uploader = mobj.group(1)
1591 # Process video information
1592 self._downloader.process_info({
1593 'id': video_id.decode('utf-8'),
1594 'url': video_url.decode('utf-8'),
1595 'uploader': video_uploader.decode('utf-8'),
1596 'upload_date': u'NA',
1597 'title': video_title,
1598 'stitle': simple_title,
1599 'ext': video_extension.decode('utf-8'),
1603 except UnavailableVideoError:
1604 self._downloader.trouble(u'\nERROR: unable to download video')
1607 class GoogleIE(InfoExtractor):
1608 """Information extractor for video.google.com."""
1610 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1611 IE_NAME = u'video.google'
1613 def __init__(self, downloader=None):
1614 InfoExtractor.__init__(self, downloader)
1616 def report_download_webpage(self, video_id):
1617 """Report webpage download."""
1618 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1620 def report_extraction(self, video_id):
1621 """Report information extraction."""
1622 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1624 def _real_initialize(self):
1627 def _real_extract(self, url):
1628 # Extract id from URL
1629 mobj = re.match(self._VALID_URL, url)
1631 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1634 # At this point we have a new video
1635 self._downloader.increment_downloads()
1636 video_id = mobj.group(1)
1638 video_extension = 'mp4'
1640 # Retrieve video webpage to extract further information
1641 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1643 self.report_download_webpage(video_id)
1644 webpage = urllib2.urlopen(request).read()
1645 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1646 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1649 # Extract URL, uploader, and title from webpage
1650 self.report_extraction(video_id)
1651 mobj = re.search(r"download_url:'([^']+)'", webpage)
1653 video_extension = 'flv'
1654 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1656 self._downloader.trouble(u'ERROR: unable to extract media URL')
1658 mediaURL = urllib.unquote(mobj.group(1))
1659 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1660 mediaURL = mediaURL.replace('\\x26', '\x26')
1662 video_url = mediaURL
1664 mobj = re.search(r'<title>(.*)</title>', webpage)
1666 self._downloader.trouble(u'ERROR: unable to extract title')
1668 video_title = mobj.group(1).decode('utf-8')
1669 video_title = sanitize_title(video_title)
1670 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1672 # Extract video description
1673 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1675 self._downloader.trouble(u'ERROR: unable to extract video description')
1677 video_description = mobj.group(1).decode('utf-8')
1678 if not video_description:
1679 video_description = 'No description available.'
1681 # Extract video thumbnail
1682 if self._downloader.params.get('forcethumbnail', False):
1683 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1685 webpage = urllib2.urlopen(request).read()
1686 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1687 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1689 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1691 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1693 video_thumbnail = mobj.group(1)
1694 else: # we need something to pass to process_info
1695 video_thumbnail = ''
1698 # Process video information
1699 self._downloader.process_info({
1700 'id': video_id.decode('utf-8'),
1701 'url': video_url.decode('utf-8'),
1703 'upload_date': u'NA',
1704 'title': video_title,
1705 'stitle': simple_title,
1706 'ext': video_extension.decode('utf-8'),
1710 except UnavailableVideoError:
1711 self._downloader.trouble(u'\nERROR: unable to download video')
1714 class PhotobucketIE(InfoExtractor):
1715 """Information extractor for photobucket.com."""
1717 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1718 IE_NAME = u'photobucket'
1720 def __init__(self, downloader=None):
1721 InfoExtractor.__init__(self, downloader)
1723 def report_download_webpage(self, video_id):
1724 """Report webpage download."""
1725 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1727 def report_extraction(self, video_id):
1728 """Report information extraction."""
1729 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1731 def _real_initialize(self):
1734 def _real_extract(self, url):
1735 # Extract id from URL
1736 mobj = re.match(self._VALID_URL, url)
1738 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1741 # At this point we have a new video
1742 self._downloader.increment_downloads()
1743 video_id = mobj.group(1)
1745 video_extension = 'flv'
1747 # Retrieve video webpage to extract further information
1748 request = urllib2.Request(url)
1750 self.report_download_webpage(video_id)
1751 webpage = urllib2.urlopen(request).read()
1752 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1753 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1756 # Extract URL, uploader, and title from webpage
1757 self.report_extraction(video_id)
1758 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1760 self._downloader.trouble(u'ERROR: unable to extract media URL')
1762 mediaURL = urllib.unquote(mobj.group(1))
1764 video_url = mediaURL
1766 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1768 self._downloader.trouble(u'ERROR: unable to extract title')
1770 video_title = mobj.group(1).decode('utf-8')
1771 video_title = sanitize_title(video_title)
1772 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1774 video_uploader = mobj.group(2).decode('utf-8')
1777 # Process video information
1778 self._downloader.process_info({
1779 'id': video_id.decode('utf-8'),
1780 'url': video_url.decode('utf-8'),
1781 'uploader': video_uploader,
1782 'upload_date': u'NA',
1783 'title': video_title,
1784 'stitle': simple_title,
1785 'ext': video_extension.decode('utf-8'),
1789 except UnavailableVideoError:
1790 self._downloader.trouble(u'\nERROR: unable to download video')
1793 class YahooIE(InfoExtractor):
1794 """Information extractor for video.yahoo.com."""
1796 # _VALID_URL matches all Yahoo! Video URLs
1797 # _VPAGE_URL matches only the extractable '/watch/' URLs
1798 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1799 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1800 IE_NAME = u'video.yahoo'
1802 def __init__(self, downloader=None):
1803 InfoExtractor.__init__(self, downloader)
1805 def report_download_webpage(self, video_id):
1806 """Report webpage download."""
1807 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1809 def report_extraction(self, video_id):
1810 """Report information extraction."""
1811 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1813 def _real_initialize(self):
1816 def _real_extract(self, url, new_video=True):
1817 # Extract ID from URL
1818 mobj = re.match(self._VALID_URL, url)
1820 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1823 # At this point we have a new video
1824 self._downloader.increment_downloads()
1825 video_id = mobj.group(2)
1826 video_extension = 'flv'
1828 # Rewrite valid but non-extractable URLs as
1829 # extractable English language /watch/ URLs
1830 if re.match(self._VPAGE_URL, url) is None:
1831 request = urllib2.Request(url)
1833 webpage = urllib2.urlopen(request).read()
1834 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1835 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1838 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1840 self._downloader.trouble(u'ERROR: Unable to extract id field')
1842 yahoo_id = mobj.group(1)
1844 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1846 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1848 yahoo_vid = mobj.group(1)
1850 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1851 return self._real_extract(url, new_video=False)
1853 # Retrieve video webpage to extract further information
1854 request = urllib2.Request(url)
1856 self.report_download_webpage(video_id)
1857 webpage = urllib2.urlopen(request).read()
1858 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1859 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1862 # Extract uploader and title from webpage
1863 self.report_extraction(video_id)
1864 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1866 self._downloader.trouble(u'ERROR: unable to extract video title')
1868 video_title = mobj.group(1).decode('utf-8')
1869 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1871 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1873 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1875 video_uploader = mobj.group(1).decode('utf-8')
1877 # Extract video thumbnail
1878 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1880 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1882 video_thumbnail = mobj.group(1).decode('utf-8')
1884 # Extract video description
1885 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1887 self._downloader.trouble(u'ERROR: unable to extract video description')
1889 video_description = mobj.group(1).decode('utf-8')
1890 if not video_description:
1891 video_description = 'No description available.'
1893 # Extract video height and width
1894 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1896 self._downloader.trouble(u'ERROR: unable to extract video height')
1898 yv_video_height = mobj.group(1)
1900 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1902 self._downloader.trouble(u'ERROR: unable to extract video width')
1904 yv_video_width = mobj.group(1)
1906 # Retrieve video playlist to extract media URL
1907 # I'm not completely sure what all these options are, but we
1908 # seem to need most of them, otherwise the server sends a 401.
1909 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1910 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1911 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1912 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1913 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1915 self.report_download_webpage(video_id)
1916 webpage = urllib2.urlopen(request).read()
1917 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1918 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1921 # Extract media URL from playlist XML
1922 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1924 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1926 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1927 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1930 # Process video information
1931 self._downloader.process_info({
1932 'id': video_id.decode('utf-8'),
1934 'uploader': video_uploader,
1935 'upload_date': u'NA',
1936 'title': video_title,
1937 'stitle': simple_title,
1938 'ext': video_extension.decode('utf-8'),
1939 'thumbnail': video_thumbnail.decode('utf-8'),
1940 'description': video_description,
1941 'thumbnail': video_thumbnail,
1944 except UnavailableVideoError:
1945 self._downloader.trouble(u'\nERROR: unable to download video')
1948 class VimeoIE(InfoExtractor):
1949 """Information extractor for vimeo.com."""
1951 # _VALID_URL matches Vimeo URLs
1952 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1955 def __init__(self, downloader=None):
1956 InfoExtractor.__init__(self, downloader)
1958 def report_download_webpage(self, video_id):
1959 """Report webpage download."""
1960 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1962 def report_extraction(self, video_id):
1963 """Report information extraction."""
1964 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1966 def _real_initialize(self):
1969 def _real_extract(self, url, new_video=True):
1970 # Extract ID from URL
1971 mobj = re.match(self._VALID_URL, url)
1973 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1976 # At this point we have a new video
1977 self._downloader.increment_downloads()
1978 video_id = mobj.group(1)
1980 # Retrieve video webpage to extract further information
1981 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
1983 self.report_download_webpage(video_id)
1984 webpage = urllib2.urlopen(request).read()
1985 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1986 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1989 # Now we begin extracting as much information as we can from what we
1990 # retrieved. First we extract the information common to all extractors,
1991 # and latter we extract those that are Vimeo specific.
1992 self.report_extraction(video_id)
1995 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
1997 self._downloader.trouble(u'ERROR: unable to extract video title')
1999 video_title = mobj.group(1).decode('utf-8')
2000 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2003 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
2005 self._downloader.trouble(u'ERROR: unable to extract video uploader')
2007 video_uploader = mobj.group(1).decode('utf-8')
2009 # Extract video thumbnail
2010 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2012 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2014 video_thumbnail = mobj.group(1).decode('utf-8')
2016 # # Extract video description
2017 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2019 # self._downloader.trouble(u'ERROR: unable to extract video description')
2021 # video_description = mobj.group(1).decode('utf-8')
2022 # if not video_description: video_description = 'No description available.'
2023 video_description = 'Foo.'
2025 # Vimeo specific: extract request signature
2026 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2028 self._downloader.trouble(u'ERROR: unable to extract request signature')
2030 sig = mobj.group(1).decode('utf-8')
2032 # Vimeo specific: Extract request signature expiration
2033 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2035 self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2037 sig_exp = mobj.group(1).decode('utf-8')
2039 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s" % (video_id, sig, sig_exp)
2042 # Process video information
2043 self._downloader.process_info({
2044 'id': video_id.decode('utf-8'),
2046 'uploader': video_uploader,
2047 'upload_date': u'NA',
2048 'title': video_title,
2049 'stitle': simple_title,
2051 'thumbnail': video_thumbnail.decode('utf-8'),
2052 'description': video_description,
2053 'thumbnail': video_thumbnail,
2054 'description': video_description,
2057 except UnavailableVideoError:
2058 self._downloader.trouble(u'ERROR: unable to download video')
2061 class GenericIE(InfoExtractor):
2062 """Generic last-resort information extractor."""
2065 IE_NAME = u'generic'
2067 def __init__(self, downloader=None):
2068 InfoExtractor.__init__(self, downloader)
2070 def report_download_webpage(self, video_id):
2071 """Report webpage download."""
2072 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2073 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2075 def report_extraction(self, video_id):
2076 """Report information extraction."""
2077 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2079 def _real_initialize(self):
2082 def _real_extract(self, url):
2083 # At this point we have a new video
2084 self._downloader.increment_downloads()
2086 video_id = url.split('/')[-1]
2087 request = urllib2.Request(url)
2089 self.report_download_webpage(video_id)
2090 webpage = urllib2.urlopen(request).read()
2091 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2092 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2094 except ValueError, err:
2095 # since this is the last-resort InfoExtractor, if
2096 # this error is thrown, it'll be thrown here
2097 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2100 self.report_extraction(video_id)
2101 # Start with something easy: JW Player in SWFObject
2102 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2104 # Broaden the search a little bit
2105 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2107 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2110 # It's possible that one of the regexes
2111 # matched, but returned an empty group:
2112 if mobj.group(1) is None:
2113 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2116 video_url = urllib.unquote(mobj.group(1))
2117 video_id = os.path.basename(video_url)
2119 # here's a fun little line of code for you:
2120 video_extension = os.path.splitext(video_id)[1][1:]
2121 video_id = os.path.splitext(video_id)[0]
2123 # it's tempting to parse this further, but you would
2124 # have to take into account all the variations like
2125 # Video Title - Site Name
2126 # Site Name | Video Title
2127 # Video Title - Tagline | Site Name
2128 # and so on and so forth; it's just not practical
2129 mobj = re.search(r'<title>(.*)</title>', webpage)
2131 self._downloader.trouble(u'ERROR: unable to extract title')
2133 video_title = mobj.group(1).decode('utf-8')
2134 video_title = sanitize_title(video_title)
2135 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2137 # video uploader is domain name
2138 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2140 self._downloader.trouble(u'ERROR: unable to extract title')
2142 video_uploader = mobj.group(1).decode('utf-8')
2145 # Process video information
2146 self._downloader.process_info({
2147 'id': video_id.decode('utf-8'),
2148 'url': video_url.decode('utf-8'),
2149 'uploader': video_uploader,
2150 'upload_date': u'NA',
2151 'title': video_title,
2152 'stitle': simple_title,
2153 'ext': video_extension.decode('utf-8'),
2157 except UnavailableVideoError, err:
2158 self._downloader.trouble(u'\nERROR: unable to download video')
2161 class YoutubeSearchIE(InfoExtractor):
2162 """Information Extractor for YouTube search queries."""
2163 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2164 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2165 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2166 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2168 _max_youtube_results = 1000
2169 IE_NAME = u'youtube:search'
2171 def __init__(self, youtube_ie, downloader=None):
2172 InfoExtractor.__init__(self, downloader)
2173 self._youtube_ie = youtube_ie
2175 def report_download_page(self, query, pagenum):
2176 """Report attempt to download playlist page with given number."""
2177 query = query.decode(preferredencoding())
2178 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2180 def _real_initialize(self):
2181 self._youtube_ie.initialize()
2183 def _real_extract(self, query):
2184 mobj = re.match(self._VALID_URL, query)
2186 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2189 prefix, query = query.split(':')
2191 query = query.encode('utf-8')
2193 self._download_n_results(query, 1)
2195 elif prefix == 'all':
2196 self._download_n_results(query, self._max_youtube_results)
2202 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2204 elif n > self._max_youtube_results:
2205 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2206 n = self._max_youtube_results
2207 self._download_n_results(query, n)
2209 except ValueError: # parsing prefix as integer fails
2210 self._download_n_results(query, 1)
2213 def _download_n_results(self, query, n):
2214 """Downloads a specified number of results for a query"""
2217 already_seen = set()
2221 self.report_download_page(query, pagenum)
2222 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2223 request = urllib2.Request(result_url)
2225 page = urllib2.urlopen(request).read()
2226 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2227 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2230 # Extract video identifiers
2231 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2232 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2233 if video_id not in already_seen:
2234 video_ids.append(video_id)
2235 already_seen.add(video_id)
2236 if len(video_ids) == n:
2237 # Specified n videos reached
2238 for id in video_ids:
2239 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2242 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2243 for id in video_ids:
2244 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2247 pagenum = pagenum + 1
2250 class GoogleSearchIE(InfoExtractor):
2251 """Information Extractor for Google Video search queries."""
2252 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2253 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2254 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2255 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2257 _max_google_results = 1000
2258 IE_NAME = u'video.google:search'
2260 def __init__(self, google_ie, downloader=None):
2261 InfoExtractor.__init__(self, downloader)
2262 self._google_ie = google_ie
2264 def report_download_page(self, query, pagenum):
2265 """Report attempt to download playlist page with given number."""
2266 query = query.decode(preferredencoding())
2267 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2269 def _real_initialize(self):
2270 self._google_ie.initialize()
2272 def _real_extract(self, query):
2273 mobj = re.match(self._VALID_URL, query)
2275 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2278 prefix, query = query.split(':')
2280 query = query.encode('utf-8')
2282 self._download_n_results(query, 1)
2284 elif prefix == 'all':
2285 self._download_n_results(query, self._max_google_results)
2291 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2293 elif n > self._max_google_results:
2294 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2295 n = self._max_google_results
2296 self._download_n_results(query, n)
2298 except ValueError: # parsing prefix as integer fails
2299 self._download_n_results(query, 1)
2302 def _download_n_results(self, query, n):
2303 """Downloads a specified number of results for a query"""
2306 already_seen = set()
2310 self.report_download_page(query, pagenum)
2311 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2312 request = urllib2.Request(result_url)
2314 page = urllib2.urlopen(request).read()
2315 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2316 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2319 # Extract video identifiers
2320 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2321 video_id = mobj.group(1)
2322 if video_id not in already_seen:
2323 video_ids.append(video_id)
2324 already_seen.add(video_id)
2325 if len(video_ids) == n:
2326 # Specified n videos reached
2327 for id in video_ids:
2328 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2331 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2332 for id in video_ids:
2333 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2336 pagenum = pagenum + 1
2339 class YahooSearchIE(InfoExtractor):
2340 """Information Extractor for Yahoo! Video search queries."""
2341 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2342 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2343 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2344 _MORE_PAGES_INDICATOR = r'\s*Next'
2346 _max_yahoo_results = 1000
2347 IE_NAME = u'video.yahoo:search'
2349 def __init__(self, yahoo_ie, downloader=None):
2350 InfoExtractor.__init__(self, downloader)
2351 self._yahoo_ie = yahoo_ie
2353 def report_download_page(self, query, pagenum):
2354 """Report attempt to download playlist page with given number."""
2355 query = query.decode(preferredencoding())
2356 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2358 def _real_initialize(self):
2359 self._yahoo_ie.initialize()
2361 def _real_extract(self, query):
2362 mobj = re.match(self._VALID_URL, query)
2364 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2367 prefix, query = query.split(':')
2369 query = query.encode('utf-8')
2371 self._download_n_results(query, 1)
2373 elif prefix == 'all':
2374 self._download_n_results(query, self._max_yahoo_results)
2380 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2382 elif n > self._max_yahoo_results:
2383 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2384 n = self._max_yahoo_results
2385 self._download_n_results(query, n)
2387 except ValueError: # parsing prefix as integer fails
2388 self._download_n_results(query, 1)
2391 def _download_n_results(self, query, n):
2392 """Downloads a specified number of results for a query"""
2395 already_seen = set()
2399 self.report_download_page(query, pagenum)
2400 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2401 request = urllib2.Request(result_url)
2403 page = urllib2.urlopen(request).read()
2404 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2405 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2408 # Extract video identifiers
2409 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2410 video_id = mobj.group(1)
2411 if video_id not in already_seen:
2412 video_ids.append(video_id)
2413 already_seen.add(video_id)
2414 if len(video_ids) == n:
2415 # Specified n videos reached
2416 for id in video_ids:
2417 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2420 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2421 for id in video_ids:
2422 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2425 pagenum = pagenum + 1
2428 class YoutubePlaylistIE(InfoExtractor):
2429 """Information Extractor for YouTube playlists."""
2431 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2432 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2433 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2434 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2436 IE_NAME = u'youtube:playlist'
2438 def __init__(self, youtube_ie, downloader=None):
2439 InfoExtractor.__init__(self, downloader)
2440 self._youtube_ie = youtube_ie
2442 def report_download_page(self, playlist_id, pagenum):
2443 """Report attempt to download playlist page with given number."""
2444 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2446 def _real_initialize(self):
2447 self._youtube_ie.initialize()
2449 def _real_extract(self, url):
2450 # Extract playlist id
2451 mobj = re.match(self._VALID_URL, url)
2453 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2457 if mobj.group(3) is not None:
2458 self._youtube_ie.extract(mobj.group(3))
2461 # Download playlist pages
2462 # prefix is 'p' as default for playlists but there are other types that need extra care
2463 playlist_prefix = mobj.group(1)
2464 if playlist_prefix == 'a':
2465 playlist_access = 'artist'
2467 playlist_prefix = 'p'
2468 playlist_access = 'view_play_list'
2469 playlist_id = mobj.group(2)
2474 self.report_download_page(playlist_id, pagenum)
2475 request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2477 page = urllib2.urlopen(request).read()
2478 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2479 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2482 # Extract video identifiers
2484 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2485 if mobj.group(1) not in ids_in_page:
2486 ids_in_page.append(mobj.group(1))
2487 video_ids.extend(ids_in_page)
2489 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2491 pagenum = pagenum + 1
2493 playliststart = self._downloader.params.get('playliststart', 1) - 1
2494 playlistend = self._downloader.params.get('playlistend', -1)
2495 video_ids = video_ids[playliststart:playlistend]
2497 for id in video_ids:
2498 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2502 class YoutubeUserIE(InfoExtractor):
2503 """Information Extractor for YouTube users."""
2505 _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2506 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2507 _GDATA_PAGE_SIZE = 50
2508 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2509 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2511 IE_NAME = u'youtube:user'
2513 def __init__(self, youtube_ie, downloader=None):
2514 InfoExtractor.__init__(self, downloader)
2515 self._youtube_ie = youtube_ie
2517 def report_download_page(self, username, start_index):
2518 """Report attempt to download user page."""
2519 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2520 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2522 def _real_initialize(self):
2523 self._youtube_ie.initialize()
2525 def _real_extract(self, url):
2527 mobj = re.match(self._VALID_URL, url)
2529 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2532 username = mobj.group(1)
2534 # Download video ids using YouTube Data API. Result size per
2535 # query is limited (currently to 50 videos) so we need to query
2536 # page by page until there are no video ids - it means we got
2543 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2544 self.report_download_page(username, start_index)
2546 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2549 page = urllib2.urlopen(request).read()
2550 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2551 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2554 # Extract video identifiers
2557 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2558 if mobj.group(1) not in ids_in_page:
2559 ids_in_page.append(mobj.group(1))
2561 video_ids.extend(ids_in_page)
2563 # A little optimization - if current page is not
2564 # "full", ie. does not contain PAGE_SIZE video ids then
2565 # we can assume that this page is the last one - there
2566 # are no more ids on further pages - no need to query
2569 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2574 all_ids_count = len(video_ids)
2575 playliststart = self._downloader.params.get('playliststart', 1) - 1
2576 playlistend = self._downloader.params.get('playlistend', -1)
2578 if playlistend == -1:
2579 video_ids = video_ids[playliststart:]
2581 video_ids = video_ids[playliststart:playlistend]
2583 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2584 (username, all_ids_count, len(video_ids)))
2586 for video_id in video_ids:
2587 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2590 class DepositFilesIE(InfoExtractor):
2591 """Information extractor for depositfiles.com"""
2593 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2594 IE_NAME = u'DepositFiles'
2596 def __init__(self, downloader=None):
2597 InfoExtractor.__init__(self, downloader)
2599 def report_download_webpage(self, file_id):
2600 """Report webpage download."""
2601 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2603 def report_extraction(self, file_id):
2604 """Report information extraction."""
2605 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2607 def _real_initialize(self):
2610 def _real_extract(self, url):
2611 # At this point we have a new file
2612 self._downloader.increment_downloads()
2614 file_id = url.split('/')[-1]
2615 # Rebuild url in english locale
2616 url = 'http://depositfiles.com/en/files/' + file_id
2618 # Retrieve file webpage with 'Free download' button pressed
2619 free_download_indication = { 'gateway_result' : '1' }
2620 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2622 self.report_download_webpage(file_id)
2623 webpage = urllib2.urlopen(request).read()
2624 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2625 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2628 # Search for the real file URL
2629 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2630 if (mobj is None) or (mobj.group(1) is None):
2631 # Try to figure out reason of the error.
2632 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2633 if (mobj is not None) and (mobj.group(1) is not None):
2634 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2635 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2637 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2640 file_url = mobj.group(1)
2641 file_extension = os.path.splitext(file_url)[1][1:]
2643 # Search for file title
2644 mobj = re.search(r'<b title="(.*?)">', webpage)
2646 self._downloader.trouble(u'ERROR: unable to extract title')
2648 file_title = mobj.group(1).decode('utf-8')
2651 # Process file information
2652 self._downloader.process_info({
2653 'id': file_id.decode('utf-8'),
2654 'url': file_url.decode('utf-8'),
2656 'upload_date': u'NA',
2657 'title': file_title,
2658 'stitle': file_title,
2659 'ext': file_extension.decode('utf-8'),
2663 except UnavailableVideoError, err:
2664 self._downloader.trouble(u'ERROR: unable to download file')
2667 class FacebookIE(InfoExtractor):
2668 """Information Extractor for Facebook"""
2670 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2671 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2672 _NETRC_MACHINE = 'facebook'
2673 _available_formats = ['highqual', 'lowqual']
2674 _video_extensions = {
2678 IE_NAME = u'facebook'
2680 def __init__(self, downloader=None):
2681 InfoExtractor.__init__(self, downloader)
2683 def _reporter(self, message):
2684 """Add header and report message."""
2685 self._downloader.to_screen(u'[facebook] %s' % message)
2687 def report_login(self):
2688 """Report attempt to log in."""
2689 self._reporter(u'Logging in')
2691 def report_video_webpage_download(self, video_id):
2692 """Report attempt to download video webpage."""
2693 self._reporter(u'%s: Downloading video webpage' % video_id)
2695 def report_information_extraction(self, video_id):
2696 """Report attempt to extract video information."""
2697 self._reporter(u'%s: Extracting video information' % video_id)
2699 def _parse_page(self, video_webpage):
2700 """Extract video information from page"""
2702 data = {'title': r'class="video_title datawrap">(.*?)</',
2703 'description': r'<div class="datawrap">(.*?)</div>',
2704 'owner': r'\("video_owner_name", "(.*?)"\)',
2705 'upload_date': r'data-date="(.*?)"',
2706 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2709 for piece in data.keys():
2710 mobj = re.search(data[piece], video_webpage)
2711 if mobj is not None:
2712 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2716 for fmt in self._available_formats:
2717 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2718 if mobj is not None:
2719 # URL is in a Javascript segment inside an escaped Unicode format within
2720 # the generally utf-8 page
2721 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2722 video_info['video_urls'] = video_urls
2726 def _real_initialize(self):
2727 if self._downloader is None:
2732 downloader_params = self._downloader.params
2734 # Attempt to use provided username and password or .netrc data
2735 if downloader_params.get('username', None) is not None:
2736 useremail = downloader_params['username']
2737 password = downloader_params['password']
2738 elif downloader_params.get('usenetrc', False):
2740 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2741 if info is not None:
2745 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2746 except (IOError, netrc.NetrcParseError), err:
2747 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2750 if useremail is None:
2759 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2762 login_results = urllib2.urlopen(request).read()
2763 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2764 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2766 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2767 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2770 def _real_extract(self, url):
2771 mobj = re.match(self._VALID_URL, url)
2773 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2775 video_id = mobj.group('ID')
2778 self.report_video_webpage_download(video_id)
2779 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2781 page = urllib2.urlopen(request)
2782 video_webpage = page.read()
2783 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2784 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2787 # Start extracting information
2788 self.report_information_extraction(video_id)
2790 # Extract information
2791 video_info = self._parse_page(video_webpage)
2794 if 'owner' not in video_info:
2795 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2797 video_uploader = video_info['owner']
2800 if 'title' not in video_info:
2801 self._downloader.trouble(u'ERROR: unable to extract video title')
2803 video_title = video_info['title']
2804 video_title = video_title.decode('utf-8')
2805 video_title = sanitize_title(video_title)
2808 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2809 simple_title = simple_title.strip(ur'_')
2812 if 'thumbnail' not in video_info:
2813 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2814 video_thumbnail = ''
2816 video_thumbnail = video_info['thumbnail']
2820 if 'upload_date' in video_info:
2821 upload_time = video_info['upload_date']
2822 timetuple = email.utils.parsedate_tz(upload_time)
2823 if timetuple is not None:
2825 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2830 video_description = video_info.get('description', 'No description available.')
2832 url_map = video_info['video_urls']
2833 if len(url_map.keys()) > 0:
2834 # Decide which formats to download
2835 req_format = self._downloader.params.get('format', None)
2836 format_limit = self._downloader.params.get('format_limit', None)
2838 if format_limit is not None and format_limit in self._available_formats:
2839 format_list = self._available_formats[self._available_formats.index(format_limit):]
2841 format_list = self._available_formats
2842 existing_formats = [x for x in format_list if x in url_map]
2843 if len(existing_formats) == 0:
2844 self._downloader.trouble(u'ERROR: no known formats available for video')
2846 if req_format is None:
2847 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2848 elif req_format == 'worst':
2849 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2850 elif req_format == '-1':
2851 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2854 if req_format not in url_map:
2855 self._downloader.trouble(u'ERROR: requested format not available')
2857 video_url_list = [(req_format, url_map[req_format])] # Specific format
2859 for format_param, video_real_url in video_url_list:
2861 # At this point we have a new video
2862 self._downloader.increment_downloads()
2865 video_extension = self._video_extensions.get(format_param, 'mp4')
2868 # Process video information
2869 self._downloader.process_info({
2870 'id': video_id.decode('utf-8'),
2871 'url': video_real_url.decode('utf-8'),
2872 'uploader': video_uploader.decode('utf-8'),
2873 'upload_date': upload_date,
2874 'title': video_title,
2875 'stitle': simple_title,
2876 'ext': video_extension.decode('utf-8'),
2877 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2878 'thumbnail': video_thumbnail.decode('utf-8'),
2879 'description': video_description.decode('utf-8'),
2882 except UnavailableVideoError, err:
2883 self._downloader.trouble(u'\nERROR: unable to download video')
2885 class BlipTVIE(InfoExtractor):
2886 """Information extractor for blip.tv"""
2888 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2889 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2890 IE_NAME = u'blip.tv'
2892 def report_extraction(self, file_id):
2893 """Report information extraction."""
2894 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
2896 def _simplify_title(self, title):
2897 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2898 res = res.strip(ur'_')
2901 def _real_extract(self, url):
2902 mobj = re.match(self._VALID_URL, url)
2904 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2911 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2912 request = urllib2.Request(json_url)
2913 self.report_extraction(mobj.group(1))
2915 json_code = urllib2.urlopen(request).read()
2916 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2917 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2920 json_data = json.loads(json_code)
2921 if 'Post' in json_data:
2922 data = json_data['Post']
2926 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2927 video_url = data['media']['url']
2928 umobj = re.match(self._URL_EXT, video_url)
2930 raise ValueError('Can not determine filename extension')
2931 ext = umobj.group(1)
2933 self._downloader.increment_downloads()
2936 'id': data['item_id'],
2938 'uploader': data['display_name'],
2939 'upload_date': upload_date,
2940 'title': data['title'],
2941 'stitle': self._simplify_title(data['title']),
2943 'format': data['media']['mimeType'],
2944 'thumbnail': data['thumbnailUrl'],
2945 'description': data['description'],
2946 'player_url': data['embedUrl']
2948 except (ValueError,KeyError), err:
2949 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2953 self._downloader.process_info(info)
2954 except UnavailableVideoError, err:
2955 self._downloader.trouble(u'\nERROR: unable to download video')
2958 class MyVideoIE(InfoExtractor):
2959 """Information Extractor for myvideo.de."""
2961 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2962 IE_NAME = u'myvideo'
2964 def __init__(self, downloader=None):
2965 InfoExtractor.__init__(self, downloader)
2967 def report_download_webpage(self, video_id):
2968 """Report webpage download."""
2969 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2971 def report_extraction(self, video_id):
2972 """Report information extraction."""
2973 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2975 def _real_initialize(self):
2978 def _real_extract(self,url):
2979 mobj = re.match(self._VALID_URL, url)
2981 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2984 video_id = mobj.group(1)
2985 simple_title = mobj.group(2).decode('utf-8')
2986 # should actually not be necessary
2987 simple_title = sanitize_title(simple_title)
2988 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', simple_title)
2991 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2993 self.report_download_webpage(video_id)
2994 webpage = urllib2.urlopen(request).read()
2995 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2996 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2999 self.report_extraction(video_id)
3000 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3003 self._downloader.trouble(u'ERROR: unable to extract media URL')
3005 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3007 mobj = re.search('<title>([^<]+)</title>', webpage)
3009 self._downloader.trouble(u'ERROR: unable to extract title')
3012 video_title = mobj.group(1)
3013 video_title = sanitize_title(video_title)
3017 self._downloader.process_info({
3021 'upload_date': u'NA',
3022 'title': video_title,
3023 'stitle': simple_title,
3028 except UnavailableVideoError:
3029 self._downloader.trouble(u'\nERROR: Unable to download video')
3031 class ComedyCentralIE(InfoExtractor):
3032 """Information extractor for The Daily Show and Colbert Report """
3034 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3035 IE_NAME = u'comedycentral'
3037 def report_extraction(self, episode_id):
3038 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3040 def report_config_download(self, episode_id):
3041 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3043 def report_index_download(self, episode_id):
3044 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3046 def report_player_url(self, episode_id):
3047 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3049 def _simplify_title(self, title):
3050 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3051 res = res.strip(ur'_')
3054 def _real_extract(self, url):
3055 mobj = re.match(self._VALID_URL, url)
3057 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3060 if mobj.group('shortname'):
3061 if mobj.group('shortname') in ('tds', 'thedailyshow'):
3062 url = 'http://www.thedailyshow.com/full-episodes/'
3064 url = 'http://www.colbertnation.com/full-episodes/'
3065 mobj = re.match(self._VALID_URL, url)
3066 assert mobj is not None
3068 dlNewest = not mobj.group('episode')
3070 epTitle = mobj.group('showname')
3072 epTitle = mobj.group('episode')
3074 req = urllib2.Request(url)
3075 self.report_extraction(epTitle)
3077 htmlHandle = urllib2.urlopen(req)
3078 html = htmlHandle.read()
3079 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3080 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3083 url = htmlHandle.geturl()
3084 mobj = re.match(self._VALID_URL, url)
3086 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3088 if mobj.group('episode') == '':
3089 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3091 epTitle = mobj.group('episode')
3093 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"/>', html)
3094 if len(mMovieParams) == 0:
3095 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3098 playerUrl_raw = mMovieParams[0][0]
3099 self.report_player_url(epTitle)
3101 urlHandle = urllib2.urlopen(playerUrl_raw)
3102 playerUrl = urlHandle.geturl()
3103 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3104 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3107 uri = mMovieParams[0][1]
3108 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3109 self.report_index_download(epTitle)
3111 indexXml = urllib2.urlopen(indexUrl).read()
3112 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3113 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3116 idoc = xml.etree.ElementTree.fromstring(indexXml)
3117 itemEls = idoc.findall('.//item')
3118 for itemEl in itemEls:
3119 mediaId = itemEl.findall('./guid')[0].text
3120 shortMediaId = mediaId.split(':')[-1]
3121 showId = mediaId.split(':')[-2].replace('.com', '')
3122 officialTitle = itemEl.findall('./title')[0].text
3123 officialDate = itemEl.findall('./pubDate')[0].text
3125 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3126 urllib.urlencode({'uri': mediaId}))
3127 configReq = urllib2.Request(configUrl)
3128 self.report_config_download(epTitle)
3130 configXml = urllib2.urlopen(configReq).read()
3131 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3132 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3135 cdoc = xml.etree.ElementTree.fromstring(configXml)
3137 for rendition in cdoc.findall('.//rendition'):
3138 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3142 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3145 # For now, just pick the highest bitrate
3146 format,video_url = turls[-1]
3148 self._downloader.increment_downloads()
3150 effTitle = showId + '-' + epTitle
3155 'upload_date': officialDate,
3157 'stitle': self._simplify_title(effTitle),
3161 'description': officialTitle,
3162 'player_url': playerUrl
3166 self._downloader.process_info(info)
3167 except UnavailableVideoError, err:
3168 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3172 class EscapistIE(InfoExtractor):
3173 """Information extractor for The Escapist """
3175 _VALID_URL = r'^(https?://)?(www\.)escapistmagazine.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?].*$'
3176 IE_NAME = u'escapist'
3178 def report_extraction(self, showName):
3179 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3181 def report_config_download(self, showName):
3182 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3184 def _simplify_title(self, title):
3185 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3186 res = res.strip(ur'_')
3189 def _real_extract(self, url):
3190 htmlParser = HTMLParser.HTMLParser()
3192 mobj = re.match(self._VALID_URL, url)
3194 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3196 showName = mobj.group('showname')
3197 videoId = mobj.group('episode')
3199 self.report_extraction(showName)
3201 webPage = urllib2.urlopen(url).read()
3202 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3203 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3206 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3207 description = htmlParser.unescape(descMatch.group(1))
3208 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3209 imgUrl = htmlParser.unescape(imgMatch.group(1))
3210 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3211 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3212 configUrlMatch = re.search('config=(.*)$', playerUrl)
3213 configUrl = urllib2.unquote(configUrlMatch.group(1))
3215 self.report_config_download(showName)
3217 configJSON = urllib2.urlopen(configUrl).read()
3218 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3219 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3222 # Technically, it's JavaScript, not JSON
3223 configJSON = configJSON.replace("'", '"')
3226 config = json.loads(configJSON)
3227 except (ValueError,), err:
3228 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3231 playlist = config['playlist']
3232 videoUrl = playlist[1]['url']
3234 self._downloader.increment_downloads()
3238 'uploader': showName,
3239 'upload_date': None,
3241 'stitle': self._simplify_title(showName),
3244 'thumbnail': imgUrl,
3245 'description': description,
3246 'player_url': playerUrl,
3250 self._downloader.process_info(info)
3251 except UnavailableVideoError, err:
3252 self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3256 class PostProcessor(object):
3257 """Post Processor class.
3259 PostProcessor objects can be added to downloaders with their
3260 add_post_processor() method. When the downloader has finished a
3261 successful download, it will take its internal chain of PostProcessors
3262 and start calling the run() method on each one of them, first with
3263 an initial argument and then with the returned value of the previous
3266 The chain will be stopped if one of them ever returns None or the end
3267 of the chain is reached.
3269 PostProcessor objects follow a "mutual registration" process similar
3270 to InfoExtractor objects.
3275 def __init__(self, downloader=None):
3276 self._downloader = downloader
3278 def set_downloader(self, downloader):
3279 """Sets the downloader for this PP."""
3280 self._downloader = downloader
3282 def run(self, information):
3283 """Run the PostProcessor.
3285 The "information" argument is a dictionary like the ones
3286 composed by InfoExtractors. The only difference is that this
3287 one has an extra field called "filepath" that points to the
3290 When this method returns None, the postprocessing chain is
3291 stopped. However, this method may return an information
3292 dictionary that will be passed to the next postprocessing
3293 object in the chain. It can be the one it received after
3294 changing some fields.
3296 In addition, this method may raise a PostProcessingError
3297 exception that will be taken into account by the downloader
3300 return information # by default, do nothing
3303 class FFmpegExtractAudioPP(PostProcessor):
3305 def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
3306 PostProcessor.__init__(self, downloader)
3307 if preferredcodec is None:
3308 preferredcodec = 'best'
3309 self._preferredcodec = preferredcodec
3310 self._preferredquality = preferredquality
3311 self._keepvideo = keepvideo
3314 def get_audio_codec(path):
3316 cmd = ['ffprobe', '-show_streams', '--', path]
3317 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3318 output = handle.communicate()[0]
3319 if handle.wait() != 0:
3321 except (IOError, OSError):
3324 for line in output.split('\n'):
3325 if line.startswith('codec_name='):
3326 audio_codec = line.split('=')[1].strip()
3327 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3332 def run_ffmpeg(path, out_path, codec, more_opts):
3334 cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3335 ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3337 except (IOError, OSError):
3340 def run(self, information):
3341 path = information['filepath']
3343 filecodec = self.get_audio_codec(path)
3344 if filecodec is None:
3345 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3349 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3350 if filecodec == 'aac' or filecodec == 'mp3':
3351 # Lossless if possible
3353 extension = filecodec
3354 if filecodec == 'aac':
3355 more_opts = ['-f', 'adts']
3358 acodec = 'libmp3lame'
3361 if self._preferredquality is not None:
3362 more_opts += ['-ab', self._preferredquality]
3364 # We convert the audio (lossy)
3365 acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
3366 extension = self._preferredcodec
3368 if self._preferredquality is not None:
3369 more_opts += ['-ab', self._preferredquality]
3370 if self._preferredcodec == 'aac':
3371 more_opts += ['-f', 'adts']
3373 (prefix, ext) = os.path.splitext(path)
3374 new_path = prefix + '.' + extension
3375 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3376 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3379 self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3382 # Try to update the date time for extracted audio file.
3383 if information.get('filetime') is not None:
3385 os.utime(new_path, (time.time(), information['filetime']))
3387 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
3389 if not self._keepvideo:
3392 except (IOError, OSError):
3393 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3396 information['filepath'] = new_path
3400 def updateSelf(downloader, filename):
3401 ''' Update the program file with the latest version from the repository '''
3402 # Note: downloader only used for options
3403 if not os.access(filename, os.W_OK):
3404 sys.exit('ERROR: no write permissions on %s' % filename)
3406 downloader.to_screen('Updating to latest version...')
3410 urlh = urllib.urlopen(UPDATE_URL)
3411 newcontent = urlh.read()
3413 vmatch = re.search("__version__ = '([^']+)'", newcontent)
3414 if vmatch is not None and vmatch.group(1) == __version__:
3415 downloader.to_screen('youtube-dl is up-to-date (' + __version__ + ')')
3419 except (IOError, OSError), err:
3420 sys.exit('ERROR: unable to download latest version')
3423 outf = open(filename, 'wb')
3425 outf.write(newcontent)
3428 except (IOError, OSError), err:
3429 sys.exit('ERROR: unable to overwrite current version')
3431 downloader.to_screen('Updated youtube-dl. Restart youtube-dl to use the new version.')
3438 def _format_option_string(option):
3439 ''' ('-o', '--option') -> -o, --format METAVAR'''
3443 if option._short_opts: opts.append(option._short_opts[0])
3444 if option._long_opts: opts.append(option._long_opts[0])
3445 if len(opts) > 1: opts.insert(1, ', ')
3447 if option.takes_value(): opts.append(' %s' % option.metavar)
3449 return "".join(opts)
3451 def _find_term_columns():
3452 columns = os.environ.get('COLUMNS', None)
3457 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3458 out,err = sp.communicate()
3459 return int(out.split()[1])
3465 max_help_position = 80
3467 # No need to wrap help messages if we're on a wide console
3468 columns = _find_term_columns()
3469 if columns: max_width = columns
3471 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
3472 fmt.format_option_strings = _format_option_string
3475 'version' : __version__,
3477 'usage' : '%prog [options] url [url...]',
3478 'conflict_handler' : 'resolve',
3481 parser = optparse.OptionParser(**kw)
3484 general = optparse.OptionGroup(parser, 'General Options')
3485 selection = optparse.OptionGroup(parser, 'Video Selection')
3486 authentication = optparse.OptionGroup(parser, 'Authentication Options')
3487 video_format = optparse.OptionGroup(parser, 'Video Format Options')
3488 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
3489 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
3490 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3492 general.add_option('-h', '--help',
3493 action='help', help='print this help text and exit')
3494 general.add_option('-v', '--version',
3495 action='version', help='print program version and exit')
3496 general.add_option('-U', '--update',
3497 action='store_true', dest='update_self', help='update this program to latest version')
3498 general.add_option('-i', '--ignore-errors',
3499 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3500 general.add_option('-r', '--rate-limit',
3501 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3502 general.add_option('-R', '--retries',
3503 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
3504 general.add_option('--dump-user-agent',
3505 action='store_true', dest='dump_user_agent',
3506 help='display the current browser identification', default=False)
3507 general.add_option('--list-extractors',
3508 action='store_true', dest='list_extractors',
3509 help='List all supported extractors and the URLs they would handle', default=False)
3511 selection.add_option('--playlist-start',
3512 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3513 selection.add_option('--playlist-end',
3514 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3515 selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
3516 selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
3518 authentication.add_option('-u', '--username',
3519 dest='username', metavar='USERNAME', help='account username')
3520 authentication.add_option('-p', '--password',
3521 dest='password', metavar='PASSWORD', help='account password')
3522 authentication.add_option('-n', '--netrc',
3523 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3526 video_format.add_option('-f', '--format',
3527 action='store', dest='format', metavar='FORMAT', help='video format code')
3528 video_format.add_option('--all-formats',
3529 action='store_const', dest='format', help='download all available video formats', const='all')
3530 video_format.add_option('--max-quality',
3531 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3534 verbosity.add_option('-q', '--quiet',
3535 action='store_true', dest='quiet', help='activates quiet mode', default=False)
3536 verbosity.add_option('-s', '--simulate',
3537 action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
3538 verbosity.add_option('--skip-download',
3539 action='store_true', dest='skip_download', help='do not download the video', default=False)
3540 verbosity.add_option('-g', '--get-url',
3541 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3542 verbosity.add_option('-e', '--get-title',
3543 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3544 verbosity.add_option('--get-thumbnail',
3545 action='store_true', dest='getthumbnail',
3546 help='simulate, quiet but print thumbnail URL', default=False)
3547 verbosity.add_option('--get-description',
3548 action='store_true', dest='getdescription',
3549 help='simulate, quiet but print video description', default=False)
3550 verbosity.add_option('--get-filename',
3551 action='store_true', dest='getfilename',
3552 help='simulate, quiet but print output filename', default=False)
3553 verbosity.add_option('--get-format',
3554 action='store_true', dest='getformat',
3555 help='simulate, quiet but print output format', default=False)
3556 verbosity.add_option('--no-progress',
3557 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3558 verbosity.add_option('--console-title',
3559 action='store_true', dest='consoletitle',
3560 help='display progress in console titlebar', default=False)
3563 filesystem.add_option('-t', '--title',
3564 action='store_true', dest='usetitle', help='use title in file name', default=False)
3565 filesystem.add_option('-l', '--literal',
3566 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3567 filesystem.add_option('-A', '--auto-number',
3568 action='store_true', dest='autonumber',
3569 help='number downloaded files starting from 00000', default=False)
3570 filesystem.add_option('-o', '--output',
3571 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
3572 filesystem.add_option('-a', '--batch-file',
3573 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3574 filesystem.add_option('-w', '--no-overwrites',
3575 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3576 filesystem.add_option('-c', '--continue',
3577 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3578 filesystem.add_option('--no-continue',
3579 action='store_false', dest='continue_dl',
3580 help='do not resume partially downloaded files (restart from beginning)')
3581 filesystem.add_option('--cookies',
3582 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
3583 filesystem.add_option('--no-part',
3584 action='store_true', dest='nopart', help='do not use .part files', default=False)
3585 filesystem.add_option('--no-mtime',
3586 action='store_false', dest='updatetime',
3587 help='do not use the Last-modified header to set the file modification time', default=True)
3588 filesystem.add_option('--write-description',
3589 action='store_true', dest='writedescription',
3590 help='write video description to a .description file', default=False)
3591 filesystem.add_option('--write-info-json',
3592 action='store_true', dest='writeinfojson',
3593 help='write video metadata to a .info.json file', default=False)
3596 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3597 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3598 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3599 help='"best", "aac" or "mp3"; best by default')
3600 postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
3601 help='ffmpeg audio bitrate specification, 128k by default')
3602 postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
3603 help='keeps the video file on disk after the post-processing; the video is erased by default')
3606 parser.add_option_group(general)
3607 parser.add_option_group(selection)
3608 parser.add_option_group(filesystem)
3609 parser.add_option_group(verbosity)
3610 parser.add_option_group(video_format)
3611 parser.add_option_group(authentication)
3612 parser.add_option_group(postproc)
3614 opts, args = parser.parse_args()
3616 return parser, opts, args
3618 def gen_extractors():
3619 """ Return a list of an instance of every supported extractor.
3620 The order does matter; the first extractor matched is the one handling the URL.
3622 youtube_ie = YoutubeIE()
3623 google_ie = GoogleIE()
3624 yahoo_ie = YahooIE()
3627 MetacafeIE(youtube_ie),
3629 YoutubePlaylistIE(youtube_ie),
3630 YoutubeUserIE(youtube_ie),
3631 YoutubeSearchIE(youtube_ie),
3633 GoogleSearchIE(google_ie),
3636 YahooSearchIE(yahoo_ie),
3649 parser, opts, args = parseOpts()
3651 # Open appropriate CookieJar
3652 if opts.cookiefile is None:
3653 jar = cookielib.CookieJar()
3656 jar = cookielib.MozillaCookieJar(opts.cookiefile)
3657 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3659 except (IOError, OSError), err:
3660 sys.exit(u'ERROR: unable to open cookie file')
3663 if opts.dump_user_agent:
3664 print std_headers['User-Agent']
3667 # Batch file verification
3669 if opts.batchfile is not None:
3671 if opts.batchfile == '-':
3674 batchfd = open(opts.batchfile, 'r')
3675 batchurls = batchfd.readlines()
3676 batchurls = [x.strip() for x in batchurls]
3677 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3679 sys.exit(u'ERROR: batch file could not be read')
3680 all_urls = batchurls + args
3682 # General configuration
3683 cookie_processor = urllib2.HTTPCookieProcessor(jar)
3684 opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
3685 urllib2.install_opener(opener)
3686 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3688 extractors = gen_extractors()
3690 if opts.list_extractors:
3691 for ie in extractors:
3693 matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
3694 all_urls = filter(lambda url: url not in matchedUrls, all_urls)
3695 for mu in matchedUrls:
3699 # Conflicting, missing and erroneous options
3700 if opts.usenetrc and (opts.username is not None or opts.password is not None):
3701 parser.error(u'using .netrc conflicts with giving username/password')
3702 if opts.password is not None and opts.username is None:
3703 parser.error(u'account username missing')
3704 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3705 parser.error(u'using output template conflicts with using title, literal title or auto number')
3706 if opts.usetitle and opts.useliteral:
3707 parser.error(u'using title conflicts with using literal title')
3708 if opts.username is not None and opts.password is None:
3709 opts.password = getpass.getpass(u'Type account password and press return:')
3710 if opts.ratelimit is not None:
3711 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3712 if numeric_limit is None:
3713 parser.error(u'invalid rate limit specified')
3714 opts.ratelimit = numeric_limit
3715 if opts.retries is not None:
3717 opts.retries = long(opts.retries)
3718 except (TypeError, ValueError), err:
3719 parser.error(u'invalid retry count specified')
3721 opts.playliststart = int(opts.playliststart)
3722 if opts.playliststart <= 0:
3723 raise ValueError(u'Playlist start must be positive')
3724 except (TypeError, ValueError), err:
3725 parser.error(u'invalid playlist start number specified')
3727 opts.playlistend = int(opts.playlistend)
3728 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3729 raise ValueError(u'Playlist end must be greater than playlist start')
3730 except (TypeError, ValueError), err:
3731 parser.error(u'invalid playlist end number specified')
3732 if opts.extractaudio:
3733 if opts.audioformat not in ['best', 'aac', 'mp3']:
3734 parser.error(u'invalid audio format specified')
3737 fd = FileDownloader({
3738 'usenetrc': opts.usenetrc,
3739 'username': opts.username,
3740 'password': opts.password,
3741 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
3742 'forceurl': opts.geturl,
3743 'forcetitle': opts.gettitle,
3744 'forcethumbnail': opts.getthumbnail,
3745 'forcedescription': opts.getdescription,
3746 'forcefilename': opts.getfilename,
3747 'forceformat': opts.getformat,
3748 'simulate': opts.simulate,
3749 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
3750 'format': opts.format,
3751 'format_limit': opts.format_limit,
3752 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3753 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3754 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3755 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3756 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3757 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3758 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3759 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3760 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3761 or u'%(id)s.%(ext)s'),
3762 'ignoreerrors': opts.ignoreerrors,
3763 'ratelimit': opts.ratelimit,
3764 'nooverwrites': opts.nooverwrites,
3765 'retries': opts.retries,
3766 'continuedl': opts.continue_dl,
3767 'noprogress': opts.noprogress,
3768 'playliststart': opts.playliststart,
3769 'playlistend': opts.playlistend,
3770 'logtostderr': opts.outtmpl == '-',
3771 'consoletitle': opts.consoletitle,
3772 'nopart': opts.nopart,
3773 'updatetime': opts.updatetime,
3774 'writedescription': opts.writedescription,
3775 'writeinfojson': opts.writeinfojson,
3776 'matchtitle': opts.matchtitle,
3777 'rejecttitle': opts.rejecttitle,
3779 for extractor in extractors:
3780 fd.add_info_extractor(extractor)
3783 if opts.extractaudio:
3784 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
3787 if opts.update_self:
3788 updateSelf(fd, sys.argv[0])
3791 if len(all_urls) < 1:
3792 if not opts.update_self:
3793 parser.error(u'you must provide at least one URL')
3796 retcode = fd.download(all_urls)
3798 # Dump cookie jar if requested
3799 if opts.cookiefile is not None:
3802 except (IOError, OSError), err:
3803 sys.exit(u'ERROR: unable to save cookie jar')
3808 if __name__ == '__main__':
3811 except DownloadError:
3813 except SameFileError:
3814 sys.exit(u'ERROR: fixed output name but more than one file to download')
3815 except KeyboardInterrupt:
3816 sys.exit(u'\nERROR: Interrupted by user')
3818 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: