2 # -*- coding: utf-8 -*-
5 'Ricardo Garcia Gonzalez',
13 'Philipp Hagemeister',
17 __license__ = 'Public Domain'
18 __version__ = '2011.09.15'
20 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
49 except ImportError: # Python 2.4
52 import cStringIO as StringIO
56 # parse_qs was moved from the cgi module to the urlparse module recently.
58 from urlparse import parse_qs
60 from cgi import parse_qs
68 import xml.etree.ElementTree
69 except ImportError: # Python<2.5: Not officially supported, but let it slip
70 warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
73 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
74 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
75 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
76 'Accept-Encoding': 'gzip, deflate',
77 'Accept-Language': 'en-us,en;q=0.5',
80 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
84 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
90 def raiseError(msg, i):
91 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
92 def skipSpace(i, expectMore=True):
93 while i < len(s) and s[i] in ' \t\r\n':
97 raiseError('Premature end', i)
99 def decodeEscape(match):
115 return unichr(int(esc[1:5], 16))
116 if len(esc) == 5+6 and esc[5:7] == '\\u':
117 hi = int(esc[1:5], 16)
118 low = int(esc[7:11], 16)
119 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
120 raise ValueError('Unknown escape ' + str(esc))
127 while s[e-bslashes-1] == '\\':
129 if bslashes % 2 == 1:
133 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
134 stri = rexp.sub(decodeEscape, s[i:e])
140 if s[i] == '}': # Empty dictionary
144 raiseError('Expected a string object key', i)
145 i,key = parseString(i)
147 if i >= len(s) or s[i] != ':':
148 raiseError('Expected a colon', i)
155 raiseError('Expected comma or closing curly brace', i)
160 if s[i] == ']': # Empty array
165 i = skipSpace(i) # Raise exception if premature end
169 raiseError('Expected a comma or closing bracket', i)
171 def parseDiscrete(i):
172 for k,v in {'true': True, 'false': False, 'null': None}.items():
173 if s.startswith(k, i):
175 raiseError('Not a boolean (or null)', i)
177 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
179 raiseError('Not a number', i)
181 if '.' in nums or 'e' in nums or 'E' in nums:
182 return (i+len(nums), float(nums))
183 return (i+len(nums), int(nums))
184 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
187 i,res = CHARMAP.get(s[i], parseNumber)(i)
188 i = skipSpace(i, False)
192 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
195 def preferredencoding():
196 """Get preferred encoding.
198 Returns the best encoding scheme for the system, based on
199 locale.getpreferredencoding() and some further tweaks.
201 def yield_preferredencoding():
203 pref = locale.getpreferredencoding()
209 return yield_preferredencoding().next()
212 def htmlentity_transform(matchobj):
213 """Transforms an HTML entity to a Unicode character.
215 This function receives a match object and is intended to be used with
216 the re.sub() function.
218 entity = matchobj.group(1)
220 # Known non-numeric HTML entity
221 if entity in htmlentitydefs.name2codepoint:
222 return unichr(htmlentitydefs.name2codepoint[entity])
225 mobj = re.match(ur'(?u)#(x?\d+)', entity)
227 numstr = mobj.group(1)
228 if numstr.startswith(u'x'):
230 numstr = u'0%s' % numstr
233 return unichr(long(numstr, base))
235 # Unknown entity in name, return its literal representation
236 return (u'&%s;' % entity)
239 def sanitize_title(utitle):
240 """Sanitizes a video title so it could be used as part of a filename."""
241 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
242 return utitle.replace(unicode(os.sep), u'%')
245 def sanitize_open(filename, open_mode):
246 """Try to open the given filename, and slightly tweak it if this fails.
248 Attempts to open the given filename. If this fails, it tries to change
249 the filename slightly, step by step, until it's either able to open it
250 or it fails and raises a final exception, like the standard open()
253 It returns the tuple (stream, definitive_file_name).
257 if sys.platform == 'win32':
259 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
260 return (sys.stdout, filename)
261 stream = open(filename, open_mode)
262 return (stream, filename)
263 except (IOError, OSError), err:
264 # In case of error, try to remove win32 forbidden chars
265 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
267 # An exception here should be caught in the caller
268 stream = open(filename, open_mode)
269 return (stream, filename)
272 def timeconvert(timestr):
273 """Convert RFC 2822 defined time string into system timestamp"""
275 timetuple = email.utils.parsedate_tz(timestr)
276 if timetuple is not None:
277 timestamp = email.utils.mktime_tz(timetuple)
281 class DownloadError(Exception):
282 """Download Error exception.
284 This exception may be thrown by FileDownloader objects if they are not
285 configured to continue on errors. They will contain the appropriate
291 class SameFileError(Exception):
292 """Same File exception.
294 This exception will be thrown by FileDownloader objects if they detect
295 multiple files would have to be downloaded to the same file on disk.
300 class PostProcessingError(Exception):
301 """Post Processing exception.
303 This exception may be raised by PostProcessor's .run() method to
304 indicate an error in the postprocessing task.
309 class UnavailableVideoError(Exception):
310 """Unavailable Format exception.
312 This exception will be thrown when a video is requested
313 in a format that is not available for that video.
318 class ContentTooShortError(Exception):
319 """Content Too Short exception.
321 This exception may be raised by FileDownloader objects when a file they
322 download is too small for what the server announced first, indicating
323 the connection was probably interrupted.
329 def __init__(self, downloaded, expected):
330 self.downloaded = downloaded
331 self.expected = expected
334 class YoutubeDLHandler(urllib2.HTTPHandler):
335 """Handler for HTTP requests and responses.
337 This class, when installed with an OpenerDirector, automatically adds
338 the standard headers to every HTTP request and handles gzipped and
339 deflated responses from web servers. If compression is to be avoided in
340 a particular request, the original request in the program code only has
341 to include the HTTP header "Youtubedl-No-Compression", which will be
342 removed before making the real request.
344 Part of this code was copied from:
346 http://techknack.net/python-urllib2-handlers/
348 Andrew Rowls, the author of that code, agreed to release it to the
355 return zlib.decompress(data, -zlib.MAX_WBITS)
357 return zlib.decompress(data)
360 def addinfourl_wrapper(stream, headers, url, code):
361 if hasattr(urllib2.addinfourl, 'getcode'):
362 return urllib2.addinfourl(stream, headers, url, code)
363 ret = urllib2.addinfourl(stream, headers, url)
367 def http_request(self, req):
368 for h in std_headers:
371 req.add_header(h, std_headers[h])
372 if 'Youtubedl-no-compression' in req.headers:
373 if 'Accept-encoding' in req.headers:
374 del req.headers['Accept-encoding']
375 del req.headers['Youtubedl-no-compression']
378 def http_response(self, req, resp):
381 if resp.headers.get('Content-encoding', '') == 'gzip':
382 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
383 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
384 resp.msg = old_resp.msg
386 if resp.headers.get('Content-encoding', '') == 'deflate':
387 gz = StringIO.StringIO(self.deflate(resp.read()))
388 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
389 resp.msg = old_resp.msg
393 class FileDownloader(object):
394 """File Downloader class.
396 File downloader objects are the ones responsible of downloading the
397 actual video file and writing it to disk if the user has requested
398 it, among some other tasks. In most cases there should be one per
399 program. As, given a video URL, the downloader doesn't know how to
400 extract all the needed information, task that InfoExtractors do, it
401 has to pass the URL to one of them.
403 For this, file downloader objects have a method that allows
404 InfoExtractors to be registered in a given order. When it is passed
405 a URL, the file downloader handles it to the first InfoExtractor it
406 finds that reports being able to handle it. The InfoExtractor extracts
407 all the information about the video or videos the URL refers to, and
408 asks the FileDownloader to process the video information, possibly
409 downloading the video.
411 File downloaders accept a lot of parameters. In order not to saturate
412 the object constructor with arguments, it receives a dictionary of
413 options instead. These options are available through the params
414 attribute for the InfoExtractors to use. The FileDownloader also
415 registers itself as the downloader in charge for the InfoExtractors
416 that are added to it, so this is a "mutual registration".
420 username: Username for authentication purposes.
421 password: Password for authentication purposes.
422 usenetrc: Use netrc for authentication instead.
423 quiet: Do not print messages to stdout.
424 forceurl: Force printing final URL.
425 forcetitle: Force printing title.
426 forcethumbnail: Force printing thumbnail URL.
427 forcedescription: Force printing description.
428 forcefilename: Force printing final filename.
429 simulate: Do not download the video files.
430 format: Video format code.
431 format_limit: Highest quality format to try.
432 outtmpl: Template for output names.
433 ignoreerrors: Do not stop on download errors.
434 ratelimit: Download speed limit, in bytes/sec.
435 nooverwrites: Prevent overwriting files.
436 retries: Number of times to retry for HTTP error 5xx
437 continuedl: Try to continue downloads if possible.
438 noprogress: Do not print the progress bar.
439 playliststart: Playlist item to start at.
440 playlistend: Playlist item to end at.
441 matchtitle: Download only matching titles.
442 rejecttitle: Reject downloads for matching titles.
443 logtostderr: Log messages to stderr instead of stdout.
444 consoletitle: Display progress in console window's titlebar.
445 nopart: Do not use temporary .part files.
446 updatetime: Use the Last-modified header to set output file timestamps.
447 writedescription: Write the video description to a .description file
448 writeinfojson: Write the video description to a .info.json file
454 _download_retcode = None
455 _num_downloads = None
458 def __init__(self, params):
459 """Create a FileDownloader object with the given options."""
462 self._download_retcode = 0
463 self._num_downloads = 0
464 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
468 def format_bytes(bytes):
471 if type(bytes) is str:
476 exponent = long(math.log(bytes, 1024.0))
477 suffix = 'bkMGTPEZY'[exponent]
478 converted = float(bytes) / float(1024 ** exponent)
479 return '%.2f%s' % (converted, suffix)
482 def calc_percent(byte_counter, data_len):
485 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
488 def calc_eta(start, now, total, current):
492 if current == 0 or dif < 0.001: # One millisecond
494 rate = float(current) / dif
495 eta = long((float(total) - float(current)) / rate)
496 (eta_mins, eta_secs) = divmod(eta, 60)
499 return '%02d:%02d' % (eta_mins, eta_secs)
502 def calc_speed(start, now, bytes):
504 if bytes == 0 or dif < 0.001: # One millisecond
505 return '%10s' % '---b/s'
506 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
509 def best_block_size(elapsed_time, bytes):
510 new_min = max(bytes / 2.0, 1.0)
511 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
512 if elapsed_time < 0.001:
514 rate = bytes / elapsed_time
522 def parse_bytes(bytestr):
523 """Parse a string indicating a byte quantity into a long integer."""
524 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
527 number = float(matchobj.group(1))
528 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
529 return long(round(number * multiplier))
531 def add_info_extractor(self, ie):
532 """Add an InfoExtractor object to the end of the list."""
534 ie.set_downloader(self)
536 def add_post_processor(self, pp):
537 """Add a PostProcessor object to the end of the chain."""
539 pp.set_downloader(self)
541 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
542 """Print message to stdout if not in quiet mode."""
544 if not self.params.get('quiet', False):
545 terminator = [u'\n', u''][skip_eol]
546 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
547 self._screen_file.flush()
548 except (UnicodeEncodeError), err:
549 if not ignore_encoding_errors:
552 def to_stderr(self, message):
553 """Print message to stderr."""
554 print >>sys.stderr, message.encode(preferredencoding())
556 def to_cons_title(self, message):
557 """Set console/terminal window title to message."""
558 if not self.params.get('consoletitle', False):
560 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
561 # c_wchar_p() might not be necessary if `message` is
562 # already of type unicode()
563 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
564 elif 'TERM' in os.environ:
565 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
567 def fixed_template(self):
568 """Checks if the output template is fixed."""
569 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
571 def trouble(self, message=None):
572 """Determine action to take when a download problem appears.
574 Depending on if the downloader has been configured to ignore
575 download errors or not, this method may throw an exception or
576 not when errors are found, after printing the message.
578 if message is not None:
579 self.to_stderr(message)
580 if not self.params.get('ignoreerrors', False):
581 raise DownloadError(message)
582 self._download_retcode = 1
584 def slow_down(self, start_time, byte_counter):
585 """Sleep if the download speed is over the rate limit."""
586 rate_limit = self.params.get('ratelimit', None)
587 if rate_limit is None or byte_counter == 0:
590 elapsed = now - start_time
593 speed = float(byte_counter) / elapsed
594 if speed > rate_limit:
595 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
597 def temp_name(self, filename):
598 """Returns a temporary filename for the given filename."""
599 if self.params.get('nopart', False) or filename == u'-' or \
600 (os.path.exists(filename) and not os.path.isfile(filename)):
602 return filename + u'.part'
604 def undo_temp_name(self, filename):
605 if filename.endswith(u'.part'):
606 return filename[:-len(u'.part')]
609 def try_rename(self, old_filename, new_filename):
611 if old_filename == new_filename:
613 os.rename(old_filename, new_filename)
614 except (IOError, OSError), err:
615 self.trouble(u'ERROR: unable to rename file')
617 def try_utime(self, filename, last_modified_hdr):
618 """Try to set the last-modified time of the given file."""
619 if last_modified_hdr is None:
621 if not os.path.isfile(filename):
623 timestr = last_modified_hdr
626 filetime = timeconvert(timestr)
630 os.utime(filename, (time.time(), filetime))
635 def report_writedescription(self, descfn):
636 """ Report that the description file is being written """
637 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
639 def report_writeinfojson(self, infofn):
640 """ Report that the metadata file has been written """
641 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
643 def report_destination(self, filename):
644 """Report destination filename."""
645 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
647 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
648 """Report download progress."""
649 if self.params.get('noprogress', False):
651 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
652 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
653 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
654 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
656 def report_resuming_byte(self, resume_len):
657 """Report attempt to resume at given byte."""
658 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
660 def report_retry(self, count, retries):
661 """Report retry in case of HTTP error 5xx"""
662 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
664 def report_file_already_downloaded(self, file_name):
665 """Report file has already been fully downloaded."""
667 self.to_screen(u'[download] %s has already been downloaded' % file_name)
668 except (UnicodeEncodeError), err:
669 self.to_screen(u'[download] The file has already been downloaded')
671 def report_unable_to_resume(self):
672 """Report it was impossible to resume download."""
673 self.to_screen(u'[download] Unable to resume')
675 def report_finish(self):
676 """Report download finished."""
677 if self.params.get('noprogress', False):
678 self.to_screen(u'[download] Download completed')
682 def increment_downloads(self):
683 """Increment the ordinal that assigns a number to each file."""
684 self._num_downloads += 1
686 def prepare_filename(self, info_dict):
687 """Generate the output filename."""
689 template_dict = dict(info_dict)
690 template_dict['epoch'] = unicode(long(time.time()))
691 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
692 filename = self.params['outtmpl'] % template_dict
694 except (ValueError, KeyError), err:
695 self.trouble(u'ERROR: invalid system charset or erroneous output template')
698 def process_info(self, info_dict):
699 """Process a single dictionary returned by an InfoExtractor."""
700 filename = self.prepare_filename(info_dict)
703 if self.params.get('forcetitle', False):
704 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
705 if self.params.get('forceurl', False):
706 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
707 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
708 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
709 if self.params.get('forcedescription', False) and 'description' in info_dict:
710 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
711 if self.params.get('forcefilename', False) and filename is not None:
712 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
714 # Do nothing else if in simulate mode
715 if self.params.get('simulate', False):
721 matchtitle=self.params.get('matchtitle',False)
722 rejecttitle=self.params.get('rejecttitle',False)
723 title=info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
724 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
725 self.to_screen(u'[download] "%s" title did not match pattern "%s"' % (title, matchtitle))
727 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
728 self.to_screen(u'[download] "%s" title matched reject pattern "%s"' % (title, rejecttitle))
731 if self.params.get('nooverwrites', False) and os.path.exists(filename):
732 self.to_stderr(u'WARNING: file exists and will be skipped')
736 dn = os.path.dirname(filename)
737 if dn != '' and not os.path.exists(dn):
739 except (OSError, IOError), err:
740 self.trouble(u'ERROR: unable to create directory ' + unicode(err))
743 if self.params.get('writedescription', False):
745 descfn = filename + '.description'
746 self.report_writedescription(descfn)
747 descfile = open(descfn, 'wb')
749 descfile.write(info_dict['description'].encode('utf-8'))
752 except (OSError, IOError):
753 self.trouble(u'ERROR: Cannot write description file ' + descfn)
756 if self.params.get('writeinfojson', False):
757 infofn = filename + '.info.json'
758 self.report_writeinfojson(infofn)
761 except (NameError,AttributeError):
762 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
765 infof = open(infofn, 'wb')
767 json.dump(info_dict, infof)
770 except (OSError, IOError):
771 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
774 if not self.params.get('skip_download', False):
776 success,add_data = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
777 info_dict.update(add_data)
778 except (OSError, IOError), err:
779 raise UnavailableVideoError
780 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
781 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
783 except (ContentTooShortError, ), err:
784 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
789 self.post_process(filename, info_dict)
790 except (PostProcessingError), err:
791 self.trouble(u'ERROR: postprocessing: %s' % str(err))
794 def download(self, url_list):
795 """Download a given list of URLs."""
796 if len(url_list) > 1 and self.fixed_template():
797 raise SameFileError(self.params['outtmpl'])
800 suitable_found = False
802 # Go to next InfoExtractor if not suitable
803 if not ie.suitable(url):
806 # Suitable InfoExtractor found
807 suitable_found = True
809 # Extract information from URL and process it
812 # Suitable InfoExtractor had been found; go to next URL
815 if not suitable_found:
816 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
818 return self._download_retcode
820 def post_process(self, filename, ie_info):
821 """Run the postprocessing chain on the given file."""
823 info['filepath'] = filename
829 def _download_with_rtmpdump(self, filename, url, player_url):
830 self.report_destination(filename)
831 tmpfilename = self.temp_name(filename)
833 # Check for rtmpdump first
835 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
836 except (OSError, IOError):
837 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
840 # Download using rtmpdump. rtmpdump returns exit code 2 when
841 # the connection was interrumpted and resuming appears to be
842 # possible. This is part of rtmpdump's normal usage, AFAIK.
843 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
844 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
845 while retval == 2 or retval == 1:
846 prevsize = os.path.getsize(tmpfilename)
847 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
848 time.sleep(5.0) # This seems to be needed
849 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
850 cursize = os.path.getsize(tmpfilename)
851 if prevsize == cursize and retval == 1:
853 # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
854 if prevsize == cursize and retval == 2 and cursize > 1024:
855 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
859 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
860 self.try_rename(tmpfilename, filename)
863 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
866 def _do_download(self, filename, url, player_url):
867 # Check file already present
868 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
869 self.report_file_already_downloaded(filename)
872 # Attempt to download using rtmpdump
873 if url.startswith('rtmp'):
874 return self._download_with_rtmpdump(filename, url, player_url)
876 tmpfilename = self.temp_name(filename)
880 # Do not include the Accept-Encoding header
881 headers = {'Youtubedl-no-compression': 'True'}
882 basic_request = urllib2.Request(url, None, headers)
883 request = urllib2.Request(url, None, headers)
885 # Establish possible resume length
886 if os.path.isfile(tmpfilename):
887 resume_len = os.path.getsize(tmpfilename)
891 # Request parameters in case of being able to resume
892 if self.params.get('continuedl', False) and resume_len != 0:
893 self.report_resuming_byte(resume_len)
894 request.add_header('Range', 'bytes=%d-' % resume_len)
898 retries = self.params.get('retries', 0)
899 while count <= retries:
900 # Establish connection
902 data = urllib2.urlopen(request)
904 except (urllib2.HTTPError, ), err:
905 if (err.code < 500 or err.code >= 600) and err.code != 416:
906 # Unexpected HTTP error
908 elif err.code == 416:
909 # Unable to resume (requested range not satisfiable)
911 # Open the connection again without the range header
912 data = urllib2.urlopen(basic_request)
913 content_length = data.info()['Content-Length']
914 except (urllib2.HTTPError, ), err:
915 if err.code < 500 or err.code >= 600:
918 # Examine the reported length
919 if (content_length is not None and
920 (resume_len - 100 < long(content_length) < resume_len + 100)):
921 # The file had already been fully downloaded.
922 # Explanation to the above condition: in issue #175 it was revealed that
923 # YouTube sometimes adds or removes a few bytes from the end of the file,
924 # changing the file size slightly and causing problems for some users. So
925 # I decided to implement a suggested change and consider the file
926 # completely downloaded if the file size differs less than 100 bytes from
927 # the one in the hard drive.
928 self.report_file_already_downloaded(filename)
929 self.try_rename(tmpfilename, filename)
932 # The length does not match, we start the download over
933 self.report_unable_to_resume()
939 self.report_retry(count, retries)
942 self.trouble(u'ERROR: giving up after %s retries' % retries)
945 data_len = data.info().get('Content-length', None)
946 if data_len is not None:
947 data_len = long(data_len) + resume_len
948 data_len_str = self.format_bytes(data_len)
949 byte_counter = 0 + resume_len
955 data_block = data.read(block_size)
957 if len(data_block) == 0:
959 byte_counter += len(data_block)
961 # Open file just in time
964 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
965 assert stream is not None
966 filename = self.undo_temp_name(tmpfilename)
967 self.report_destination(filename)
968 except (OSError, IOError), err:
969 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
972 stream.write(data_block)
973 except (IOError, OSError), err:
974 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
976 block_size = self.best_block_size(after - before, len(data_block))
979 percent_str = self.calc_percent(byte_counter, data_len)
980 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
981 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
982 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
985 self.slow_down(start, byte_counter - resume_len)
988 self.trouble(u'\nERROR: Did not get any data blocks')
992 if data_len is not None and byte_counter != data_len:
993 raise ContentTooShortError(byte_counter, long(data_len))
994 self.try_rename(tmpfilename, filename)
996 # Update file modification time
998 if self.params.get('updatetime', True):
999 filetime = self.try_utime(filename, data.info().get('last-modified', None))
1001 return True, {'filetime': filetime}
1004 class InfoExtractor(object):
1005 """Information Extractor class.
1007 Information extractors are the classes that, given a URL, extract
1008 information from the video (or videos) the URL refers to. This
1009 information includes the real video URL, the video title and simplified
1010 title, author and others. The information is stored in a dictionary
1011 which is then passed to the FileDownloader. The FileDownloader
1012 processes this information possibly downloading the video to the file
1013 system, among other possible outcomes. The dictionaries must include
1014 the following fields:
1016 id: Video identifier.
1017 url: Final video URL.
1018 uploader: Nickname of the video uploader.
1019 title: Literal title.
1020 stitle: Simplified title.
1021 ext: Video filename extension.
1022 format: Video format.
1023 player_url: SWF Player URL (may be None).
1025 The following fields are optional. Their primary purpose is to allow
1026 youtube-dl to serve as the backend for a video search function, such
1027 as the one in youtube2mp3. They are only used when their respective
1028 forced printing functions are called:
1030 thumbnail: Full URL to a video thumbnail image.
1031 description: One-line video description.
1033 Subclasses of this one should re-define the _real_initialize() and
1034 _real_extract() methods and define a _VALID_URL regexp.
1035 Probably, they should also be added to the list of extractors.
1041 def __init__(self, downloader=None):
1042 """Constructor. Receives an optional downloader."""
1044 self.set_downloader(downloader)
1046 def suitable(self, url):
1047 """Receives a URL and returns True if suitable for this IE."""
1048 return re.match(self._VALID_URL, url) is not None
1050 def initialize(self):
1051 """Initializes an instance (authentication, etc)."""
1053 self._real_initialize()
1056 def extract(self, url):
1057 """Extracts URL information and returns it in list of dicts."""
1059 return self._real_extract(url)
1061 def set_downloader(self, downloader):
1062 """Sets the downloader for this IE."""
1063 self._downloader = downloader
1065 def _real_initialize(self):
1066 """Real initialization process. Redefine in subclasses."""
1069 def _real_extract(self, url):
1070 """Real extraction process. Redefine in subclasses."""
1074 class YoutubeIE(InfoExtractor):
1075 """Information extractor for youtube.com."""
1077 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1078 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1079 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1080 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1081 _NETRC_MACHINE = 'youtube'
1082 # Listed in order of quality
1083 _available_formats = ['38', '37', '45', '22', '43', '35', '34', '18', '6', '5', '17', '13']
1084 _video_extensions = {
1090 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1094 IE_NAME = u'youtube'
1096 def report_lang(self):
1097 """Report attempt to set language."""
1098 self._downloader.to_screen(u'[youtube] Setting language')
1100 def report_login(self):
1101 """Report attempt to log in."""
1102 self._downloader.to_screen(u'[youtube] Logging in')
1104 def report_age_confirmation(self):
1105 """Report attempt to confirm age."""
1106 self._downloader.to_screen(u'[youtube] Confirming age')
1108 def report_video_webpage_download(self, video_id):
1109 """Report attempt to download video webpage."""
1110 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1112 def report_video_info_webpage_download(self, video_id):
1113 """Report attempt to download video info webpage."""
1114 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1116 def report_information_extraction(self, video_id):
1117 """Report attempt to extract video information."""
1118 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1120 def report_unavailable_format(self, video_id, format):
1121 """Report extracted video URL."""
1122 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1124 def report_rtmp_download(self):
1125 """Indicate the download will use the RTMP protocol."""
1126 self._downloader.to_screen(u'[youtube] RTMP download detected')
1128 def _real_initialize(self):
1129 if self._downloader is None:
1134 downloader_params = self._downloader.params
1136 # Attempt to use provided username and password or .netrc data
1137 if downloader_params.get('username', None) is not None:
1138 username = downloader_params['username']
1139 password = downloader_params['password']
1140 elif downloader_params.get('usenetrc', False):
1142 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1143 if info is not None:
1147 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1148 except (IOError, netrc.NetrcParseError), err:
1149 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1153 request = urllib2.Request(self._LANG_URL)
1156 urllib2.urlopen(request).read()
1157 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1158 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1161 # No authentication to be performed
1162 if username is None:
1167 'current_form': 'loginForm',
1169 'action_login': 'Log In',
1170 'username': username,
1171 'password': password,
1173 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1176 login_results = urllib2.urlopen(request).read()
1177 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1178 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1180 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1181 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1187 'action_confirm': 'Confirm',
1189 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1191 self.report_age_confirmation()
1192 age_results = urllib2.urlopen(request).read()
1193 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1194 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1197 def _real_extract(self, url):
1198 # Extract video id from URL
1199 mobj = re.match(self._VALID_URL, url)
1201 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1203 video_id = mobj.group(2)
1206 self.report_video_webpage_download(video_id)
1207 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1209 video_webpage = urllib2.urlopen(request).read()
1210 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1211 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1214 # Attempt to extract SWF player URL
1215 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1216 if mobj is not None:
1217 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1222 self.report_video_info_webpage_download(video_id)
1223 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1224 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1225 % (video_id, el_type))
1226 request = urllib2.Request(video_info_url)
1228 video_info_webpage = urllib2.urlopen(request).read()
1229 video_info = parse_qs(video_info_webpage)
1230 if 'token' in video_info:
1232 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1233 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1235 if 'token' not in video_info:
1236 if 'reason' in video_info:
1237 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1239 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1242 # Start extracting information
1243 self.report_information_extraction(video_id)
1246 if 'author' not in video_info:
1247 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1249 video_uploader = urllib.unquote_plus(video_info['author'][0])
1252 if 'title' not in video_info:
1253 self._downloader.trouble(u'ERROR: unable to extract video title')
1255 video_title = urllib.unquote_plus(video_info['title'][0])
1256 video_title = video_title.decode('utf-8')
1257 video_title = sanitize_title(video_title)
1260 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1261 simple_title = simple_title.strip(ur'_')
1264 if 'thumbnail_url' not in video_info:
1265 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1266 video_thumbnail = ''
1267 else: # don't panic if we can't find it
1268 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1272 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1273 if mobj is not None:
1274 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1275 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1276 for expression in format_expressions:
1278 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1286 video_description = u'No description available.'
1287 if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1288 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1289 if mobj is not None:
1290 video_description = mobj.group(1).decode('utf-8')
1292 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1293 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1294 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1295 # TODO use another parser
1298 video_token = urllib.unquote_plus(video_info['token'][0])
1300 # Decide which formats to download
1301 req_format = self._downloader.params.get('format', None)
1303 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1304 self.report_rtmp_download()
1305 video_url_list = [(None, video_info['conn'][0])]
1306 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1307 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1308 url_data = [parse_qs(uds) for uds in url_data_strs]
1309 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1310 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1312 format_limit = self._downloader.params.get('format_limit', None)
1313 if format_limit is not None and format_limit in self._available_formats:
1314 format_list = self._available_formats[self._available_formats.index(format_limit):]
1316 format_list = self._available_formats
1317 existing_formats = [x for x in format_list if x in url_map]
1318 if len(existing_formats) == 0:
1319 self._downloader.trouble(u'ERROR: no known formats available for video')
1321 if req_format is None:
1322 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1323 elif req_format == '-1':
1324 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1327 if req_format not in url_map:
1328 self._downloader.trouble(u'ERROR: requested format not available')
1330 video_url_list = [(req_format, url_map[req_format])] # Specific format
1332 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1335 for format_param, video_real_url in video_url_list:
1336 # At this point we have a new video
1337 self._downloader.increment_downloads()
1340 video_extension = self._video_extensions.get(format_param, 'flv')
1343 # Process video information
1344 self._downloader.process_info({
1345 'id': video_id.decode('utf-8'),
1346 'url': video_real_url.decode('utf-8'),
1347 'uploader': video_uploader.decode('utf-8'),
1348 'upload_date': upload_date,
1349 'title': video_title,
1350 'stitle': simple_title,
1351 'ext': video_extension.decode('utf-8'),
1352 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1353 'thumbnail': video_thumbnail.decode('utf-8'),
1354 'description': video_description,
1355 'player_url': player_url,
1357 except UnavailableVideoError, err:
1358 self._downloader.trouble(u'\nERROR: unable to download video')
1361 class MetacafeIE(InfoExtractor):
1362 """Information Extractor for metacafe.com."""
1364 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1365 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1366 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1368 IE_NAME = u'metacafe'
1370 def __init__(self, youtube_ie, downloader=None):
1371 InfoExtractor.__init__(self, downloader)
1372 self._youtube_ie = youtube_ie
1374 def report_disclaimer(self):
1375 """Report disclaimer retrieval."""
1376 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1378 def report_age_confirmation(self):
1379 """Report attempt to confirm age."""
1380 self._downloader.to_screen(u'[metacafe] Confirming age')
1382 def report_download_webpage(self, video_id):
1383 """Report webpage download."""
1384 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1386 def report_extraction(self, video_id):
1387 """Report information extraction."""
1388 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1390 def _real_initialize(self):
1391 # Retrieve disclaimer
1392 request = urllib2.Request(self._DISCLAIMER)
1394 self.report_disclaimer()
1395 disclaimer = urllib2.urlopen(request).read()
1396 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1397 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1403 'submit': "Continue - I'm over 18",
1405 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1407 self.report_age_confirmation()
1408 disclaimer = urllib2.urlopen(request).read()
1409 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1410 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1413 def _real_extract(self, url):
1414 # Extract id and simplified title from URL
1415 mobj = re.match(self._VALID_URL, url)
1417 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1420 video_id = mobj.group(1)
1422 # Check if video comes from YouTube
1423 mobj2 = re.match(r'^yt-(.*)$', video_id)
1424 if mobj2 is not None:
1425 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1428 # At this point we have a new video
1429 self._downloader.increment_downloads()
1431 simple_title = mobj.group(2).decode('utf-8')
1433 # Retrieve video webpage to extract further information
1434 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1436 self.report_download_webpage(video_id)
1437 webpage = urllib2.urlopen(request).read()
1438 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1439 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1442 # Extract URL, uploader and title from webpage
1443 self.report_extraction(video_id)
1444 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1445 if mobj is not None:
1446 mediaURL = urllib.unquote(mobj.group(1))
1447 video_extension = mediaURL[-3:]
1449 # Extract gdaKey if available
1450 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1452 video_url = mediaURL
1454 gdaKey = mobj.group(1)
1455 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1457 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1459 self._downloader.trouble(u'ERROR: unable to extract media URL')
1461 vardict = parse_qs(mobj.group(1))
1462 if 'mediaData' not in vardict:
1463 self._downloader.trouble(u'ERROR: unable to extract media URL')
1465 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1467 self._downloader.trouble(u'ERROR: unable to extract media URL')
1469 mediaURL = mobj.group(1).replace('\\/', '/')
1470 video_extension = mediaURL[-3:]
1471 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1473 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1475 self._downloader.trouble(u'ERROR: unable to extract title')
1477 video_title = mobj.group(1).decode('utf-8')
1478 video_title = sanitize_title(video_title)
1480 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1482 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1484 video_uploader = mobj.group(1)
1487 # Process video information
1488 self._downloader.process_info({
1489 'id': video_id.decode('utf-8'),
1490 'url': video_url.decode('utf-8'),
1491 'uploader': video_uploader.decode('utf-8'),
1492 'upload_date': u'NA',
1493 'title': video_title,
1494 'stitle': simple_title,
1495 'ext': video_extension.decode('utf-8'),
1499 except UnavailableVideoError:
1500 self._downloader.trouble(u'\nERROR: unable to download video')
1503 class DailymotionIE(InfoExtractor):
1504 """Information Extractor for Dailymotion"""
1506 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1507 IE_NAME = u'dailymotion'
1509 def __init__(self, downloader=None):
1510 InfoExtractor.__init__(self, downloader)
1512 def report_download_webpage(self, video_id):
1513 """Report webpage download."""
1514 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1516 def report_extraction(self, video_id):
1517 """Report information extraction."""
1518 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1520 def _real_initialize(self):
1523 def _real_extract(self, url):
1524 # Extract id and simplified title from URL
1525 mobj = re.match(self._VALID_URL, url)
1527 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1530 # At this point we have a new video
1531 self._downloader.increment_downloads()
1532 video_id = mobj.group(1)
1534 simple_title = mobj.group(2).decode('utf-8')
1535 video_extension = 'flv'
1537 # Retrieve video webpage to extract further information
1538 request = urllib2.Request(url)
1539 request.add_header('Cookie', 'family_filter=off')
1541 self.report_download_webpage(video_id)
1542 webpage = urllib2.urlopen(request).read()
1543 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1544 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1547 # Extract URL, uploader and title from webpage
1548 self.report_extraction(video_id)
1549 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1551 self._downloader.trouble(u'ERROR: unable to extract media URL')
1553 sequence = urllib.unquote(mobj.group(1))
1554 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1556 self._downloader.trouble(u'ERROR: unable to extract media URL')
1558 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1560 # if needed add http://www.dailymotion.com/ if relative URL
1562 video_url = mediaURL
1564 mobj = re.search(r'(?im)<title>Dailymotion\s*-\s*(.+)\s*-\s*[^<]+?</title>', webpage)
1566 self._downloader.trouble(u'ERROR: unable to extract title')
1568 video_title = mobj.group(1).decode('utf-8')
1569 video_title = sanitize_title(video_title)
1571 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1573 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1575 video_uploader = mobj.group(1)
1578 # Process video information
1579 self._downloader.process_info({
1580 'id': video_id.decode('utf-8'),
1581 'url': video_url.decode('utf-8'),
1582 'uploader': video_uploader.decode('utf-8'),
1583 'upload_date': u'NA',
1584 'title': video_title,
1585 'stitle': simple_title,
1586 'ext': video_extension.decode('utf-8'),
1590 except UnavailableVideoError:
1591 self._downloader.trouble(u'\nERROR: unable to download video')
1594 class GoogleIE(InfoExtractor):
1595 """Information extractor for video.google.com."""
1597 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1598 IE_NAME = u'video.google'
1600 def __init__(self, downloader=None):
1601 InfoExtractor.__init__(self, downloader)
1603 def report_download_webpage(self, video_id):
1604 """Report webpage download."""
1605 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1607 def report_extraction(self, video_id):
1608 """Report information extraction."""
1609 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1611 def _real_initialize(self):
1614 def _real_extract(self, url):
1615 # Extract id from URL
1616 mobj = re.match(self._VALID_URL, url)
1618 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1621 # At this point we have a new video
1622 self._downloader.increment_downloads()
1623 video_id = mobj.group(1)
1625 video_extension = 'mp4'
1627 # Retrieve video webpage to extract further information
1628 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1630 self.report_download_webpage(video_id)
1631 webpage = urllib2.urlopen(request).read()
1632 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1633 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1636 # Extract URL, uploader, and title from webpage
1637 self.report_extraction(video_id)
1638 mobj = re.search(r"download_url:'([^']+)'", webpage)
1640 video_extension = 'flv'
1641 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1643 self._downloader.trouble(u'ERROR: unable to extract media URL')
1645 mediaURL = urllib.unquote(mobj.group(1))
1646 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1647 mediaURL = mediaURL.replace('\\x26', '\x26')
1649 video_url = mediaURL
1651 mobj = re.search(r'<title>(.*)</title>', webpage)
1653 self._downloader.trouble(u'ERROR: unable to extract title')
1655 video_title = mobj.group(1).decode('utf-8')
1656 video_title = sanitize_title(video_title)
1657 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1659 # Extract video description
1660 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1662 self._downloader.trouble(u'ERROR: unable to extract video description')
1664 video_description = mobj.group(1).decode('utf-8')
1665 if not video_description:
1666 video_description = 'No description available.'
1668 # Extract video thumbnail
1669 if self._downloader.params.get('forcethumbnail', False):
1670 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1672 webpage = urllib2.urlopen(request).read()
1673 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1674 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1676 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1678 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1680 video_thumbnail = mobj.group(1)
1681 else: # we need something to pass to process_info
1682 video_thumbnail = ''
1685 # Process video information
1686 self._downloader.process_info({
1687 'id': video_id.decode('utf-8'),
1688 'url': video_url.decode('utf-8'),
1690 'upload_date': u'NA',
1691 'title': video_title,
1692 'stitle': simple_title,
1693 'ext': video_extension.decode('utf-8'),
1697 except UnavailableVideoError:
1698 self._downloader.trouble(u'\nERROR: unable to download video')
1701 class PhotobucketIE(InfoExtractor):
1702 """Information extractor for photobucket.com."""
1704 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1705 IE_NAME = u'photobucket'
1707 def __init__(self, downloader=None):
1708 InfoExtractor.__init__(self, downloader)
1710 def report_download_webpage(self, video_id):
1711 """Report webpage download."""
1712 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1714 def report_extraction(self, video_id):
1715 """Report information extraction."""
1716 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1718 def _real_initialize(self):
1721 def _real_extract(self, url):
1722 # Extract id from URL
1723 mobj = re.match(self._VALID_URL, url)
1725 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1728 # At this point we have a new video
1729 self._downloader.increment_downloads()
1730 video_id = mobj.group(1)
1732 video_extension = 'flv'
1734 # Retrieve video webpage to extract further information
1735 request = urllib2.Request(url)
1737 self.report_download_webpage(video_id)
1738 webpage = urllib2.urlopen(request).read()
1739 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1740 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1743 # Extract URL, uploader, and title from webpage
1744 self.report_extraction(video_id)
1745 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1747 self._downloader.trouble(u'ERROR: unable to extract media URL')
1749 mediaURL = urllib.unquote(mobj.group(1))
1751 video_url = mediaURL
1753 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1755 self._downloader.trouble(u'ERROR: unable to extract title')
1757 video_title = mobj.group(1).decode('utf-8')
1758 video_title = sanitize_title(video_title)
1759 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1761 video_uploader = mobj.group(2).decode('utf-8')
1764 # Process video information
1765 self._downloader.process_info({
1766 'id': video_id.decode('utf-8'),
1767 'url': video_url.decode('utf-8'),
1768 'uploader': video_uploader,
1769 'upload_date': u'NA',
1770 'title': video_title,
1771 'stitle': simple_title,
1772 'ext': video_extension.decode('utf-8'),
1776 except UnavailableVideoError:
1777 self._downloader.trouble(u'\nERROR: unable to download video')
1780 class YahooIE(InfoExtractor):
1781 """Information extractor for video.yahoo.com."""
1783 # _VALID_URL matches all Yahoo! Video URLs
1784 # _VPAGE_URL matches only the extractable '/watch/' URLs
1785 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1786 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1787 IE_NAME = u'video.yahoo'
1789 def __init__(self, downloader=None):
1790 InfoExtractor.__init__(self, downloader)
1792 def report_download_webpage(self, video_id):
1793 """Report webpage download."""
1794 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1796 def report_extraction(self, video_id):
1797 """Report information extraction."""
1798 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1800 def _real_initialize(self):
1803 def _real_extract(self, url, new_video=True):
1804 # Extract ID from URL
1805 mobj = re.match(self._VALID_URL, url)
1807 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1810 # At this point we have a new video
1811 self._downloader.increment_downloads()
1812 video_id = mobj.group(2)
1813 video_extension = 'flv'
1815 # Rewrite valid but non-extractable URLs as
1816 # extractable English language /watch/ URLs
1817 if re.match(self._VPAGE_URL, url) is None:
1818 request = urllib2.Request(url)
1820 webpage = urllib2.urlopen(request).read()
1821 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1822 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1825 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1827 self._downloader.trouble(u'ERROR: Unable to extract id field')
1829 yahoo_id = mobj.group(1)
1831 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1833 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1835 yahoo_vid = mobj.group(1)
1837 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1838 return self._real_extract(url, new_video=False)
1840 # Retrieve video webpage to extract further information
1841 request = urllib2.Request(url)
1843 self.report_download_webpage(video_id)
1844 webpage = urllib2.urlopen(request).read()
1845 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1846 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1849 # Extract uploader and title from webpage
1850 self.report_extraction(video_id)
1851 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1853 self._downloader.trouble(u'ERROR: unable to extract video title')
1855 video_title = mobj.group(1).decode('utf-8')
1856 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1858 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1860 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1862 video_uploader = mobj.group(1).decode('utf-8')
1864 # Extract video thumbnail
1865 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1867 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1869 video_thumbnail = mobj.group(1).decode('utf-8')
1871 # Extract video description
1872 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1874 self._downloader.trouble(u'ERROR: unable to extract video description')
1876 video_description = mobj.group(1).decode('utf-8')
1877 if not video_description:
1878 video_description = 'No description available.'
1880 # Extract video height and width
1881 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1883 self._downloader.trouble(u'ERROR: unable to extract video height')
1885 yv_video_height = mobj.group(1)
1887 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1889 self._downloader.trouble(u'ERROR: unable to extract video width')
1891 yv_video_width = mobj.group(1)
1893 # Retrieve video playlist to extract media URL
1894 # I'm not completely sure what all these options are, but we
1895 # seem to need most of them, otherwise the server sends a 401.
1896 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1897 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1898 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1899 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1900 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1902 self.report_download_webpage(video_id)
1903 webpage = urllib2.urlopen(request).read()
1904 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1905 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1908 # Extract media URL from playlist XML
1909 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1911 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1913 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1914 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1917 # Process video information
1918 self._downloader.process_info({
1919 'id': video_id.decode('utf-8'),
1921 'uploader': video_uploader,
1922 'upload_date': u'NA',
1923 'title': video_title,
1924 'stitle': simple_title,
1925 'ext': video_extension.decode('utf-8'),
1926 'thumbnail': video_thumbnail.decode('utf-8'),
1927 'description': video_description,
1928 'thumbnail': video_thumbnail,
1931 except UnavailableVideoError:
1932 self._downloader.trouble(u'\nERROR: unable to download video')
1935 class VimeoIE(InfoExtractor):
1936 """Information extractor for vimeo.com."""
1938 # _VALID_URL matches Vimeo URLs
1939 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1942 def __init__(self, downloader=None):
1943 InfoExtractor.__init__(self, downloader)
1945 def report_download_webpage(self, video_id):
1946 """Report webpage download."""
1947 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1949 def report_extraction(self, video_id):
1950 """Report information extraction."""
1951 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1953 def _real_initialize(self):
1956 def _real_extract(self, url, new_video=True):
1957 # Extract ID from URL
1958 mobj = re.match(self._VALID_URL, url)
1960 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1963 # At this point we have a new video
1964 self._downloader.increment_downloads()
1965 video_id = mobj.group(1)
1967 # Retrieve video webpage to extract further information
1968 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
1970 self.report_download_webpage(video_id)
1971 webpage = urllib2.urlopen(request).read()
1972 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1973 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1976 # Now we begin extracting as much information as we can from what we
1977 # retrieved. First we extract the information common to all extractors,
1978 # and latter we extract those that are Vimeo specific.
1979 self.report_extraction(video_id)
1982 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
1984 self._downloader.trouble(u'ERROR: unable to extract video title')
1986 video_title = mobj.group(1).decode('utf-8')
1987 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1990 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
1992 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1994 video_uploader = mobj.group(1).decode('utf-8')
1996 # Extract video thumbnail
1997 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
1999 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2001 video_thumbnail = mobj.group(1).decode('utf-8')
2003 # # Extract video description
2004 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2006 # self._downloader.trouble(u'ERROR: unable to extract video description')
2008 # video_description = mobj.group(1).decode('utf-8')
2009 # if not video_description: video_description = 'No description available.'
2010 video_description = 'Foo.'
2012 # Vimeo specific: extract request signature
2013 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2015 self._downloader.trouble(u'ERROR: unable to extract request signature')
2017 sig = mobj.group(1).decode('utf-8')
2019 # Vimeo specific: Extract request signature expiration
2020 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2022 self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2024 sig_exp = mobj.group(1).decode('utf-8')
2026 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s" % (video_id, sig, sig_exp)
2029 # Process video information
2030 self._downloader.process_info({
2031 'id': video_id.decode('utf-8'),
2033 'uploader': video_uploader,
2034 'upload_date': u'NA',
2035 'title': video_title,
2036 'stitle': simple_title,
2038 'thumbnail': video_thumbnail.decode('utf-8'),
2039 'description': video_description,
2040 'thumbnail': video_thumbnail,
2041 'description': video_description,
2044 except UnavailableVideoError:
2045 self._downloader.trouble(u'ERROR: unable to download video')
2048 class GenericIE(InfoExtractor):
2049 """Generic last-resort information extractor."""
2052 IE_NAME = u'generic'
2054 def __init__(self, downloader=None):
2055 InfoExtractor.__init__(self, downloader)
2057 def report_download_webpage(self, video_id):
2058 """Report webpage download."""
2059 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2060 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2062 def report_extraction(self, video_id):
2063 """Report information extraction."""
2064 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2066 def _real_initialize(self):
2069 def _real_extract(self, url):
2070 # At this point we have a new video
2071 self._downloader.increment_downloads()
2073 video_id = url.split('/')[-1]
2074 request = urllib2.Request(url)
2076 self.report_download_webpage(video_id)
2077 webpage = urllib2.urlopen(request).read()
2078 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2079 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2081 except ValueError, err:
2082 # since this is the last-resort InfoExtractor, if
2083 # this error is thrown, it'll be thrown here
2084 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2087 self.report_extraction(video_id)
2088 # Start with something easy: JW Player in SWFObject
2089 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2091 # Broaden the search a little bit
2092 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2094 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2097 # It's possible that one of the regexes
2098 # matched, but returned an empty group:
2099 if mobj.group(1) is None:
2100 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2103 video_url = urllib.unquote(mobj.group(1))
2104 video_id = os.path.basename(video_url)
2106 # here's a fun little line of code for you:
2107 video_extension = os.path.splitext(video_id)[1][1:]
2108 video_id = os.path.splitext(video_id)[0]
2110 # it's tempting to parse this further, but you would
2111 # have to take into account all the variations like
2112 # Video Title - Site Name
2113 # Site Name | Video Title
2114 # Video Title - Tagline | Site Name
2115 # and so on and so forth; it's just not practical
2116 mobj = re.search(r'<title>(.*)</title>', webpage)
2118 self._downloader.trouble(u'ERROR: unable to extract title')
2120 video_title = mobj.group(1).decode('utf-8')
2121 video_title = sanitize_title(video_title)
2122 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2124 # video uploader is domain name
2125 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2127 self._downloader.trouble(u'ERROR: unable to extract title')
2129 video_uploader = mobj.group(1).decode('utf-8')
2132 # Process video information
2133 self._downloader.process_info({
2134 'id': video_id.decode('utf-8'),
2135 'url': video_url.decode('utf-8'),
2136 'uploader': video_uploader,
2137 'upload_date': u'NA',
2138 'title': video_title,
2139 'stitle': simple_title,
2140 'ext': video_extension.decode('utf-8'),
2144 except UnavailableVideoError, err:
2145 self._downloader.trouble(u'\nERROR: unable to download video')
2148 class YoutubeSearchIE(InfoExtractor):
2149 """Information Extractor for YouTube search queries."""
2150 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2151 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2152 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2153 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2155 _max_youtube_results = 1000
2156 IE_NAME = u'youtube:search'
2158 def __init__(self, youtube_ie, downloader=None):
2159 InfoExtractor.__init__(self, downloader)
2160 self._youtube_ie = youtube_ie
2162 def report_download_page(self, query, pagenum):
2163 """Report attempt to download playlist page with given number."""
2164 query = query.decode(preferredencoding())
2165 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2167 def _real_initialize(self):
2168 self._youtube_ie.initialize()
2170 def _real_extract(self, query):
2171 mobj = re.match(self._VALID_URL, query)
2173 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2176 prefix, query = query.split(':')
2178 query = query.encode('utf-8')
2180 self._download_n_results(query, 1)
2182 elif prefix == 'all':
2183 self._download_n_results(query, self._max_youtube_results)
2189 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2191 elif n > self._max_youtube_results:
2192 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2193 n = self._max_youtube_results
2194 self._download_n_results(query, n)
2196 except ValueError: # parsing prefix as integer fails
2197 self._download_n_results(query, 1)
2200 def _download_n_results(self, query, n):
2201 """Downloads a specified number of results for a query"""
2204 already_seen = set()
2208 self.report_download_page(query, pagenum)
2209 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2210 request = urllib2.Request(result_url)
2212 page = urllib2.urlopen(request).read()
2213 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2214 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2217 # Extract video identifiers
2218 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2219 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2220 if video_id not in already_seen:
2221 video_ids.append(video_id)
2222 already_seen.add(video_id)
2223 if len(video_ids) == n:
2224 # Specified n videos reached
2225 for id in video_ids:
2226 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2229 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2230 for id in video_ids:
2231 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2234 pagenum = pagenum + 1
2237 class GoogleSearchIE(InfoExtractor):
2238 """Information Extractor for Google Video search queries."""
2239 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2240 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2241 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2242 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2244 _max_google_results = 1000
2245 IE_NAME = u'video.google:search'
2247 def __init__(self, google_ie, downloader=None):
2248 InfoExtractor.__init__(self, downloader)
2249 self._google_ie = google_ie
2251 def report_download_page(self, query, pagenum):
2252 """Report attempt to download playlist page with given number."""
2253 query = query.decode(preferredencoding())
2254 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2256 def _real_initialize(self):
2257 self._google_ie.initialize()
2259 def _real_extract(self, query):
2260 mobj = re.match(self._VALID_URL, query)
2262 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2265 prefix, query = query.split(':')
2267 query = query.encode('utf-8')
2269 self._download_n_results(query, 1)
2271 elif prefix == 'all':
2272 self._download_n_results(query, self._max_google_results)
2278 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2280 elif n > self._max_google_results:
2281 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2282 n = self._max_google_results
2283 self._download_n_results(query, n)
2285 except ValueError: # parsing prefix as integer fails
2286 self._download_n_results(query, 1)
2289 def _download_n_results(self, query, n):
2290 """Downloads a specified number of results for a query"""
2293 already_seen = set()
2297 self.report_download_page(query, pagenum)
2298 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2299 request = urllib2.Request(result_url)
2301 page = urllib2.urlopen(request).read()
2302 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2303 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2306 # Extract video identifiers
2307 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2308 video_id = mobj.group(1)
2309 if video_id not in already_seen:
2310 video_ids.append(video_id)
2311 already_seen.add(video_id)
2312 if len(video_ids) == n:
2313 # Specified n videos reached
2314 for id in video_ids:
2315 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2318 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2319 for id in video_ids:
2320 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2323 pagenum = pagenum + 1
2326 class YahooSearchIE(InfoExtractor):
2327 """Information Extractor for Yahoo! Video search queries."""
2328 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2329 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2330 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2331 _MORE_PAGES_INDICATOR = r'\s*Next'
2333 _max_yahoo_results = 1000
2334 IE_NAME = u'video.yahoo:search'
2336 def __init__(self, yahoo_ie, downloader=None):
2337 InfoExtractor.__init__(self, downloader)
2338 self._yahoo_ie = yahoo_ie
2340 def report_download_page(self, query, pagenum):
2341 """Report attempt to download playlist page with given number."""
2342 query = query.decode(preferredencoding())
2343 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2345 def _real_initialize(self):
2346 self._yahoo_ie.initialize()
2348 def _real_extract(self, query):
2349 mobj = re.match(self._VALID_URL, query)
2351 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2354 prefix, query = query.split(':')
2356 query = query.encode('utf-8')
2358 self._download_n_results(query, 1)
2360 elif prefix == 'all':
2361 self._download_n_results(query, self._max_yahoo_results)
2367 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2369 elif n > self._max_yahoo_results:
2370 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2371 n = self._max_yahoo_results
2372 self._download_n_results(query, n)
2374 except ValueError: # parsing prefix as integer fails
2375 self._download_n_results(query, 1)
2378 def _download_n_results(self, query, n):
2379 """Downloads a specified number of results for a query"""
2382 already_seen = set()
2386 self.report_download_page(query, pagenum)
2387 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2388 request = urllib2.Request(result_url)
2390 page = urllib2.urlopen(request).read()
2391 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2392 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2395 # Extract video identifiers
2396 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2397 video_id = mobj.group(1)
2398 if video_id not in already_seen:
2399 video_ids.append(video_id)
2400 already_seen.add(video_id)
2401 if len(video_ids) == n:
2402 # Specified n videos reached
2403 for id in video_ids:
2404 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2407 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2408 for id in video_ids:
2409 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2412 pagenum = pagenum + 1
2415 class YoutubePlaylistIE(InfoExtractor):
2416 """Information Extractor for YouTube playlists."""
2418 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2419 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2420 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2421 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2423 IE_NAME = u'youtube:playlist'
2425 def __init__(self, youtube_ie, downloader=None):
2426 InfoExtractor.__init__(self, downloader)
2427 self._youtube_ie = youtube_ie
2429 def report_download_page(self, playlist_id, pagenum):
2430 """Report attempt to download playlist page with given number."""
2431 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2433 def _real_initialize(self):
2434 self._youtube_ie.initialize()
2436 def _real_extract(self, url):
2437 # Extract playlist id
2438 mobj = re.match(self._VALID_URL, url)
2440 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2444 if mobj.group(3) is not None:
2445 self._youtube_ie.extract(mobj.group(3))
2448 # Download playlist pages
2449 # prefix is 'p' as default for playlists but there are other types that need extra care
2450 playlist_prefix = mobj.group(1)
2451 if playlist_prefix == 'a':
2452 playlist_access = 'artist'
2454 playlist_prefix = 'p'
2455 playlist_access = 'view_play_list'
2456 playlist_id = mobj.group(2)
2461 self.report_download_page(playlist_id, pagenum)
2462 request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2464 page = urllib2.urlopen(request).read()
2465 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2466 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2469 # Extract video identifiers
2471 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2472 if mobj.group(1) not in ids_in_page:
2473 ids_in_page.append(mobj.group(1))
2474 video_ids.extend(ids_in_page)
2476 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2478 pagenum = pagenum + 1
2480 playliststart = self._downloader.params.get('playliststart', 1) - 1
2481 playlistend = self._downloader.params.get('playlistend', -1)
2482 video_ids = video_ids[playliststart:playlistend]
2484 for id in video_ids:
2485 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2489 class YoutubeUserIE(InfoExtractor):
2490 """Information Extractor for YouTube users."""
2492 _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2493 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2494 _GDATA_PAGE_SIZE = 50
2495 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2496 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2498 IE_NAME = u'youtube:user'
2500 def __init__(self, youtube_ie, downloader=None):
2501 InfoExtractor.__init__(self, downloader)
2502 self._youtube_ie = youtube_ie
2504 def report_download_page(self, username, start_index):
2505 """Report attempt to download user page."""
2506 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2507 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2509 def _real_initialize(self):
2510 self._youtube_ie.initialize()
2512 def _real_extract(self, url):
2514 mobj = re.match(self._VALID_URL, url)
2516 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2519 username = mobj.group(1)
2521 # Download video ids using YouTube Data API. Result size per
2522 # query is limited (currently to 50 videos) so we need to query
2523 # page by page until there are no video ids - it means we got
2530 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2531 self.report_download_page(username, start_index)
2533 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2536 page = urllib2.urlopen(request).read()
2537 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2538 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2541 # Extract video identifiers
2544 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2545 if mobj.group(1) not in ids_in_page:
2546 ids_in_page.append(mobj.group(1))
2548 video_ids.extend(ids_in_page)
2550 # A little optimization - if current page is not
2551 # "full", ie. does not contain PAGE_SIZE video ids then
2552 # we can assume that this page is the last one - there
2553 # are no more ids on further pages - no need to query
2556 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2561 all_ids_count = len(video_ids)
2562 playliststart = self._downloader.params.get('playliststart', 1) - 1
2563 playlistend = self._downloader.params.get('playlistend', -1)
2565 if playlistend == -1:
2566 video_ids = video_ids[playliststart:]
2568 video_ids = video_ids[playliststart:playlistend]
2570 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2571 (username, all_ids_count, len(video_ids)))
2573 for video_id in video_ids:
2574 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2577 class DepositFilesIE(InfoExtractor):
2578 """Information extractor for depositfiles.com"""
2580 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2581 IE_NAME = u'DepositFiles'
2583 def __init__(self, downloader=None):
2584 InfoExtractor.__init__(self, downloader)
2586 def report_download_webpage(self, file_id):
2587 """Report webpage download."""
2588 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2590 def report_extraction(self, file_id):
2591 """Report information extraction."""
2592 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2594 def _real_initialize(self):
2597 def _real_extract(self, url):
2598 # At this point we have a new file
2599 self._downloader.increment_downloads()
2601 file_id = url.split('/')[-1]
2602 # Rebuild url in english locale
2603 url = 'http://depositfiles.com/en/files/' + file_id
2605 # Retrieve file webpage with 'Free download' button pressed
2606 free_download_indication = { 'gateway_result' : '1' }
2607 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2609 self.report_download_webpage(file_id)
2610 webpage = urllib2.urlopen(request).read()
2611 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2612 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2615 # Search for the real file URL
2616 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2617 if (mobj is None) or (mobj.group(1) is None):
2618 # Try to figure out reason of the error.
2619 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2620 if (mobj is not None) and (mobj.group(1) is not None):
2621 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2622 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2624 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2627 file_url = mobj.group(1)
2628 file_extension = os.path.splitext(file_url)[1][1:]
2630 # Search for file title
2631 mobj = re.search(r'<b title="(.*?)">', webpage)
2633 self._downloader.trouble(u'ERROR: unable to extract title')
2635 file_title = mobj.group(1).decode('utf-8')
2638 # Process file information
2639 self._downloader.process_info({
2640 'id': file_id.decode('utf-8'),
2641 'url': file_url.decode('utf-8'),
2643 'upload_date': u'NA',
2644 'title': file_title,
2645 'stitle': file_title,
2646 'ext': file_extension.decode('utf-8'),
2650 except UnavailableVideoError, err:
2651 self._downloader.trouble(u'ERROR: unable to download file')
2654 class FacebookIE(InfoExtractor):
2655 """Information Extractor for Facebook"""
2657 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2658 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2659 _NETRC_MACHINE = 'facebook'
2660 _available_formats = ['highqual', 'lowqual']
2661 _video_extensions = {
2665 IE_NAME = u'facebook'
2667 def __init__(self, downloader=None):
2668 InfoExtractor.__init__(self, downloader)
2670 def _reporter(self, message):
2671 """Add header and report message."""
2672 self._downloader.to_screen(u'[facebook] %s' % message)
2674 def report_login(self):
2675 """Report attempt to log in."""
2676 self._reporter(u'Logging in')
2678 def report_video_webpage_download(self, video_id):
2679 """Report attempt to download video webpage."""
2680 self._reporter(u'%s: Downloading video webpage' % video_id)
2682 def report_information_extraction(self, video_id):
2683 """Report attempt to extract video information."""
2684 self._reporter(u'%s: Extracting video information' % video_id)
2686 def _parse_page(self, video_webpage):
2687 """Extract video information from page"""
2689 data = {'title': r'class="video_title datawrap">(.*?)</',
2690 'description': r'<div class="datawrap">(.*?)</div>',
2691 'owner': r'\("video_owner_name", "(.*?)"\)',
2692 'upload_date': r'data-date="(.*?)"',
2693 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2696 for piece in data.keys():
2697 mobj = re.search(data[piece], video_webpage)
2698 if mobj is not None:
2699 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2703 for fmt in self._available_formats:
2704 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2705 if mobj is not None:
2706 # URL is in a Javascript segment inside an escaped Unicode format within
2707 # the generally utf-8 page
2708 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2709 video_info['video_urls'] = video_urls
2713 def _real_initialize(self):
2714 if self._downloader is None:
2719 downloader_params = self._downloader.params
2721 # Attempt to use provided username and password or .netrc data
2722 if downloader_params.get('username', None) is not None:
2723 useremail = downloader_params['username']
2724 password = downloader_params['password']
2725 elif downloader_params.get('usenetrc', False):
2727 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2728 if info is not None:
2732 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2733 except (IOError, netrc.NetrcParseError), err:
2734 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2737 if useremail is None:
2746 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2749 login_results = urllib2.urlopen(request).read()
2750 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2751 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2753 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2754 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2757 def _real_extract(self, url):
2758 mobj = re.match(self._VALID_URL, url)
2760 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2762 video_id = mobj.group('ID')
2765 self.report_video_webpage_download(video_id)
2766 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2768 page = urllib2.urlopen(request)
2769 video_webpage = page.read()
2770 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2771 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2774 # Start extracting information
2775 self.report_information_extraction(video_id)
2777 # Extract information
2778 video_info = self._parse_page(video_webpage)
2781 if 'owner' not in video_info:
2782 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2784 video_uploader = video_info['owner']
2787 if 'title' not in video_info:
2788 self._downloader.trouble(u'ERROR: unable to extract video title')
2790 video_title = video_info['title']
2791 video_title = video_title.decode('utf-8')
2792 video_title = sanitize_title(video_title)
2795 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2796 simple_title = simple_title.strip(ur'_')
2799 if 'thumbnail' not in video_info:
2800 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2801 video_thumbnail = ''
2803 video_thumbnail = video_info['thumbnail']
2807 if 'upload_date' in video_info:
2808 upload_time = video_info['upload_date']
2809 timetuple = email.utils.parsedate_tz(upload_time)
2810 if timetuple is not None:
2812 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2817 video_description = video_info.get('description', 'No description available.')
2819 url_map = video_info['video_urls']
2820 if len(url_map.keys()) > 0:
2821 # Decide which formats to download
2822 req_format = self._downloader.params.get('format', None)
2823 format_limit = self._downloader.params.get('format_limit', None)
2825 if format_limit is not None and format_limit in self._available_formats:
2826 format_list = self._available_formats[self._available_formats.index(format_limit):]
2828 format_list = self._available_formats
2829 existing_formats = [x for x in format_list if x in url_map]
2830 if len(existing_formats) == 0:
2831 self._downloader.trouble(u'ERROR: no known formats available for video')
2833 if req_format is None:
2834 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2835 elif req_format == '-1':
2836 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2839 if req_format not in url_map:
2840 self._downloader.trouble(u'ERROR: requested format not available')
2842 video_url_list = [(req_format, url_map[req_format])] # Specific format
2844 for format_param, video_real_url in video_url_list:
2846 # At this point we have a new video
2847 self._downloader.increment_downloads()
2850 video_extension = self._video_extensions.get(format_param, 'mp4')
2853 # Process video information
2854 self._downloader.process_info({
2855 'id': video_id.decode('utf-8'),
2856 'url': video_real_url.decode('utf-8'),
2857 'uploader': video_uploader.decode('utf-8'),
2858 'upload_date': upload_date,
2859 'title': video_title,
2860 'stitle': simple_title,
2861 'ext': video_extension.decode('utf-8'),
2862 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2863 'thumbnail': video_thumbnail.decode('utf-8'),
2864 'description': video_description.decode('utf-8'),
2867 except UnavailableVideoError, err:
2868 self._downloader.trouble(u'\nERROR: unable to download video')
2870 class BlipTVIE(InfoExtractor):
2871 """Information extractor for blip.tv"""
2873 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2874 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2875 IE_NAME = u'blip.tv'
2877 def report_extraction(self, file_id):
2878 """Report information extraction."""
2879 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
2881 def _simplify_title(self, title):
2882 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2883 res = res.strip(ur'_')
2886 def _real_extract(self, url):
2887 mobj = re.match(self._VALID_URL, url)
2889 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2896 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2897 request = urllib2.Request(json_url)
2898 self.report_extraction(mobj.group(1))
2900 json_code = urllib2.urlopen(request).read()
2901 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2902 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2905 json_data = json.loads(json_code)
2906 if 'Post' in json_data:
2907 data = json_data['Post']
2911 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2912 video_url = data['media']['url']
2913 umobj = re.match(self._URL_EXT, video_url)
2915 raise ValueError('Can not determine filename extension')
2916 ext = umobj.group(1)
2918 self._downloader.increment_downloads()
2921 'id': data['item_id'],
2923 'uploader': data['display_name'],
2924 'upload_date': upload_date,
2925 'title': data['title'],
2926 'stitle': self._simplify_title(data['title']),
2928 'format': data['media']['mimeType'],
2929 'thumbnail': data['thumbnailUrl'],
2930 'description': data['description'],
2931 'player_url': data['embedUrl']
2933 except (ValueError,KeyError), err:
2934 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2938 self._downloader.process_info(info)
2939 except UnavailableVideoError, err:
2940 self._downloader.trouble(u'\nERROR: unable to download video')
2943 class MyVideoIE(InfoExtractor):
2944 """Information Extractor for myvideo.de."""
2946 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2947 IE_NAME = u'myvideo'
2949 def __init__(self, downloader=None):
2950 InfoExtractor.__init__(self, downloader)
2952 def report_download_webpage(self, video_id):
2953 """Report webpage download."""
2954 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2956 def report_extraction(self, video_id):
2957 """Report information extraction."""
2958 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2960 def _real_initialize(self):
2963 def _real_extract(self,url):
2964 mobj = re.match(self._VALID_URL, url)
2966 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2969 video_id = mobj.group(1)
2970 simple_title = mobj.group(2).decode('utf-8')
2971 # should actually not be necessary
2972 simple_title = sanitize_title(simple_title)
2973 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', simple_title)
2976 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2978 self.report_download_webpage(video_id)
2979 webpage = urllib2.urlopen(request).read()
2980 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2981 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2984 self.report_extraction(video_id)
2985 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2988 self._downloader.trouble(u'ERROR: unable to extract media URL')
2990 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2992 mobj = re.search('<title>([^<]+)</title>', webpage)
2994 self._downloader.trouble(u'ERROR: unable to extract title')
2997 video_title = mobj.group(1)
2998 video_title = sanitize_title(video_title)
3002 self._downloader.process_info({
3006 'upload_date': u'NA',
3007 'title': video_title,
3008 'stitle': simple_title,
3013 except UnavailableVideoError:
3014 self._downloader.trouble(u'\nERROR: Unable to download video')
3016 class ComedyCentralIE(InfoExtractor):
3017 """Information extractor for The Daily Show and Colbert Report """
3019 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3020 IE_NAME = u'comedycentral'
3022 def report_extraction(self, episode_id):
3023 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3025 def report_config_download(self, episode_id):
3026 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3028 def report_index_download(self, episode_id):
3029 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3031 def report_player_url(self, episode_id):
3032 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3034 def _simplify_title(self, title):
3035 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3036 res = res.strip(ur'_')
3039 def _real_extract(self, url):
3040 mobj = re.match(self._VALID_URL, url)
3042 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3045 if mobj.group('shortname'):
3046 if mobj.group('shortname') in ('tds', 'thedailyshow'):
3047 url = 'http://www.thedailyshow.com/full-episodes/'
3049 url = 'http://www.colbertnation.com/full-episodes/'
3050 mobj = re.match(self._VALID_URL, url)
3051 assert mobj is not None
3053 dlNewest = not mobj.group('episode')
3055 epTitle = mobj.group('showname')
3057 epTitle = mobj.group('episode')
3059 req = urllib2.Request(url)
3060 self.report_extraction(epTitle)
3062 htmlHandle = urllib2.urlopen(req)
3063 html = htmlHandle.read()
3064 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3065 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3068 url = htmlHandle.geturl()
3069 mobj = re.match(self._VALID_URL, url)
3071 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3073 if mobj.group('episode') == '':
3074 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3076 epTitle = mobj.group('episode')
3078 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"/>', html)
3079 if len(mMovieParams) == 0:
3080 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3083 playerUrl_raw = mMovieParams[0][0]
3084 self.report_player_url(epTitle)
3086 urlHandle = urllib2.urlopen(playerUrl_raw)
3087 playerUrl = urlHandle.geturl()
3088 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3089 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3092 uri = mMovieParams[0][1]
3093 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3094 self.report_index_download(epTitle)
3096 indexXml = urllib2.urlopen(indexUrl).read()
3097 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3098 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3101 idoc = xml.etree.ElementTree.fromstring(indexXml)
3102 itemEls = idoc.findall('.//item')
3103 for itemEl in itemEls:
3104 mediaId = itemEl.findall('./guid')[0].text
3105 shortMediaId = mediaId.split(':')[-1]
3106 showId = mediaId.split(':')[-2].replace('.com', '')
3107 officialTitle = itemEl.findall('./title')[0].text
3108 officialDate = itemEl.findall('./pubDate')[0].text
3110 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3111 urllib.urlencode({'uri': mediaId}))
3112 configReq = urllib2.Request(configUrl)
3113 self.report_config_download(epTitle)
3115 configXml = urllib2.urlopen(configReq).read()
3116 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3117 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3120 cdoc = xml.etree.ElementTree.fromstring(configXml)
3122 for rendition in cdoc.findall('.//rendition'):
3123 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3127 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3130 # For now, just pick the highest bitrate
3131 format,video_url = turls[-1]
3133 self._downloader.increment_downloads()
3135 effTitle = showId + '-' + epTitle
3140 'upload_date': officialDate,
3142 'stitle': self._simplify_title(effTitle),
3146 'description': officialTitle,
3147 'player_url': playerUrl
3151 self._downloader.process_info(info)
3152 except UnavailableVideoError, err:
3153 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3157 class EscapistIE(InfoExtractor):
3158 """Information extractor for The Escapist """
3160 _VALID_URL = r'^(https?://)?(www\.)escapistmagazine.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?].*$'
3161 IE_NAME = u'escapist'
3163 def report_extraction(self, showName):
3164 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3166 def report_config_download(self, showName):
3167 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3169 def _simplify_title(self, title):
3170 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3171 res = res.strip(ur'_')
3174 def _real_extract(self, url):
3175 htmlParser = HTMLParser.HTMLParser()
3177 mobj = re.match(self._VALID_URL, url)
3179 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3181 showName = mobj.group('showname')
3182 videoId = mobj.group('episode')
3184 self.report_extraction(showName)
3186 webPage = urllib2.urlopen(url).read()
3187 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3188 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3191 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3192 description = htmlParser.unescape(descMatch.group(1))
3193 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3194 imgUrl = htmlParser.unescape(imgMatch.group(1))
3195 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3196 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3197 configUrlMatch = re.search('config=(.*)$', playerUrl)
3198 configUrl = urllib2.unquote(configUrlMatch.group(1))
3200 self.report_config_download(showName)
3202 configJSON = urllib2.urlopen(configUrl).read()
3203 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3204 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3207 # Technically, it's JavaScript, not JSON
3208 configJSON = configJSON.replace("'", '"')
3211 config = json.loads(configJSON)
3212 except (ValueError,), err:
3213 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3216 playlist = config['playlist']
3217 videoUrl = playlist[1]['url']
3219 self._downloader.increment_downloads()
3223 'uploader': showName,
3224 'upload_date': None,
3226 'stitle': self._simplify_title(showName),
3229 'thumbnail': imgUrl,
3230 'description': description,
3231 'player_url': playerUrl,
3235 self._downloader.process_info(info)
3236 except UnavailableVideoError, err:
3237 self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3241 class PostProcessor(object):
3242 """Post Processor class.
3244 PostProcessor objects can be added to downloaders with their
3245 add_post_processor() method. When the downloader has finished a
3246 successful download, it will take its internal chain of PostProcessors
3247 and start calling the run() method on each one of them, first with
3248 an initial argument and then with the returned value of the previous
3251 The chain will be stopped if one of them ever returns None or the end
3252 of the chain is reached.
3254 PostProcessor objects follow a "mutual registration" process similar
3255 to InfoExtractor objects.
3260 def __init__(self, downloader=None):
3261 self._downloader = downloader
3263 def set_downloader(self, downloader):
3264 """Sets the downloader for this PP."""
3265 self._downloader = downloader
3267 def run(self, information):
3268 """Run the PostProcessor.
3270 The "information" argument is a dictionary like the ones
3271 composed by InfoExtractors. The only difference is that this
3272 one has an extra field called "filepath" that points to the
3275 When this method returns None, the postprocessing chain is
3276 stopped. However, this method may return an information
3277 dictionary that will be passed to the next postprocessing
3278 object in the chain. It can be the one it received after
3279 changing some fields.
3281 In addition, this method may raise a PostProcessingError
3282 exception that will be taken into account by the downloader
3285 return information # by default, do nothing
3288 class FFmpegExtractAudioPP(PostProcessor):
3290 def __init__(self, downloader=None, preferredcodec=None):
3291 PostProcessor.__init__(self, downloader)
3292 if preferredcodec is None:
3293 preferredcodec = 'best'
3294 self._preferredcodec = preferredcodec
3297 def get_audio_codec(path):
3299 cmd = ['ffprobe', '-show_streams', '--', path]
3300 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3301 output = handle.communicate()[0]
3302 if handle.wait() != 0:
3304 except (IOError, OSError):
3307 for line in output.split('\n'):
3308 if line.startswith('codec_name='):
3309 audio_codec = line.split('=')[1].strip()
3310 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3315 def run_ffmpeg(path, out_path, codec, more_opts):
3317 cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3318 ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3320 except (IOError, OSError):
3323 def run(self, information):
3324 path = information['filepath']
3326 filecodec = self.get_audio_codec(path)
3327 if filecodec is None:
3328 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3332 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3333 if filecodec == 'aac' or filecodec == 'mp3':
3334 # Lossless if possible
3336 extension = filecodec
3337 if filecodec == 'aac':
3338 more_opts = ['-f', 'adts']
3341 acodec = 'libmp3lame'
3343 more_opts = ['-ab', '128k']
3345 # We convert the audio (lossy)
3346 acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
3347 extension = self._preferredcodec
3348 more_opts = ['-ab', '128k']
3349 if self._preferredcodec == 'aac':
3350 more_opts += ['-f', 'adts']
3352 (prefix, ext) = os.path.splitext(path)
3353 new_path = prefix + '.' + extension
3354 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3355 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3358 self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3361 # Try to update the date time for extracted audio file.
3362 if information.get('filetime') is not None:
3364 os.utime(new_path, (time.time(), information['filetime']))
3366 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
3370 except (IOError, OSError):
3371 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3374 information['filepath'] = new_path
3378 def updateSelf(downloader, filename):
3379 ''' Update the program file with the latest version from the repository '''
3380 # Note: downloader only used for options
3381 if not os.access(filename, os.W_OK):
3382 sys.exit('ERROR: no write permissions on %s' % filename)
3384 downloader.to_screen('Updating to latest version...')
3388 urlh = urllib.urlopen(UPDATE_URL)
3389 newcontent = urlh.read()
3392 except (IOError, OSError), err:
3393 sys.exit('ERROR: unable to download latest version')
3396 outf = open(filename, 'wb')
3398 outf.write(newcontent)
3401 except (IOError, OSError), err:
3402 sys.exit('ERROR: unable to overwrite current version')
3404 downloader.to_screen('Updated youtube-dl. Restart to use the new version.')
3411 def _format_option_string(option):
3412 ''' ('-o', '--option') -> -o, --format METAVAR'''
3416 if option._short_opts: opts.append(option._short_opts[0])
3417 if option._long_opts: opts.append(option._long_opts[0])
3418 if len(opts) > 1: opts.insert(1, ', ')
3420 if option.takes_value(): opts.append(' %s' % option.metavar)
3422 return "".join(opts)
3424 def _find_term_columns():
3425 columns = os.environ.get('COLUMNS', None)
3430 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3431 out,err = sp.communicate()
3432 return int(out.split()[1])
3438 max_help_position = 80
3440 # No need to wrap help messages if we're on a wide console
3441 columns = _find_term_columns()
3442 if columns: max_width = columns
3444 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
3445 fmt.format_option_strings = _format_option_string
3448 'version' : __version__,
3450 'usage' : '%prog [options] url [url...]',
3451 'conflict_handler' : 'resolve',
3454 parser = optparse.OptionParser(**kw)
3457 general = optparse.OptionGroup(parser, 'General Options')
3458 selection = optparse.OptionGroup(parser, 'Video Selection')
3459 authentication = optparse.OptionGroup(parser, 'Authentication Options')
3460 video_format = optparse.OptionGroup(parser, 'Video Format Options')
3461 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
3462 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
3463 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3465 general.add_option('-h', '--help',
3466 action='help', help='print this help text and exit')
3467 general.add_option('-v', '--version',
3468 action='version', help='print program version and exit')
3469 general.add_option('-U', '--update',
3470 action='store_true', dest='update_self', help='update this program to latest version')
3471 general.add_option('-i', '--ignore-errors',
3472 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3473 general.add_option('-r', '--rate-limit',
3474 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3475 general.add_option('-R', '--retries',
3476 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
3477 general.add_option('--dump-user-agent',
3478 action='store_true', dest='dump_user_agent',
3479 help='display the current browser identification', default=False)
3480 general.add_option('--list-extractors',
3481 action='store_true', dest='list_extractors',
3482 help='List all supported extractors and the URLs they would handle', default=False)
3484 selection.add_option('--playlist-start',
3485 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3486 selection.add_option('--playlist-end',
3487 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3488 selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
3489 selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
3491 authentication.add_option('-u', '--username',
3492 dest='username', metavar='USERNAME', help='account username')
3493 authentication.add_option('-p', '--password',
3494 dest='password', metavar='PASSWORD', help='account password')
3495 authentication.add_option('-n', '--netrc',
3496 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3499 video_format.add_option('-f', '--format',
3500 action='store', dest='format', metavar='FORMAT', help='video format code')
3501 video_format.add_option('--all-formats',
3502 action='store_const', dest='format', help='download all available video formats', const='-1')
3503 video_format.add_option('--max-quality',
3504 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3507 verbosity.add_option('-q', '--quiet',
3508 action='store_true', dest='quiet', help='activates quiet mode', default=False)
3509 verbosity.add_option('-s', '--simulate',
3510 action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
3511 verbosity.add_option('--skip-download',
3512 action='store_true', dest='skip_download', help='do not download the video', default=False)
3513 verbosity.add_option('-g', '--get-url',
3514 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3515 verbosity.add_option('-e', '--get-title',
3516 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3517 verbosity.add_option('--get-thumbnail',
3518 action='store_true', dest='getthumbnail',
3519 help='simulate, quiet but print thumbnail URL', default=False)
3520 verbosity.add_option('--get-description',
3521 action='store_true', dest='getdescription',
3522 help='simulate, quiet but print video description', default=False)
3523 verbosity.add_option('--get-filename',
3524 action='store_true', dest='getfilename',
3525 help='simulate, quiet but print output filename', default=False)
3526 verbosity.add_option('--no-progress',
3527 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3528 verbosity.add_option('--console-title',
3529 action='store_true', dest='consoletitle',
3530 help='display progress in console titlebar', default=False)
3533 filesystem.add_option('-t', '--title',
3534 action='store_true', dest='usetitle', help='use title in file name', default=False)
3535 filesystem.add_option('-l', '--literal',
3536 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3537 filesystem.add_option('-A', '--auto-number',
3538 action='store_true', dest='autonumber',
3539 help='number downloaded files starting from 00000', default=False)
3540 filesystem.add_option('-o', '--output',
3541 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
3542 filesystem.add_option('-a', '--batch-file',
3543 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3544 filesystem.add_option('-w', '--no-overwrites',
3545 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3546 filesystem.add_option('-c', '--continue',
3547 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3548 filesystem.add_option('--cookies',
3549 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
3550 filesystem.add_option('--no-part',
3551 action='store_true', dest='nopart', help='do not use .part files', default=False)
3552 filesystem.add_option('--no-mtime',
3553 action='store_false', dest='updatetime',
3554 help='do not use the Last-modified header to set the file modification time', default=True)
3555 filesystem.add_option('--write-description',
3556 action='store_true', dest='writedescription',
3557 help='write video description to a .description file', default=False)
3558 filesystem.add_option('--write-info-json',
3559 action='store_true', dest='writeinfojson',
3560 help='write video metadata to a .info.json file', default=False)
3563 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3564 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3565 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3566 help='"best", "aac" or "mp3"; best by default')
3569 parser.add_option_group(general)
3570 parser.add_option_group(selection)
3571 parser.add_option_group(filesystem)
3572 parser.add_option_group(verbosity)
3573 parser.add_option_group(video_format)
3574 parser.add_option_group(authentication)
3575 parser.add_option_group(postproc)
3577 opts, args = parser.parse_args()
3579 return parser, opts, args
3581 def gen_extractors():
3582 """ Return a list of an instance of every supported extractor.
3583 The order does matter; the first extractor matched is the one handling the URL.
3585 youtube_ie = YoutubeIE()
3586 google_ie = GoogleIE()
3587 yahoo_ie = YahooIE()
3590 MetacafeIE(youtube_ie),
3592 YoutubePlaylistIE(youtube_ie),
3593 YoutubeUserIE(youtube_ie),
3594 YoutubeSearchIE(youtube_ie),
3596 GoogleSearchIE(google_ie),
3599 YahooSearchIE(yahoo_ie),
3612 parser, opts, args = parseOpts()
3614 # Open appropriate CookieJar
3615 if opts.cookiefile is None:
3616 jar = cookielib.CookieJar()
3619 jar = cookielib.MozillaCookieJar(opts.cookiefile)
3620 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3622 except (IOError, OSError), err:
3623 sys.exit(u'ERROR: unable to open cookie file')
3626 if opts.dump_user_agent:
3627 print std_headers['User-Agent']
3630 # Batch file verification
3632 if opts.batchfile is not None:
3634 if opts.batchfile == '-':
3637 batchfd = open(opts.batchfile, 'r')
3638 batchurls = batchfd.readlines()
3639 batchurls = [x.strip() for x in batchurls]
3640 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3642 sys.exit(u'ERROR: batch file could not be read')
3643 all_urls = batchurls + args
3645 # General configuration
3646 cookie_processor = urllib2.HTTPCookieProcessor(jar)
3647 opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
3648 urllib2.install_opener(opener)
3649 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3651 extractors = gen_extractors()
3653 if opts.list_extractors:
3654 for ie in extractors:
3656 matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
3657 all_urls = filter(lambda url: url not in matchedUrls, all_urls)
3658 for mu in matchedUrls:
3662 # Conflicting, missing and erroneous options
3663 if opts.usenetrc and (opts.username is not None or opts.password is not None):
3664 parser.error(u'using .netrc conflicts with giving username/password')
3665 if opts.password is not None and opts.username is None:
3666 parser.error(u'account username missing')
3667 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3668 parser.error(u'using output template conflicts with using title, literal title or auto number')
3669 if opts.usetitle and opts.useliteral:
3670 parser.error(u'using title conflicts with using literal title')
3671 if opts.username is not None and opts.password is None:
3672 opts.password = getpass.getpass(u'Type account password and press return:')
3673 if opts.ratelimit is not None:
3674 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3675 if numeric_limit is None:
3676 parser.error(u'invalid rate limit specified')
3677 opts.ratelimit = numeric_limit
3678 if opts.retries is not None:
3680 opts.retries = long(opts.retries)
3681 except (TypeError, ValueError), err:
3682 parser.error(u'invalid retry count specified')
3684 opts.playliststart = int(opts.playliststart)
3685 if opts.playliststart <= 0:
3686 raise ValueError(u'Playlist start must be positive')
3687 except (TypeError, ValueError), err:
3688 parser.error(u'invalid playlist start number specified')
3690 opts.playlistend = int(opts.playlistend)
3691 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3692 raise ValueError(u'Playlist end must be greater than playlist start')
3693 except (TypeError, ValueError), err:
3694 parser.error(u'invalid playlist end number specified')
3695 if opts.extractaudio:
3696 if opts.audioformat not in ['best', 'aac', 'mp3']:
3697 parser.error(u'invalid audio format specified')
3700 fd = FileDownloader({
3701 'usenetrc': opts.usenetrc,
3702 'username': opts.username,
3703 'password': opts.password,
3704 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3705 'forceurl': opts.geturl,
3706 'forcetitle': opts.gettitle,
3707 'forcethumbnail': opts.getthumbnail,
3708 'forcedescription': opts.getdescription,
3709 'forcefilename': opts.getfilename,
3710 'simulate': opts.simulate,
3711 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3712 'format': opts.format,
3713 'format_limit': opts.format_limit,
3714 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3715 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3716 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3717 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3718 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3719 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3720 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3721 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3722 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3723 or u'%(id)s.%(ext)s'),
3724 'ignoreerrors': opts.ignoreerrors,
3725 'ratelimit': opts.ratelimit,
3726 'nooverwrites': opts.nooverwrites,
3727 'retries': opts.retries,
3728 'continuedl': opts.continue_dl,
3729 'noprogress': opts.noprogress,
3730 'playliststart': opts.playliststart,
3731 'playlistend': opts.playlistend,
3732 'logtostderr': opts.outtmpl == '-',
3733 'consoletitle': opts.consoletitle,
3734 'nopart': opts.nopart,
3735 'updatetime': opts.updatetime,
3736 'writedescription': opts.writedescription,
3737 'writeinfojson': opts.writeinfojson,
3738 'matchtitle': opts.matchtitle,
3739 'rejecttitle': opts.rejecttitle,
3741 for extractor in extractors:
3742 fd.add_info_extractor(extractor)
3745 if opts.extractaudio:
3746 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))
3749 if opts.update_self:
3750 updateSelf(fd, sys.argv[0])
3753 if len(all_urls) < 1:
3754 if not opts.update_self:
3755 parser.error(u'you must provide at least one URL')
3758 retcode = fd.download(all_urls)
3760 # Dump cookie jar if requested
3761 if opts.cookiefile is not None:
3764 except (IOError, OSError), err:
3765 sys.exit(u'ERROR: unable to save cookie jar')
3770 if __name__ == '__main__':
3773 except DownloadError:
3775 except SameFileError:
3776 sys.exit(u'ERROR: fixed output name but more than one file to download')
3777 except KeyboardInterrupt:
3778 sys.exit(u'\nERROR: Interrupted by user')
3780 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: