2 # -*- coding: utf-8 -*-
5 'Ricardo Garcia Gonzalez',
13 'Philipp Hagemeister',
17 __license__ = 'Public Domain'
18 __version__ = '2011.09.18'
20 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
49 except ImportError: # Python 2.4
52 import cStringIO as StringIO
56 # parse_qs was moved from the cgi module to the urlparse module recently.
58 from urlparse import parse_qs
60 from cgi import parse_qs
68 import xml.etree.ElementTree
69 except ImportError: # Python<2.5: Not officially supported, but let it slip
70 warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
73 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
74 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
75 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
76 'Accept-Encoding': 'gzip, deflate',
77 'Accept-Language': 'en-us,en;q=0.5',
80 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
84 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
90 def raiseError(msg, i):
91 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
92 def skipSpace(i, expectMore=True):
93 while i < len(s) and s[i] in ' \t\r\n':
97 raiseError('Premature end', i)
99 def decodeEscape(match):
115 return unichr(int(esc[1:5], 16))
116 if len(esc) == 5+6 and esc[5:7] == '\\u':
117 hi = int(esc[1:5], 16)
118 low = int(esc[7:11], 16)
119 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
120 raise ValueError('Unknown escape ' + str(esc))
127 while s[e-bslashes-1] == '\\':
129 if bslashes % 2 == 1:
133 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
134 stri = rexp.sub(decodeEscape, s[i:e])
140 if s[i] == '}': # Empty dictionary
144 raiseError('Expected a string object key', i)
145 i,key = parseString(i)
147 if i >= len(s) or s[i] != ':':
148 raiseError('Expected a colon', i)
155 raiseError('Expected comma or closing curly brace', i)
160 if s[i] == ']': # Empty array
165 i = skipSpace(i) # Raise exception if premature end
169 raiseError('Expected a comma or closing bracket', i)
171 def parseDiscrete(i):
172 for k,v in {'true': True, 'false': False, 'null': None}.items():
173 if s.startswith(k, i):
175 raiseError('Not a boolean (or null)', i)
177 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
179 raiseError('Not a number', i)
181 if '.' in nums or 'e' in nums or 'E' in nums:
182 return (i+len(nums), float(nums))
183 return (i+len(nums), int(nums))
184 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
187 i,res = CHARMAP.get(s[i], parseNumber)(i)
188 i = skipSpace(i, False)
192 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
195 def preferredencoding():
196 """Get preferred encoding.
198 Returns the best encoding scheme for the system, based on
199 locale.getpreferredencoding() and some further tweaks.
201 def yield_preferredencoding():
203 pref = locale.getpreferredencoding()
209 return yield_preferredencoding().next()
212 def htmlentity_transform(matchobj):
213 """Transforms an HTML entity to a Unicode character.
215 This function receives a match object and is intended to be used with
216 the re.sub() function.
218 entity = matchobj.group(1)
220 # Known non-numeric HTML entity
221 if entity in htmlentitydefs.name2codepoint:
222 return unichr(htmlentitydefs.name2codepoint[entity])
225 mobj = re.match(ur'(?u)#(x?\d+)', entity)
227 numstr = mobj.group(1)
228 if numstr.startswith(u'x'):
230 numstr = u'0%s' % numstr
233 return unichr(long(numstr, base))
235 # Unknown entity in name, return its literal representation
236 return (u'&%s;' % entity)
239 def sanitize_title(utitle):
240 """Sanitizes a video title so it could be used as part of a filename."""
241 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
242 return utitle.replace(unicode(os.sep), u'%')
245 def sanitize_open(filename, open_mode):
246 """Try to open the given filename, and slightly tweak it if this fails.
248 Attempts to open the given filename. If this fails, it tries to change
249 the filename slightly, step by step, until it's either able to open it
250 or it fails and raises a final exception, like the standard open()
253 It returns the tuple (stream, definitive_file_name).
257 if sys.platform == 'win32':
259 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
260 return (sys.stdout, filename)
261 stream = open(filename, open_mode)
262 return (stream, filename)
263 except (IOError, OSError), err:
264 # In case of error, try to remove win32 forbidden chars
265 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
267 # An exception here should be caught in the caller
268 stream = open(filename, open_mode)
269 return (stream, filename)
272 def timeconvert(timestr):
273 """Convert RFC 2822 defined time string into system timestamp"""
275 timetuple = email.utils.parsedate_tz(timestr)
276 if timetuple is not None:
277 timestamp = email.utils.mktime_tz(timetuple)
281 class DownloadError(Exception):
282 """Download Error exception.
284 This exception may be thrown by FileDownloader objects if they are not
285 configured to continue on errors. They will contain the appropriate
291 class SameFileError(Exception):
292 """Same File exception.
294 This exception will be thrown by FileDownloader objects if they detect
295 multiple files would have to be downloaded to the same file on disk.
300 class PostProcessingError(Exception):
301 """Post Processing exception.
303 This exception may be raised by PostProcessor's .run() method to
304 indicate an error in the postprocessing task.
309 class UnavailableVideoError(Exception):
310 """Unavailable Format exception.
312 This exception will be thrown when a video is requested
313 in a format that is not available for that video.
318 class ContentTooShortError(Exception):
319 """Content Too Short exception.
321 This exception may be raised by FileDownloader objects when a file they
322 download is too small for what the server announced first, indicating
323 the connection was probably interrupted.
329 def __init__(self, downloaded, expected):
330 self.downloaded = downloaded
331 self.expected = expected
334 class YoutubeDLHandler(urllib2.HTTPHandler):
335 """Handler for HTTP requests and responses.
337 This class, when installed with an OpenerDirector, automatically adds
338 the standard headers to every HTTP request and handles gzipped and
339 deflated responses from web servers. If compression is to be avoided in
340 a particular request, the original request in the program code only has
341 to include the HTTP header "Youtubedl-No-Compression", which will be
342 removed before making the real request.
344 Part of this code was copied from:
346 http://techknack.net/python-urllib2-handlers/
348 Andrew Rowls, the author of that code, agreed to release it to the
355 return zlib.decompress(data, -zlib.MAX_WBITS)
357 return zlib.decompress(data)
360 def addinfourl_wrapper(stream, headers, url, code):
361 if hasattr(urllib2.addinfourl, 'getcode'):
362 return urllib2.addinfourl(stream, headers, url, code)
363 ret = urllib2.addinfourl(stream, headers, url)
367 def http_request(self, req):
368 for h in std_headers:
371 req.add_header(h, std_headers[h])
372 if 'Youtubedl-no-compression' in req.headers:
373 if 'Accept-encoding' in req.headers:
374 del req.headers['Accept-encoding']
375 del req.headers['Youtubedl-no-compression']
378 def http_response(self, req, resp):
381 if resp.headers.get('Content-encoding', '') == 'gzip':
382 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
383 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
384 resp.msg = old_resp.msg
386 if resp.headers.get('Content-encoding', '') == 'deflate':
387 gz = StringIO.StringIO(self.deflate(resp.read()))
388 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
389 resp.msg = old_resp.msg
393 class FileDownloader(object):
394 """File Downloader class.
396 File downloader objects are the ones responsible of downloading the
397 actual video file and writing it to disk if the user has requested
398 it, among some other tasks. In most cases there should be one per
399 program. As, given a video URL, the downloader doesn't know how to
400 extract all the needed information, task that InfoExtractors do, it
401 has to pass the URL to one of them.
403 For this, file downloader objects have a method that allows
404 InfoExtractors to be registered in a given order. When it is passed
405 a URL, the file downloader handles it to the first InfoExtractor it
406 finds that reports being able to handle it. The InfoExtractor extracts
407 all the information about the video or videos the URL refers to, and
408 asks the FileDownloader to process the video information, possibly
409 downloading the video.
411 File downloaders accept a lot of parameters. In order not to saturate
412 the object constructor with arguments, it receives a dictionary of
413 options instead. These options are available through the params
414 attribute for the InfoExtractors to use. The FileDownloader also
415 registers itself as the downloader in charge for the InfoExtractors
416 that are added to it, so this is a "mutual registration".
420 username: Username for authentication purposes.
421 password: Password for authentication purposes.
422 usenetrc: Use netrc for authentication instead.
423 quiet: Do not print messages to stdout.
424 forceurl: Force printing final URL.
425 forcetitle: Force printing title.
426 forcethumbnail: Force printing thumbnail URL.
427 forcedescription: Force printing description.
428 forcefilename: Force printing final filename.
429 simulate: Do not download the video files.
430 format: Video format code.
431 format_limit: Highest quality format to try.
432 outtmpl: Template for output names.
433 ignoreerrors: Do not stop on download errors.
434 ratelimit: Download speed limit, in bytes/sec.
435 nooverwrites: Prevent overwriting files.
436 retries: Number of times to retry for HTTP error 5xx
437 continuedl: Try to continue downloads if possible.
438 noprogress: Do not print the progress bar.
439 playliststart: Playlist item to start at.
440 playlistend: Playlist item to end at.
441 matchtitle: Download only matching titles.
442 rejecttitle: Reject downloads for matching titles.
443 logtostderr: Log messages to stderr instead of stdout.
444 consoletitle: Display progress in console window's titlebar.
445 nopart: Do not use temporary .part files.
446 updatetime: Use the Last-modified header to set output file timestamps.
447 writedescription: Write the video description to a .description file
448 writeinfojson: Write the video description to a .info.json file
454 _download_retcode = None
455 _num_downloads = None
458 def __init__(self, params):
459 """Create a FileDownloader object with the given options."""
462 self._download_retcode = 0
463 self._num_downloads = 0
464 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
468 def format_bytes(bytes):
471 if type(bytes) is str:
476 exponent = long(math.log(bytes, 1024.0))
477 suffix = 'bkMGTPEZY'[exponent]
478 converted = float(bytes) / float(1024 ** exponent)
479 return '%.2f%s' % (converted, suffix)
482 def calc_percent(byte_counter, data_len):
485 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
488 def calc_eta(start, now, total, current):
492 if current == 0 or dif < 0.001: # One millisecond
494 rate = float(current) / dif
495 eta = long((float(total) - float(current)) / rate)
496 (eta_mins, eta_secs) = divmod(eta, 60)
499 return '%02d:%02d' % (eta_mins, eta_secs)
502 def calc_speed(start, now, bytes):
504 if bytes == 0 or dif < 0.001: # One millisecond
505 return '%10s' % '---b/s'
506 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
509 def best_block_size(elapsed_time, bytes):
510 new_min = max(bytes / 2.0, 1.0)
511 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
512 if elapsed_time < 0.001:
514 rate = bytes / elapsed_time
522 def parse_bytes(bytestr):
523 """Parse a string indicating a byte quantity into a long integer."""
524 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
527 number = float(matchobj.group(1))
528 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
529 return long(round(number * multiplier))
531 def add_info_extractor(self, ie):
532 """Add an InfoExtractor object to the end of the list."""
534 ie.set_downloader(self)
536 def add_post_processor(self, pp):
537 """Add a PostProcessor object to the end of the chain."""
539 pp.set_downloader(self)
541 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
542 """Print message to stdout if not in quiet mode."""
544 if not self.params.get('quiet', False):
545 terminator = [u'\n', u''][skip_eol]
546 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
547 self._screen_file.flush()
548 except (UnicodeEncodeError), err:
549 if not ignore_encoding_errors:
552 def to_stderr(self, message):
553 """Print message to stderr."""
554 print >>sys.stderr, message.encode(preferredencoding())
556 def to_cons_title(self, message):
557 """Set console/terminal window title to message."""
558 if not self.params.get('consoletitle', False):
560 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
561 # c_wchar_p() might not be necessary if `message` is
562 # already of type unicode()
563 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
564 elif 'TERM' in os.environ:
565 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
567 def fixed_template(self):
568 """Checks if the output template is fixed."""
569 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
571 def trouble(self, message=None):
572 """Determine action to take when a download problem appears.
574 Depending on if the downloader has been configured to ignore
575 download errors or not, this method may throw an exception or
576 not when errors are found, after printing the message.
578 if message is not None:
579 self.to_stderr(message)
580 if not self.params.get('ignoreerrors', False):
581 raise DownloadError(message)
582 self._download_retcode = 1
584 def slow_down(self, start_time, byte_counter):
585 """Sleep if the download speed is over the rate limit."""
586 rate_limit = self.params.get('ratelimit', None)
587 if rate_limit is None or byte_counter == 0:
590 elapsed = now - start_time
593 speed = float(byte_counter) / elapsed
594 if speed > rate_limit:
595 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
597 def temp_name(self, filename):
598 """Returns a temporary filename for the given filename."""
599 if self.params.get('nopart', False) or filename == u'-' or \
600 (os.path.exists(filename) and not os.path.isfile(filename)):
602 return filename + u'.part'
604 def undo_temp_name(self, filename):
605 if filename.endswith(u'.part'):
606 return filename[:-len(u'.part')]
609 def try_rename(self, old_filename, new_filename):
611 if old_filename == new_filename:
613 os.rename(old_filename, new_filename)
614 except (IOError, OSError), err:
615 self.trouble(u'ERROR: unable to rename file')
617 def try_utime(self, filename, last_modified_hdr):
618 """Try to set the last-modified time of the given file."""
619 if last_modified_hdr is None:
621 if not os.path.isfile(filename):
623 timestr = last_modified_hdr
626 filetime = timeconvert(timestr)
630 os.utime(filename, (time.time(), filetime))
635 def report_writedescription(self, descfn):
636 """ Report that the description file is being written """
637 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
639 def report_writeinfojson(self, infofn):
640 """ Report that the metadata file has been written """
641 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
643 def report_destination(self, filename):
644 """Report destination filename."""
645 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
647 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
648 """Report download progress."""
649 if self.params.get('noprogress', False):
651 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
652 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
653 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
654 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
656 def report_resuming_byte(self, resume_len):
657 """Report attempt to resume at given byte."""
658 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
660 def report_retry(self, count, retries):
661 """Report retry in case of HTTP error 5xx"""
662 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
664 def report_file_already_downloaded(self, file_name):
665 """Report file has already been fully downloaded."""
667 self.to_screen(u'[download] %s has already been downloaded' % file_name)
668 except (UnicodeEncodeError), err:
669 self.to_screen(u'[download] The file has already been downloaded')
671 def report_unable_to_resume(self):
672 """Report it was impossible to resume download."""
673 self.to_screen(u'[download] Unable to resume')
675 def report_finish(self):
676 """Report download finished."""
677 if self.params.get('noprogress', False):
678 self.to_screen(u'[download] Download completed')
682 def increment_downloads(self):
683 """Increment the ordinal that assigns a number to each file."""
684 self._num_downloads += 1
686 def prepare_filename(self, info_dict):
687 """Generate the output filename."""
689 template_dict = dict(info_dict)
690 template_dict['epoch'] = unicode(long(time.time()))
691 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
692 filename = self.params['outtmpl'] % template_dict
694 except (ValueError, KeyError), err:
695 self.trouble(u'ERROR: invalid system charset or erroneous output template')
698 def process_info(self, info_dict):
699 """Process a single dictionary returned by an InfoExtractor."""
700 filename = self.prepare_filename(info_dict)
703 if self.params.get('forcetitle', False):
704 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
705 if self.params.get('forceurl', False):
706 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
707 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
708 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
709 if self.params.get('forcedescription', False) and 'description' in info_dict:
710 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
711 if self.params.get('forcefilename', False) and filename is not None:
712 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
713 if self.params.get('forceformat', False):
714 print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
716 # Do nothing else if in simulate mode
717 if self.params.get('simulate', False):
723 matchtitle=self.params.get('matchtitle',False)
724 rejecttitle=self.params.get('rejecttitle',False)
725 title=info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
726 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
727 self.to_screen(u'[download] "%s" title did not match pattern "%s"' % (title, matchtitle))
729 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
730 self.to_screen(u'[download] "%s" title matched reject pattern "%s"' % (title, rejecttitle))
733 if self.params.get('nooverwrites', False) and os.path.exists(filename):
734 self.to_stderr(u'WARNING: file exists and will be skipped')
738 dn = os.path.dirname(filename)
739 if dn != '' and not os.path.exists(dn):
741 except (OSError, IOError), err:
742 self.trouble(u'ERROR: unable to create directory ' + unicode(err))
745 if self.params.get('writedescription', False):
747 descfn = filename + '.description'
748 self.report_writedescription(descfn)
749 descfile = open(descfn, 'wb')
751 descfile.write(info_dict['description'].encode('utf-8'))
754 except (OSError, IOError):
755 self.trouble(u'ERROR: Cannot write description file ' + descfn)
758 if self.params.get('writeinfojson', False):
759 infofn = filename + '.info.json'
760 self.report_writeinfojson(infofn)
763 except (NameError,AttributeError):
764 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
767 infof = open(infofn, 'wb')
769 json.dump(info_dict, infof)
772 except (OSError, IOError):
773 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
776 if not self.params.get('skip_download', False):
778 success = self._do_download(filename, info_dict)
779 except (OSError, IOError), err:
780 raise UnavailableVideoError
781 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
782 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
784 except (ContentTooShortError, ), err:
785 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
790 self.post_process(filename, info_dict)
791 except (PostProcessingError), err:
792 self.trouble(u'ERROR: postprocessing: %s' % str(err))
795 def download(self, url_list):
796 """Download a given list of URLs."""
797 if len(url_list) > 1 and self.fixed_template():
798 raise SameFileError(self.params['outtmpl'])
801 suitable_found = False
803 # Go to next InfoExtractor if not suitable
804 if not ie.suitable(url):
807 # Suitable InfoExtractor found
808 suitable_found = True
810 # Extract information from URL and process it
813 # Suitable InfoExtractor had been found; go to next URL
816 if not suitable_found:
817 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
819 return self._download_retcode
821 def post_process(self, filename, ie_info):
822 """Run the postprocessing chain on the given file."""
824 info['filepath'] = filename
830 def _download_with_rtmpdump(self, filename, url, player_url):
831 self.report_destination(filename)
832 tmpfilename = self.temp_name(filename)
834 # Check for rtmpdump first
836 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
837 except (OSError, IOError):
838 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
841 # Download using rtmpdump. rtmpdump returns exit code 2 when
842 # the connection was interrumpted and resuming appears to be
843 # possible. This is part of rtmpdump's normal usage, AFAIK.
844 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
845 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
846 while retval == 2 or retval == 1:
847 prevsize = os.path.getsize(tmpfilename)
848 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
849 time.sleep(5.0) # This seems to be needed
850 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
851 cursize = os.path.getsize(tmpfilename)
852 if prevsize == cursize and retval == 1:
854 # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
855 if prevsize == cursize and retval == 2 and cursize > 1024:
856 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
860 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
861 self.try_rename(tmpfilename, filename)
864 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
867 def _do_download(self, filename, info_dict):
868 url = info_dict['url']
869 player_url = info_dict.get('player_url', None)
871 # Check file already present
872 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
873 self.report_file_already_downloaded(filename)
876 # Attempt to download using rtmpdump
877 if url.startswith('rtmp'):
878 return self._download_with_rtmpdump(filename, url, player_url)
880 tmpfilename = self.temp_name(filename)
883 # Do not include the Accept-Encoding header
884 headers = {'Youtubedl-no-compression': 'True'}
885 basic_request = urllib2.Request(url, None, headers)
886 request = urllib2.Request(url, None, headers)
888 # Establish possible resume length
889 if os.path.isfile(tmpfilename):
890 resume_len = os.path.getsize(tmpfilename)
896 if self.params.get('continuedl', False):
897 self.report_resuming_byte(resume_len)
898 request.add_header('Range','bytes=%d-' % resume_len)
904 retries = self.params.get('retries', 0)
905 while count <= retries:
906 # Establish connection
908 data = urllib2.urlopen(request)
910 except (urllib2.HTTPError, ), err:
911 if (err.code < 500 or err.code >= 600) and err.code != 416:
912 # Unexpected HTTP error
914 elif err.code == 416:
915 # Unable to resume (requested range not satisfiable)
917 # Open the connection again without the range header
918 data = urllib2.urlopen(basic_request)
919 content_length = data.info()['Content-Length']
920 except (urllib2.HTTPError, ), err:
921 if err.code < 500 or err.code >= 600:
924 # Examine the reported length
925 if (content_length is not None and
926 (resume_len - 100 < long(content_length) < resume_len + 100)):
927 # The file had already been fully downloaded.
928 # Explanation to the above condition: in issue #175 it was revealed that
929 # YouTube sometimes adds or removes a few bytes from the end of the file,
930 # changing the file size slightly and causing problems for some users. So
931 # I decided to implement a suggested change and consider the file
932 # completely downloaded if the file size differs less than 100 bytes from
933 # the one in the hard drive.
934 self.report_file_already_downloaded(filename)
935 self.try_rename(tmpfilename, filename)
938 # The length does not match, we start the download over
939 self.report_unable_to_resume()
945 self.report_retry(count, retries)
948 self.trouble(u'ERROR: giving up after %s retries' % retries)
951 data_len = data.info().get('Content-length', None)
952 if data_len is not None:
953 data_len = long(data_len) + resume_len
954 data_len_str = self.format_bytes(data_len)
955 byte_counter = 0 + resume_len
961 data_block = data.read(block_size)
963 if len(data_block) == 0:
965 byte_counter += len(data_block)
967 # Open file just in time
970 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
971 assert stream is not None
972 filename = self.undo_temp_name(tmpfilename)
973 self.report_destination(filename)
974 except (OSError, IOError), err:
975 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
978 stream.write(data_block)
979 except (IOError, OSError), err:
980 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
982 block_size = self.best_block_size(after - before, len(data_block))
985 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
987 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
989 percent_str = self.calc_percent(byte_counter, data_len)
990 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
991 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
994 self.slow_down(start, byte_counter - resume_len)
997 self.trouble(u'\nERROR: Did not get any data blocks')
1000 self.report_finish()
1001 if data_len is not None and byte_counter != data_len:
1002 raise ContentTooShortError(byte_counter, long(data_len))
1003 self.try_rename(tmpfilename, filename)
1005 # Update file modification time
1006 if self.params.get('updatetime', True):
1007 info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1012 class InfoExtractor(object):
1013 """Information Extractor class.
1015 Information extractors are the classes that, given a URL, extract
1016 information from the video (or videos) the URL refers to. This
1017 information includes the real video URL, the video title and simplified
1018 title, author and others. The information is stored in a dictionary
1019 which is then passed to the FileDownloader. The FileDownloader
1020 processes this information possibly downloading the video to the file
1021 system, among other possible outcomes. The dictionaries must include
1022 the following fields:
1024 id: Video identifier.
1025 url: Final video URL.
1026 uploader: Nickname of the video uploader.
1027 title: Literal title.
1028 stitle: Simplified title.
1029 ext: Video filename extension.
1030 format: Video format.
1031 player_url: SWF Player URL (may be None).
1033 The following fields are optional. Their primary purpose is to allow
1034 youtube-dl to serve as the backend for a video search function, such
1035 as the one in youtube2mp3. They are only used when their respective
1036 forced printing functions are called:
1038 thumbnail: Full URL to a video thumbnail image.
1039 description: One-line video description.
1041 Subclasses of this one should re-define the _real_initialize() and
1042 _real_extract() methods and define a _VALID_URL regexp.
1043 Probably, they should also be added to the list of extractors.
1049 def __init__(self, downloader=None):
1050 """Constructor. Receives an optional downloader."""
1052 self.set_downloader(downloader)
1054 def suitable(self, url):
1055 """Receives a URL and returns True if suitable for this IE."""
1056 return re.match(self._VALID_URL, url) is not None
1058 def initialize(self):
1059 """Initializes an instance (authentication, etc)."""
1061 self._real_initialize()
1064 def extract(self, url):
1065 """Extracts URL information and returns it in list of dicts."""
1067 return self._real_extract(url)
1069 def set_downloader(self, downloader):
1070 """Sets the downloader for this IE."""
1071 self._downloader = downloader
1073 def _real_initialize(self):
1074 """Real initialization process. Redefine in subclasses."""
1077 def _real_extract(self, url):
1078 """Real extraction process. Redefine in subclasses."""
1082 class YoutubeIE(InfoExtractor):
1083 """Information extractor for youtube.com."""
1085 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1086 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1087 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1088 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1089 _NETRC_MACHINE = 'youtube'
1090 # Listed in order of quality
1091 _available_formats = ['38', '37', '45', '22', '43', '35', '34', '18', '6', '5', '17', '13']
1092 _video_extensions = {
1098 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1102 IE_NAME = u'youtube'
1104 def report_lang(self):
1105 """Report attempt to set language."""
1106 self._downloader.to_screen(u'[youtube] Setting language')
1108 def report_login(self):
1109 """Report attempt to log in."""
1110 self._downloader.to_screen(u'[youtube] Logging in')
1112 def report_age_confirmation(self):
1113 """Report attempt to confirm age."""
1114 self._downloader.to_screen(u'[youtube] Confirming age')
1116 def report_video_webpage_download(self, video_id):
1117 """Report attempt to download video webpage."""
1118 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1120 def report_video_info_webpage_download(self, video_id):
1121 """Report attempt to download video info webpage."""
1122 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1124 def report_information_extraction(self, video_id):
1125 """Report attempt to extract video information."""
1126 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1128 def report_unavailable_format(self, video_id, format):
1129 """Report extracted video URL."""
1130 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1132 def report_rtmp_download(self):
1133 """Indicate the download will use the RTMP protocol."""
1134 self._downloader.to_screen(u'[youtube] RTMP download detected')
1136 def _real_initialize(self):
1137 if self._downloader is None:
1142 downloader_params = self._downloader.params
1144 # Attempt to use provided username and password or .netrc data
1145 if downloader_params.get('username', None) is not None:
1146 username = downloader_params['username']
1147 password = downloader_params['password']
1148 elif downloader_params.get('usenetrc', False):
1150 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1151 if info is not None:
1155 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1156 except (IOError, netrc.NetrcParseError), err:
1157 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1161 request = urllib2.Request(self._LANG_URL)
1164 urllib2.urlopen(request).read()
1165 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1166 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1169 # No authentication to be performed
1170 if username is None:
1175 'current_form': 'loginForm',
1177 'action_login': 'Log In',
1178 'username': username,
1179 'password': password,
1181 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1184 login_results = urllib2.urlopen(request).read()
1185 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1186 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1188 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1189 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1195 'action_confirm': 'Confirm',
1197 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1199 self.report_age_confirmation()
1200 age_results = urllib2.urlopen(request).read()
1201 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1202 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1205 def _real_extract(self, url):
1206 # Extract video id from URL
1207 mobj = re.match(self._VALID_URL, url)
1209 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1211 video_id = mobj.group(2)
1214 self.report_video_webpage_download(video_id)
1215 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1217 video_webpage = urllib2.urlopen(request).read()
1218 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1219 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1222 # Attempt to extract SWF player URL
1223 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1224 if mobj is not None:
1225 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1230 self.report_video_info_webpage_download(video_id)
1231 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1232 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1233 % (video_id, el_type))
1234 request = urllib2.Request(video_info_url)
1236 video_info_webpage = urllib2.urlopen(request).read()
1237 video_info = parse_qs(video_info_webpage)
1238 if 'token' in video_info:
1240 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1241 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1243 if 'token' not in video_info:
1244 if 'reason' in video_info:
1245 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1247 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1250 # Start extracting information
1251 self.report_information_extraction(video_id)
1254 if 'author' not in video_info:
1255 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1257 video_uploader = urllib.unquote_plus(video_info['author'][0])
1260 if 'title' not in video_info:
1261 self._downloader.trouble(u'ERROR: unable to extract video title')
1263 video_title = urllib.unquote_plus(video_info['title'][0])
1264 video_title = video_title.decode('utf-8')
1265 video_title = sanitize_title(video_title)
1268 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1269 simple_title = simple_title.strip(ur'_')
1272 if 'thumbnail_url' not in video_info:
1273 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1274 video_thumbnail = ''
1275 else: # don't panic if we can't find it
1276 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1280 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1281 if mobj is not None:
1282 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1283 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1284 for expression in format_expressions:
1286 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1294 video_description = u'No description available.'
1295 if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1296 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1297 if mobj is not None:
1298 video_description = mobj.group(1).decode('utf-8')
1300 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1301 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1302 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1303 # TODO use another parser
1306 video_token = urllib.unquote_plus(video_info['token'][0])
1308 # Decide which formats to download
1309 req_format = self._downloader.params.get('format', None)
1311 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1312 self.report_rtmp_download()
1313 video_url_list = [(None, video_info['conn'][0])]
1314 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1315 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1316 url_data = [parse_qs(uds) for uds in url_data_strs]
1317 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1318 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1320 format_limit = self._downloader.params.get('format_limit', None)
1321 if format_limit is not None and format_limit in self._available_formats:
1322 format_list = self._available_formats[self._available_formats.index(format_limit):]
1324 format_list = self._available_formats
1325 existing_formats = [x for x in format_list if x in url_map]
1326 if len(existing_formats) == 0:
1327 self._downloader.trouble(u'ERROR: no known formats available for video')
1329 if req_format is None or req_format == 'best':
1330 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1331 elif req_format == 'worst':
1332 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1333 elif req_format in ('-1', 'all'):
1334 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1336 # Specific formats. We pick the first in a slash-delimeted sequence.
1337 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1338 req_formats = req_format.split('/')
1339 video_url_list = None
1340 for rf in req_formats:
1342 video_url_list = [(rf, url_map[rf])]
1344 if video_url_list is None:
1345 self._downloader.trouble(u'ERROR: requested format not available')
1348 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1351 for format_param, video_real_url in video_url_list:
1352 # At this point we have a new video
1353 self._downloader.increment_downloads()
1356 video_extension = self._video_extensions.get(format_param, 'flv')
1359 # Process video information
1360 self._downloader.process_info({
1361 'id': video_id.decode('utf-8'),
1362 'url': video_real_url.decode('utf-8'),
1363 'uploader': video_uploader.decode('utf-8'),
1364 'upload_date': upload_date,
1365 'title': video_title,
1366 'stitle': simple_title,
1367 'ext': video_extension.decode('utf-8'),
1368 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1369 'thumbnail': video_thumbnail.decode('utf-8'),
1370 'description': video_description,
1371 'player_url': player_url,
1373 except UnavailableVideoError, err:
1374 self._downloader.trouble(u'\nERROR: unable to download video')
1377 class MetacafeIE(InfoExtractor):
1378 """Information Extractor for metacafe.com."""
1380 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1381 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1382 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1384 IE_NAME = u'metacafe'
1386 def __init__(self, youtube_ie, downloader=None):
1387 InfoExtractor.__init__(self, downloader)
1388 self._youtube_ie = youtube_ie
1390 def report_disclaimer(self):
1391 """Report disclaimer retrieval."""
1392 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1394 def report_age_confirmation(self):
1395 """Report attempt to confirm age."""
1396 self._downloader.to_screen(u'[metacafe] Confirming age')
1398 def report_download_webpage(self, video_id):
1399 """Report webpage download."""
1400 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1402 def report_extraction(self, video_id):
1403 """Report information extraction."""
1404 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1406 def _real_initialize(self):
1407 # Retrieve disclaimer
1408 request = urllib2.Request(self._DISCLAIMER)
1410 self.report_disclaimer()
1411 disclaimer = urllib2.urlopen(request).read()
1412 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1413 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1419 'submit': "Continue - I'm over 18",
1421 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1423 self.report_age_confirmation()
1424 disclaimer = urllib2.urlopen(request).read()
1425 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1426 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1429 def _real_extract(self, url):
1430 # Extract id and simplified title from URL
1431 mobj = re.match(self._VALID_URL, url)
1433 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1436 video_id = mobj.group(1)
1438 # Check if video comes from YouTube
1439 mobj2 = re.match(r'^yt-(.*)$', video_id)
1440 if mobj2 is not None:
1441 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1444 # At this point we have a new video
1445 self._downloader.increment_downloads()
1447 simple_title = mobj.group(2).decode('utf-8')
1449 # Retrieve video webpage to extract further information
1450 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1452 self.report_download_webpage(video_id)
1453 webpage = urllib2.urlopen(request).read()
1454 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1455 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1458 # Extract URL, uploader and title from webpage
1459 self.report_extraction(video_id)
1460 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1461 if mobj is not None:
1462 mediaURL = urllib.unquote(mobj.group(1))
1463 video_extension = mediaURL[-3:]
1465 # Extract gdaKey if available
1466 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1468 video_url = mediaURL
1470 gdaKey = mobj.group(1)
1471 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1473 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1475 self._downloader.trouble(u'ERROR: unable to extract media URL')
1477 vardict = parse_qs(mobj.group(1))
1478 if 'mediaData' not in vardict:
1479 self._downloader.trouble(u'ERROR: unable to extract media URL')
1481 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1483 self._downloader.trouble(u'ERROR: unable to extract media URL')
1485 mediaURL = mobj.group(1).replace('\\/', '/')
1486 video_extension = mediaURL[-3:]
1487 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1489 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1491 self._downloader.trouble(u'ERROR: unable to extract title')
1493 video_title = mobj.group(1).decode('utf-8')
1494 video_title = sanitize_title(video_title)
1496 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1498 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1500 video_uploader = mobj.group(1)
1503 # Process video information
1504 self._downloader.process_info({
1505 'id': video_id.decode('utf-8'),
1506 'url': video_url.decode('utf-8'),
1507 'uploader': video_uploader.decode('utf-8'),
1508 'upload_date': u'NA',
1509 'title': video_title,
1510 'stitle': simple_title,
1511 'ext': video_extension.decode('utf-8'),
1515 except UnavailableVideoError:
1516 self._downloader.trouble(u'\nERROR: unable to download video')
1519 class DailymotionIE(InfoExtractor):
1520 """Information Extractor for Dailymotion"""
1522 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1523 IE_NAME = u'dailymotion'
1525 def __init__(self, downloader=None):
1526 InfoExtractor.__init__(self, downloader)
1528 def report_download_webpage(self, video_id):
1529 """Report webpage download."""
1530 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1532 def report_extraction(self, video_id):
1533 """Report information extraction."""
1534 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1536 def _real_initialize(self):
1539 def _real_extract(self, url):
1540 # Extract id and simplified title from URL
1541 mobj = re.match(self._VALID_URL, url)
1543 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1546 # At this point we have a new video
1547 self._downloader.increment_downloads()
1548 video_id = mobj.group(1)
1550 simple_title = mobj.group(2).decode('utf-8')
1551 video_extension = 'flv'
1553 # Retrieve video webpage to extract further information
1554 request = urllib2.Request(url)
1555 request.add_header('Cookie', 'family_filter=off')
1557 self.report_download_webpage(video_id)
1558 webpage = urllib2.urlopen(request).read()
1559 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1560 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1563 # Extract URL, uploader and title from webpage
1564 self.report_extraction(video_id)
1565 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1567 self._downloader.trouble(u'ERROR: unable to extract media URL')
1569 sequence = urllib.unquote(mobj.group(1))
1570 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1572 self._downloader.trouble(u'ERROR: unable to extract media URL')
1574 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1576 # if needed add http://www.dailymotion.com/ if relative URL
1578 video_url = mediaURL
1580 mobj = re.search(r'(?im)<title>Dailymotion\s*-\s*(.+)\s*-\s*[^<]+?</title>', webpage)
1582 self._downloader.trouble(u'ERROR: unable to extract title')
1584 video_title = mobj.group(1).decode('utf-8')
1585 video_title = sanitize_title(video_title)
1587 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1589 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1591 video_uploader = mobj.group(1)
1594 # Process video information
1595 self._downloader.process_info({
1596 'id': video_id.decode('utf-8'),
1597 'url': video_url.decode('utf-8'),
1598 'uploader': video_uploader.decode('utf-8'),
1599 'upload_date': u'NA',
1600 'title': video_title,
1601 'stitle': simple_title,
1602 'ext': video_extension.decode('utf-8'),
1606 except UnavailableVideoError:
1607 self._downloader.trouble(u'\nERROR: unable to download video')
1610 class GoogleIE(InfoExtractor):
1611 """Information extractor for video.google.com."""
1613 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1614 IE_NAME = u'video.google'
1616 def __init__(self, downloader=None):
1617 InfoExtractor.__init__(self, downloader)
1619 def report_download_webpage(self, video_id):
1620 """Report webpage download."""
1621 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1623 def report_extraction(self, video_id):
1624 """Report information extraction."""
1625 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1627 def _real_initialize(self):
1630 def _real_extract(self, url):
1631 # Extract id from URL
1632 mobj = re.match(self._VALID_URL, url)
1634 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1637 # At this point we have a new video
1638 self._downloader.increment_downloads()
1639 video_id = mobj.group(1)
1641 video_extension = 'mp4'
1643 # Retrieve video webpage to extract further information
1644 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1646 self.report_download_webpage(video_id)
1647 webpage = urllib2.urlopen(request).read()
1648 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1649 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1652 # Extract URL, uploader, and title from webpage
1653 self.report_extraction(video_id)
1654 mobj = re.search(r"download_url:'([^']+)'", webpage)
1656 video_extension = 'flv'
1657 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1659 self._downloader.trouble(u'ERROR: unable to extract media URL')
1661 mediaURL = urllib.unquote(mobj.group(1))
1662 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1663 mediaURL = mediaURL.replace('\\x26', '\x26')
1665 video_url = mediaURL
1667 mobj = re.search(r'<title>(.*)</title>', webpage)
1669 self._downloader.trouble(u'ERROR: unable to extract title')
1671 video_title = mobj.group(1).decode('utf-8')
1672 video_title = sanitize_title(video_title)
1673 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1675 # Extract video description
1676 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1678 self._downloader.trouble(u'ERROR: unable to extract video description')
1680 video_description = mobj.group(1).decode('utf-8')
1681 if not video_description:
1682 video_description = 'No description available.'
1684 # Extract video thumbnail
1685 if self._downloader.params.get('forcethumbnail', False):
1686 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1688 webpage = urllib2.urlopen(request).read()
1689 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1690 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1692 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1694 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1696 video_thumbnail = mobj.group(1)
1697 else: # we need something to pass to process_info
1698 video_thumbnail = ''
1701 # Process video information
1702 self._downloader.process_info({
1703 'id': video_id.decode('utf-8'),
1704 'url': video_url.decode('utf-8'),
1706 'upload_date': u'NA',
1707 'title': video_title,
1708 'stitle': simple_title,
1709 'ext': video_extension.decode('utf-8'),
1713 except UnavailableVideoError:
1714 self._downloader.trouble(u'\nERROR: unable to download video')
1717 class PhotobucketIE(InfoExtractor):
1718 """Information extractor for photobucket.com."""
1720 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1721 IE_NAME = u'photobucket'
1723 def __init__(self, downloader=None):
1724 InfoExtractor.__init__(self, downloader)
1726 def report_download_webpage(self, video_id):
1727 """Report webpage download."""
1728 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1730 def report_extraction(self, video_id):
1731 """Report information extraction."""
1732 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1734 def _real_initialize(self):
1737 def _real_extract(self, url):
1738 # Extract id from URL
1739 mobj = re.match(self._VALID_URL, url)
1741 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1744 # At this point we have a new video
1745 self._downloader.increment_downloads()
1746 video_id = mobj.group(1)
1748 video_extension = 'flv'
1750 # Retrieve video webpage to extract further information
1751 request = urllib2.Request(url)
1753 self.report_download_webpage(video_id)
1754 webpage = urllib2.urlopen(request).read()
1755 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1756 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1759 # Extract URL, uploader, and title from webpage
1760 self.report_extraction(video_id)
1761 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1763 self._downloader.trouble(u'ERROR: unable to extract media URL')
1765 mediaURL = urllib.unquote(mobj.group(1))
1767 video_url = mediaURL
1769 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1771 self._downloader.trouble(u'ERROR: unable to extract title')
1773 video_title = mobj.group(1).decode('utf-8')
1774 video_title = sanitize_title(video_title)
1775 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1777 video_uploader = mobj.group(2).decode('utf-8')
1780 # Process video information
1781 self._downloader.process_info({
1782 'id': video_id.decode('utf-8'),
1783 'url': video_url.decode('utf-8'),
1784 'uploader': video_uploader,
1785 'upload_date': u'NA',
1786 'title': video_title,
1787 'stitle': simple_title,
1788 'ext': video_extension.decode('utf-8'),
1792 except UnavailableVideoError:
1793 self._downloader.trouble(u'\nERROR: unable to download video')
1796 class YahooIE(InfoExtractor):
1797 """Information extractor for video.yahoo.com."""
1799 # _VALID_URL matches all Yahoo! Video URLs
1800 # _VPAGE_URL matches only the extractable '/watch/' URLs
1801 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1802 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1803 IE_NAME = u'video.yahoo'
1805 def __init__(self, downloader=None):
1806 InfoExtractor.__init__(self, downloader)
1808 def report_download_webpage(self, video_id):
1809 """Report webpage download."""
1810 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1812 def report_extraction(self, video_id):
1813 """Report information extraction."""
1814 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1816 def _real_initialize(self):
1819 def _real_extract(self, url, new_video=True):
1820 # Extract ID from URL
1821 mobj = re.match(self._VALID_URL, url)
1823 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1826 # At this point we have a new video
1827 self._downloader.increment_downloads()
1828 video_id = mobj.group(2)
1829 video_extension = 'flv'
1831 # Rewrite valid but non-extractable URLs as
1832 # extractable English language /watch/ URLs
1833 if re.match(self._VPAGE_URL, url) is None:
1834 request = urllib2.Request(url)
1836 webpage = urllib2.urlopen(request).read()
1837 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1838 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1841 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1843 self._downloader.trouble(u'ERROR: Unable to extract id field')
1845 yahoo_id = mobj.group(1)
1847 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1849 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1851 yahoo_vid = mobj.group(1)
1853 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1854 return self._real_extract(url, new_video=False)
1856 # Retrieve video webpage to extract further information
1857 request = urllib2.Request(url)
1859 self.report_download_webpage(video_id)
1860 webpage = urllib2.urlopen(request).read()
1861 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1862 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1865 # Extract uploader and title from webpage
1866 self.report_extraction(video_id)
1867 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1869 self._downloader.trouble(u'ERROR: unable to extract video title')
1871 video_title = mobj.group(1).decode('utf-8')
1872 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1874 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1876 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1878 video_uploader = mobj.group(1).decode('utf-8')
1880 # Extract video thumbnail
1881 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1883 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1885 video_thumbnail = mobj.group(1).decode('utf-8')
1887 # Extract video description
1888 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1890 self._downloader.trouble(u'ERROR: unable to extract video description')
1892 video_description = mobj.group(1).decode('utf-8')
1893 if not video_description:
1894 video_description = 'No description available.'
1896 # Extract video height and width
1897 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1899 self._downloader.trouble(u'ERROR: unable to extract video height')
1901 yv_video_height = mobj.group(1)
1903 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1905 self._downloader.trouble(u'ERROR: unable to extract video width')
1907 yv_video_width = mobj.group(1)
1909 # Retrieve video playlist to extract media URL
1910 # I'm not completely sure what all these options are, but we
1911 # seem to need most of them, otherwise the server sends a 401.
1912 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1913 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1914 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1915 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1916 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1918 self.report_download_webpage(video_id)
1919 webpage = urllib2.urlopen(request).read()
1920 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1921 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1924 # Extract media URL from playlist XML
1925 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1927 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1929 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1930 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1933 # Process video information
1934 self._downloader.process_info({
1935 'id': video_id.decode('utf-8'),
1937 'uploader': video_uploader,
1938 'upload_date': u'NA',
1939 'title': video_title,
1940 'stitle': simple_title,
1941 'ext': video_extension.decode('utf-8'),
1942 'thumbnail': video_thumbnail.decode('utf-8'),
1943 'description': video_description,
1944 'thumbnail': video_thumbnail,
1947 except UnavailableVideoError:
1948 self._downloader.trouble(u'\nERROR: unable to download video')
1951 class VimeoIE(InfoExtractor):
1952 """Information extractor for vimeo.com."""
1954 # _VALID_URL matches Vimeo URLs
1955 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1958 def __init__(self, downloader=None):
1959 InfoExtractor.__init__(self, downloader)
1961 def report_download_webpage(self, video_id):
1962 """Report webpage download."""
1963 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1965 def report_extraction(self, video_id):
1966 """Report information extraction."""
1967 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1969 def _real_initialize(self):
1972 def _real_extract(self, url, new_video=True):
1973 # Extract ID from URL
1974 mobj = re.match(self._VALID_URL, url)
1976 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1979 # At this point we have a new video
1980 self._downloader.increment_downloads()
1981 video_id = mobj.group(1)
1983 # Retrieve video webpage to extract further information
1984 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
1986 self.report_download_webpage(video_id)
1987 webpage = urllib2.urlopen(request).read()
1988 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1989 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1992 # Now we begin extracting as much information as we can from what we
1993 # retrieved. First we extract the information common to all extractors,
1994 # and latter we extract those that are Vimeo specific.
1995 self.report_extraction(video_id)
1998 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
2000 self._downloader.trouble(u'ERROR: unable to extract video title')
2002 video_title = mobj.group(1).decode('utf-8')
2003 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2006 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
2008 self._downloader.trouble(u'ERROR: unable to extract video uploader')
2010 video_uploader = mobj.group(1).decode('utf-8')
2012 # Extract video thumbnail
2013 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2015 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2017 video_thumbnail = mobj.group(1).decode('utf-8')
2019 # # Extract video description
2020 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2022 # self._downloader.trouble(u'ERROR: unable to extract video description')
2024 # video_description = mobj.group(1).decode('utf-8')
2025 # if not video_description: video_description = 'No description available.'
2026 video_description = 'Foo.'
2028 # Vimeo specific: extract request signature
2029 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2031 self._downloader.trouble(u'ERROR: unable to extract request signature')
2033 sig = mobj.group(1).decode('utf-8')
2035 # Vimeo specific: Extract request signature expiration
2036 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2038 self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2040 sig_exp = mobj.group(1).decode('utf-8')
2042 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s" % (video_id, sig, sig_exp)
2045 # Process video information
2046 self._downloader.process_info({
2047 'id': video_id.decode('utf-8'),
2049 'uploader': video_uploader,
2050 'upload_date': u'NA',
2051 'title': video_title,
2052 'stitle': simple_title,
2054 'thumbnail': video_thumbnail.decode('utf-8'),
2055 'description': video_description,
2056 'thumbnail': video_thumbnail,
2057 'description': video_description,
2060 except UnavailableVideoError:
2061 self._downloader.trouble(u'ERROR: unable to download video')
2064 class GenericIE(InfoExtractor):
2065 """Generic last-resort information extractor."""
2068 IE_NAME = u'generic'
2070 def __init__(self, downloader=None):
2071 InfoExtractor.__init__(self, downloader)
2073 def report_download_webpage(self, video_id):
2074 """Report webpage download."""
2075 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2076 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2078 def report_extraction(self, video_id):
2079 """Report information extraction."""
2080 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2082 def _real_initialize(self):
2085 def _real_extract(self, url):
2086 # At this point we have a new video
2087 self._downloader.increment_downloads()
2089 video_id = url.split('/')[-1]
2090 request = urllib2.Request(url)
2092 self.report_download_webpage(video_id)
2093 webpage = urllib2.urlopen(request).read()
2094 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2095 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2097 except ValueError, err:
2098 # since this is the last-resort InfoExtractor, if
2099 # this error is thrown, it'll be thrown here
2100 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2103 self.report_extraction(video_id)
2104 # Start with something easy: JW Player in SWFObject
2105 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2107 # Broaden the search a little bit
2108 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2110 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2113 # It's possible that one of the regexes
2114 # matched, but returned an empty group:
2115 if mobj.group(1) is None:
2116 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2119 video_url = urllib.unquote(mobj.group(1))
2120 video_id = os.path.basename(video_url)
2122 # here's a fun little line of code for you:
2123 video_extension = os.path.splitext(video_id)[1][1:]
2124 video_id = os.path.splitext(video_id)[0]
2126 # it's tempting to parse this further, but you would
2127 # have to take into account all the variations like
2128 # Video Title - Site Name
2129 # Site Name | Video Title
2130 # Video Title - Tagline | Site Name
2131 # and so on and so forth; it's just not practical
2132 mobj = re.search(r'<title>(.*)</title>', webpage)
2134 self._downloader.trouble(u'ERROR: unable to extract title')
2136 video_title = mobj.group(1).decode('utf-8')
2137 video_title = sanitize_title(video_title)
2138 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2140 # video uploader is domain name
2141 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2143 self._downloader.trouble(u'ERROR: unable to extract title')
2145 video_uploader = mobj.group(1).decode('utf-8')
2148 # Process video information
2149 self._downloader.process_info({
2150 'id': video_id.decode('utf-8'),
2151 'url': video_url.decode('utf-8'),
2152 'uploader': video_uploader,
2153 'upload_date': u'NA',
2154 'title': video_title,
2155 'stitle': simple_title,
2156 'ext': video_extension.decode('utf-8'),
2160 except UnavailableVideoError, err:
2161 self._downloader.trouble(u'\nERROR: unable to download video')
2164 class YoutubeSearchIE(InfoExtractor):
2165 """Information Extractor for YouTube search queries."""
2166 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2167 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2168 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2169 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2171 _max_youtube_results = 1000
2172 IE_NAME = u'youtube:search'
2174 def __init__(self, youtube_ie, downloader=None):
2175 InfoExtractor.__init__(self, downloader)
2176 self._youtube_ie = youtube_ie
2178 def report_download_page(self, query, pagenum):
2179 """Report attempt to download playlist page with given number."""
2180 query = query.decode(preferredencoding())
2181 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2183 def _real_initialize(self):
2184 self._youtube_ie.initialize()
2186 def _real_extract(self, query):
2187 mobj = re.match(self._VALID_URL, query)
2189 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2192 prefix, query = query.split(':')
2194 query = query.encode('utf-8')
2196 self._download_n_results(query, 1)
2198 elif prefix == 'all':
2199 self._download_n_results(query, self._max_youtube_results)
2205 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2207 elif n > self._max_youtube_results:
2208 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2209 n = self._max_youtube_results
2210 self._download_n_results(query, n)
2212 except ValueError: # parsing prefix as integer fails
2213 self._download_n_results(query, 1)
2216 def _download_n_results(self, query, n):
2217 """Downloads a specified number of results for a query"""
2220 already_seen = set()
2224 self.report_download_page(query, pagenum)
2225 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2226 request = urllib2.Request(result_url)
2228 page = urllib2.urlopen(request).read()
2229 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2230 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2233 # Extract video identifiers
2234 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2235 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2236 if video_id not in already_seen:
2237 video_ids.append(video_id)
2238 already_seen.add(video_id)
2239 if len(video_ids) == n:
2240 # Specified n videos reached
2241 for id in video_ids:
2242 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2245 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2246 for id in video_ids:
2247 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2250 pagenum = pagenum + 1
2253 class GoogleSearchIE(InfoExtractor):
2254 """Information Extractor for Google Video search queries."""
2255 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2256 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2257 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2258 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2260 _max_google_results = 1000
2261 IE_NAME = u'video.google:search'
2263 def __init__(self, google_ie, downloader=None):
2264 InfoExtractor.__init__(self, downloader)
2265 self._google_ie = google_ie
2267 def report_download_page(self, query, pagenum):
2268 """Report attempt to download playlist page with given number."""
2269 query = query.decode(preferredencoding())
2270 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2272 def _real_initialize(self):
2273 self._google_ie.initialize()
2275 def _real_extract(self, query):
2276 mobj = re.match(self._VALID_URL, query)
2278 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2281 prefix, query = query.split(':')
2283 query = query.encode('utf-8')
2285 self._download_n_results(query, 1)
2287 elif prefix == 'all':
2288 self._download_n_results(query, self._max_google_results)
2294 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2296 elif n > self._max_google_results:
2297 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2298 n = self._max_google_results
2299 self._download_n_results(query, n)
2301 except ValueError: # parsing prefix as integer fails
2302 self._download_n_results(query, 1)
2305 def _download_n_results(self, query, n):
2306 """Downloads a specified number of results for a query"""
2309 already_seen = set()
2313 self.report_download_page(query, pagenum)
2314 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2315 request = urllib2.Request(result_url)
2317 page = urllib2.urlopen(request).read()
2318 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2319 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2322 # Extract video identifiers
2323 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2324 video_id = mobj.group(1)
2325 if video_id not in already_seen:
2326 video_ids.append(video_id)
2327 already_seen.add(video_id)
2328 if len(video_ids) == n:
2329 # Specified n videos reached
2330 for id in video_ids:
2331 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2334 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2335 for id in video_ids:
2336 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2339 pagenum = pagenum + 1
2342 class YahooSearchIE(InfoExtractor):
2343 """Information Extractor for Yahoo! Video search queries."""
2344 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2345 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2346 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2347 _MORE_PAGES_INDICATOR = r'\s*Next'
2349 _max_yahoo_results = 1000
2350 IE_NAME = u'video.yahoo:search'
2352 def __init__(self, yahoo_ie, downloader=None):
2353 InfoExtractor.__init__(self, downloader)
2354 self._yahoo_ie = yahoo_ie
2356 def report_download_page(self, query, pagenum):
2357 """Report attempt to download playlist page with given number."""
2358 query = query.decode(preferredencoding())
2359 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2361 def _real_initialize(self):
2362 self._yahoo_ie.initialize()
2364 def _real_extract(self, query):
2365 mobj = re.match(self._VALID_URL, query)
2367 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2370 prefix, query = query.split(':')
2372 query = query.encode('utf-8')
2374 self._download_n_results(query, 1)
2376 elif prefix == 'all':
2377 self._download_n_results(query, self._max_yahoo_results)
2383 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2385 elif n > self._max_yahoo_results:
2386 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2387 n = self._max_yahoo_results
2388 self._download_n_results(query, n)
2390 except ValueError: # parsing prefix as integer fails
2391 self._download_n_results(query, 1)
2394 def _download_n_results(self, query, n):
2395 """Downloads a specified number of results for a query"""
2398 already_seen = set()
2402 self.report_download_page(query, pagenum)
2403 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2404 request = urllib2.Request(result_url)
2406 page = urllib2.urlopen(request).read()
2407 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2408 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2411 # Extract video identifiers
2412 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2413 video_id = mobj.group(1)
2414 if video_id not in already_seen:
2415 video_ids.append(video_id)
2416 already_seen.add(video_id)
2417 if len(video_ids) == n:
2418 # Specified n videos reached
2419 for id in video_ids:
2420 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2423 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2424 for id in video_ids:
2425 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2428 pagenum = pagenum + 1
2431 class YoutubePlaylistIE(InfoExtractor):
2432 """Information Extractor for YouTube playlists."""
2434 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2435 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2436 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2437 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2439 IE_NAME = u'youtube:playlist'
2441 def __init__(self, youtube_ie, downloader=None):
2442 InfoExtractor.__init__(self, downloader)
2443 self._youtube_ie = youtube_ie
2445 def report_download_page(self, playlist_id, pagenum):
2446 """Report attempt to download playlist page with given number."""
2447 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2449 def _real_initialize(self):
2450 self._youtube_ie.initialize()
2452 def _real_extract(self, url):
2453 # Extract playlist id
2454 mobj = re.match(self._VALID_URL, url)
2456 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2460 if mobj.group(3) is not None:
2461 self._youtube_ie.extract(mobj.group(3))
2464 # Download playlist pages
2465 # prefix is 'p' as default for playlists but there are other types that need extra care
2466 playlist_prefix = mobj.group(1)
2467 if playlist_prefix == 'a':
2468 playlist_access = 'artist'
2470 playlist_prefix = 'p'
2471 playlist_access = 'view_play_list'
2472 playlist_id = mobj.group(2)
2477 self.report_download_page(playlist_id, pagenum)
2478 request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2480 page = urllib2.urlopen(request).read()
2481 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2482 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2485 # Extract video identifiers
2487 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2488 if mobj.group(1) not in ids_in_page:
2489 ids_in_page.append(mobj.group(1))
2490 video_ids.extend(ids_in_page)
2492 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2494 pagenum = pagenum + 1
2496 playliststart = self._downloader.params.get('playliststart', 1) - 1
2497 playlistend = self._downloader.params.get('playlistend', -1)
2498 video_ids = video_ids[playliststart:playlistend]
2500 for id in video_ids:
2501 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2505 class YoutubeUserIE(InfoExtractor):
2506 """Information Extractor for YouTube users."""
2508 _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2509 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2510 _GDATA_PAGE_SIZE = 50
2511 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2512 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2514 IE_NAME = u'youtube:user'
2516 def __init__(self, youtube_ie, downloader=None):
2517 InfoExtractor.__init__(self, downloader)
2518 self._youtube_ie = youtube_ie
2520 def report_download_page(self, username, start_index):
2521 """Report attempt to download user page."""
2522 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2523 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2525 def _real_initialize(self):
2526 self._youtube_ie.initialize()
2528 def _real_extract(self, url):
2530 mobj = re.match(self._VALID_URL, url)
2532 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2535 username = mobj.group(1)
2537 # Download video ids using YouTube Data API. Result size per
2538 # query is limited (currently to 50 videos) so we need to query
2539 # page by page until there are no video ids - it means we got
2546 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2547 self.report_download_page(username, start_index)
2549 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2552 page = urllib2.urlopen(request).read()
2553 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2554 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2557 # Extract video identifiers
2560 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2561 if mobj.group(1) not in ids_in_page:
2562 ids_in_page.append(mobj.group(1))
2564 video_ids.extend(ids_in_page)
2566 # A little optimization - if current page is not
2567 # "full", ie. does not contain PAGE_SIZE video ids then
2568 # we can assume that this page is the last one - there
2569 # are no more ids on further pages - no need to query
2572 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2577 all_ids_count = len(video_ids)
2578 playliststart = self._downloader.params.get('playliststart', 1) - 1
2579 playlistend = self._downloader.params.get('playlistend', -1)
2581 if playlistend == -1:
2582 video_ids = video_ids[playliststart:]
2584 video_ids = video_ids[playliststart:playlistend]
2586 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2587 (username, all_ids_count, len(video_ids)))
2589 for video_id in video_ids:
2590 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2593 class DepositFilesIE(InfoExtractor):
2594 """Information extractor for depositfiles.com"""
2596 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2597 IE_NAME = u'DepositFiles'
2599 def __init__(self, downloader=None):
2600 InfoExtractor.__init__(self, downloader)
2602 def report_download_webpage(self, file_id):
2603 """Report webpage download."""
2604 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2606 def report_extraction(self, file_id):
2607 """Report information extraction."""
2608 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2610 def _real_initialize(self):
2613 def _real_extract(self, url):
2614 # At this point we have a new file
2615 self._downloader.increment_downloads()
2617 file_id = url.split('/')[-1]
2618 # Rebuild url in english locale
2619 url = 'http://depositfiles.com/en/files/' + file_id
2621 # Retrieve file webpage with 'Free download' button pressed
2622 free_download_indication = { 'gateway_result' : '1' }
2623 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2625 self.report_download_webpage(file_id)
2626 webpage = urllib2.urlopen(request).read()
2627 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2628 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2631 # Search for the real file URL
2632 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2633 if (mobj is None) or (mobj.group(1) is None):
2634 # Try to figure out reason of the error.
2635 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2636 if (mobj is not None) and (mobj.group(1) is not None):
2637 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2638 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2640 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2643 file_url = mobj.group(1)
2644 file_extension = os.path.splitext(file_url)[1][1:]
2646 # Search for file title
2647 mobj = re.search(r'<b title="(.*?)">', webpage)
2649 self._downloader.trouble(u'ERROR: unable to extract title')
2651 file_title = mobj.group(1).decode('utf-8')
2654 # Process file information
2655 self._downloader.process_info({
2656 'id': file_id.decode('utf-8'),
2657 'url': file_url.decode('utf-8'),
2659 'upload_date': u'NA',
2660 'title': file_title,
2661 'stitle': file_title,
2662 'ext': file_extension.decode('utf-8'),
2666 except UnavailableVideoError, err:
2667 self._downloader.trouble(u'ERROR: unable to download file')
2670 class FacebookIE(InfoExtractor):
2671 """Information Extractor for Facebook"""
2673 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2674 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2675 _NETRC_MACHINE = 'facebook'
2676 _available_formats = ['highqual', 'lowqual']
2677 _video_extensions = {
2681 IE_NAME = u'facebook'
2683 def __init__(self, downloader=None):
2684 InfoExtractor.__init__(self, downloader)
2686 def _reporter(self, message):
2687 """Add header and report message."""
2688 self._downloader.to_screen(u'[facebook] %s' % message)
2690 def report_login(self):
2691 """Report attempt to log in."""
2692 self._reporter(u'Logging in')
2694 def report_video_webpage_download(self, video_id):
2695 """Report attempt to download video webpage."""
2696 self._reporter(u'%s: Downloading video webpage' % video_id)
2698 def report_information_extraction(self, video_id):
2699 """Report attempt to extract video information."""
2700 self._reporter(u'%s: Extracting video information' % video_id)
2702 def _parse_page(self, video_webpage):
2703 """Extract video information from page"""
2705 data = {'title': r'class="video_title datawrap">(.*?)</',
2706 'description': r'<div class="datawrap">(.*?)</div>',
2707 'owner': r'\("video_owner_name", "(.*?)"\)',
2708 'upload_date': r'data-date="(.*?)"',
2709 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2712 for piece in data.keys():
2713 mobj = re.search(data[piece], video_webpage)
2714 if mobj is not None:
2715 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2719 for fmt in self._available_formats:
2720 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2721 if mobj is not None:
2722 # URL is in a Javascript segment inside an escaped Unicode format within
2723 # the generally utf-8 page
2724 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2725 video_info['video_urls'] = video_urls
2729 def _real_initialize(self):
2730 if self._downloader is None:
2735 downloader_params = self._downloader.params
2737 # Attempt to use provided username and password or .netrc data
2738 if downloader_params.get('username', None) is not None:
2739 useremail = downloader_params['username']
2740 password = downloader_params['password']
2741 elif downloader_params.get('usenetrc', False):
2743 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2744 if info is not None:
2748 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2749 except (IOError, netrc.NetrcParseError), err:
2750 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2753 if useremail is None:
2762 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2765 login_results = urllib2.urlopen(request).read()
2766 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2767 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2769 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2770 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2773 def _real_extract(self, url):
2774 mobj = re.match(self._VALID_URL, url)
2776 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2778 video_id = mobj.group('ID')
2781 self.report_video_webpage_download(video_id)
2782 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2784 page = urllib2.urlopen(request)
2785 video_webpage = page.read()
2786 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2787 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2790 # Start extracting information
2791 self.report_information_extraction(video_id)
2793 # Extract information
2794 video_info = self._parse_page(video_webpage)
2797 if 'owner' not in video_info:
2798 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2800 video_uploader = video_info['owner']
2803 if 'title' not in video_info:
2804 self._downloader.trouble(u'ERROR: unable to extract video title')
2806 video_title = video_info['title']
2807 video_title = video_title.decode('utf-8')
2808 video_title = sanitize_title(video_title)
2811 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2812 simple_title = simple_title.strip(ur'_')
2815 if 'thumbnail' not in video_info:
2816 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2817 video_thumbnail = ''
2819 video_thumbnail = video_info['thumbnail']
2823 if 'upload_date' in video_info:
2824 upload_time = video_info['upload_date']
2825 timetuple = email.utils.parsedate_tz(upload_time)
2826 if timetuple is not None:
2828 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2833 video_description = video_info.get('description', 'No description available.')
2835 url_map = video_info['video_urls']
2836 if len(url_map.keys()) > 0:
2837 # Decide which formats to download
2838 req_format = self._downloader.params.get('format', None)
2839 format_limit = self._downloader.params.get('format_limit', None)
2841 if format_limit is not None and format_limit in self._available_formats:
2842 format_list = self._available_formats[self._available_formats.index(format_limit):]
2844 format_list = self._available_formats
2845 existing_formats = [x for x in format_list if x in url_map]
2846 if len(existing_formats) == 0:
2847 self._downloader.trouble(u'ERROR: no known formats available for video')
2849 if req_format is None:
2850 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2851 elif req_format == 'worst':
2852 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2853 elif req_format == '-1':
2854 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2857 if req_format not in url_map:
2858 self._downloader.trouble(u'ERROR: requested format not available')
2860 video_url_list = [(req_format, url_map[req_format])] # Specific format
2862 for format_param, video_real_url in video_url_list:
2864 # At this point we have a new video
2865 self._downloader.increment_downloads()
2868 video_extension = self._video_extensions.get(format_param, 'mp4')
2871 # Process video information
2872 self._downloader.process_info({
2873 'id': video_id.decode('utf-8'),
2874 'url': video_real_url.decode('utf-8'),
2875 'uploader': video_uploader.decode('utf-8'),
2876 'upload_date': upload_date,
2877 'title': video_title,
2878 'stitle': simple_title,
2879 'ext': video_extension.decode('utf-8'),
2880 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2881 'thumbnail': video_thumbnail.decode('utf-8'),
2882 'description': video_description.decode('utf-8'),
2885 except UnavailableVideoError, err:
2886 self._downloader.trouble(u'\nERROR: unable to download video')
2888 class BlipTVIE(InfoExtractor):
2889 """Information extractor for blip.tv"""
2891 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2892 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2893 IE_NAME = u'blip.tv'
2895 def report_extraction(self, file_id):
2896 """Report information extraction."""
2897 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
2899 def _simplify_title(self, title):
2900 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2901 res = res.strip(ur'_')
2904 def _real_extract(self, url):
2905 mobj = re.match(self._VALID_URL, url)
2907 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2914 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2915 request = urllib2.Request(json_url)
2916 self.report_extraction(mobj.group(1))
2918 json_code = urllib2.urlopen(request).read()
2919 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2920 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2923 json_data = json.loads(json_code)
2924 if 'Post' in json_data:
2925 data = json_data['Post']
2929 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2930 video_url = data['media']['url']
2931 umobj = re.match(self._URL_EXT, video_url)
2933 raise ValueError('Can not determine filename extension')
2934 ext = umobj.group(1)
2936 self._downloader.increment_downloads()
2939 'id': data['item_id'],
2941 'uploader': data['display_name'],
2942 'upload_date': upload_date,
2943 'title': data['title'],
2944 'stitle': self._simplify_title(data['title']),
2946 'format': data['media']['mimeType'],
2947 'thumbnail': data['thumbnailUrl'],
2948 'description': data['description'],
2949 'player_url': data['embedUrl']
2951 except (ValueError,KeyError), err:
2952 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2956 self._downloader.process_info(info)
2957 except UnavailableVideoError, err:
2958 self._downloader.trouble(u'\nERROR: unable to download video')
2961 class MyVideoIE(InfoExtractor):
2962 """Information Extractor for myvideo.de."""
2964 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2965 IE_NAME = u'myvideo'
2967 def __init__(self, downloader=None):
2968 InfoExtractor.__init__(self, downloader)
2970 def report_download_webpage(self, video_id):
2971 """Report webpage download."""
2972 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2974 def report_extraction(self, video_id):
2975 """Report information extraction."""
2976 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2978 def _real_initialize(self):
2981 def _real_extract(self,url):
2982 mobj = re.match(self._VALID_URL, url)
2984 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2987 video_id = mobj.group(1)
2988 simple_title = mobj.group(2).decode('utf-8')
2989 # should actually not be necessary
2990 simple_title = sanitize_title(simple_title)
2991 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', simple_title)
2994 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2996 self.report_download_webpage(video_id)
2997 webpage = urllib2.urlopen(request).read()
2998 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2999 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3002 self.report_extraction(video_id)
3003 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3006 self._downloader.trouble(u'ERROR: unable to extract media URL')
3008 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3010 mobj = re.search('<title>([^<]+)</title>', webpage)
3012 self._downloader.trouble(u'ERROR: unable to extract title')
3015 video_title = mobj.group(1)
3016 video_title = sanitize_title(video_title)
3020 self._downloader.process_info({
3024 'upload_date': u'NA',
3025 'title': video_title,
3026 'stitle': simple_title,
3031 except UnavailableVideoError:
3032 self._downloader.trouble(u'\nERROR: Unable to download video')
3034 class ComedyCentralIE(InfoExtractor):
3035 """Information extractor for The Daily Show and Colbert Report """
3037 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3038 IE_NAME = u'comedycentral'
3040 def report_extraction(self, episode_id):
3041 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3043 def report_config_download(self, episode_id):
3044 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3046 def report_index_download(self, episode_id):
3047 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3049 def report_player_url(self, episode_id):
3050 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3052 def _simplify_title(self, title):
3053 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3054 res = res.strip(ur'_')
3057 def _real_extract(self, url):
3058 mobj = re.match(self._VALID_URL, url)
3060 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3063 if mobj.group('shortname'):
3064 if mobj.group('shortname') in ('tds', 'thedailyshow'):
3065 url = 'http://www.thedailyshow.com/full-episodes/'
3067 url = 'http://www.colbertnation.com/full-episodes/'
3068 mobj = re.match(self._VALID_URL, url)
3069 assert mobj is not None
3071 dlNewest = not mobj.group('episode')
3073 epTitle = mobj.group('showname')
3075 epTitle = mobj.group('episode')
3077 req = urllib2.Request(url)
3078 self.report_extraction(epTitle)
3080 htmlHandle = urllib2.urlopen(req)
3081 html = htmlHandle.read()
3082 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3083 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3086 url = htmlHandle.geturl()
3087 mobj = re.match(self._VALID_URL, url)
3089 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3091 if mobj.group('episode') == '':
3092 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3094 epTitle = mobj.group('episode')
3096 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"/>', html)
3097 if len(mMovieParams) == 0:
3098 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3101 playerUrl_raw = mMovieParams[0][0]
3102 self.report_player_url(epTitle)
3104 urlHandle = urllib2.urlopen(playerUrl_raw)
3105 playerUrl = urlHandle.geturl()
3106 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3107 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3110 uri = mMovieParams[0][1]
3111 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3112 self.report_index_download(epTitle)
3114 indexXml = urllib2.urlopen(indexUrl).read()
3115 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3116 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3119 idoc = xml.etree.ElementTree.fromstring(indexXml)
3120 itemEls = idoc.findall('.//item')
3121 for itemEl in itemEls:
3122 mediaId = itemEl.findall('./guid')[0].text
3123 shortMediaId = mediaId.split(':')[-1]
3124 showId = mediaId.split(':')[-2].replace('.com', '')
3125 officialTitle = itemEl.findall('./title')[0].text
3126 officialDate = itemEl.findall('./pubDate')[0].text
3128 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3129 urllib.urlencode({'uri': mediaId}))
3130 configReq = urllib2.Request(configUrl)
3131 self.report_config_download(epTitle)
3133 configXml = urllib2.urlopen(configReq).read()
3134 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3135 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3138 cdoc = xml.etree.ElementTree.fromstring(configXml)
3140 for rendition in cdoc.findall('.//rendition'):
3141 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3145 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3148 # For now, just pick the highest bitrate
3149 format,video_url = turls[-1]
3151 self._downloader.increment_downloads()
3153 effTitle = showId + '-' + epTitle
3158 'upload_date': officialDate,
3160 'stitle': self._simplify_title(effTitle),
3164 'description': officialTitle,
3165 'player_url': playerUrl
3169 self._downloader.process_info(info)
3170 except UnavailableVideoError, err:
3171 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3175 class EscapistIE(InfoExtractor):
3176 """Information extractor for The Escapist """
3178 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3179 IE_NAME = u'escapist'
3181 def report_extraction(self, showName):
3182 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3184 def report_config_download(self, showName):
3185 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3187 def _simplify_title(self, title):
3188 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3189 res = res.strip(ur'_')
3192 def _real_extract(self, url):
3193 htmlParser = HTMLParser.HTMLParser()
3195 mobj = re.match(self._VALID_URL, url)
3197 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3199 showName = mobj.group('showname')
3200 videoId = mobj.group('episode')
3202 self.report_extraction(showName)
3204 webPage = urllib2.urlopen(url).read()
3205 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3206 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3209 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3210 description = htmlParser.unescape(descMatch.group(1))
3211 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3212 imgUrl = htmlParser.unescape(imgMatch.group(1))
3213 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3214 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3215 configUrlMatch = re.search('config=(.*)$', playerUrl)
3216 configUrl = urllib2.unquote(configUrlMatch.group(1))
3218 self.report_config_download(showName)
3220 configJSON = urllib2.urlopen(configUrl).read()
3221 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3222 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3225 # Technically, it's JavaScript, not JSON
3226 configJSON = configJSON.replace("'", '"')
3229 config = json.loads(configJSON)
3230 except (ValueError,), err:
3231 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3234 playlist = config['playlist']
3235 videoUrl = playlist[1]['url']
3237 self._downloader.increment_downloads()
3241 'uploader': showName,
3242 'upload_date': None,
3244 'stitle': self._simplify_title(showName),
3247 'thumbnail': imgUrl,
3248 'description': description,
3249 'player_url': playerUrl,
3253 self._downloader.process_info(info)
3254 except UnavailableVideoError, err:
3255 self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3259 class PostProcessor(object):
3260 """Post Processor class.
3262 PostProcessor objects can be added to downloaders with their
3263 add_post_processor() method. When the downloader has finished a
3264 successful download, it will take its internal chain of PostProcessors
3265 and start calling the run() method on each one of them, first with
3266 an initial argument and then with the returned value of the previous
3269 The chain will be stopped if one of them ever returns None or the end
3270 of the chain is reached.
3272 PostProcessor objects follow a "mutual registration" process similar
3273 to InfoExtractor objects.
3278 def __init__(self, downloader=None):
3279 self._downloader = downloader
3281 def set_downloader(self, downloader):
3282 """Sets the downloader for this PP."""
3283 self._downloader = downloader
3285 def run(self, information):
3286 """Run the PostProcessor.
3288 The "information" argument is a dictionary like the ones
3289 composed by InfoExtractors. The only difference is that this
3290 one has an extra field called "filepath" that points to the
3293 When this method returns None, the postprocessing chain is
3294 stopped. However, this method may return an information
3295 dictionary that will be passed to the next postprocessing
3296 object in the chain. It can be the one it received after
3297 changing some fields.
3299 In addition, this method may raise a PostProcessingError
3300 exception that will be taken into account by the downloader
3303 return information # by default, do nothing
3306 class FFmpegExtractAudioPP(PostProcessor):
3308 def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
3309 PostProcessor.__init__(self, downloader)
3310 if preferredcodec is None:
3311 preferredcodec = 'best'
3312 self._preferredcodec = preferredcodec
3313 self._preferredquality = preferredquality
3314 self._keepvideo = keepvideo
3317 def get_audio_codec(path):
3319 cmd = ['ffprobe', '-show_streams', '--', path]
3320 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3321 output = handle.communicate()[0]
3322 if handle.wait() != 0:
3324 except (IOError, OSError):
3327 for line in output.split('\n'):
3328 if line.startswith('codec_name='):
3329 audio_codec = line.split('=')[1].strip()
3330 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3335 def run_ffmpeg(path, out_path, codec, more_opts):
3337 cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3338 ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3340 except (IOError, OSError):
3343 def run(self, information):
3344 path = information['filepath']
3346 filecodec = self.get_audio_codec(path)
3347 if filecodec is None:
3348 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3352 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3353 if filecodec == 'aac' or filecodec == 'mp3':
3354 # Lossless if possible
3356 extension = filecodec
3357 if filecodec == 'aac':
3358 more_opts = ['-f', 'adts']
3361 acodec = 'libmp3lame'
3364 if self._preferredquality is not None:
3365 more_opts += ['-ab', self._preferredquality]
3367 # We convert the audio (lossy)
3368 acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
3369 extension = self._preferredcodec
3371 if self._preferredquality is not None:
3372 more_opts += ['-ab', self._preferredquality]
3373 if self._preferredcodec == 'aac':
3374 more_opts += ['-f', 'adts']
3376 (prefix, ext) = os.path.splitext(path)
3377 new_path = prefix + '.' + extension
3378 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3379 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3382 self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3385 # Try to update the date time for extracted audio file.
3386 if information.get('filetime') is not None:
3388 os.utime(new_path, (time.time(), information['filetime']))
3390 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
3392 if not self._keepvideo:
3395 except (IOError, OSError):
3396 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3399 information['filepath'] = new_path
3403 def updateSelf(downloader, filename):
3404 ''' Update the program file with the latest version from the repository '''
3405 # Note: downloader only used for options
3406 if not os.access(filename, os.W_OK):
3407 sys.exit('ERROR: no write permissions on %s' % filename)
3409 downloader.to_screen('Updating to latest version...')
3413 urlh = urllib.urlopen(UPDATE_URL)
3414 newcontent = urlh.read()
3416 vmatch = re.search("__version__ = '([^']+)'", newcontent)
3417 if vmatch is not None and vmatch.group(1) == __version__:
3418 downloader.to_screen('youtube-dl is up-to-date (' + __version__ + ')')
3422 except (IOError, OSError), err:
3423 sys.exit('ERROR: unable to download latest version')
3426 outf = open(filename, 'wb')
3428 outf.write(newcontent)
3431 except (IOError, OSError), err:
3432 sys.exit('ERROR: unable to overwrite current version')
3434 downloader.to_screen('Updated youtube-dl. Restart youtube-dl to use the new version.')
3441 def _format_option_string(option):
3442 ''' ('-o', '--option') -> -o, --format METAVAR'''
3446 if option._short_opts: opts.append(option._short_opts[0])
3447 if option._long_opts: opts.append(option._long_opts[0])
3448 if len(opts) > 1: opts.insert(1, ', ')
3450 if option.takes_value(): opts.append(' %s' % option.metavar)
3452 return "".join(opts)
3454 def _find_term_columns():
3455 columns = os.environ.get('COLUMNS', None)
3460 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3461 out,err = sp.communicate()
3462 return int(out.split()[1])
3468 max_help_position = 80
3470 # No need to wrap help messages if we're on a wide console
3471 columns = _find_term_columns()
3472 if columns: max_width = columns
3474 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
3475 fmt.format_option_strings = _format_option_string
3478 'version' : __version__,
3480 'usage' : '%prog [options] url [url...]',
3481 'conflict_handler' : 'resolve',
3484 parser = optparse.OptionParser(**kw)
3487 general = optparse.OptionGroup(parser, 'General Options')
3488 selection = optparse.OptionGroup(parser, 'Video Selection')
3489 authentication = optparse.OptionGroup(parser, 'Authentication Options')
3490 video_format = optparse.OptionGroup(parser, 'Video Format Options')
3491 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
3492 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
3493 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3495 general.add_option('-h', '--help',
3496 action='help', help='print this help text and exit')
3497 general.add_option('-v', '--version',
3498 action='version', help='print program version and exit')
3499 general.add_option('-U', '--update',
3500 action='store_true', dest='update_self', help='update this program to latest version')
3501 general.add_option('-i', '--ignore-errors',
3502 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3503 general.add_option('-r', '--rate-limit',
3504 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3505 general.add_option('-R', '--retries',
3506 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
3507 general.add_option('--dump-user-agent',
3508 action='store_true', dest='dump_user_agent',
3509 help='display the current browser identification', default=False)
3510 general.add_option('--list-extractors',
3511 action='store_true', dest='list_extractors',
3512 help='List all supported extractors and the URLs they would handle', default=False)
3514 selection.add_option('--playlist-start',
3515 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3516 selection.add_option('--playlist-end',
3517 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3518 selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
3519 selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
3521 authentication.add_option('-u', '--username',
3522 dest='username', metavar='USERNAME', help='account username')
3523 authentication.add_option('-p', '--password',
3524 dest='password', metavar='PASSWORD', help='account password')
3525 authentication.add_option('-n', '--netrc',
3526 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3529 video_format.add_option('-f', '--format',
3530 action='store', dest='format', metavar='FORMAT', help='video format code')
3531 video_format.add_option('--all-formats',
3532 action='store_const', dest='format', help='download all available video formats', const='all')
3533 video_format.add_option('--max-quality',
3534 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3537 verbosity.add_option('-q', '--quiet',
3538 action='store_true', dest='quiet', help='activates quiet mode', default=False)
3539 verbosity.add_option('-s', '--simulate',
3540 action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
3541 verbosity.add_option('--skip-download',
3542 action='store_true', dest='skip_download', help='do not download the video', default=False)
3543 verbosity.add_option('-g', '--get-url',
3544 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3545 verbosity.add_option('-e', '--get-title',
3546 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3547 verbosity.add_option('--get-thumbnail',
3548 action='store_true', dest='getthumbnail',
3549 help='simulate, quiet but print thumbnail URL', default=False)
3550 verbosity.add_option('--get-description',
3551 action='store_true', dest='getdescription',
3552 help='simulate, quiet but print video description', default=False)
3553 verbosity.add_option('--get-filename',
3554 action='store_true', dest='getfilename',
3555 help='simulate, quiet but print output filename', default=False)
3556 verbosity.add_option('--get-format',
3557 action='store_true', dest='getformat',
3558 help='simulate, quiet but print output format', default=False)
3559 verbosity.add_option('--no-progress',
3560 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3561 verbosity.add_option('--console-title',
3562 action='store_true', dest='consoletitle',
3563 help='display progress in console titlebar', default=False)
3566 filesystem.add_option('-t', '--title',
3567 action='store_true', dest='usetitle', help='use title in file name', default=False)
3568 filesystem.add_option('-l', '--literal',
3569 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3570 filesystem.add_option('-A', '--auto-number',
3571 action='store_true', dest='autonumber',
3572 help='number downloaded files starting from 00000', default=False)
3573 filesystem.add_option('-o', '--output',
3574 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
3575 filesystem.add_option('-a', '--batch-file',
3576 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3577 filesystem.add_option('-w', '--no-overwrites',
3578 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3579 filesystem.add_option('-c', '--continue',
3580 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3581 filesystem.add_option('--no-continue',
3582 action='store_false', dest='continue_dl',
3583 help='do not resume partially downloaded files (restart from beginning)')
3584 filesystem.add_option('--cookies',
3585 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
3586 filesystem.add_option('--no-part',
3587 action='store_true', dest='nopart', help='do not use .part files', default=False)
3588 filesystem.add_option('--no-mtime',
3589 action='store_false', dest='updatetime',
3590 help='do not use the Last-modified header to set the file modification time', default=True)
3591 filesystem.add_option('--write-description',
3592 action='store_true', dest='writedescription',
3593 help='write video description to a .description file', default=False)
3594 filesystem.add_option('--write-info-json',
3595 action='store_true', dest='writeinfojson',
3596 help='write video metadata to a .info.json file', default=False)
3599 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3600 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3601 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3602 help='"best", "aac" or "mp3"; best by default')
3603 postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
3604 help='ffmpeg audio bitrate specification, 128k by default')
3605 postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
3606 help='keeps the video file on disk after the post-processing; the video is erased by default')
3609 parser.add_option_group(general)
3610 parser.add_option_group(selection)
3611 parser.add_option_group(filesystem)
3612 parser.add_option_group(verbosity)
3613 parser.add_option_group(video_format)
3614 parser.add_option_group(authentication)
3615 parser.add_option_group(postproc)
3617 opts, args = parser.parse_args()
3619 return parser, opts, args
3621 def gen_extractors():
3622 """ Return a list of an instance of every supported extractor.
3623 The order does matter; the first extractor matched is the one handling the URL.
3625 youtube_ie = YoutubeIE()
3626 google_ie = GoogleIE()
3627 yahoo_ie = YahooIE()
3629 YoutubePlaylistIE(youtube_ie),
3630 YoutubeUserIE(youtube_ie),
3631 YoutubeSearchIE(youtube_ie),
3633 MetacafeIE(youtube_ie),
3636 GoogleSearchIE(google_ie),
3639 YahooSearchIE(yahoo_ie),
3652 parser, opts, args = parseOpts()
3654 # Open appropriate CookieJar
3655 if opts.cookiefile is None:
3656 jar = cookielib.CookieJar()
3659 jar = cookielib.MozillaCookieJar(opts.cookiefile)
3660 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3662 except (IOError, OSError), err:
3663 sys.exit(u'ERROR: unable to open cookie file')
3666 if opts.dump_user_agent:
3667 print std_headers['User-Agent']
3670 # Batch file verification
3672 if opts.batchfile is not None:
3674 if opts.batchfile == '-':
3677 batchfd = open(opts.batchfile, 'r')
3678 batchurls = batchfd.readlines()
3679 batchurls = [x.strip() for x in batchurls]
3680 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3682 sys.exit(u'ERROR: batch file could not be read')
3683 all_urls = batchurls + args
3685 # General configuration
3686 cookie_processor = urllib2.HTTPCookieProcessor(jar)
3687 opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
3688 urllib2.install_opener(opener)
3689 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3691 extractors = gen_extractors()
3693 if opts.list_extractors:
3694 for ie in extractors:
3696 matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
3697 all_urls = filter(lambda url: url not in matchedUrls, all_urls)
3698 for mu in matchedUrls:
3702 # Conflicting, missing and erroneous options
3703 if opts.usenetrc and (opts.username is not None or opts.password is not None):
3704 parser.error(u'using .netrc conflicts with giving username/password')
3705 if opts.password is not None and opts.username is None:
3706 parser.error(u'account username missing')
3707 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3708 parser.error(u'using output template conflicts with using title, literal title or auto number')
3709 if opts.usetitle and opts.useliteral:
3710 parser.error(u'using title conflicts with using literal title')
3711 if opts.username is not None and opts.password is None:
3712 opts.password = getpass.getpass(u'Type account password and press return:')
3713 if opts.ratelimit is not None:
3714 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3715 if numeric_limit is None:
3716 parser.error(u'invalid rate limit specified')
3717 opts.ratelimit = numeric_limit
3718 if opts.retries is not None:
3720 opts.retries = long(opts.retries)
3721 except (TypeError, ValueError), err:
3722 parser.error(u'invalid retry count specified')
3724 opts.playliststart = int(opts.playliststart)
3725 if opts.playliststart <= 0:
3726 raise ValueError(u'Playlist start must be positive')
3727 except (TypeError, ValueError), err:
3728 parser.error(u'invalid playlist start number specified')
3730 opts.playlistend = int(opts.playlistend)
3731 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3732 raise ValueError(u'Playlist end must be greater than playlist start')
3733 except (TypeError, ValueError), err:
3734 parser.error(u'invalid playlist end number specified')
3735 if opts.extractaudio:
3736 if opts.audioformat not in ['best', 'aac', 'mp3']:
3737 parser.error(u'invalid audio format specified')
3740 fd = FileDownloader({
3741 'usenetrc': opts.usenetrc,
3742 'username': opts.username,
3743 'password': opts.password,
3744 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
3745 'forceurl': opts.geturl,
3746 'forcetitle': opts.gettitle,
3747 'forcethumbnail': opts.getthumbnail,
3748 'forcedescription': opts.getdescription,
3749 'forcefilename': opts.getfilename,
3750 'forceformat': opts.getformat,
3751 'simulate': opts.simulate,
3752 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
3753 'format': opts.format,
3754 'format_limit': opts.format_limit,
3755 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3756 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3757 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3758 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3759 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3760 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3761 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3762 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3763 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3764 or u'%(id)s.%(ext)s'),
3765 'ignoreerrors': opts.ignoreerrors,
3766 'ratelimit': opts.ratelimit,
3767 'nooverwrites': opts.nooverwrites,
3768 'retries': opts.retries,
3769 'continuedl': opts.continue_dl,
3770 'noprogress': opts.noprogress,
3771 'playliststart': opts.playliststart,
3772 'playlistend': opts.playlistend,
3773 'logtostderr': opts.outtmpl == '-',
3774 'consoletitle': opts.consoletitle,
3775 'nopart': opts.nopart,
3776 'updatetime': opts.updatetime,
3777 'writedescription': opts.writedescription,
3778 'writeinfojson': opts.writeinfojson,
3779 'matchtitle': opts.matchtitle,
3780 'rejecttitle': opts.rejecttitle,
3782 for extractor in extractors:
3783 fd.add_info_extractor(extractor)
3786 if opts.extractaudio:
3787 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
3790 if opts.update_self:
3791 updateSelf(fd, sys.argv[0])
3794 if len(all_urls) < 1:
3795 if not opts.update_self:
3796 parser.error(u'you must provide at least one URL')
3799 retcode = fd.download(all_urls)
3801 # Dump cookie jar if requested
3802 if opts.cookiefile is not None:
3805 except (IOError, OSError), err:
3806 sys.exit(u'ERROR: unable to save cookie jar')
3811 if __name__ == '__main__':
3814 except DownloadError:
3816 except SameFileError:
3817 sys.exit(u'ERROR: fixed output name but more than one file to download')
3818 except KeyboardInterrupt:
3819 sys.exit(u'\nERROR: Interrupted by user')
3821 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: