2 # -*- coding: utf-8 -*-
5 'Ricardo Garcia Gonzalez',
13 'Philipp Hagemeister',
17 __license__ = 'Public Domain'
18 __version__ = '2011.09.18c'
20 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
49 except ImportError: # Python 2.4
52 import cStringIO as StringIO
56 # parse_qs was moved from the cgi module to the urlparse module recently.
58 from urlparse import parse_qs
60 from cgi import parse_qs
68 import xml.etree.ElementTree
69 except ImportError: # Python<2.5: Not officially supported, but let it slip
70 warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
73 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
74 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
75 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
76 'Accept-Encoding': 'gzip, deflate',
77 'Accept-Language': 'en-us,en;q=0.5',
80 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
84 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
90 def raiseError(msg, i):
91 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
92 def skipSpace(i, expectMore=True):
93 while i < len(s) and s[i] in ' \t\r\n':
97 raiseError('Premature end', i)
99 def decodeEscape(match):
115 return unichr(int(esc[1:5], 16))
116 if len(esc) == 5+6 and esc[5:7] == '\\u':
117 hi = int(esc[1:5], 16)
118 low = int(esc[7:11], 16)
119 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
120 raise ValueError('Unknown escape ' + str(esc))
127 while s[e-bslashes-1] == '\\':
129 if bslashes % 2 == 1:
133 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
134 stri = rexp.sub(decodeEscape, s[i:e])
140 if s[i] == '}': # Empty dictionary
144 raiseError('Expected a string object key', i)
145 i,key = parseString(i)
147 if i >= len(s) or s[i] != ':':
148 raiseError('Expected a colon', i)
155 raiseError('Expected comma or closing curly brace', i)
160 if s[i] == ']': # Empty array
165 i = skipSpace(i) # Raise exception if premature end
169 raiseError('Expected a comma or closing bracket', i)
171 def parseDiscrete(i):
172 for k,v in {'true': True, 'false': False, 'null': None}.items():
173 if s.startswith(k, i):
175 raiseError('Not a boolean (or null)', i)
177 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
179 raiseError('Not a number', i)
181 if '.' in nums or 'e' in nums or 'E' in nums:
182 return (i+len(nums), float(nums))
183 return (i+len(nums), int(nums))
184 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
187 i,res = CHARMAP.get(s[i], parseNumber)(i)
188 i = skipSpace(i, False)
192 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
195 def preferredencoding():
196 """Get preferred encoding.
198 Returns the best encoding scheme for the system, based on
199 locale.getpreferredencoding() and some further tweaks.
201 def yield_preferredencoding():
203 pref = locale.getpreferredencoding()
209 return yield_preferredencoding().next()
212 def htmlentity_transform(matchobj):
213 """Transforms an HTML entity to a Unicode character.
215 This function receives a match object and is intended to be used with
216 the re.sub() function.
218 entity = matchobj.group(1)
220 # Known non-numeric HTML entity
221 if entity in htmlentitydefs.name2codepoint:
222 return unichr(htmlentitydefs.name2codepoint[entity])
225 mobj = re.match(ur'(?u)#(x?\d+)', entity)
227 numstr = mobj.group(1)
228 if numstr.startswith(u'x'):
230 numstr = u'0%s' % numstr
233 return unichr(long(numstr, base))
235 # Unknown entity in name, return its literal representation
236 return (u'&%s;' % entity)
239 def sanitize_title(utitle):
240 """Sanitizes a video title so it could be used as part of a filename."""
241 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
242 return utitle.replace(unicode(os.sep), u'%')
245 def sanitize_open(filename, open_mode):
246 """Try to open the given filename, and slightly tweak it if this fails.
248 Attempts to open the given filename. If this fails, it tries to change
249 the filename slightly, step by step, until it's either able to open it
250 or it fails and raises a final exception, like the standard open()
253 It returns the tuple (stream, definitive_file_name).
257 if sys.platform == 'win32':
259 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
260 return (sys.stdout, filename)
261 stream = open(filename, open_mode)
262 return (stream, filename)
263 except (IOError, OSError), err:
264 # In case of error, try to remove win32 forbidden chars
265 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
267 # An exception here should be caught in the caller
268 stream = open(filename, open_mode)
269 return (stream, filename)
272 def timeconvert(timestr):
273 """Convert RFC 2822 defined time string into system timestamp"""
275 timetuple = email.utils.parsedate_tz(timestr)
276 if timetuple is not None:
277 timestamp = email.utils.mktime_tz(timetuple)
281 class DownloadError(Exception):
282 """Download Error exception.
284 This exception may be thrown by FileDownloader objects if they are not
285 configured to continue on errors. They will contain the appropriate
291 class SameFileError(Exception):
292 """Same File exception.
294 This exception will be thrown by FileDownloader objects if they detect
295 multiple files would have to be downloaded to the same file on disk.
300 class PostProcessingError(Exception):
301 """Post Processing exception.
303 This exception may be raised by PostProcessor's .run() method to
304 indicate an error in the postprocessing task.
309 class UnavailableVideoError(Exception):
310 """Unavailable Format exception.
312 This exception will be thrown when a video is requested
313 in a format that is not available for that video.
318 class ContentTooShortError(Exception):
319 """Content Too Short exception.
321 This exception may be raised by FileDownloader objects when a file they
322 download is too small for what the server announced first, indicating
323 the connection was probably interrupted.
329 def __init__(self, downloaded, expected):
330 self.downloaded = downloaded
331 self.expected = expected
334 class YoutubeDLHandler(urllib2.HTTPHandler):
335 """Handler for HTTP requests and responses.
337 This class, when installed with an OpenerDirector, automatically adds
338 the standard headers to every HTTP request and handles gzipped and
339 deflated responses from web servers. If compression is to be avoided in
340 a particular request, the original request in the program code only has
341 to include the HTTP header "Youtubedl-No-Compression", which will be
342 removed before making the real request.
344 Part of this code was copied from:
346 http://techknack.net/python-urllib2-handlers/
348 Andrew Rowls, the author of that code, agreed to release it to the
355 return zlib.decompress(data, -zlib.MAX_WBITS)
357 return zlib.decompress(data)
360 def addinfourl_wrapper(stream, headers, url, code):
361 if hasattr(urllib2.addinfourl, 'getcode'):
362 return urllib2.addinfourl(stream, headers, url, code)
363 ret = urllib2.addinfourl(stream, headers, url)
367 def http_request(self, req):
368 for h in std_headers:
371 req.add_header(h, std_headers[h])
372 if 'Youtubedl-no-compression' in req.headers:
373 if 'Accept-encoding' in req.headers:
374 del req.headers['Accept-encoding']
375 del req.headers['Youtubedl-no-compression']
378 def http_response(self, req, resp):
381 if resp.headers.get('Content-encoding', '') == 'gzip':
382 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
383 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
384 resp.msg = old_resp.msg
386 if resp.headers.get('Content-encoding', '') == 'deflate':
387 gz = StringIO.StringIO(self.deflate(resp.read()))
388 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
389 resp.msg = old_resp.msg
393 class FileDownloader(object):
394 """File Downloader class.
396 File downloader objects are the ones responsible of downloading the
397 actual video file and writing it to disk if the user has requested
398 it, among some other tasks. In most cases there should be one per
399 program. As, given a video URL, the downloader doesn't know how to
400 extract all the needed information, task that InfoExtractors do, it
401 has to pass the URL to one of them.
403 For this, file downloader objects have a method that allows
404 InfoExtractors to be registered in a given order. When it is passed
405 a URL, the file downloader handles it to the first InfoExtractor it
406 finds that reports being able to handle it. The InfoExtractor extracts
407 all the information about the video or videos the URL refers to, and
408 asks the FileDownloader to process the video information, possibly
409 downloading the video.
411 File downloaders accept a lot of parameters. In order not to saturate
412 the object constructor with arguments, it receives a dictionary of
413 options instead. These options are available through the params
414 attribute for the InfoExtractors to use. The FileDownloader also
415 registers itself as the downloader in charge for the InfoExtractors
416 that are added to it, so this is a "mutual registration".
420 username: Username for authentication purposes.
421 password: Password for authentication purposes.
422 usenetrc: Use netrc for authentication instead.
423 quiet: Do not print messages to stdout.
424 forceurl: Force printing final URL.
425 forcetitle: Force printing title.
426 forcethumbnail: Force printing thumbnail URL.
427 forcedescription: Force printing description.
428 forcefilename: Force printing final filename.
429 simulate: Do not download the video files.
430 format: Video format code.
431 format_limit: Highest quality format to try.
432 outtmpl: Template for output names.
433 ignoreerrors: Do not stop on download errors.
434 ratelimit: Download speed limit, in bytes/sec.
435 nooverwrites: Prevent overwriting files.
436 retries: Number of times to retry for HTTP error 5xx
437 continuedl: Try to continue downloads if possible.
438 noprogress: Do not print the progress bar.
439 playliststart: Playlist item to start at.
440 playlistend: Playlist item to end at.
441 matchtitle: Download only matching titles.
442 rejecttitle: Reject downloads for matching titles.
443 logtostderr: Log messages to stderr instead of stdout.
444 consoletitle: Display progress in console window's titlebar.
445 nopart: Do not use temporary .part files.
446 updatetime: Use the Last-modified header to set output file timestamps.
447 writedescription: Write the video description to a .description file
448 writeinfojson: Write the video description to a .info.json file
454 _download_retcode = None
455 _num_downloads = None
458 def __init__(self, params):
459 """Create a FileDownloader object with the given options."""
462 self._download_retcode = 0
463 self._num_downloads = 0
464 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
468 def format_bytes(bytes):
471 if type(bytes) is str:
476 exponent = long(math.log(bytes, 1024.0))
477 suffix = 'bkMGTPEZY'[exponent]
478 converted = float(bytes) / float(1024 ** exponent)
479 return '%.2f%s' % (converted, suffix)
482 def calc_percent(byte_counter, data_len):
485 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
488 def calc_eta(start, now, total, current):
492 if current == 0 or dif < 0.001: # One millisecond
494 rate = float(current) / dif
495 eta = long((float(total) - float(current)) / rate)
496 (eta_mins, eta_secs) = divmod(eta, 60)
499 return '%02d:%02d' % (eta_mins, eta_secs)
502 def calc_speed(start, now, bytes):
504 if bytes == 0 or dif < 0.001: # One millisecond
505 return '%10s' % '---b/s'
506 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
509 def best_block_size(elapsed_time, bytes):
510 new_min = max(bytes / 2.0, 1.0)
511 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
512 if elapsed_time < 0.001:
514 rate = bytes / elapsed_time
522 def parse_bytes(bytestr):
523 """Parse a string indicating a byte quantity into a long integer."""
524 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
527 number = float(matchobj.group(1))
528 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
529 return long(round(number * multiplier))
531 def add_info_extractor(self, ie):
532 """Add an InfoExtractor object to the end of the list."""
534 ie.set_downloader(self)
536 def add_post_processor(self, pp):
537 """Add a PostProcessor object to the end of the chain."""
539 pp.set_downloader(self)
541 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
542 """Print message to stdout if not in quiet mode."""
544 if not self.params.get('quiet', False):
545 terminator = [u'\n', u''][skip_eol]
546 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
547 self._screen_file.flush()
548 except (UnicodeEncodeError), err:
549 if not ignore_encoding_errors:
552 def to_stderr(self, message):
553 """Print message to stderr."""
554 print >>sys.stderr, message.encode(preferredencoding())
556 def to_cons_title(self, message):
557 """Set console/terminal window title to message."""
558 if not self.params.get('consoletitle', False):
560 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
561 # c_wchar_p() might not be necessary if `message` is
562 # already of type unicode()
563 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
564 elif 'TERM' in os.environ:
565 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
567 def fixed_template(self):
568 """Checks if the output template is fixed."""
569 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
571 def trouble(self, message=None):
572 """Determine action to take when a download problem appears.
574 Depending on if the downloader has been configured to ignore
575 download errors or not, this method may throw an exception or
576 not when errors are found, after printing the message.
578 if message is not None:
579 self.to_stderr(message)
580 if not self.params.get('ignoreerrors', False):
581 raise DownloadError(message)
582 self._download_retcode = 1
584 def slow_down(self, start_time, byte_counter):
585 """Sleep if the download speed is over the rate limit."""
586 rate_limit = self.params.get('ratelimit', None)
587 if rate_limit is None or byte_counter == 0:
590 elapsed = now - start_time
593 speed = float(byte_counter) / elapsed
594 if speed > rate_limit:
595 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
597 def temp_name(self, filename):
598 """Returns a temporary filename for the given filename."""
599 if self.params.get('nopart', False) or filename == u'-' or \
600 (os.path.exists(filename) and not os.path.isfile(filename)):
602 return filename + u'.part'
604 def undo_temp_name(self, filename):
605 if filename.endswith(u'.part'):
606 return filename[:-len(u'.part')]
609 def try_rename(self, old_filename, new_filename):
611 if old_filename == new_filename:
613 os.rename(old_filename, new_filename)
614 except (IOError, OSError), err:
615 self.trouble(u'ERROR: unable to rename file')
617 def try_utime(self, filename, last_modified_hdr):
618 """Try to set the last-modified time of the given file."""
619 if last_modified_hdr is None:
621 if not os.path.isfile(filename):
623 timestr = last_modified_hdr
626 filetime = timeconvert(timestr)
630 os.utime(filename, (time.time(), filetime))
635 def report_writedescription(self, descfn):
636 """ Report that the description file is being written """
637 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
639 def report_writeinfojson(self, infofn):
640 """ Report that the metadata file has been written """
641 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
643 def report_destination(self, filename):
644 """Report destination filename."""
645 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
647 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
648 """Report download progress."""
649 if self.params.get('noprogress', False):
651 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
652 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
653 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
654 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
656 def report_resuming_byte(self, resume_len):
657 """Report attempt to resume at given byte."""
658 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
660 def report_retry(self, count, retries):
661 """Report retry in case of HTTP error 5xx"""
662 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
664 def report_file_already_downloaded(self, file_name):
665 """Report file has already been fully downloaded."""
667 self.to_screen(u'[download] %s has already been downloaded' % file_name)
668 except (UnicodeEncodeError), err:
669 self.to_screen(u'[download] The file has already been downloaded')
671 def report_unable_to_resume(self):
672 """Report it was impossible to resume download."""
673 self.to_screen(u'[download] Unable to resume')
675 def report_finish(self):
676 """Report download finished."""
677 if self.params.get('noprogress', False):
678 self.to_screen(u'[download] Download completed')
682 def increment_downloads(self):
683 """Increment the ordinal that assigns a number to each file."""
684 self._num_downloads += 1
686 def prepare_filename(self, info_dict):
687 """Generate the output filename."""
689 template_dict = dict(info_dict)
690 template_dict['epoch'] = unicode(long(time.time()))
691 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
692 filename = self.params['outtmpl'] % template_dict
694 except (ValueError, KeyError), err:
695 self.trouble(u'ERROR: invalid system charset or erroneous output template')
698 def process_info(self, info_dict):
699 """Process a single dictionary returned by an InfoExtractor."""
700 filename = self.prepare_filename(info_dict)
703 if self.params.get('forcetitle', False):
704 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
705 if self.params.get('forceurl', False):
706 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
707 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
708 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
709 if self.params.get('forcedescription', False) and 'description' in info_dict:
710 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
711 if self.params.get('forcefilename', False) and filename is not None:
712 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
713 if self.params.get('forceformat', False):
714 print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
716 # Do nothing else if in simulate mode
717 if self.params.get('simulate', False):
723 matchtitle=self.params.get('matchtitle',False)
724 rejecttitle=self.params.get('rejecttitle',False)
725 title=info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
726 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
727 self.to_screen(u'[download] "%s" title did not match pattern "%s"' % (title, matchtitle))
729 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
730 self.to_screen(u'[download] "%s" title matched reject pattern "%s"' % (title, rejecttitle))
733 if self.params.get('nooverwrites', False) and os.path.exists(filename):
734 self.to_stderr(u'WARNING: file exists and will be skipped')
738 dn = os.path.dirname(filename)
739 if dn != '' and not os.path.exists(dn):
741 except (OSError, IOError), err:
742 self.trouble(u'ERROR: unable to create directory ' + unicode(err))
745 if self.params.get('writedescription', False):
747 descfn = filename + '.description'
748 self.report_writedescription(descfn)
749 descfile = open(descfn, 'wb')
751 descfile.write(info_dict['description'].encode('utf-8'))
754 except (OSError, IOError):
755 self.trouble(u'ERROR: Cannot write description file ' + descfn)
758 if self.params.get('writeinfojson', False):
759 infofn = filename + '.info.json'
760 self.report_writeinfojson(infofn)
763 except (NameError,AttributeError):
764 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
767 infof = open(infofn, 'wb')
769 json.dump(info_dict, infof)
772 except (OSError, IOError):
773 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
776 if not self.params.get('skip_download', False):
778 success = self._do_download(filename, info_dict)
779 except (OSError, IOError), err:
780 raise UnavailableVideoError
781 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
782 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
784 except (ContentTooShortError, ), err:
785 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
790 self.post_process(filename, info_dict)
791 except (PostProcessingError), err:
792 self.trouble(u'ERROR: postprocessing: %s' % str(err))
795 def download(self, url_list):
796 """Download a given list of URLs."""
797 if len(url_list) > 1 and self.fixed_template():
798 raise SameFileError(self.params['outtmpl'])
801 suitable_found = False
803 # Go to next InfoExtractor if not suitable
804 if not ie.suitable(url):
807 # Suitable InfoExtractor found
808 suitable_found = True
810 # Extract information from URL and process it
813 # Suitable InfoExtractor had been found; go to next URL
816 if not suitable_found:
817 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
819 return self._download_retcode
821 def post_process(self, filename, ie_info):
822 """Run the postprocessing chain on the given file."""
824 info['filepath'] = filename
830 def _download_with_rtmpdump(self, filename, url, player_url):
831 self.report_destination(filename)
832 tmpfilename = self.temp_name(filename)
834 # Check for rtmpdump first
836 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
837 except (OSError, IOError):
838 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
841 # Download using rtmpdump. rtmpdump returns exit code 2 when
842 # the connection was interrumpted and resuming appears to be
843 # possible. This is part of rtmpdump's normal usage, AFAIK.
844 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
845 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
846 while retval == 2 or retval == 1:
847 prevsize = os.path.getsize(tmpfilename)
848 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
849 time.sleep(5.0) # This seems to be needed
850 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
851 cursize = os.path.getsize(tmpfilename)
852 if prevsize == cursize and retval == 1:
854 # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
855 if prevsize == cursize and retval == 2 and cursize > 1024:
856 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
860 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
861 self.try_rename(tmpfilename, filename)
864 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
867 def _do_download(self, filename, info_dict):
868 url = info_dict['url']
869 player_url = info_dict.get('player_url', None)
871 # Check file already present
872 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
873 self.report_file_already_downloaded(filename)
876 # Attempt to download using rtmpdump
877 if url.startswith('rtmp'):
878 return self._download_with_rtmpdump(filename, url, player_url)
880 tmpfilename = self.temp_name(filename)
883 # Do not include the Accept-Encoding header
884 headers = {'Youtubedl-no-compression': 'True'}
885 basic_request = urllib2.Request(url, None, headers)
886 request = urllib2.Request(url, None, headers)
888 # Establish possible resume length
889 if os.path.isfile(tmpfilename):
890 resume_len = os.path.getsize(tmpfilename)
896 if self.params.get('continuedl', False):
897 self.report_resuming_byte(resume_len)
898 request.add_header('Range','bytes=%d-' % resume_len)
904 retries = self.params.get('retries', 0)
905 while count <= retries:
906 # Establish connection
908 data = urllib2.urlopen(request)
910 except (urllib2.HTTPError, ), err:
911 if (err.code < 500 or err.code >= 600) and err.code != 416:
912 # Unexpected HTTP error
914 elif err.code == 416:
915 # Unable to resume (requested range not satisfiable)
917 # Open the connection again without the range header
918 data = urllib2.urlopen(basic_request)
919 content_length = data.info()['Content-Length']
920 except (urllib2.HTTPError, ), err:
921 if err.code < 500 or err.code >= 600:
924 # Examine the reported length
925 if (content_length is not None and
926 (resume_len - 100 < long(content_length) < resume_len + 100)):
927 # The file had already been fully downloaded.
928 # Explanation to the above condition: in issue #175 it was revealed that
929 # YouTube sometimes adds or removes a few bytes from the end of the file,
930 # changing the file size slightly and causing problems for some users. So
931 # I decided to implement a suggested change and consider the file
932 # completely downloaded if the file size differs less than 100 bytes from
933 # the one in the hard drive.
934 self.report_file_already_downloaded(filename)
935 self.try_rename(tmpfilename, filename)
938 # The length does not match, we start the download over
939 self.report_unable_to_resume()
945 self.report_retry(count, retries)
948 self.trouble(u'ERROR: giving up after %s retries' % retries)
951 data_len = data.info().get('Content-length', None)
952 if data_len is not None:
953 data_len = long(data_len) + resume_len
954 data_len_str = self.format_bytes(data_len)
955 byte_counter = 0 + resume_len
961 data_block = data.read(block_size)
963 if len(data_block) == 0:
965 byte_counter += len(data_block)
967 # Open file just in time
970 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
971 assert stream is not None
972 filename = self.undo_temp_name(tmpfilename)
973 self.report_destination(filename)
974 except (OSError, IOError), err:
975 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
978 stream.write(data_block)
979 except (IOError, OSError), err:
980 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
982 block_size = self.best_block_size(after - before, len(data_block))
985 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
987 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
989 percent_str = self.calc_percent(byte_counter, data_len)
990 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
991 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
994 self.slow_down(start, byte_counter - resume_len)
997 self.trouble(u'\nERROR: Did not get any data blocks')
1000 self.report_finish()
1001 if data_len is not None and byte_counter != data_len:
1002 raise ContentTooShortError(byte_counter, long(data_len))
1003 self.try_rename(tmpfilename, filename)
1005 # Update file modification time
1006 if self.params.get('updatetime', True):
1007 info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1012 class InfoExtractor(object):
1013 """Information Extractor class.
1015 Information extractors are the classes that, given a URL, extract
1016 information from the video (or videos) the URL refers to. This
1017 information includes the real video URL, the video title and simplified
1018 title, author and others. The information is stored in a dictionary
1019 which is then passed to the FileDownloader. The FileDownloader
1020 processes this information possibly downloading the video to the file
1021 system, among other possible outcomes. The dictionaries must include
1022 the following fields:
1024 id: Video identifier.
1025 url: Final video URL.
1026 uploader: Nickname of the video uploader.
1027 title: Literal title.
1028 stitle: Simplified title.
1029 ext: Video filename extension.
1030 format: Video format.
1031 player_url: SWF Player URL (may be None).
1033 The following fields are optional. Their primary purpose is to allow
1034 youtube-dl to serve as the backend for a video search function, such
1035 as the one in youtube2mp3. They are only used when their respective
1036 forced printing functions are called:
1038 thumbnail: Full URL to a video thumbnail image.
1039 description: One-line video description.
1041 Subclasses of this one should re-define the _real_initialize() and
1042 _real_extract() methods and define a _VALID_URL regexp.
1043 Probably, they should also be added to the list of extractors.
1049 def __init__(self, downloader=None):
1050 """Constructor. Receives an optional downloader."""
1052 self.set_downloader(downloader)
1054 def suitable(self, url):
1055 """Receives a URL and returns True if suitable for this IE."""
1056 return re.match(self._VALID_URL, url) is not None
1058 def initialize(self):
1059 """Initializes an instance (authentication, etc)."""
1061 self._real_initialize()
1064 def extract(self, url):
1065 """Extracts URL information and returns it in list of dicts."""
1067 return self._real_extract(url)
1069 def set_downloader(self, downloader):
1070 """Sets the downloader for this IE."""
1071 self._downloader = downloader
1073 def _real_initialize(self):
1074 """Real initialization process. Redefine in subclasses."""
1077 def _real_extract(self, url):
1078 """Real extraction process. Redefine in subclasses."""
1082 class YoutubeIE(InfoExtractor):
1083 """Information extractor for youtube.com."""
1085 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1086 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1087 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1088 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1089 _NETRC_MACHINE = 'youtube'
1090 # Listed in order of quality
1091 _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1092 _video_extensions = {
1098 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1103 IE_NAME = u'youtube'
1105 def report_lang(self):
1106 """Report attempt to set language."""
1107 self._downloader.to_screen(u'[youtube] Setting language')
1109 def report_login(self):
1110 """Report attempt to log in."""
1111 self._downloader.to_screen(u'[youtube] Logging in')
1113 def report_age_confirmation(self):
1114 """Report attempt to confirm age."""
1115 self._downloader.to_screen(u'[youtube] Confirming age')
1117 def report_video_webpage_download(self, video_id):
1118 """Report attempt to download video webpage."""
1119 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1121 def report_video_info_webpage_download(self, video_id):
1122 """Report attempt to download video info webpage."""
1123 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1125 def report_information_extraction(self, video_id):
1126 """Report attempt to extract video information."""
1127 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1129 def report_unavailable_format(self, video_id, format):
1130 """Report extracted video URL."""
1131 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1133 def report_rtmp_download(self):
1134 """Indicate the download will use the RTMP protocol."""
1135 self._downloader.to_screen(u'[youtube] RTMP download detected')
1137 def _real_initialize(self):
1138 if self._downloader is None:
1143 downloader_params = self._downloader.params
1145 # Attempt to use provided username and password or .netrc data
1146 if downloader_params.get('username', None) is not None:
1147 username = downloader_params['username']
1148 password = downloader_params['password']
1149 elif downloader_params.get('usenetrc', False):
1151 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1152 if info is not None:
1156 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1157 except (IOError, netrc.NetrcParseError), err:
1158 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1162 request = urllib2.Request(self._LANG_URL)
1165 urllib2.urlopen(request).read()
1166 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1167 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1170 # No authentication to be performed
1171 if username is None:
1176 'current_form': 'loginForm',
1178 'action_login': 'Log In',
1179 'username': username,
1180 'password': password,
1182 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1185 login_results = urllib2.urlopen(request).read()
1186 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1187 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1189 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1190 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1196 'action_confirm': 'Confirm',
1198 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1200 self.report_age_confirmation()
1201 age_results = urllib2.urlopen(request).read()
1202 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1203 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1206 def _real_extract(self, url):
1207 # Extract video id from URL
1208 mobj = re.match(self._VALID_URL, url)
1210 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1212 video_id = mobj.group(2)
1215 self.report_video_webpage_download(video_id)
1216 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1218 video_webpage = urllib2.urlopen(request).read()
1219 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1220 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1223 # Attempt to extract SWF player URL
1224 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1225 if mobj is not None:
1226 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1231 self.report_video_info_webpage_download(video_id)
1232 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1233 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1234 % (video_id, el_type))
1235 request = urllib2.Request(video_info_url)
1237 video_info_webpage = urllib2.urlopen(request).read()
1238 video_info = parse_qs(video_info_webpage)
1239 if 'token' in video_info:
1241 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1242 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1244 if 'token' not in video_info:
1245 if 'reason' in video_info:
1246 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1248 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1251 # Start extracting information
1252 self.report_information_extraction(video_id)
1255 if 'author' not in video_info:
1256 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1258 video_uploader = urllib.unquote_plus(video_info['author'][0])
1261 if 'title' not in video_info:
1262 self._downloader.trouble(u'ERROR: unable to extract video title')
1264 video_title = urllib.unquote_plus(video_info['title'][0])
1265 video_title = video_title.decode('utf-8')
1266 video_title = sanitize_title(video_title)
1269 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1270 simple_title = simple_title.strip(ur'_')
1273 if 'thumbnail_url' not in video_info:
1274 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1275 video_thumbnail = ''
1276 else: # don't panic if we can't find it
1277 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1281 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1282 if mobj is not None:
1283 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1284 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1285 for expression in format_expressions:
1287 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1295 video_description = u'No description available.'
1296 if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1297 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1298 if mobj is not None:
1299 video_description = mobj.group(1).decode('utf-8')
1301 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1302 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1303 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1304 # TODO use another parser
1307 video_token = urllib.unquote_plus(video_info['token'][0])
1309 # Decide which formats to download
1310 req_format = self._downloader.params.get('format', None)
1312 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1313 self.report_rtmp_download()
1314 video_url_list = [(None, video_info['conn'][0])]
1315 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1316 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1317 url_data = [parse_qs(uds) for uds in url_data_strs]
1318 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1319 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1321 format_limit = self._downloader.params.get('format_limit', None)
1322 if format_limit is not None and format_limit in self._available_formats:
1323 format_list = self._available_formats[self._available_formats.index(format_limit):]
1325 format_list = self._available_formats
1326 existing_formats = [x for x in format_list if x in url_map]
1327 if len(existing_formats) == 0:
1328 self._downloader.trouble(u'ERROR: no known formats available for video')
1330 if req_format is None or req_format == 'best':
1331 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1332 elif req_format == 'worst':
1333 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1334 elif req_format in ('-1', 'all'):
1335 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1337 # Specific formats. We pick the first in a slash-delimeted sequence.
1338 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1339 req_formats = req_format.split('/')
1340 video_url_list = None
1341 for rf in req_formats:
1343 video_url_list = [(rf, url_map[rf])]
1345 if video_url_list is None:
1346 self._downloader.trouble(u'ERROR: requested format not available')
1349 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1352 for format_param, video_real_url in video_url_list:
1353 # At this point we have a new video
1354 self._downloader.increment_downloads()
1357 video_extension = self._video_extensions.get(format_param, 'flv')
1360 # Process video information
1361 self._downloader.process_info({
1362 'id': video_id.decode('utf-8'),
1363 'url': video_real_url.decode('utf-8'),
1364 'uploader': video_uploader.decode('utf-8'),
1365 'upload_date': upload_date,
1366 'title': video_title,
1367 'stitle': simple_title,
1368 'ext': video_extension.decode('utf-8'),
1369 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1370 'thumbnail': video_thumbnail.decode('utf-8'),
1371 'description': video_description,
1372 'player_url': player_url,
1374 except UnavailableVideoError, err:
1375 self._downloader.trouble(u'\nERROR: unable to download video')
1378 class MetacafeIE(InfoExtractor):
1379 """Information Extractor for metacafe.com."""
1381 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1382 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1383 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1385 IE_NAME = u'metacafe'
1387 def __init__(self, youtube_ie, downloader=None):
1388 InfoExtractor.__init__(self, downloader)
1389 self._youtube_ie = youtube_ie
1391 def report_disclaimer(self):
1392 """Report disclaimer retrieval."""
1393 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1395 def report_age_confirmation(self):
1396 """Report attempt to confirm age."""
1397 self._downloader.to_screen(u'[metacafe] Confirming age')
1399 def report_download_webpage(self, video_id):
1400 """Report webpage download."""
1401 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1403 def report_extraction(self, video_id):
1404 """Report information extraction."""
1405 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1407 def _real_initialize(self):
1408 # Retrieve disclaimer
1409 request = urllib2.Request(self._DISCLAIMER)
1411 self.report_disclaimer()
1412 disclaimer = urllib2.urlopen(request).read()
1413 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1414 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1420 'submit': "Continue - I'm over 18",
1422 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1424 self.report_age_confirmation()
1425 disclaimer = urllib2.urlopen(request).read()
1426 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1427 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1430 def _real_extract(self, url):
1431 # Extract id and simplified title from URL
1432 mobj = re.match(self._VALID_URL, url)
1434 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1437 video_id = mobj.group(1)
1439 # Check if video comes from YouTube
1440 mobj2 = re.match(r'^yt-(.*)$', video_id)
1441 if mobj2 is not None:
1442 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1445 # At this point we have a new video
1446 self._downloader.increment_downloads()
1448 simple_title = mobj.group(2).decode('utf-8')
1450 # Retrieve video webpage to extract further information
1451 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1453 self.report_download_webpage(video_id)
1454 webpage = urllib2.urlopen(request).read()
1455 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1456 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1459 # Extract URL, uploader and title from webpage
1460 self.report_extraction(video_id)
1461 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1462 if mobj is not None:
1463 mediaURL = urllib.unquote(mobj.group(1))
1464 video_extension = mediaURL[-3:]
1466 # Extract gdaKey if available
1467 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1469 video_url = mediaURL
1471 gdaKey = mobj.group(1)
1472 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1474 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1476 self._downloader.trouble(u'ERROR: unable to extract media URL')
1478 vardict = parse_qs(mobj.group(1))
1479 if 'mediaData' not in vardict:
1480 self._downloader.trouble(u'ERROR: unable to extract media URL')
1482 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1484 self._downloader.trouble(u'ERROR: unable to extract media URL')
1486 mediaURL = mobj.group(1).replace('\\/', '/')
1487 video_extension = mediaURL[-3:]
1488 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1490 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1492 self._downloader.trouble(u'ERROR: unable to extract title')
1494 video_title = mobj.group(1).decode('utf-8')
1495 video_title = sanitize_title(video_title)
1497 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1499 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1501 video_uploader = mobj.group(1)
1504 # Process video information
1505 self._downloader.process_info({
1506 'id': video_id.decode('utf-8'),
1507 'url': video_url.decode('utf-8'),
1508 'uploader': video_uploader.decode('utf-8'),
1509 'upload_date': u'NA',
1510 'title': video_title,
1511 'stitle': simple_title,
1512 'ext': video_extension.decode('utf-8'),
1516 except UnavailableVideoError:
1517 self._downloader.trouble(u'\nERROR: unable to download video')
1520 class DailymotionIE(InfoExtractor):
1521 """Information Extractor for Dailymotion"""
1523 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1524 IE_NAME = u'dailymotion'
1526 def __init__(self, downloader=None):
1527 InfoExtractor.__init__(self, downloader)
1529 def report_download_webpage(self, video_id):
1530 """Report webpage download."""
1531 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1533 def report_extraction(self, video_id):
1534 """Report information extraction."""
1535 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1537 def _real_initialize(self):
1540 def _real_extract(self, url):
1541 # Extract id and simplified title from URL
1542 mobj = re.match(self._VALID_URL, url)
1544 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1547 # At this point we have a new video
1548 self._downloader.increment_downloads()
1549 video_id = mobj.group(1)
1551 simple_title = mobj.group(2).decode('utf-8')
1552 video_extension = 'flv'
1554 # Retrieve video webpage to extract further information
1555 request = urllib2.Request(url)
1556 request.add_header('Cookie', 'family_filter=off')
1558 self.report_download_webpage(video_id)
1559 webpage = urllib2.urlopen(request).read()
1560 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1561 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1564 # Extract URL, uploader and title from webpage
1565 self.report_extraction(video_id)
1566 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1568 self._downloader.trouble(u'ERROR: unable to extract media URL')
1570 sequence = urllib.unquote(mobj.group(1))
1571 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1573 self._downloader.trouble(u'ERROR: unable to extract media URL')
1575 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1577 # if needed add http://www.dailymotion.com/ if relative URL
1579 video_url = mediaURL
1581 mobj = re.search(r'(?im)<title>Dailymotion\s*-\s*(.+)\s*-\s*[^<]+?</title>', webpage)
1583 self._downloader.trouble(u'ERROR: unable to extract title')
1585 video_title = mobj.group(1).decode('utf-8')
1586 video_title = sanitize_title(video_title)
1588 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1590 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1592 video_uploader = mobj.group(1)
1595 # Process video information
1596 self._downloader.process_info({
1597 'id': video_id.decode('utf-8'),
1598 'url': video_url.decode('utf-8'),
1599 'uploader': video_uploader.decode('utf-8'),
1600 'upload_date': u'NA',
1601 'title': video_title,
1602 'stitle': simple_title,
1603 'ext': video_extension.decode('utf-8'),
1607 except UnavailableVideoError:
1608 self._downloader.trouble(u'\nERROR: unable to download video')
1611 class GoogleIE(InfoExtractor):
1612 """Information extractor for video.google.com."""
1614 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1615 IE_NAME = u'video.google'
1617 def __init__(self, downloader=None):
1618 InfoExtractor.__init__(self, downloader)
1620 def report_download_webpage(self, video_id):
1621 """Report webpage download."""
1622 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1624 def report_extraction(self, video_id):
1625 """Report information extraction."""
1626 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1628 def _real_initialize(self):
1631 def _real_extract(self, url):
1632 # Extract id from URL
1633 mobj = re.match(self._VALID_URL, url)
1635 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1638 # At this point we have a new video
1639 self._downloader.increment_downloads()
1640 video_id = mobj.group(1)
1642 video_extension = 'mp4'
1644 # Retrieve video webpage to extract further information
1645 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1647 self.report_download_webpage(video_id)
1648 webpage = urllib2.urlopen(request).read()
1649 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1650 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1653 # Extract URL, uploader, and title from webpage
1654 self.report_extraction(video_id)
1655 mobj = re.search(r"download_url:'([^']+)'", webpage)
1657 video_extension = 'flv'
1658 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1660 self._downloader.trouble(u'ERROR: unable to extract media URL')
1662 mediaURL = urllib.unquote(mobj.group(1))
1663 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1664 mediaURL = mediaURL.replace('\\x26', '\x26')
1666 video_url = mediaURL
1668 mobj = re.search(r'<title>(.*)</title>', webpage)
1670 self._downloader.trouble(u'ERROR: unable to extract title')
1672 video_title = mobj.group(1).decode('utf-8')
1673 video_title = sanitize_title(video_title)
1674 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1676 # Extract video description
1677 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1679 self._downloader.trouble(u'ERROR: unable to extract video description')
1681 video_description = mobj.group(1).decode('utf-8')
1682 if not video_description:
1683 video_description = 'No description available.'
1685 # Extract video thumbnail
1686 if self._downloader.params.get('forcethumbnail', False):
1687 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1689 webpage = urllib2.urlopen(request).read()
1690 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1691 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1693 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1695 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1697 video_thumbnail = mobj.group(1)
1698 else: # we need something to pass to process_info
1699 video_thumbnail = ''
1702 # Process video information
1703 self._downloader.process_info({
1704 'id': video_id.decode('utf-8'),
1705 'url': video_url.decode('utf-8'),
1707 'upload_date': u'NA',
1708 'title': video_title,
1709 'stitle': simple_title,
1710 'ext': video_extension.decode('utf-8'),
1714 except UnavailableVideoError:
1715 self._downloader.trouble(u'\nERROR: unable to download video')
1718 class PhotobucketIE(InfoExtractor):
1719 """Information extractor for photobucket.com."""
1721 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1722 IE_NAME = u'photobucket'
1724 def __init__(self, downloader=None):
1725 InfoExtractor.__init__(self, downloader)
1727 def report_download_webpage(self, video_id):
1728 """Report webpage download."""
1729 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1731 def report_extraction(self, video_id):
1732 """Report information extraction."""
1733 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1735 def _real_initialize(self):
1738 def _real_extract(self, url):
1739 # Extract id from URL
1740 mobj = re.match(self._VALID_URL, url)
1742 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1745 # At this point we have a new video
1746 self._downloader.increment_downloads()
1747 video_id = mobj.group(1)
1749 video_extension = 'flv'
1751 # Retrieve video webpage to extract further information
1752 request = urllib2.Request(url)
1754 self.report_download_webpage(video_id)
1755 webpage = urllib2.urlopen(request).read()
1756 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1757 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1760 # Extract URL, uploader, and title from webpage
1761 self.report_extraction(video_id)
1762 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1764 self._downloader.trouble(u'ERROR: unable to extract media URL')
1766 mediaURL = urllib.unquote(mobj.group(1))
1768 video_url = mediaURL
1770 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1772 self._downloader.trouble(u'ERROR: unable to extract title')
1774 video_title = mobj.group(1).decode('utf-8')
1775 video_title = sanitize_title(video_title)
1776 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1778 video_uploader = mobj.group(2).decode('utf-8')
1781 # Process video information
1782 self._downloader.process_info({
1783 'id': video_id.decode('utf-8'),
1784 'url': video_url.decode('utf-8'),
1785 'uploader': video_uploader,
1786 'upload_date': u'NA',
1787 'title': video_title,
1788 'stitle': simple_title,
1789 'ext': video_extension.decode('utf-8'),
1793 except UnavailableVideoError:
1794 self._downloader.trouble(u'\nERROR: unable to download video')
1797 class YahooIE(InfoExtractor):
1798 """Information extractor for video.yahoo.com."""
1800 # _VALID_URL matches all Yahoo! Video URLs
1801 # _VPAGE_URL matches only the extractable '/watch/' URLs
1802 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1803 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1804 IE_NAME = u'video.yahoo'
1806 def __init__(self, downloader=None):
1807 InfoExtractor.__init__(self, downloader)
1809 def report_download_webpage(self, video_id):
1810 """Report webpage download."""
1811 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1813 def report_extraction(self, video_id):
1814 """Report information extraction."""
1815 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1817 def _real_initialize(self):
1820 def _real_extract(self, url, new_video=True):
1821 # Extract ID from URL
1822 mobj = re.match(self._VALID_URL, url)
1824 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1827 # At this point we have a new video
1828 self._downloader.increment_downloads()
1829 video_id = mobj.group(2)
1830 video_extension = 'flv'
1832 # Rewrite valid but non-extractable URLs as
1833 # extractable English language /watch/ URLs
1834 if re.match(self._VPAGE_URL, url) is None:
1835 request = urllib2.Request(url)
1837 webpage = urllib2.urlopen(request).read()
1838 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1839 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1842 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1844 self._downloader.trouble(u'ERROR: Unable to extract id field')
1846 yahoo_id = mobj.group(1)
1848 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1850 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1852 yahoo_vid = mobj.group(1)
1854 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1855 return self._real_extract(url, new_video=False)
1857 # Retrieve video webpage to extract further information
1858 request = urllib2.Request(url)
1860 self.report_download_webpage(video_id)
1861 webpage = urllib2.urlopen(request).read()
1862 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1863 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1866 # Extract uploader and title from webpage
1867 self.report_extraction(video_id)
1868 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1870 self._downloader.trouble(u'ERROR: unable to extract video title')
1872 video_title = mobj.group(1).decode('utf-8')
1873 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1875 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1877 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1879 video_uploader = mobj.group(1).decode('utf-8')
1881 # Extract video thumbnail
1882 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1884 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1886 video_thumbnail = mobj.group(1).decode('utf-8')
1888 # Extract video description
1889 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1891 self._downloader.trouble(u'ERROR: unable to extract video description')
1893 video_description = mobj.group(1).decode('utf-8')
1894 if not video_description:
1895 video_description = 'No description available.'
1897 # Extract video height and width
1898 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1900 self._downloader.trouble(u'ERROR: unable to extract video height')
1902 yv_video_height = mobj.group(1)
1904 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1906 self._downloader.trouble(u'ERROR: unable to extract video width')
1908 yv_video_width = mobj.group(1)
1910 # Retrieve video playlist to extract media URL
1911 # I'm not completely sure what all these options are, but we
1912 # seem to need most of them, otherwise the server sends a 401.
1913 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1914 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1915 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1916 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1917 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1919 self.report_download_webpage(video_id)
1920 webpage = urllib2.urlopen(request).read()
1921 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1922 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1925 # Extract media URL from playlist XML
1926 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1928 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1930 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1931 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1934 # Process video information
1935 self._downloader.process_info({
1936 'id': video_id.decode('utf-8'),
1938 'uploader': video_uploader,
1939 'upload_date': u'NA',
1940 'title': video_title,
1941 'stitle': simple_title,
1942 'ext': video_extension.decode('utf-8'),
1943 'thumbnail': video_thumbnail.decode('utf-8'),
1944 'description': video_description,
1945 'thumbnail': video_thumbnail,
1948 except UnavailableVideoError:
1949 self._downloader.trouble(u'\nERROR: unable to download video')
1952 class VimeoIE(InfoExtractor):
1953 """Information extractor for vimeo.com."""
1955 # _VALID_URL matches Vimeo URLs
1956 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1959 def __init__(self, downloader=None):
1960 InfoExtractor.__init__(self, downloader)
1962 def report_download_webpage(self, video_id):
1963 """Report webpage download."""
1964 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1966 def report_extraction(self, video_id):
1967 """Report information extraction."""
1968 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1970 def _real_initialize(self):
1973 def _real_extract(self, url, new_video=True):
1974 # Extract ID from URL
1975 mobj = re.match(self._VALID_URL, url)
1977 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1980 # At this point we have a new video
1981 self._downloader.increment_downloads()
1982 video_id = mobj.group(1)
1984 # Retrieve video webpage to extract further information
1985 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
1987 self.report_download_webpage(video_id)
1988 webpage = urllib2.urlopen(request).read()
1989 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1990 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1993 # Now we begin extracting as much information as we can from what we
1994 # retrieved. First we extract the information common to all extractors,
1995 # and latter we extract those that are Vimeo specific.
1996 self.report_extraction(video_id)
1999 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
2001 self._downloader.trouble(u'ERROR: unable to extract video title')
2003 video_title = mobj.group(1).decode('utf-8')
2004 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2007 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
2009 self._downloader.trouble(u'ERROR: unable to extract video uploader')
2011 video_uploader = mobj.group(1).decode('utf-8')
2013 # Extract video thumbnail
2014 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2016 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2018 video_thumbnail = mobj.group(1).decode('utf-8')
2020 # # Extract video description
2021 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2023 # self._downloader.trouble(u'ERROR: unable to extract video description')
2025 # video_description = mobj.group(1).decode('utf-8')
2026 # if not video_description: video_description = 'No description available.'
2027 video_description = 'Foo.'
2029 # Vimeo specific: extract request signature
2030 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2032 self._downloader.trouble(u'ERROR: unable to extract request signature')
2034 sig = mobj.group(1).decode('utf-8')
2036 # Vimeo specific: Extract request signature expiration
2037 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2039 self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2041 sig_exp = mobj.group(1).decode('utf-8')
2043 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s" % (video_id, sig, sig_exp)
2046 # Process video information
2047 self._downloader.process_info({
2048 'id': video_id.decode('utf-8'),
2050 'uploader': video_uploader,
2051 'upload_date': u'NA',
2052 'title': video_title,
2053 'stitle': simple_title,
2055 'thumbnail': video_thumbnail.decode('utf-8'),
2056 'description': video_description,
2057 'thumbnail': video_thumbnail,
2058 'description': video_description,
2061 except UnavailableVideoError:
2062 self._downloader.trouble(u'ERROR: unable to download video')
2065 class GenericIE(InfoExtractor):
2066 """Generic last-resort information extractor."""
2069 IE_NAME = u'generic'
2071 def __init__(self, downloader=None):
2072 InfoExtractor.__init__(self, downloader)
2074 def report_download_webpage(self, video_id):
2075 """Report webpage download."""
2076 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2077 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2079 def report_extraction(self, video_id):
2080 """Report information extraction."""
2081 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2083 def _real_initialize(self):
2086 def _real_extract(self, url):
2087 # At this point we have a new video
2088 self._downloader.increment_downloads()
2090 video_id = url.split('/')[-1]
2091 request = urllib2.Request(url)
2093 self.report_download_webpage(video_id)
2094 webpage = urllib2.urlopen(request).read()
2095 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2096 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2098 except ValueError, err:
2099 # since this is the last-resort InfoExtractor, if
2100 # this error is thrown, it'll be thrown here
2101 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2104 self.report_extraction(video_id)
2105 # Start with something easy: JW Player in SWFObject
2106 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2108 # Broaden the search a little bit
2109 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2111 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2114 # It's possible that one of the regexes
2115 # matched, but returned an empty group:
2116 if mobj.group(1) is None:
2117 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2120 video_url = urllib.unquote(mobj.group(1))
2121 video_id = os.path.basename(video_url)
2123 # here's a fun little line of code for you:
2124 video_extension = os.path.splitext(video_id)[1][1:]
2125 video_id = os.path.splitext(video_id)[0]
2127 # it's tempting to parse this further, but you would
2128 # have to take into account all the variations like
2129 # Video Title - Site Name
2130 # Site Name | Video Title
2131 # Video Title - Tagline | Site Name
2132 # and so on and so forth; it's just not practical
2133 mobj = re.search(r'<title>(.*)</title>', webpage)
2135 self._downloader.trouble(u'ERROR: unable to extract title')
2137 video_title = mobj.group(1).decode('utf-8')
2138 video_title = sanitize_title(video_title)
2139 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2141 # video uploader is domain name
2142 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2144 self._downloader.trouble(u'ERROR: unable to extract title')
2146 video_uploader = mobj.group(1).decode('utf-8')
2149 # Process video information
2150 self._downloader.process_info({
2151 'id': video_id.decode('utf-8'),
2152 'url': video_url.decode('utf-8'),
2153 'uploader': video_uploader,
2154 'upload_date': u'NA',
2155 'title': video_title,
2156 'stitle': simple_title,
2157 'ext': video_extension.decode('utf-8'),
2161 except UnavailableVideoError, err:
2162 self._downloader.trouble(u'\nERROR: unable to download video')
2165 class YoutubeSearchIE(InfoExtractor):
2166 """Information Extractor for YouTube search queries."""
2167 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2168 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2169 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2170 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2172 _max_youtube_results = 1000
2173 IE_NAME = u'youtube:search'
2175 def __init__(self, youtube_ie, downloader=None):
2176 InfoExtractor.__init__(self, downloader)
2177 self._youtube_ie = youtube_ie
2179 def report_download_page(self, query, pagenum):
2180 """Report attempt to download playlist page with given number."""
2181 query = query.decode(preferredencoding())
2182 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2184 def _real_initialize(self):
2185 self._youtube_ie.initialize()
2187 def _real_extract(self, query):
2188 mobj = re.match(self._VALID_URL, query)
2190 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2193 prefix, query = query.split(':')
2195 query = query.encode('utf-8')
2197 self._download_n_results(query, 1)
2199 elif prefix == 'all':
2200 self._download_n_results(query, self._max_youtube_results)
2206 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2208 elif n > self._max_youtube_results:
2209 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2210 n = self._max_youtube_results
2211 self._download_n_results(query, n)
2213 except ValueError: # parsing prefix as integer fails
2214 self._download_n_results(query, 1)
2217 def _download_n_results(self, query, n):
2218 """Downloads a specified number of results for a query"""
2221 already_seen = set()
2225 self.report_download_page(query, pagenum)
2226 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2227 request = urllib2.Request(result_url)
2229 page = urllib2.urlopen(request).read()
2230 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2231 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2234 # Extract video identifiers
2235 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2236 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2237 if video_id not in already_seen:
2238 video_ids.append(video_id)
2239 already_seen.add(video_id)
2240 if len(video_ids) == n:
2241 # Specified n videos reached
2242 for id in video_ids:
2243 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2246 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2247 for id in video_ids:
2248 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2251 pagenum = pagenum + 1
2254 class GoogleSearchIE(InfoExtractor):
2255 """Information Extractor for Google Video search queries."""
2256 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2257 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2258 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2259 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2261 _max_google_results = 1000
2262 IE_NAME = u'video.google:search'
2264 def __init__(self, google_ie, downloader=None):
2265 InfoExtractor.__init__(self, downloader)
2266 self._google_ie = google_ie
2268 def report_download_page(self, query, pagenum):
2269 """Report attempt to download playlist page with given number."""
2270 query = query.decode(preferredencoding())
2271 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2273 def _real_initialize(self):
2274 self._google_ie.initialize()
2276 def _real_extract(self, query):
2277 mobj = re.match(self._VALID_URL, query)
2279 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2282 prefix, query = query.split(':')
2284 query = query.encode('utf-8')
2286 self._download_n_results(query, 1)
2288 elif prefix == 'all':
2289 self._download_n_results(query, self._max_google_results)
2295 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2297 elif n > self._max_google_results:
2298 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2299 n = self._max_google_results
2300 self._download_n_results(query, n)
2302 except ValueError: # parsing prefix as integer fails
2303 self._download_n_results(query, 1)
2306 def _download_n_results(self, query, n):
2307 """Downloads a specified number of results for a query"""
2310 already_seen = set()
2314 self.report_download_page(query, pagenum)
2315 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2316 request = urllib2.Request(result_url)
2318 page = urllib2.urlopen(request).read()
2319 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2320 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2323 # Extract video identifiers
2324 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2325 video_id = mobj.group(1)
2326 if video_id not in already_seen:
2327 video_ids.append(video_id)
2328 already_seen.add(video_id)
2329 if len(video_ids) == n:
2330 # Specified n videos reached
2331 for id in video_ids:
2332 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2335 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2336 for id in video_ids:
2337 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2340 pagenum = pagenum + 1
2343 class YahooSearchIE(InfoExtractor):
2344 """Information Extractor for Yahoo! Video search queries."""
2345 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2346 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2347 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2348 _MORE_PAGES_INDICATOR = r'\s*Next'
2350 _max_yahoo_results = 1000
2351 IE_NAME = u'video.yahoo:search'
2353 def __init__(self, yahoo_ie, downloader=None):
2354 InfoExtractor.__init__(self, downloader)
2355 self._yahoo_ie = yahoo_ie
2357 def report_download_page(self, query, pagenum):
2358 """Report attempt to download playlist page with given number."""
2359 query = query.decode(preferredencoding())
2360 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2362 def _real_initialize(self):
2363 self._yahoo_ie.initialize()
2365 def _real_extract(self, query):
2366 mobj = re.match(self._VALID_URL, query)
2368 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2371 prefix, query = query.split(':')
2373 query = query.encode('utf-8')
2375 self._download_n_results(query, 1)
2377 elif prefix == 'all':
2378 self._download_n_results(query, self._max_yahoo_results)
2384 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2386 elif n > self._max_yahoo_results:
2387 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2388 n = self._max_yahoo_results
2389 self._download_n_results(query, n)
2391 except ValueError: # parsing prefix as integer fails
2392 self._download_n_results(query, 1)
2395 def _download_n_results(self, query, n):
2396 """Downloads a specified number of results for a query"""
2399 already_seen = set()
2403 self.report_download_page(query, pagenum)
2404 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2405 request = urllib2.Request(result_url)
2407 page = urllib2.urlopen(request).read()
2408 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2409 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2412 # Extract video identifiers
2413 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2414 video_id = mobj.group(1)
2415 if video_id not in already_seen:
2416 video_ids.append(video_id)
2417 already_seen.add(video_id)
2418 if len(video_ids) == n:
2419 # Specified n videos reached
2420 for id in video_ids:
2421 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2424 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2425 for id in video_ids:
2426 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2429 pagenum = pagenum + 1
2432 class YoutubePlaylistIE(InfoExtractor):
2433 """Information Extractor for YouTube playlists."""
2435 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2436 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2437 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2438 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2440 IE_NAME = u'youtube:playlist'
2442 def __init__(self, youtube_ie, downloader=None):
2443 InfoExtractor.__init__(self, downloader)
2444 self._youtube_ie = youtube_ie
2446 def report_download_page(self, playlist_id, pagenum):
2447 """Report attempt to download playlist page with given number."""
2448 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2450 def _real_initialize(self):
2451 self._youtube_ie.initialize()
2453 def _real_extract(self, url):
2454 # Extract playlist id
2455 mobj = re.match(self._VALID_URL, url)
2457 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2461 if mobj.group(3) is not None:
2462 self._youtube_ie.extract(mobj.group(3))
2465 # Download playlist pages
2466 # prefix is 'p' as default for playlists but there are other types that need extra care
2467 playlist_prefix = mobj.group(1)
2468 if playlist_prefix == 'a':
2469 playlist_access = 'artist'
2471 playlist_prefix = 'p'
2472 playlist_access = 'view_play_list'
2473 playlist_id = mobj.group(2)
2478 self.report_download_page(playlist_id, pagenum)
2479 request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2481 page = urllib2.urlopen(request).read()
2482 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2483 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2486 # Extract video identifiers
2488 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2489 if mobj.group(1) not in ids_in_page:
2490 ids_in_page.append(mobj.group(1))
2491 video_ids.extend(ids_in_page)
2493 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2495 pagenum = pagenum + 1
2497 playliststart = self._downloader.params.get('playliststart', 1) - 1
2498 playlistend = self._downloader.params.get('playlistend', -1)
2499 video_ids = video_ids[playliststart:playlistend]
2501 for id in video_ids:
2502 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2506 class YoutubeUserIE(InfoExtractor):
2507 """Information Extractor for YouTube users."""
2509 _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2510 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2511 _GDATA_PAGE_SIZE = 50
2512 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2513 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2515 IE_NAME = u'youtube:user'
2517 def __init__(self, youtube_ie, downloader=None):
2518 InfoExtractor.__init__(self, downloader)
2519 self._youtube_ie = youtube_ie
2521 def report_download_page(self, username, start_index):
2522 """Report attempt to download user page."""
2523 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2524 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2526 def _real_initialize(self):
2527 self._youtube_ie.initialize()
2529 def _real_extract(self, url):
2531 mobj = re.match(self._VALID_URL, url)
2533 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2536 username = mobj.group(1)
2538 # Download video ids using YouTube Data API. Result size per
2539 # query is limited (currently to 50 videos) so we need to query
2540 # page by page until there are no video ids - it means we got
2547 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2548 self.report_download_page(username, start_index)
2550 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2553 page = urllib2.urlopen(request).read()
2554 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2555 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2558 # Extract video identifiers
2561 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2562 if mobj.group(1) not in ids_in_page:
2563 ids_in_page.append(mobj.group(1))
2565 video_ids.extend(ids_in_page)
2567 # A little optimization - if current page is not
2568 # "full", ie. does not contain PAGE_SIZE video ids then
2569 # we can assume that this page is the last one - there
2570 # are no more ids on further pages - no need to query
2573 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2578 all_ids_count = len(video_ids)
2579 playliststart = self._downloader.params.get('playliststart', 1) - 1
2580 playlistend = self._downloader.params.get('playlistend', -1)
2582 if playlistend == -1:
2583 video_ids = video_ids[playliststart:]
2585 video_ids = video_ids[playliststart:playlistend]
2587 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2588 (username, all_ids_count, len(video_ids)))
2590 for video_id in video_ids:
2591 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2594 class DepositFilesIE(InfoExtractor):
2595 """Information extractor for depositfiles.com"""
2597 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2598 IE_NAME = u'DepositFiles'
2600 def __init__(self, downloader=None):
2601 InfoExtractor.__init__(self, downloader)
2603 def report_download_webpage(self, file_id):
2604 """Report webpage download."""
2605 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2607 def report_extraction(self, file_id):
2608 """Report information extraction."""
2609 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2611 def _real_initialize(self):
2614 def _real_extract(self, url):
2615 # At this point we have a new file
2616 self._downloader.increment_downloads()
2618 file_id = url.split('/')[-1]
2619 # Rebuild url in english locale
2620 url = 'http://depositfiles.com/en/files/' + file_id
2622 # Retrieve file webpage with 'Free download' button pressed
2623 free_download_indication = { 'gateway_result' : '1' }
2624 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2626 self.report_download_webpage(file_id)
2627 webpage = urllib2.urlopen(request).read()
2628 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2629 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2632 # Search for the real file URL
2633 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2634 if (mobj is None) or (mobj.group(1) is None):
2635 # Try to figure out reason of the error.
2636 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2637 if (mobj is not None) and (mobj.group(1) is not None):
2638 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2639 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2641 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2644 file_url = mobj.group(1)
2645 file_extension = os.path.splitext(file_url)[1][1:]
2647 # Search for file title
2648 mobj = re.search(r'<b title="(.*?)">', webpage)
2650 self._downloader.trouble(u'ERROR: unable to extract title')
2652 file_title = mobj.group(1).decode('utf-8')
2655 # Process file information
2656 self._downloader.process_info({
2657 'id': file_id.decode('utf-8'),
2658 'url': file_url.decode('utf-8'),
2660 'upload_date': u'NA',
2661 'title': file_title,
2662 'stitle': file_title,
2663 'ext': file_extension.decode('utf-8'),
2667 except UnavailableVideoError, err:
2668 self._downloader.trouble(u'ERROR: unable to download file')
2671 class FacebookIE(InfoExtractor):
2672 """Information Extractor for Facebook"""
2674 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2675 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2676 _NETRC_MACHINE = 'facebook'
2677 _available_formats = ['highqual', 'lowqual']
2678 _video_extensions = {
2682 IE_NAME = u'facebook'
2684 def __init__(self, downloader=None):
2685 InfoExtractor.__init__(self, downloader)
2687 def _reporter(self, message):
2688 """Add header and report message."""
2689 self._downloader.to_screen(u'[facebook] %s' % message)
2691 def report_login(self):
2692 """Report attempt to log in."""
2693 self._reporter(u'Logging in')
2695 def report_video_webpage_download(self, video_id):
2696 """Report attempt to download video webpage."""
2697 self._reporter(u'%s: Downloading video webpage' % video_id)
2699 def report_information_extraction(self, video_id):
2700 """Report attempt to extract video information."""
2701 self._reporter(u'%s: Extracting video information' % video_id)
2703 def _parse_page(self, video_webpage):
2704 """Extract video information from page"""
2706 data = {'title': r'class="video_title datawrap">(.*?)</',
2707 'description': r'<div class="datawrap">(.*?)</div>',
2708 'owner': r'\("video_owner_name", "(.*?)"\)',
2709 'upload_date': r'data-date="(.*?)"',
2710 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2713 for piece in data.keys():
2714 mobj = re.search(data[piece], video_webpage)
2715 if mobj is not None:
2716 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2720 for fmt in self._available_formats:
2721 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2722 if mobj is not None:
2723 # URL is in a Javascript segment inside an escaped Unicode format within
2724 # the generally utf-8 page
2725 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2726 video_info['video_urls'] = video_urls
2730 def _real_initialize(self):
2731 if self._downloader is None:
2736 downloader_params = self._downloader.params
2738 # Attempt to use provided username and password or .netrc data
2739 if downloader_params.get('username', None) is not None:
2740 useremail = downloader_params['username']
2741 password = downloader_params['password']
2742 elif downloader_params.get('usenetrc', False):
2744 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2745 if info is not None:
2749 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2750 except (IOError, netrc.NetrcParseError), err:
2751 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2754 if useremail is None:
2763 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2766 login_results = urllib2.urlopen(request).read()
2767 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2768 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2770 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2771 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2774 def _real_extract(self, url):
2775 mobj = re.match(self._VALID_URL, url)
2777 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2779 video_id = mobj.group('ID')
2782 self.report_video_webpage_download(video_id)
2783 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2785 page = urllib2.urlopen(request)
2786 video_webpage = page.read()
2787 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2788 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2791 # Start extracting information
2792 self.report_information_extraction(video_id)
2794 # Extract information
2795 video_info = self._parse_page(video_webpage)
2798 if 'owner' not in video_info:
2799 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2801 video_uploader = video_info['owner']
2804 if 'title' not in video_info:
2805 self._downloader.trouble(u'ERROR: unable to extract video title')
2807 video_title = video_info['title']
2808 video_title = video_title.decode('utf-8')
2809 video_title = sanitize_title(video_title)
2812 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2813 simple_title = simple_title.strip(ur'_')
2816 if 'thumbnail' not in video_info:
2817 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2818 video_thumbnail = ''
2820 video_thumbnail = video_info['thumbnail']
2824 if 'upload_date' in video_info:
2825 upload_time = video_info['upload_date']
2826 timetuple = email.utils.parsedate_tz(upload_time)
2827 if timetuple is not None:
2829 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2834 video_description = video_info.get('description', 'No description available.')
2836 url_map = video_info['video_urls']
2837 if len(url_map.keys()) > 0:
2838 # Decide which formats to download
2839 req_format = self._downloader.params.get('format', None)
2840 format_limit = self._downloader.params.get('format_limit', None)
2842 if format_limit is not None and format_limit in self._available_formats:
2843 format_list = self._available_formats[self._available_formats.index(format_limit):]
2845 format_list = self._available_formats
2846 existing_formats = [x for x in format_list if x in url_map]
2847 if len(existing_formats) == 0:
2848 self._downloader.trouble(u'ERROR: no known formats available for video')
2850 if req_format is None:
2851 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2852 elif req_format == 'worst':
2853 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2854 elif req_format == '-1':
2855 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2858 if req_format not in url_map:
2859 self._downloader.trouble(u'ERROR: requested format not available')
2861 video_url_list = [(req_format, url_map[req_format])] # Specific format
2863 for format_param, video_real_url in video_url_list:
2865 # At this point we have a new video
2866 self._downloader.increment_downloads()
2869 video_extension = self._video_extensions.get(format_param, 'mp4')
2872 # Process video information
2873 self._downloader.process_info({
2874 'id': video_id.decode('utf-8'),
2875 'url': video_real_url.decode('utf-8'),
2876 'uploader': video_uploader.decode('utf-8'),
2877 'upload_date': upload_date,
2878 'title': video_title,
2879 'stitle': simple_title,
2880 'ext': video_extension.decode('utf-8'),
2881 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2882 'thumbnail': video_thumbnail.decode('utf-8'),
2883 'description': video_description.decode('utf-8'),
2886 except UnavailableVideoError, err:
2887 self._downloader.trouble(u'\nERROR: unable to download video')
2889 class BlipTVIE(InfoExtractor):
2890 """Information extractor for blip.tv"""
2892 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2893 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2894 IE_NAME = u'blip.tv'
2896 def report_extraction(self, file_id):
2897 """Report information extraction."""
2898 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
2900 def _simplify_title(self, title):
2901 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2902 res = res.strip(ur'_')
2905 def _real_extract(self, url):
2906 mobj = re.match(self._VALID_URL, url)
2908 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2915 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2916 request = urllib2.Request(json_url)
2917 self.report_extraction(mobj.group(1))
2919 json_code = urllib2.urlopen(request).read()
2920 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2921 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2924 json_data = json.loads(json_code)
2925 if 'Post' in json_data:
2926 data = json_data['Post']
2930 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2931 video_url = data['media']['url']
2932 umobj = re.match(self._URL_EXT, video_url)
2934 raise ValueError('Can not determine filename extension')
2935 ext = umobj.group(1)
2937 self._downloader.increment_downloads()
2940 'id': data['item_id'],
2942 'uploader': data['display_name'],
2943 'upload_date': upload_date,
2944 'title': data['title'],
2945 'stitle': self._simplify_title(data['title']),
2947 'format': data['media']['mimeType'],
2948 'thumbnail': data['thumbnailUrl'],
2949 'description': data['description'],
2950 'player_url': data['embedUrl']
2952 except (ValueError,KeyError), err:
2953 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2957 self._downloader.process_info(info)
2958 except UnavailableVideoError, err:
2959 self._downloader.trouble(u'\nERROR: unable to download video')
2962 class MyVideoIE(InfoExtractor):
2963 """Information Extractor for myvideo.de."""
2965 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2966 IE_NAME = u'myvideo'
2968 def __init__(self, downloader=None):
2969 InfoExtractor.__init__(self, downloader)
2971 def report_download_webpage(self, video_id):
2972 """Report webpage download."""
2973 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2975 def report_extraction(self, video_id):
2976 """Report information extraction."""
2977 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2979 def _real_initialize(self):
2982 def _real_extract(self,url):
2983 mobj = re.match(self._VALID_URL, url)
2985 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2988 video_id = mobj.group(1)
2989 simple_title = mobj.group(2).decode('utf-8')
2990 # should actually not be necessary
2991 simple_title = sanitize_title(simple_title)
2992 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', simple_title)
2995 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2997 self.report_download_webpage(video_id)
2998 webpage = urllib2.urlopen(request).read()
2999 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3000 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3003 self.report_extraction(video_id)
3004 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3007 self._downloader.trouble(u'ERROR: unable to extract media URL')
3009 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3011 mobj = re.search('<title>([^<]+)</title>', webpage)
3013 self._downloader.trouble(u'ERROR: unable to extract title')
3016 video_title = mobj.group(1)
3017 video_title = sanitize_title(video_title)
3021 self._downloader.process_info({
3025 'upload_date': u'NA',
3026 'title': video_title,
3027 'stitle': simple_title,
3032 except UnavailableVideoError:
3033 self._downloader.trouble(u'\nERROR: Unable to download video')
3035 class ComedyCentralIE(InfoExtractor):
3036 """Information extractor for The Daily Show and Colbert Report """
3038 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3039 IE_NAME = u'comedycentral'
3041 def report_extraction(self, episode_id):
3042 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3044 def report_config_download(self, episode_id):
3045 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3047 def report_index_download(self, episode_id):
3048 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3050 def report_player_url(self, episode_id):
3051 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3053 def _simplify_title(self, title):
3054 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3055 res = res.strip(ur'_')
3058 def _real_extract(self, url):
3059 mobj = re.match(self._VALID_URL, url)
3061 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3064 if mobj.group('shortname'):
3065 if mobj.group('shortname') in ('tds', 'thedailyshow'):
3066 url = 'http://www.thedailyshow.com/full-episodes/'
3068 url = 'http://www.colbertnation.com/full-episodes/'
3069 mobj = re.match(self._VALID_URL, url)
3070 assert mobj is not None
3072 dlNewest = not mobj.group('episode')
3074 epTitle = mobj.group('showname')
3076 epTitle = mobj.group('episode')
3078 req = urllib2.Request(url)
3079 self.report_extraction(epTitle)
3081 htmlHandle = urllib2.urlopen(req)
3082 html = htmlHandle.read()
3083 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3084 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3087 url = htmlHandle.geturl()
3088 mobj = re.match(self._VALID_URL, url)
3090 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3092 if mobj.group('episode') == '':
3093 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3095 epTitle = mobj.group('episode')
3097 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"/>', html)
3098 if len(mMovieParams) == 0:
3099 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3102 playerUrl_raw = mMovieParams[0][0]
3103 self.report_player_url(epTitle)
3105 urlHandle = urllib2.urlopen(playerUrl_raw)
3106 playerUrl = urlHandle.geturl()
3107 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3108 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3111 uri = mMovieParams[0][1]
3112 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3113 self.report_index_download(epTitle)
3115 indexXml = urllib2.urlopen(indexUrl).read()
3116 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3117 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3120 idoc = xml.etree.ElementTree.fromstring(indexXml)
3121 itemEls = idoc.findall('.//item')
3122 for itemEl in itemEls:
3123 mediaId = itemEl.findall('./guid')[0].text
3124 shortMediaId = mediaId.split(':')[-1]
3125 showId = mediaId.split(':')[-2].replace('.com', '')
3126 officialTitle = itemEl.findall('./title')[0].text
3127 officialDate = itemEl.findall('./pubDate')[0].text
3129 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3130 urllib.urlencode({'uri': mediaId}))
3131 configReq = urllib2.Request(configUrl)
3132 self.report_config_download(epTitle)
3134 configXml = urllib2.urlopen(configReq).read()
3135 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3136 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3139 cdoc = xml.etree.ElementTree.fromstring(configXml)
3141 for rendition in cdoc.findall('.//rendition'):
3142 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3146 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3149 # For now, just pick the highest bitrate
3150 format,video_url = turls[-1]
3152 self._downloader.increment_downloads()
3154 effTitle = showId + '-' + epTitle
3159 'upload_date': officialDate,
3161 'stitle': self._simplify_title(effTitle),
3165 'description': officialTitle,
3166 'player_url': playerUrl
3170 self._downloader.process_info(info)
3171 except UnavailableVideoError, err:
3172 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3176 class EscapistIE(InfoExtractor):
3177 """Information extractor for The Escapist """
3179 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3180 IE_NAME = u'escapist'
3182 def report_extraction(self, showName):
3183 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3185 def report_config_download(self, showName):
3186 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3188 def _simplify_title(self, title):
3189 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3190 res = res.strip(ur'_')
3193 def _real_extract(self, url):
3194 htmlParser = HTMLParser.HTMLParser()
3196 mobj = re.match(self._VALID_URL, url)
3198 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3200 showName = mobj.group('showname')
3201 videoId = mobj.group('episode')
3203 self.report_extraction(showName)
3205 webPage = urllib2.urlopen(url).read()
3206 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3207 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3210 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3211 description = htmlParser.unescape(descMatch.group(1))
3212 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3213 imgUrl = htmlParser.unescape(imgMatch.group(1))
3214 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3215 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3216 configUrlMatch = re.search('config=(.*)$', playerUrl)
3217 configUrl = urllib2.unquote(configUrlMatch.group(1))
3219 self.report_config_download(showName)
3221 configJSON = urllib2.urlopen(configUrl).read()
3222 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3223 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3226 # Technically, it's JavaScript, not JSON
3227 configJSON = configJSON.replace("'", '"')
3230 config = json.loads(configJSON)
3231 except (ValueError,), err:
3232 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3235 playlist = config['playlist']
3236 videoUrl = playlist[1]['url']
3238 self._downloader.increment_downloads()
3242 'uploader': showName,
3243 'upload_date': None,
3245 'stitle': self._simplify_title(showName),
3248 'thumbnail': imgUrl,
3249 'description': description,
3250 'player_url': playerUrl,
3254 self._downloader.process_info(info)
3255 except UnavailableVideoError, err:
3256 self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3260 class PostProcessor(object):
3261 """Post Processor class.
3263 PostProcessor objects can be added to downloaders with their
3264 add_post_processor() method. When the downloader has finished a
3265 successful download, it will take its internal chain of PostProcessors
3266 and start calling the run() method on each one of them, first with
3267 an initial argument and then with the returned value of the previous
3270 The chain will be stopped if one of them ever returns None or the end
3271 of the chain is reached.
3273 PostProcessor objects follow a "mutual registration" process similar
3274 to InfoExtractor objects.
3279 def __init__(self, downloader=None):
3280 self._downloader = downloader
3282 def set_downloader(self, downloader):
3283 """Sets the downloader for this PP."""
3284 self._downloader = downloader
3286 def run(self, information):
3287 """Run the PostProcessor.
3289 The "information" argument is a dictionary like the ones
3290 composed by InfoExtractors. The only difference is that this
3291 one has an extra field called "filepath" that points to the
3294 When this method returns None, the postprocessing chain is
3295 stopped. However, this method may return an information
3296 dictionary that will be passed to the next postprocessing
3297 object in the chain. It can be the one it received after
3298 changing some fields.
3300 In addition, this method may raise a PostProcessingError
3301 exception that will be taken into account by the downloader
3304 return information # by default, do nothing
3307 class FFmpegExtractAudioPP(PostProcessor):
3309 def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
3310 PostProcessor.__init__(self, downloader)
3311 if preferredcodec is None:
3312 preferredcodec = 'best'
3313 self._preferredcodec = preferredcodec
3314 self._preferredquality = preferredquality
3315 self._keepvideo = keepvideo
3318 def get_audio_codec(path):
3320 cmd = ['ffprobe', '-show_streams', '--', path]
3321 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3322 output = handle.communicate()[0]
3323 if handle.wait() != 0:
3325 except (IOError, OSError):
3328 for line in output.split('\n'):
3329 if line.startswith('codec_name='):
3330 audio_codec = line.split('=')[1].strip()
3331 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3336 def run_ffmpeg(path, out_path, codec, more_opts):
3338 cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3339 ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3341 except (IOError, OSError):
3344 def run(self, information):
3345 path = information['filepath']
3347 filecodec = self.get_audio_codec(path)
3348 if filecodec is None:
3349 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3353 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3354 if filecodec == 'aac' or filecodec == 'mp3':
3355 # Lossless if possible
3357 extension = filecodec
3358 if filecodec == 'aac':
3359 more_opts = ['-f', 'adts']
3362 acodec = 'libmp3lame'
3365 if self._preferredquality is not None:
3366 more_opts += ['-ab', self._preferredquality]
3368 # We convert the audio (lossy)
3369 acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
3370 extension = self._preferredcodec
3372 if self._preferredquality is not None:
3373 more_opts += ['-ab', self._preferredquality]
3374 if self._preferredcodec == 'aac':
3375 more_opts += ['-f', 'adts']
3377 (prefix, ext) = os.path.splitext(path)
3378 new_path = prefix + '.' + extension
3379 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3380 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3383 self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3386 # Try to update the date time for extracted audio file.
3387 if information.get('filetime') is not None:
3389 os.utime(new_path, (time.time(), information['filetime']))
3391 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
3393 if not self._keepvideo:
3396 except (IOError, OSError):
3397 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3400 information['filepath'] = new_path
3404 def updateSelf(downloader, filename):
3405 ''' Update the program file with the latest version from the repository '''
3406 # Note: downloader only used for options
3407 if not os.access(filename, os.W_OK):
3408 sys.exit('ERROR: no write permissions on %s' % filename)
3410 downloader.to_screen('Updating to latest version...')
3414 urlh = urllib.urlopen(UPDATE_URL)
3415 newcontent = urlh.read()
3417 vmatch = re.search("__version__ = '([^']+)'", newcontent)
3418 if vmatch is not None and vmatch.group(1) == __version__:
3419 downloader.to_screen('youtube-dl is up-to-date (' + __version__ + ')')
3423 except (IOError, OSError), err:
3424 sys.exit('ERROR: unable to download latest version')
3427 outf = open(filename, 'wb')
3429 outf.write(newcontent)
3432 except (IOError, OSError), err:
3433 sys.exit('ERROR: unable to overwrite current version')
3435 downloader.to_screen('Updated youtube-dl. Restart youtube-dl to use the new version.')
3442 def _format_option_string(option):
3443 ''' ('-o', '--option') -> -o, --format METAVAR'''
3447 if option._short_opts: opts.append(option._short_opts[0])
3448 if option._long_opts: opts.append(option._long_opts[0])
3449 if len(opts) > 1: opts.insert(1, ', ')
3451 if option.takes_value(): opts.append(' %s' % option.metavar)
3453 return "".join(opts)
3455 def _find_term_columns():
3456 columns = os.environ.get('COLUMNS', None)
3461 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3462 out,err = sp.communicate()
3463 return int(out.split()[1])
3469 max_help_position = 80
3471 # No need to wrap help messages if we're on a wide console
3472 columns = _find_term_columns()
3473 if columns: max_width = columns
3475 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
3476 fmt.format_option_strings = _format_option_string
3479 'version' : __version__,
3481 'usage' : '%prog [options] url [url...]',
3482 'conflict_handler' : 'resolve',
3485 parser = optparse.OptionParser(**kw)
3488 general = optparse.OptionGroup(parser, 'General Options')
3489 selection = optparse.OptionGroup(parser, 'Video Selection')
3490 authentication = optparse.OptionGroup(parser, 'Authentication Options')
3491 video_format = optparse.OptionGroup(parser, 'Video Format Options')
3492 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
3493 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
3494 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3496 general.add_option('-h', '--help',
3497 action='help', help='print this help text and exit')
3498 general.add_option('-v', '--version',
3499 action='version', help='print program version and exit')
3500 general.add_option('-U', '--update',
3501 action='store_true', dest='update_self', help='update this program to latest version')
3502 general.add_option('-i', '--ignore-errors',
3503 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3504 general.add_option('-r', '--rate-limit',
3505 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3506 general.add_option('-R', '--retries',
3507 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
3508 general.add_option('--dump-user-agent',
3509 action='store_true', dest='dump_user_agent',
3510 help='display the current browser identification', default=False)
3511 general.add_option('--list-extractors',
3512 action='store_true', dest='list_extractors',
3513 help='List all supported extractors and the URLs they would handle', default=False)
3515 selection.add_option('--playlist-start',
3516 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3517 selection.add_option('--playlist-end',
3518 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3519 selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
3520 selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
3522 authentication.add_option('-u', '--username',
3523 dest='username', metavar='USERNAME', help='account username')
3524 authentication.add_option('-p', '--password',
3525 dest='password', metavar='PASSWORD', help='account password')
3526 authentication.add_option('-n', '--netrc',
3527 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3530 video_format.add_option('-f', '--format',
3531 action='store', dest='format', metavar='FORMAT', help='video format code')
3532 video_format.add_option('--all-formats',
3533 action='store_const', dest='format', help='download all available video formats', const='all')
3534 video_format.add_option('--max-quality',
3535 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3538 verbosity.add_option('-q', '--quiet',
3539 action='store_true', dest='quiet', help='activates quiet mode', default=False)
3540 verbosity.add_option('-s', '--simulate',
3541 action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
3542 verbosity.add_option('--skip-download',
3543 action='store_true', dest='skip_download', help='do not download the video', default=False)
3544 verbosity.add_option('-g', '--get-url',
3545 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3546 verbosity.add_option('-e', '--get-title',
3547 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3548 verbosity.add_option('--get-thumbnail',
3549 action='store_true', dest='getthumbnail',
3550 help='simulate, quiet but print thumbnail URL', default=False)
3551 verbosity.add_option('--get-description',
3552 action='store_true', dest='getdescription',
3553 help='simulate, quiet but print video description', default=False)
3554 verbosity.add_option('--get-filename',
3555 action='store_true', dest='getfilename',
3556 help='simulate, quiet but print output filename', default=False)
3557 verbosity.add_option('--get-format',
3558 action='store_true', dest='getformat',
3559 help='simulate, quiet but print output format', default=False)
3560 verbosity.add_option('--no-progress',
3561 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3562 verbosity.add_option('--console-title',
3563 action='store_true', dest='consoletitle',
3564 help='display progress in console titlebar', default=False)
3567 filesystem.add_option('-t', '--title',
3568 action='store_true', dest='usetitle', help='use title in file name', default=False)
3569 filesystem.add_option('-l', '--literal',
3570 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3571 filesystem.add_option('-A', '--auto-number',
3572 action='store_true', dest='autonumber',
3573 help='number downloaded files starting from 00000', default=False)
3574 filesystem.add_option('-o', '--output',
3575 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
3576 filesystem.add_option('-a', '--batch-file',
3577 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3578 filesystem.add_option('-w', '--no-overwrites',
3579 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3580 filesystem.add_option('-c', '--continue',
3581 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3582 filesystem.add_option('--no-continue',
3583 action='store_false', dest='continue_dl',
3584 help='do not resume partially downloaded files (restart from beginning)')
3585 filesystem.add_option('--cookies',
3586 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
3587 filesystem.add_option('--no-part',
3588 action='store_true', dest='nopart', help='do not use .part files', default=False)
3589 filesystem.add_option('--no-mtime',
3590 action='store_false', dest='updatetime',
3591 help='do not use the Last-modified header to set the file modification time', default=True)
3592 filesystem.add_option('--write-description',
3593 action='store_true', dest='writedescription',
3594 help='write video description to a .description file', default=False)
3595 filesystem.add_option('--write-info-json',
3596 action='store_true', dest='writeinfojson',
3597 help='write video metadata to a .info.json file', default=False)
3600 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3601 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3602 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3603 help='"best", "aac" or "mp3"; best by default')
3604 postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
3605 help='ffmpeg audio bitrate specification, 128k by default')
3606 postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
3607 help='keeps the video file on disk after the post-processing; the video is erased by default')
3610 parser.add_option_group(general)
3611 parser.add_option_group(selection)
3612 parser.add_option_group(filesystem)
3613 parser.add_option_group(verbosity)
3614 parser.add_option_group(video_format)
3615 parser.add_option_group(authentication)
3616 parser.add_option_group(postproc)
3618 opts, args = parser.parse_args()
3620 return parser, opts, args
3622 def gen_extractors():
3623 """ Return a list of an instance of every supported extractor.
3624 The order does matter; the first extractor matched is the one handling the URL.
3626 youtube_ie = YoutubeIE()
3627 google_ie = GoogleIE()
3628 yahoo_ie = YahooIE()
3630 YoutubePlaylistIE(youtube_ie),
3631 YoutubeUserIE(youtube_ie),
3632 YoutubeSearchIE(youtube_ie),
3634 MetacafeIE(youtube_ie),
3637 GoogleSearchIE(google_ie),
3640 YahooSearchIE(yahoo_ie),
3653 parser, opts, args = parseOpts()
3655 # Open appropriate CookieJar
3656 if opts.cookiefile is None:
3657 jar = cookielib.CookieJar()
3660 jar = cookielib.MozillaCookieJar(opts.cookiefile)
3661 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3663 except (IOError, OSError), err:
3664 sys.exit(u'ERROR: unable to open cookie file')
3667 if opts.dump_user_agent:
3668 print std_headers['User-Agent']
3671 # Batch file verification
3673 if opts.batchfile is not None:
3675 if opts.batchfile == '-':
3678 batchfd = open(opts.batchfile, 'r')
3679 batchurls = batchfd.readlines()
3680 batchurls = [x.strip() for x in batchurls]
3681 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3683 sys.exit(u'ERROR: batch file could not be read')
3684 all_urls = batchurls + args
3686 # General configuration
3687 cookie_processor = urllib2.HTTPCookieProcessor(jar)
3688 opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
3689 urllib2.install_opener(opener)
3690 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3692 extractors = gen_extractors()
3694 if opts.list_extractors:
3695 for ie in extractors:
3697 matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
3698 all_urls = filter(lambda url: url not in matchedUrls, all_urls)
3699 for mu in matchedUrls:
3703 # Conflicting, missing and erroneous options
3704 if opts.usenetrc and (opts.username is not None or opts.password is not None):
3705 parser.error(u'using .netrc conflicts with giving username/password')
3706 if opts.password is not None and opts.username is None:
3707 parser.error(u'account username missing')
3708 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3709 parser.error(u'using output template conflicts with using title, literal title or auto number')
3710 if opts.usetitle and opts.useliteral:
3711 parser.error(u'using title conflicts with using literal title')
3712 if opts.username is not None and opts.password is None:
3713 opts.password = getpass.getpass(u'Type account password and press return:')
3714 if opts.ratelimit is not None:
3715 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3716 if numeric_limit is None:
3717 parser.error(u'invalid rate limit specified')
3718 opts.ratelimit = numeric_limit
3719 if opts.retries is not None:
3721 opts.retries = long(opts.retries)
3722 except (TypeError, ValueError), err:
3723 parser.error(u'invalid retry count specified')
3725 opts.playliststart = int(opts.playliststart)
3726 if opts.playliststart <= 0:
3727 raise ValueError(u'Playlist start must be positive')
3728 except (TypeError, ValueError), err:
3729 parser.error(u'invalid playlist start number specified')
3731 opts.playlistend = int(opts.playlistend)
3732 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3733 raise ValueError(u'Playlist end must be greater than playlist start')
3734 except (TypeError, ValueError), err:
3735 parser.error(u'invalid playlist end number specified')
3736 if opts.extractaudio:
3737 if opts.audioformat not in ['best', 'aac', 'mp3']:
3738 parser.error(u'invalid audio format specified')
3741 fd = FileDownloader({
3742 'usenetrc': opts.usenetrc,
3743 'username': opts.username,
3744 'password': opts.password,
3745 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
3746 'forceurl': opts.geturl,
3747 'forcetitle': opts.gettitle,
3748 'forcethumbnail': opts.getthumbnail,
3749 'forcedescription': opts.getdescription,
3750 'forcefilename': opts.getfilename,
3751 'forceformat': opts.getformat,
3752 'simulate': opts.simulate,
3753 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
3754 'format': opts.format,
3755 'format_limit': opts.format_limit,
3756 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3757 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3758 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3759 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3760 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3761 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3762 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3763 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3764 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3765 or u'%(id)s.%(ext)s'),
3766 'ignoreerrors': opts.ignoreerrors,
3767 'ratelimit': opts.ratelimit,
3768 'nooverwrites': opts.nooverwrites,
3769 'retries': opts.retries,
3770 'continuedl': opts.continue_dl,
3771 'noprogress': opts.noprogress,
3772 'playliststart': opts.playliststart,
3773 'playlistend': opts.playlistend,
3774 'logtostderr': opts.outtmpl == '-',
3775 'consoletitle': opts.consoletitle,
3776 'nopart': opts.nopart,
3777 'updatetime': opts.updatetime,
3778 'writedescription': opts.writedescription,
3779 'writeinfojson': opts.writeinfojson,
3780 'matchtitle': opts.matchtitle,
3781 'rejecttitle': opts.rejecttitle,
3783 for extractor in extractors:
3784 fd.add_info_extractor(extractor)
3787 if opts.extractaudio:
3788 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
3791 if opts.update_self:
3792 updateSelf(fd, sys.argv[0])
3795 if len(all_urls) < 1:
3796 if not opts.update_self:
3797 parser.error(u'you must provide at least one URL')
3800 retcode = fd.download(all_urls)
3802 # Dump cookie jar if requested
3803 if opts.cookiefile is not None:
3806 except (IOError, OSError), err:
3807 sys.exit(u'ERROR: unable to save cookie jar')
3812 if __name__ == '__main__':
3815 except DownloadError:
3817 except SameFileError:
3818 sys.exit(u'ERROR: fixed output name but more than one file to download')
3819 except KeyboardInterrupt:
3820 sys.exit(u'\nERROR: Interrupted by user')
3822 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: