2 # -*- coding: utf-8 -*-
5 'Ricardo Garcia Gonzalez',
13 'Philipp Hagemeister',
17 __license__ = 'Public Domain'
18 __version__ = '2011.09.15'
20 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
49 except ImportError: # Python 2.4
52 import cStringIO as StringIO
56 # parse_qs was moved from the cgi module to the urlparse module recently.
58 from urlparse import parse_qs
60 from cgi import parse_qs
68 import xml.etree.ElementTree
69 except ImportError: # Python<2.5
70 pass # Not officially supported, but let it slip
73 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
74 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
75 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
76 'Accept-Encoding': 'gzip, deflate',
77 'Accept-Language': 'en-us,en;q=0.5',
80 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
84 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
90 def raiseError(msg, i):
91 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
92 def skipSpace(i, expectMore=True):
93 while i < len(s) and s[i] in ' \t\r\n':
97 raiseError('Premature end', i)
99 def decodeEscape(match):
115 return unichr(int(esc[1:5], 16))
116 if len(esc) == 5+6 and esc[5:7] == '\\u':
117 hi = int(esc[1:5], 16)
118 low = int(esc[7:11], 16)
119 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
120 raise ValueError('Unknown escape ' + str(esc))
127 while s[e-bslashes-1] == '\\':
129 if bslashes % 2 == 1:
133 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
134 stri = rexp.sub(decodeEscape, s[i:e])
140 if s[i] == '}': # Empty dictionary
144 raiseError('Expected a string object key', i)
145 i,key = parseString(i)
147 if i >= len(s) or s[i] != ':':
148 raiseError('Expected a colon', i)
155 raiseError('Expected comma or closing curly brace', i)
160 if s[i] == ']': # Empty array
165 i = skipSpace(i) # Raise exception if premature end
169 raiseError('Expected a comma or closing bracket', i)
171 def parseDiscrete(i):
172 for k,v in {'true': True, 'false': False, 'null': None}.items():
173 if s.startswith(k, i):
175 raiseError('Not a boolean (or null)', i)
177 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
179 raiseError('Not a number', i)
181 if '.' in nums or 'e' in nums or 'E' in nums:
182 return (i+len(nums), float(nums))
183 return (i+len(nums), int(nums))
184 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
187 i,res = CHARMAP.get(s[i], parseNumber)(i)
188 i = skipSpace(i, False)
192 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
195 def preferredencoding():
196 """Get preferred encoding.
198 Returns the best encoding scheme for the system, based on
199 locale.getpreferredencoding() and some further tweaks.
201 def yield_preferredencoding():
203 pref = locale.getpreferredencoding()
209 return yield_preferredencoding().next()
212 def htmlentity_transform(matchobj):
213 """Transforms an HTML entity to a Unicode character.
215 This function receives a match object and is intended to be used with
216 the re.sub() function.
218 entity = matchobj.group(1)
220 # Known non-numeric HTML entity
221 if entity in htmlentitydefs.name2codepoint:
222 return unichr(htmlentitydefs.name2codepoint[entity])
225 mobj = re.match(ur'(?u)#(x?\d+)', entity)
227 numstr = mobj.group(1)
228 if numstr.startswith(u'x'):
230 numstr = u'0%s' % numstr
233 return unichr(long(numstr, base))
235 # Unknown entity in name, return its literal representation
236 return (u'&%s;' % entity)
239 def sanitize_title(utitle):
240 """Sanitizes a video title so it could be used as part of a filename."""
241 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
242 return utitle.replace(unicode(os.sep), u'%')
245 def sanitize_open(filename, open_mode):
246 """Try to open the given filename, and slightly tweak it if this fails.
248 Attempts to open the given filename. If this fails, it tries to change
249 the filename slightly, step by step, until it's either able to open it
250 or it fails and raises a final exception, like the standard open()
253 It returns the tuple (stream, definitive_file_name).
257 if sys.platform == 'win32':
259 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
260 return (sys.stdout, filename)
261 stream = open(filename, open_mode)
262 return (stream, filename)
263 except (IOError, OSError), err:
264 # In case of error, try to remove win32 forbidden chars
265 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
267 # An exception here should be caught in the caller
268 stream = open(filename, open_mode)
269 return (stream, filename)
272 def timeconvert(timestr):
273 """Convert RFC 2822 defined time string into system timestamp"""
275 timetuple = email.utils.parsedate_tz(timestr)
276 if timetuple is not None:
277 timestamp = email.utils.mktime_tz(timetuple)
281 class DownloadError(Exception):
282 """Download Error exception.
284 This exception may be thrown by FileDownloader objects if they are not
285 configured to continue on errors. They will contain the appropriate
291 class SameFileError(Exception):
292 """Same File exception.
294 This exception will be thrown by FileDownloader objects if they detect
295 multiple files would have to be downloaded to the same file on disk.
300 class PostProcessingError(Exception):
301 """Post Processing exception.
303 This exception may be raised by PostProcessor's .run() method to
304 indicate an error in the postprocessing task.
309 class UnavailableVideoError(Exception):
310 """Unavailable Format exception.
312 This exception will be thrown when a video is requested
313 in a format that is not available for that video.
318 class ContentTooShortError(Exception):
319 """Content Too Short exception.
321 This exception may be raised by FileDownloader objects when a file they
322 download is too small for what the server announced first, indicating
323 the connection was probably interrupted.
329 def __init__(self, downloaded, expected):
330 self.downloaded = downloaded
331 self.expected = expected
334 class YoutubeDLHandler(urllib2.HTTPHandler):
335 """Handler for HTTP requests and responses.
337 This class, when installed with an OpenerDirector, automatically adds
338 the standard headers to every HTTP request and handles gzipped and
339 deflated responses from web servers. If compression is to be avoided in
340 a particular request, the original request in the program code only has
341 to include the HTTP header "Youtubedl-No-Compression", which will be
342 removed before making the real request.
344 Part of this code was copied from:
346 http://techknack.net/python-urllib2-handlers/
348 Andrew Rowls, the author of that code, agreed to release it to the
355 return zlib.decompress(data, -zlib.MAX_WBITS)
357 return zlib.decompress(data)
360 def addinfourl_wrapper(stream, headers, url, code):
361 if hasattr(urllib2.addinfourl, 'getcode'):
362 return urllib2.addinfourl(stream, headers, url, code)
363 ret = urllib2.addinfourl(stream, headers, url)
367 def http_request(self, req):
368 for h in std_headers:
371 req.add_header(h, std_headers[h])
372 if 'Youtubedl-no-compression' in req.headers:
373 if 'Accept-encoding' in req.headers:
374 del req.headers['Accept-encoding']
375 del req.headers['Youtubedl-no-compression']
378 def http_response(self, req, resp):
381 if resp.headers.get('Content-encoding', '') == 'gzip':
382 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
383 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
384 resp.msg = old_resp.msg
386 if resp.headers.get('Content-encoding', '') == 'deflate':
387 gz = StringIO.StringIO(self.deflate(resp.read()))
388 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
389 resp.msg = old_resp.msg
393 class FileDownloader(object):
394 """File Downloader class.
396 File downloader objects are the ones responsible of downloading the
397 actual video file and writing it to disk if the user has requested
398 it, among some other tasks. In most cases there should be one per
399 program. As, given a video URL, the downloader doesn't know how to
400 extract all the needed information, task that InfoExtractors do, it
401 has to pass the URL to one of them.
403 For this, file downloader objects have a method that allows
404 InfoExtractors to be registered in a given order. When it is passed
405 a URL, the file downloader handles it to the first InfoExtractor it
406 finds that reports being able to handle it. The InfoExtractor extracts
407 all the information about the video or videos the URL refers to, and
408 asks the FileDownloader to process the video information, possibly
409 downloading the video.
411 File downloaders accept a lot of parameters. In order not to saturate
412 the object constructor with arguments, it receives a dictionary of
413 options instead. These options are available through the params
414 attribute for the InfoExtractors to use. The FileDownloader also
415 registers itself as the downloader in charge for the InfoExtractors
416 that are added to it, so this is a "mutual registration".
420 username: Username for authentication purposes.
421 password: Password for authentication purposes.
422 usenetrc: Use netrc for authentication instead.
423 quiet: Do not print messages to stdout.
424 forceurl: Force printing final URL.
425 forcetitle: Force printing title.
426 forcethumbnail: Force printing thumbnail URL.
427 forcedescription: Force printing description.
428 forcefilename: Force printing final filename.
429 simulate: Do not download the video files.
430 format: Video format code.
431 format_limit: Highest quality format to try.
432 outtmpl: Template for output names.
433 ignoreerrors: Do not stop on download errors.
434 ratelimit: Download speed limit, in bytes/sec.
435 nooverwrites: Prevent overwriting files.
436 retries: Number of times to retry for HTTP error 5xx
437 continuedl: Try to continue downloads if possible.
438 noprogress: Do not print the progress bar.
439 playliststart: Playlist item to start at.
440 playlistend: Playlist item to end at.
441 matchtitle: Download only matching titles.
442 rejecttitle: Reject downloads for matching titles.
443 logtostderr: Log messages to stderr instead of stdout.
444 consoletitle: Display progress in console window's titlebar.
445 nopart: Do not use temporary .part files.
446 updatetime: Use the Last-modified header to set output file timestamps.
447 writedescription: Write the video description to a .description file
448 writeinfojson: Write the video description to a .info.json file
454 _download_retcode = None
455 _num_downloads = None
458 def __init__(self, params):
459 """Create a FileDownloader object with the given options."""
462 self._download_retcode = 0
463 self._num_downloads = 0
464 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
468 def format_bytes(bytes):
471 if type(bytes) is str:
476 exponent = long(math.log(bytes, 1024.0))
477 suffix = 'bkMGTPEZY'[exponent]
478 converted = float(bytes) / float(1024 ** exponent)
479 return '%.2f%s' % (converted, suffix)
482 def calc_percent(byte_counter, data_len):
485 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
488 def calc_eta(start, now, total, current):
492 if current == 0 or dif < 0.001: # One millisecond
494 rate = float(current) / dif
495 eta = long((float(total) - float(current)) / rate)
496 (eta_mins, eta_secs) = divmod(eta, 60)
499 return '%02d:%02d' % (eta_mins, eta_secs)
502 def calc_speed(start, now, bytes):
504 if bytes == 0 or dif < 0.001: # One millisecond
505 return '%10s' % '---b/s'
506 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
509 def best_block_size(elapsed_time, bytes):
510 new_min = max(bytes / 2.0, 1.0)
511 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
512 if elapsed_time < 0.001:
514 rate = bytes / elapsed_time
522 def parse_bytes(bytestr):
523 """Parse a string indicating a byte quantity into a long integer."""
524 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
527 number = float(matchobj.group(1))
528 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
529 return long(round(number * multiplier))
531 def add_info_extractor(self, ie):
532 """Add an InfoExtractor object to the end of the list."""
534 ie.set_downloader(self)
536 def add_post_processor(self, pp):
537 """Add a PostProcessor object to the end of the chain."""
539 pp.set_downloader(self)
541 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
542 """Print message to stdout if not in quiet mode."""
544 if not self.params.get('quiet', False):
545 terminator = [u'\n', u''][skip_eol]
546 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
547 self._screen_file.flush()
548 except (UnicodeEncodeError), err:
549 if not ignore_encoding_errors:
552 def to_stderr(self, message):
553 """Print message to stderr."""
554 print >>sys.stderr, message.encode(preferredencoding())
556 def to_cons_title(self, message):
557 """Set console/terminal window title to message."""
558 if not self.params.get('consoletitle', False):
560 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
561 # c_wchar_p() might not be necessary if `message` is
562 # already of type unicode()
563 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
564 elif 'TERM' in os.environ:
565 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
567 def fixed_template(self):
568 """Checks if the output template is fixed."""
569 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
571 def trouble(self, message=None):
572 """Determine action to take when a download problem appears.
574 Depending on if the downloader has been configured to ignore
575 download errors or not, this method may throw an exception or
576 not when errors are found, after printing the message.
578 if message is not None:
579 self.to_stderr(message)
580 if not self.params.get('ignoreerrors', False):
581 raise DownloadError(message)
582 self._download_retcode = 1
584 def slow_down(self, start_time, byte_counter):
585 """Sleep if the download speed is over the rate limit."""
586 rate_limit = self.params.get('ratelimit', None)
587 if rate_limit is None or byte_counter == 0:
590 elapsed = now - start_time
593 speed = float(byte_counter) / elapsed
594 if speed > rate_limit:
595 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
597 def temp_name(self, filename):
598 """Returns a temporary filename for the given filename."""
599 if self.params.get('nopart', False) or filename == u'-' or \
600 (os.path.exists(filename) and not os.path.isfile(filename)):
602 return filename + u'.part'
604 def undo_temp_name(self, filename):
605 if filename.endswith(u'.part'):
606 return filename[:-len(u'.part')]
609 def try_rename(self, old_filename, new_filename):
611 if old_filename == new_filename:
613 os.rename(old_filename, new_filename)
614 except (IOError, OSError), err:
615 self.trouble(u'ERROR: unable to rename file')
617 def try_utime(self, filename, last_modified_hdr):
618 """Try to set the last-modified time of the given file."""
619 if last_modified_hdr is None:
621 if not os.path.isfile(filename):
623 timestr = last_modified_hdr
626 filetime = timeconvert(timestr)
630 os.utime(filename, (time.time(), filetime))
634 def report_writedescription(self, descfn):
635 """ Report that the description file is being written """
636 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
638 def report_writeinfojson(self, infofn):
639 """ Report that the metadata file has been written """
640 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
642 def report_destination(self, filename):
643 """Report destination filename."""
644 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
646 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
647 """Report download progress."""
648 if self.params.get('noprogress', False):
650 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
651 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
652 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
653 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
655 def report_resuming_byte(self, resume_len):
656 """Report attempt to resume at given byte."""
657 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
659 def report_retry(self, count, retries):
660 """Report retry in case of HTTP error 5xx"""
661 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
663 def report_file_already_downloaded(self, file_name):
664 """Report file has already been fully downloaded."""
666 self.to_screen(u'[download] %s has already been downloaded' % file_name)
667 except (UnicodeEncodeError), err:
668 self.to_screen(u'[download] The file has already been downloaded')
670 def report_unable_to_resume(self):
671 """Report it was impossible to resume download."""
672 self.to_screen(u'[download] Unable to resume')
674 def report_finish(self):
675 """Report download finished."""
676 if self.params.get('noprogress', False):
677 self.to_screen(u'[download] Download completed')
681 def increment_downloads(self):
682 """Increment the ordinal that assigns a number to each file."""
683 self._num_downloads += 1
685 def prepare_filename(self, info_dict):
686 """Generate the output filename."""
688 template_dict = dict(info_dict)
689 template_dict['epoch'] = unicode(long(time.time()))
690 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
691 filename = self.params['outtmpl'] % template_dict
693 except (ValueError, KeyError), err:
694 self.trouble(u'ERROR: invalid system charset or erroneous output template')
697 def process_info(self, info_dict):
698 """Process a single dictionary returned by an InfoExtractor."""
699 filename = self.prepare_filename(info_dict)
700 # Do nothing else if in simulate mode
701 if self.params.get('simulate', False):
703 if self.params.get('forcetitle', False):
704 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
705 if self.params.get('forceurl', False):
706 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
707 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
708 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
709 if self.params.get('forcedescription', False) and 'description' in info_dict:
710 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
711 if self.params.get('forcefilename', False) and filename is not None:
712 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
719 matchtitle=self.params.get('matchtitle',False)
720 rejecttitle=self.params.get('rejecttitle',False)
721 title=info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
722 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
723 self.to_screen(u'[download] "%s" title did not match pattern "%s"' % (title, matchtitle))
725 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
726 self.to_screen(u'[download] "%s" title matched reject pattern "%s"' % (title, rejecttitle))
729 if self.params.get('nooverwrites', False) and os.path.exists(filename):
730 self.to_stderr(u'WARNING: file exists and will be skipped')
734 dn = os.path.dirname(filename)
735 if dn != '' and not os.path.exists(dn):
737 except (OSError, IOError), err:
738 self.trouble(u'ERROR: unable to create directory ' + unicode(err))
741 if self.params.get('writedescription', False):
743 descfn = filename + '.description'
744 self.report_writedescription(descfn)
745 descfile = open(descfn, 'wb')
747 descfile.write(info_dict['description'].encode('utf-8'))
750 except (OSError, IOError):
751 self.trouble(u'ERROR: Cannot write description file ' + descfn)
754 if self.params.get('writeinfojson', False):
755 infofn = filename + '.info.json'
756 self.report_writeinfojson(infofn)
759 except (NameError,AttributeError):
760 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
763 infof = open(infofn, 'wb')
765 json.dump(info_dict, infof)
768 except (OSError, IOError):
769 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
773 success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
774 except (OSError, IOError), err:
775 raise UnavailableVideoError
776 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
777 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
779 except (ContentTooShortError, ), err:
780 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
785 self.post_process(filename, info_dict)
786 except (PostProcessingError), err:
787 self.trouble(u'ERROR: postprocessing: %s' % str(err))
790 def download(self, url_list):
791 """Download a given list of URLs."""
792 if len(url_list) > 1 and self.fixed_template():
793 raise SameFileError(self.params['outtmpl'])
796 suitable_found = False
798 # Go to next InfoExtractor if not suitable
799 if not ie.suitable(url):
802 # Suitable InfoExtractor found
803 suitable_found = True
805 # Extract information from URL and process it
808 # Suitable InfoExtractor had been found; go to next URL
811 if not suitable_found:
812 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
814 return self._download_retcode
816 def post_process(self, filename, ie_info):
817 """Run the postprocessing chain on the given file."""
819 info['filepath'] = filename
825 def _download_with_rtmpdump(self, filename, url, player_url):
826 self.report_destination(filename)
827 tmpfilename = self.temp_name(filename)
829 # Check for rtmpdump first
831 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
832 except (OSError, IOError):
833 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
836 # Download using rtmpdump. rtmpdump returns exit code 2 when
837 # the connection was interrumpted and resuming appears to be
838 # possible. This is part of rtmpdump's normal usage, AFAIK.
839 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
840 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
841 while retval == 2 or retval == 1:
842 prevsize = os.path.getsize(tmpfilename)
843 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
844 time.sleep(5.0) # This seems to be needed
845 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
846 cursize = os.path.getsize(tmpfilename)
847 if prevsize == cursize and retval == 1:
849 # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
850 if prevsize == cursize and retval == 2 and cursize > 1024:
851 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
855 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
856 self.try_rename(tmpfilename, filename)
859 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
862 def _do_download(self, filename, url, player_url):
863 # Check file already present
864 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
865 self.report_file_already_downloaded(filename)
868 # Attempt to download using rtmpdump
869 if url.startswith('rtmp'):
870 return self._download_with_rtmpdump(filename, url, player_url)
872 tmpfilename = self.temp_name(filename)
876 # Do not include the Accept-Encoding header
877 headers = {'Youtubedl-no-compression': 'True'}
878 basic_request = urllib2.Request(url, None, headers)
879 request = urllib2.Request(url, None, headers)
881 # Establish possible resume length
882 if os.path.isfile(tmpfilename):
883 resume_len = os.path.getsize(tmpfilename)
887 # Request parameters in case of being able to resume
888 if self.params.get('continuedl', False) and resume_len != 0:
889 self.report_resuming_byte(resume_len)
890 request.add_header('Range', 'bytes=%d-' % resume_len)
894 retries = self.params.get('retries', 0)
895 while count <= retries:
896 # Establish connection
898 data = urllib2.urlopen(request)
900 except (urllib2.HTTPError, ), err:
901 if (err.code < 500 or err.code >= 600) and err.code != 416:
902 # Unexpected HTTP error
904 elif err.code == 416:
905 # Unable to resume (requested range not satisfiable)
907 # Open the connection again without the range header
908 data = urllib2.urlopen(basic_request)
909 content_length = data.info()['Content-Length']
910 except (urllib2.HTTPError, ), err:
911 if err.code < 500 or err.code >= 600:
914 # Examine the reported length
915 if (content_length is not None and
916 (resume_len - 100 < long(content_length) < resume_len + 100)):
917 # The file had already been fully downloaded.
918 # Explanation to the above condition: in issue #175 it was revealed that
919 # YouTube sometimes adds or removes a few bytes from the end of the file,
920 # changing the file size slightly and causing problems for some users. So
921 # I decided to implement a suggested change and consider the file
922 # completely downloaded if the file size differs less than 100 bytes from
923 # the one in the hard drive.
924 self.report_file_already_downloaded(filename)
925 self.try_rename(tmpfilename, filename)
928 # The length does not match, we start the download over
929 self.report_unable_to_resume()
935 self.report_retry(count, retries)
938 self.trouble(u'ERROR: giving up after %s retries' % retries)
941 data_len = data.info().get('Content-length', None)
942 if data_len is not None:
943 data_len = long(data_len) + resume_len
944 data_len_str = self.format_bytes(data_len)
945 byte_counter = 0 + resume_len
951 data_block = data.read(block_size)
953 if len(data_block) == 0:
955 byte_counter += len(data_block)
957 # Open file just in time
960 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
961 assert stream is not None
962 filename = self.undo_temp_name(tmpfilename)
963 self.report_destination(filename)
964 except (OSError, IOError), err:
965 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
968 stream.write(data_block)
969 except (IOError, OSError), err:
970 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
972 block_size = self.best_block_size(after - before, len(data_block))
975 percent_str = self.calc_percent(byte_counter, data_len)
976 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
977 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
978 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
981 self.slow_down(start, byte_counter - resume_len)
984 self.trouble(u'\nERROR: Did not get any data blocks')
988 if data_len is not None and byte_counter != data_len:
989 raise ContentTooShortError(byte_counter, long(data_len))
990 self.try_rename(tmpfilename, filename)
992 # Update file modification time
993 if self.params.get('updatetime', True):
994 self.try_utime(filename, data.info().get('last-modified', None))
999 class InfoExtractor(object):
1000 """Information Extractor class.
1002 Information extractors are the classes that, given a URL, extract
1003 information from the video (or videos) the URL refers to. This
1004 information includes the real video URL, the video title and simplified
1005 title, author and others. The information is stored in a dictionary
1006 which is then passed to the FileDownloader. The FileDownloader
1007 processes this information possibly downloading the video to the file
1008 system, among other possible outcomes. The dictionaries must include
1009 the following fields:
1011 id: Video identifier.
1012 url: Final video URL.
1013 uploader: Nickname of the video uploader.
1014 title: Literal title.
1015 stitle: Simplified title.
1016 ext: Video filename extension.
1017 format: Video format.
1018 player_url: SWF Player URL (may be None).
1020 The following fields are optional. Their primary purpose is to allow
1021 youtube-dl to serve as the backend for a video search function, such
1022 as the one in youtube2mp3. They are only used when their respective
1023 forced printing functions are called:
1025 thumbnail: Full URL to a video thumbnail image.
1026 description: One-line video description.
1028 Subclasses of this one should re-define the _real_initialize() and
1029 _real_extract() methods, as well as the suitable() static method.
1030 Probably, they should also be instantiated and added to the main
1037 def __init__(self, downloader=None):
1038 """Constructor. Receives an optional downloader."""
1040 self.set_downloader(downloader)
1044 """Receives a URL and returns True if suitable for this IE."""
1047 def initialize(self):
1048 """Initializes an instance (authentication, etc)."""
1050 self._real_initialize()
1053 def extract(self, url):
1054 """Extracts URL information and returns it in list of dicts."""
1056 return self._real_extract(url)
1058 def set_downloader(self, downloader):
1059 """Sets the downloader for this IE."""
1060 self._downloader = downloader
1062 def _real_initialize(self):
1063 """Real initialization process. Redefine in subclasses."""
1066 def _real_extract(self, url):
1067 """Real extraction process. Redefine in subclasses."""
1071 class YoutubeIE(InfoExtractor):
1072 """Information extractor for youtube.com."""
1074 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1075 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1076 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1077 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1078 _NETRC_MACHINE = 'youtube'
1079 # Listed in order of quality
1080 _available_formats = ['38', '37', '45', '22', '43', '35', '34', '18', '6', '5', '17', '13']
1081 _video_extensions = {
1087 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1094 return (re.match(YoutubeIE._VALID_URL, url) is not None)
1096 def report_lang(self):
1097 """Report attempt to set language."""
1098 self._downloader.to_screen(u'[youtube] Setting language')
1100 def report_login(self):
1101 """Report attempt to log in."""
1102 self._downloader.to_screen(u'[youtube] Logging in')
1104 def report_age_confirmation(self):
1105 """Report attempt to confirm age."""
1106 self._downloader.to_screen(u'[youtube] Confirming age')
1108 def report_video_webpage_download(self, video_id):
1109 """Report attempt to download video webpage."""
1110 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1112 def report_video_info_webpage_download(self, video_id):
1113 """Report attempt to download video info webpage."""
1114 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1116 def report_information_extraction(self, video_id):
1117 """Report attempt to extract video information."""
1118 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1120 def report_unavailable_format(self, video_id, format):
1121 """Report extracted video URL."""
1122 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1124 def report_rtmp_download(self):
1125 """Indicate the download will use the RTMP protocol."""
1126 self._downloader.to_screen(u'[youtube] RTMP download detected')
1128 def _real_initialize(self):
1129 if self._downloader is None:
1134 downloader_params = self._downloader.params
1136 # Attempt to use provided username and password or .netrc data
1137 if downloader_params.get('username', None) is not None:
1138 username = downloader_params['username']
1139 password = downloader_params['password']
1140 elif downloader_params.get('usenetrc', False):
1142 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1143 if info is not None:
1147 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1148 except (IOError, netrc.NetrcParseError), err:
1149 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1153 request = urllib2.Request(self._LANG_URL)
1156 urllib2.urlopen(request).read()
1157 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1158 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1161 # No authentication to be performed
1162 if username is None:
1167 'current_form': 'loginForm',
1169 'action_login': 'Log In',
1170 'username': username,
1171 'password': password,
1173 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1176 login_results = urllib2.urlopen(request).read()
1177 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1178 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1180 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1181 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1187 'action_confirm': 'Confirm',
1189 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1191 self.report_age_confirmation()
1192 age_results = urllib2.urlopen(request).read()
1193 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1194 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1197 def _real_extract(self, url):
1198 # Extract video id from URL
1199 mobj = re.match(self._VALID_URL, url)
1201 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1203 video_id = mobj.group(2)
1206 self.report_video_webpage_download(video_id)
1207 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1209 video_webpage = urllib2.urlopen(request).read()
1210 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1211 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1214 # Attempt to extract SWF player URL
1215 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1216 if mobj is not None:
1217 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1222 self.report_video_info_webpage_download(video_id)
1223 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1224 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1225 % (video_id, el_type))
1226 request = urllib2.Request(video_info_url)
1228 video_info_webpage = urllib2.urlopen(request).read()
1229 video_info = parse_qs(video_info_webpage)
1230 if 'token' in video_info:
1232 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1233 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1235 if 'token' not in video_info:
1236 if 'reason' in video_info:
1237 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1239 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1242 # Start extracting information
1243 self.report_information_extraction(video_id)
1246 if 'author' not in video_info:
1247 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1249 video_uploader = urllib.unquote_plus(video_info['author'][0])
1252 if 'title' not in video_info:
1253 self._downloader.trouble(u'ERROR: unable to extract video title')
1255 video_title = urllib.unquote_plus(video_info['title'][0])
1256 video_title = video_title.decode('utf-8')
1257 video_title = sanitize_title(video_title)
1260 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1261 simple_title = simple_title.strip(ur'_')
1264 if 'thumbnail_url' not in video_info:
1265 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1266 video_thumbnail = ''
1267 else: # don't panic if we can't find it
1268 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1272 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1273 if mobj is not None:
1274 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1275 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1276 for expression in format_expressions:
1278 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1286 video_description = u'No description available.'
1287 if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1288 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1289 if mobj is not None:
1290 video_description = mobj.group(1).decode('utf-8')
1292 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1293 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1294 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1295 # TODO use another parser
1298 video_token = urllib.unquote_plus(video_info['token'][0])
1300 # Decide which formats to download
1301 req_format = self._downloader.params.get('format', None)
1303 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1304 self.report_rtmp_download()
1305 video_url_list = [(None, video_info['conn'][0])]
1306 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1307 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1308 url_data = [parse_qs(uds) for uds in url_data_strs]
1309 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1310 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1312 format_limit = self._downloader.params.get('format_limit', None)
1313 if format_limit is not None and format_limit in self._available_formats:
1314 format_list = self._available_formats[self._available_formats.index(format_limit):]
1316 format_list = self._available_formats
1317 existing_formats = [x for x in format_list if x in url_map]
1318 if len(existing_formats) == 0:
1319 self._downloader.trouble(u'ERROR: no known formats available for video')
1321 if req_format is None:
1322 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1323 elif req_format == '-1':
1324 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1327 if req_format not in url_map:
1328 self._downloader.trouble(u'ERROR: requested format not available')
1330 video_url_list = [(req_format, url_map[req_format])] # Specific format
1332 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1335 for format_param, video_real_url in video_url_list:
1336 # At this point we have a new video
1337 self._downloader.increment_downloads()
1340 video_extension = self._video_extensions.get(format_param, 'flv')
1343 # Process video information
1344 self._downloader.process_info({
1345 'id': video_id.decode('utf-8'),
1346 'url': video_real_url.decode('utf-8'),
1347 'uploader': video_uploader.decode('utf-8'),
1348 'upload_date': upload_date,
1349 'title': video_title,
1350 'stitle': simple_title,
1351 'ext': video_extension.decode('utf-8'),
1352 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1353 'thumbnail': video_thumbnail.decode('utf-8'),
1354 'description': video_description,
1355 'player_url': player_url,
1357 except UnavailableVideoError, err:
1358 self._downloader.trouble(u'\nERROR: unable to download video')
1361 class MetacafeIE(InfoExtractor):
1362 """Information Extractor for metacafe.com."""
1364 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1365 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1366 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1369 def __init__(self, youtube_ie, downloader=None):
1370 InfoExtractor.__init__(self, downloader)
1371 self._youtube_ie = youtube_ie
1375 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1377 def report_disclaimer(self):
1378 """Report disclaimer retrieval."""
1379 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1381 def report_age_confirmation(self):
1382 """Report attempt to confirm age."""
1383 self._downloader.to_screen(u'[metacafe] Confirming age')
1385 def report_download_webpage(self, video_id):
1386 """Report webpage download."""
1387 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1389 def report_extraction(self, video_id):
1390 """Report information extraction."""
1391 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1393 def _real_initialize(self):
1394 # Retrieve disclaimer
1395 request = urllib2.Request(self._DISCLAIMER)
1397 self.report_disclaimer()
1398 disclaimer = urllib2.urlopen(request).read()
1399 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1400 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1406 'submit': "Continue - I'm over 18",
1408 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1410 self.report_age_confirmation()
1411 disclaimer = urllib2.urlopen(request).read()
1412 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1413 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1416 def _real_extract(self, url):
1417 # Extract id and simplified title from URL
1418 mobj = re.match(self._VALID_URL, url)
1420 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1423 video_id = mobj.group(1)
1425 # Check if video comes from YouTube
1426 mobj2 = re.match(r'^yt-(.*)$', video_id)
1427 if mobj2 is not None:
1428 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1431 # At this point we have a new video
1432 self._downloader.increment_downloads()
1434 simple_title = mobj.group(2).decode('utf-8')
1436 # Retrieve video webpage to extract further information
1437 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1439 self.report_download_webpage(video_id)
1440 webpage = urllib2.urlopen(request).read()
1441 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1442 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1445 # Extract URL, uploader and title from webpage
1446 self.report_extraction(video_id)
1447 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1448 if mobj is not None:
1449 mediaURL = urllib.unquote(mobj.group(1))
1450 video_extension = mediaURL[-3:]
1452 # Extract gdaKey if available
1453 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1455 video_url = mediaURL
1457 gdaKey = mobj.group(1)
1458 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1460 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1462 self._downloader.trouble(u'ERROR: unable to extract media URL')
1464 vardict = parse_qs(mobj.group(1))
1465 if 'mediaData' not in vardict:
1466 self._downloader.trouble(u'ERROR: unable to extract media URL')
1468 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1470 self._downloader.trouble(u'ERROR: unable to extract media URL')
1472 mediaURL = mobj.group(1).replace('\\/', '/')
1473 video_extension = mediaURL[-3:]
1474 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1476 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1478 self._downloader.trouble(u'ERROR: unable to extract title')
1480 video_title = mobj.group(1).decode('utf-8')
1481 video_title = sanitize_title(video_title)
1483 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1485 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1487 video_uploader = mobj.group(1)
1490 # Process video information
1491 self._downloader.process_info({
1492 'id': video_id.decode('utf-8'),
1493 'url': video_url.decode('utf-8'),
1494 'uploader': video_uploader.decode('utf-8'),
1495 'upload_date': u'NA',
1496 'title': video_title,
1497 'stitle': simple_title,
1498 'ext': video_extension.decode('utf-8'),
1502 except UnavailableVideoError:
1503 self._downloader.trouble(u'\nERROR: unable to download video')
1506 class DailymotionIE(InfoExtractor):
1507 """Information Extractor for Dailymotion"""
1509 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1511 def __init__(self, downloader=None):
1512 InfoExtractor.__init__(self, downloader)
1516 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1518 def report_download_webpage(self, video_id):
1519 """Report webpage download."""
1520 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1522 def report_extraction(self, video_id):
1523 """Report information extraction."""
1524 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1526 def _real_initialize(self):
1529 def _real_extract(self, url):
1530 # Extract id and simplified title from URL
1531 mobj = re.match(self._VALID_URL, url)
1533 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1536 # At this point we have a new video
1537 self._downloader.increment_downloads()
1538 video_id = mobj.group(1)
1540 simple_title = mobj.group(2).decode('utf-8')
1541 video_extension = 'flv'
1543 # Retrieve video webpage to extract further information
1544 request = urllib2.Request(url)
1545 request.add_header('Cookie', 'family_filter=off')
1547 self.report_download_webpage(video_id)
1548 webpage = urllib2.urlopen(request).read()
1549 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1550 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1553 # Extract URL, uploader and title from webpage
1554 self.report_extraction(video_id)
1555 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1557 self._downloader.trouble(u'ERROR: unable to extract media URL')
1559 sequence = urllib.unquote(mobj.group(1))
1560 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1562 self._downloader.trouble(u'ERROR: unable to extract media URL')
1564 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1566 # if needed add http://www.dailymotion.com/ if relative URL
1568 video_url = mediaURL
1570 mobj = re.search(r'(?im)<title>Dailymotion\s*-\s*(.+)\s*-\s*[^<]+?</title>', webpage)
1572 self._downloader.trouble(u'ERROR: unable to extract title')
1574 video_title = mobj.group(1).decode('utf-8')
1575 video_title = sanitize_title(video_title)
1577 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1579 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1581 video_uploader = mobj.group(1)
1584 # Process video information
1585 self._downloader.process_info({
1586 'id': video_id.decode('utf-8'),
1587 'url': video_url.decode('utf-8'),
1588 'uploader': video_uploader.decode('utf-8'),
1589 'upload_date': u'NA',
1590 'title': video_title,
1591 'stitle': simple_title,
1592 'ext': video_extension.decode('utf-8'),
1596 except UnavailableVideoError:
1597 self._downloader.trouble(u'\nERROR: unable to download video')
1600 class GoogleIE(InfoExtractor):
1601 """Information extractor for video.google.com."""
1603 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1605 def __init__(self, downloader=None):
1606 InfoExtractor.__init__(self, downloader)
1610 return (re.match(GoogleIE._VALID_URL, url) is not None)
1612 def report_download_webpage(self, video_id):
1613 """Report webpage download."""
1614 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1616 def report_extraction(self, video_id):
1617 """Report information extraction."""
1618 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1620 def _real_initialize(self):
1623 def _real_extract(self, url):
1624 # Extract id from URL
1625 mobj = re.match(self._VALID_URL, url)
1627 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1630 # At this point we have a new video
1631 self._downloader.increment_downloads()
1632 video_id = mobj.group(1)
1634 video_extension = 'mp4'
1636 # Retrieve video webpage to extract further information
1637 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1639 self.report_download_webpage(video_id)
1640 webpage = urllib2.urlopen(request).read()
1641 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1642 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1645 # Extract URL, uploader, and title from webpage
1646 self.report_extraction(video_id)
1647 mobj = re.search(r"download_url:'([^']+)'", webpage)
1649 video_extension = 'flv'
1650 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1652 self._downloader.trouble(u'ERROR: unable to extract media URL')
1654 mediaURL = urllib.unquote(mobj.group(1))
1655 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1656 mediaURL = mediaURL.replace('\\x26', '\x26')
1658 video_url = mediaURL
1660 mobj = re.search(r'<title>(.*)</title>', webpage)
1662 self._downloader.trouble(u'ERROR: unable to extract title')
1664 video_title = mobj.group(1).decode('utf-8')
1665 video_title = sanitize_title(video_title)
1666 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1668 # Extract video description
1669 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1671 self._downloader.trouble(u'ERROR: unable to extract video description')
1673 video_description = mobj.group(1).decode('utf-8')
1674 if not video_description:
1675 video_description = 'No description available.'
1677 # Extract video thumbnail
1678 if self._downloader.params.get('forcethumbnail', False):
1679 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1681 webpage = urllib2.urlopen(request).read()
1682 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1683 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1685 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1687 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1689 video_thumbnail = mobj.group(1)
1690 else: # we need something to pass to process_info
1691 video_thumbnail = ''
1694 # Process video information
1695 self._downloader.process_info({
1696 'id': video_id.decode('utf-8'),
1697 'url': video_url.decode('utf-8'),
1699 'upload_date': u'NA',
1700 'title': video_title,
1701 'stitle': simple_title,
1702 'ext': video_extension.decode('utf-8'),
1706 except UnavailableVideoError:
1707 self._downloader.trouble(u'\nERROR: unable to download video')
1710 class PhotobucketIE(InfoExtractor):
1711 """Information extractor for photobucket.com."""
1713 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1715 def __init__(self, downloader=None):
1716 InfoExtractor.__init__(self, downloader)
1720 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1722 def report_download_webpage(self, video_id):
1723 """Report webpage download."""
1724 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1726 def report_extraction(self, video_id):
1727 """Report information extraction."""
1728 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1730 def _real_initialize(self):
1733 def _real_extract(self, url):
1734 # Extract id from URL
1735 mobj = re.match(self._VALID_URL, url)
1737 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1740 # At this point we have a new video
1741 self._downloader.increment_downloads()
1742 video_id = mobj.group(1)
1744 video_extension = 'flv'
1746 # Retrieve video webpage to extract further information
1747 request = urllib2.Request(url)
1749 self.report_download_webpage(video_id)
1750 webpage = urllib2.urlopen(request).read()
1751 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1752 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1755 # Extract URL, uploader, and title from webpage
1756 self.report_extraction(video_id)
1757 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1759 self._downloader.trouble(u'ERROR: unable to extract media URL')
1761 mediaURL = urllib.unquote(mobj.group(1))
1763 video_url = mediaURL
1765 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1767 self._downloader.trouble(u'ERROR: unable to extract title')
1769 video_title = mobj.group(1).decode('utf-8')
1770 video_title = sanitize_title(video_title)
1771 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1773 video_uploader = mobj.group(2).decode('utf-8')
1776 # Process video information
1777 self._downloader.process_info({
1778 'id': video_id.decode('utf-8'),
1779 'url': video_url.decode('utf-8'),
1780 'uploader': video_uploader,
1781 'upload_date': u'NA',
1782 'title': video_title,
1783 'stitle': simple_title,
1784 'ext': video_extension.decode('utf-8'),
1788 except UnavailableVideoError:
1789 self._downloader.trouble(u'\nERROR: unable to download video')
1792 class YahooIE(InfoExtractor):
1793 """Information extractor for video.yahoo.com."""
1795 # _VALID_URL matches all Yahoo! Video URLs
1796 # _VPAGE_URL matches only the extractable '/watch/' URLs
1797 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1798 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1800 def __init__(self, downloader=None):
1801 InfoExtractor.__init__(self, downloader)
1805 return (re.match(YahooIE._VALID_URL, url) is not None)
1807 def report_download_webpage(self, video_id):
1808 """Report webpage download."""
1809 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1811 def report_extraction(self, video_id):
1812 """Report information extraction."""
1813 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1815 def _real_initialize(self):
1818 def _real_extract(self, url, new_video=True):
1819 # Extract ID from URL
1820 mobj = re.match(self._VALID_URL, url)
1822 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1825 # At this point we have a new video
1826 self._downloader.increment_downloads()
1827 video_id = mobj.group(2)
1828 video_extension = 'flv'
1830 # Rewrite valid but non-extractable URLs as
1831 # extractable English language /watch/ URLs
1832 if re.match(self._VPAGE_URL, url) is None:
1833 request = urllib2.Request(url)
1835 webpage = urllib2.urlopen(request).read()
1836 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1837 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1840 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1842 self._downloader.trouble(u'ERROR: Unable to extract id field')
1844 yahoo_id = mobj.group(1)
1846 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1848 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1850 yahoo_vid = mobj.group(1)
1852 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1853 return self._real_extract(url, new_video=False)
1855 # Retrieve video webpage to extract further information
1856 request = urllib2.Request(url)
1858 self.report_download_webpage(video_id)
1859 webpage = urllib2.urlopen(request).read()
1860 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1861 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1864 # Extract uploader and title from webpage
1865 self.report_extraction(video_id)
1866 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1868 self._downloader.trouble(u'ERROR: unable to extract video title')
1870 video_title = mobj.group(1).decode('utf-8')
1871 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1873 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1875 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1877 video_uploader = mobj.group(1).decode('utf-8')
1879 # Extract video thumbnail
1880 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1882 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1884 video_thumbnail = mobj.group(1).decode('utf-8')
1886 # Extract video description
1887 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1889 self._downloader.trouble(u'ERROR: unable to extract video description')
1891 video_description = mobj.group(1).decode('utf-8')
1892 if not video_description:
1893 video_description = 'No description available.'
1895 # Extract video height and width
1896 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1898 self._downloader.trouble(u'ERROR: unable to extract video height')
1900 yv_video_height = mobj.group(1)
1902 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1904 self._downloader.trouble(u'ERROR: unable to extract video width')
1906 yv_video_width = mobj.group(1)
1908 # Retrieve video playlist to extract media URL
1909 # I'm not completely sure what all these options are, but we
1910 # seem to need most of them, otherwise the server sends a 401.
1911 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1912 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1913 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1914 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1915 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1917 self.report_download_webpage(video_id)
1918 webpage = urllib2.urlopen(request).read()
1919 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1920 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1923 # Extract media URL from playlist XML
1924 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1926 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1928 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1929 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1932 # Process video information
1933 self._downloader.process_info({
1934 'id': video_id.decode('utf-8'),
1936 'uploader': video_uploader,
1937 'upload_date': u'NA',
1938 'title': video_title,
1939 'stitle': simple_title,
1940 'ext': video_extension.decode('utf-8'),
1941 'thumbnail': video_thumbnail.decode('utf-8'),
1942 'description': video_description,
1943 'thumbnail': video_thumbnail,
1946 except UnavailableVideoError:
1947 self._downloader.trouble(u'\nERROR: unable to download video')
1950 class VimeoIE(InfoExtractor):
1951 """Information extractor for vimeo.com."""
1953 # _VALID_URL matches Vimeo URLs
1954 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1956 def __init__(self, downloader=None):
1957 InfoExtractor.__init__(self, downloader)
1961 return (re.match(VimeoIE._VALID_URL, url) is not None)
1963 def report_download_webpage(self, video_id):
1964 """Report webpage download."""
1965 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1967 def report_extraction(self, video_id):
1968 """Report information extraction."""
1969 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1971 def _real_initialize(self):
1974 def _real_extract(self, url, new_video=True):
1975 # Extract ID from URL
1976 mobj = re.match(self._VALID_URL, url)
1978 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1981 # At this point we have a new video
1982 self._downloader.increment_downloads()
1983 video_id = mobj.group(1)
1985 # Retrieve video webpage to extract further information
1986 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
1988 self.report_download_webpage(video_id)
1989 webpage = urllib2.urlopen(request).read()
1990 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1991 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1994 # Now we begin extracting as much information as we can from what we
1995 # retrieved. First we extract the information common to all extractors,
1996 # and latter we extract those that are Vimeo specific.
1997 self.report_extraction(video_id)
2000 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
2002 self._downloader.trouble(u'ERROR: unable to extract video title')
2004 video_title = mobj.group(1).decode('utf-8')
2005 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2008 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
2010 self._downloader.trouble(u'ERROR: unable to extract video uploader')
2012 video_uploader = mobj.group(1).decode('utf-8')
2014 # Extract video thumbnail
2015 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2017 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2019 video_thumbnail = mobj.group(1).decode('utf-8')
2021 # # Extract video description
2022 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2024 # self._downloader.trouble(u'ERROR: unable to extract video description')
2026 # video_description = mobj.group(1).decode('utf-8')
2027 # if not video_description: video_description = 'No description available.'
2028 video_description = 'Foo.'
2030 # Vimeo specific: extract request signature
2031 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2033 self._downloader.trouble(u'ERROR: unable to extract request signature')
2035 sig = mobj.group(1).decode('utf-8')
2037 # Vimeo specific: Extract request signature expiration
2038 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2040 self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2042 sig_exp = mobj.group(1).decode('utf-8')
2044 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s" % (video_id, sig, sig_exp)
2047 # Process video information
2048 self._downloader.process_info({
2049 'id': video_id.decode('utf-8'),
2051 'uploader': video_uploader,
2052 'upload_date': u'NA',
2053 'title': video_title,
2054 'stitle': simple_title,
2056 'thumbnail': video_thumbnail.decode('utf-8'),
2057 'description': video_description,
2058 'thumbnail': video_thumbnail,
2059 'description': video_description,
2062 except UnavailableVideoError:
2063 self._downloader.trouble(u'ERROR: unable to download video')
2066 class GenericIE(InfoExtractor):
2067 """Generic last-resort information extractor."""
2069 def __init__(self, downloader=None):
2070 InfoExtractor.__init__(self, downloader)
2076 def report_download_webpage(self, video_id):
2077 """Report webpage download."""
2078 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2079 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2081 def report_extraction(self, video_id):
2082 """Report information extraction."""
2083 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2085 def _real_initialize(self):
2088 def _real_extract(self, url):
2089 # At this point we have a new video
2090 self._downloader.increment_downloads()
2092 video_id = url.split('/')[-1]
2093 request = urllib2.Request(url)
2095 self.report_download_webpage(video_id)
2096 webpage = urllib2.urlopen(request).read()
2097 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2098 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2100 except ValueError, err:
2101 # since this is the last-resort InfoExtractor, if
2102 # this error is thrown, it'll be thrown here
2103 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2106 self.report_extraction(video_id)
2107 # Start with something easy: JW Player in SWFObject
2108 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2110 # Broaden the search a little bit
2111 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2113 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2116 # It's possible that one of the regexes
2117 # matched, but returned an empty group:
2118 if mobj.group(1) is None:
2119 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2122 video_url = urllib.unquote(mobj.group(1))
2123 video_id = os.path.basename(video_url)
2125 # here's a fun little line of code for you:
2126 video_extension = os.path.splitext(video_id)[1][1:]
2127 video_id = os.path.splitext(video_id)[0]
2129 # it's tempting to parse this further, but you would
2130 # have to take into account all the variations like
2131 # Video Title - Site Name
2132 # Site Name | Video Title
2133 # Video Title - Tagline | Site Name
2134 # and so on and so forth; it's just not practical
2135 mobj = re.search(r'<title>(.*)</title>', webpage)
2137 self._downloader.trouble(u'ERROR: unable to extract title')
2139 video_title = mobj.group(1).decode('utf-8')
2140 video_title = sanitize_title(video_title)
2141 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2143 # video uploader is domain name
2144 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2146 self._downloader.trouble(u'ERROR: unable to extract title')
2148 video_uploader = mobj.group(1).decode('utf-8')
2151 # Process video information
2152 self._downloader.process_info({
2153 'id': video_id.decode('utf-8'),
2154 'url': video_url.decode('utf-8'),
2155 'uploader': video_uploader,
2156 'upload_date': u'NA',
2157 'title': video_title,
2158 'stitle': simple_title,
2159 'ext': video_extension.decode('utf-8'),
2163 except UnavailableVideoError, err:
2164 self._downloader.trouble(u'\nERROR: unable to download video')
2167 class YoutubeSearchIE(InfoExtractor):
2168 """Information Extractor for YouTube search queries."""
2169 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
2170 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2171 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2172 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2174 _max_youtube_results = 1000
2176 def __init__(self, youtube_ie, downloader=None):
2177 InfoExtractor.__init__(self, downloader)
2178 self._youtube_ie = youtube_ie
2182 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
2184 def report_download_page(self, query, pagenum):
2185 """Report attempt to download playlist page with given number."""
2186 query = query.decode(preferredencoding())
2187 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2189 def _real_initialize(self):
2190 self._youtube_ie.initialize()
2192 def _real_extract(self, query):
2193 mobj = re.match(self._VALID_QUERY, query)
2195 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2198 prefix, query = query.split(':')
2200 query = query.encode('utf-8')
2202 self._download_n_results(query, 1)
2204 elif prefix == 'all':
2205 self._download_n_results(query, self._max_youtube_results)
2211 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2213 elif n > self._max_youtube_results:
2214 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2215 n = self._max_youtube_results
2216 self._download_n_results(query, n)
2218 except ValueError: # parsing prefix as integer fails
2219 self._download_n_results(query, 1)
2222 def _download_n_results(self, query, n):
2223 """Downloads a specified number of results for a query"""
2226 already_seen = set()
2230 self.report_download_page(query, pagenum)
2231 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2232 request = urllib2.Request(result_url)
2234 page = urllib2.urlopen(request).read()
2235 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2236 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2239 # Extract video identifiers
2240 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2241 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2242 if video_id not in already_seen:
2243 video_ids.append(video_id)
2244 already_seen.add(video_id)
2245 if len(video_ids) == n:
2246 # Specified n videos reached
2247 for id in video_ids:
2248 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2251 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2252 for id in video_ids:
2253 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2256 pagenum = pagenum + 1
2259 class GoogleSearchIE(InfoExtractor):
2260 """Information Extractor for Google Video search queries."""
2261 _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
2262 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2263 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2264 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2266 _max_google_results = 1000
2268 def __init__(self, google_ie, downloader=None):
2269 InfoExtractor.__init__(self, downloader)
2270 self._google_ie = google_ie
2274 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
2276 def report_download_page(self, query, pagenum):
2277 """Report attempt to download playlist page with given number."""
2278 query = query.decode(preferredencoding())
2279 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2281 def _real_initialize(self):
2282 self._google_ie.initialize()
2284 def _real_extract(self, query):
2285 mobj = re.match(self._VALID_QUERY, query)
2287 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2290 prefix, query = query.split(':')
2292 query = query.encode('utf-8')
2294 self._download_n_results(query, 1)
2296 elif prefix == 'all':
2297 self._download_n_results(query, self._max_google_results)
2303 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2305 elif n > self._max_google_results:
2306 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2307 n = self._max_google_results
2308 self._download_n_results(query, n)
2310 except ValueError: # parsing prefix as integer fails
2311 self._download_n_results(query, 1)
2314 def _download_n_results(self, query, n):
2315 """Downloads a specified number of results for a query"""
2318 already_seen = set()
2322 self.report_download_page(query, pagenum)
2323 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2324 request = urllib2.Request(result_url)
2326 page = urllib2.urlopen(request).read()
2327 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2328 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2331 # Extract video identifiers
2332 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2333 video_id = mobj.group(1)
2334 if video_id not in already_seen:
2335 video_ids.append(video_id)
2336 already_seen.add(video_id)
2337 if len(video_ids) == n:
2338 # Specified n videos reached
2339 for id in video_ids:
2340 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2343 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2344 for id in video_ids:
2345 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2348 pagenum = pagenum + 1
2351 class YahooSearchIE(InfoExtractor):
2352 """Information Extractor for Yahoo! Video search queries."""
2353 _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
2354 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2355 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2356 _MORE_PAGES_INDICATOR = r'\s*Next'
2358 _max_yahoo_results = 1000
2360 def __init__(self, yahoo_ie, downloader=None):
2361 InfoExtractor.__init__(self, downloader)
2362 self._yahoo_ie = yahoo_ie
2366 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
2368 def report_download_page(self, query, pagenum):
2369 """Report attempt to download playlist page with given number."""
2370 query = query.decode(preferredencoding())
2371 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2373 def _real_initialize(self):
2374 self._yahoo_ie.initialize()
2376 def _real_extract(self, query):
2377 mobj = re.match(self._VALID_QUERY, query)
2379 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2382 prefix, query = query.split(':')
2384 query = query.encode('utf-8')
2386 self._download_n_results(query, 1)
2388 elif prefix == 'all':
2389 self._download_n_results(query, self._max_yahoo_results)
2395 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2397 elif n > self._max_yahoo_results:
2398 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2399 n = self._max_yahoo_results
2400 self._download_n_results(query, n)
2402 except ValueError: # parsing prefix as integer fails
2403 self._download_n_results(query, 1)
2406 def _download_n_results(self, query, n):
2407 """Downloads a specified number of results for a query"""
2410 already_seen = set()
2414 self.report_download_page(query, pagenum)
2415 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2416 request = urllib2.Request(result_url)
2418 page = urllib2.urlopen(request).read()
2419 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2420 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2423 # Extract video identifiers
2424 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2425 video_id = mobj.group(1)
2426 if video_id not in already_seen:
2427 video_ids.append(video_id)
2428 already_seen.add(video_id)
2429 if len(video_ids) == n:
2430 # Specified n videos reached
2431 for id in video_ids:
2432 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2435 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2436 for id in video_ids:
2437 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2440 pagenum = pagenum + 1
2443 class YoutubePlaylistIE(InfoExtractor):
2444 """Information Extractor for YouTube playlists."""
2446 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2447 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2448 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2449 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2452 def __init__(self, youtube_ie, downloader=None):
2453 InfoExtractor.__init__(self, downloader)
2454 self._youtube_ie = youtube_ie
2458 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2460 def report_download_page(self, playlist_id, pagenum):
2461 """Report attempt to download playlist page with given number."""
2462 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2464 def _real_initialize(self):
2465 self._youtube_ie.initialize()
2467 def _real_extract(self, url):
2468 # Extract playlist id
2469 mobj = re.match(self._VALID_URL, url)
2471 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2475 if mobj.group(3) is not None:
2476 self._youtube_ie.extract(mobj.group(3))
2479 # Download playlist pages
2480 # prefix is 'p' as default for playlists but there are other types that need extra care
2481 playlist_prefix = mobj.group(1)
2482 if playlist_prefix == 'a':
2483 playlist_access = 'artist'
2485 playlist_prefix = 'p'
2486 playlist_access = 'view_play_list'
2487 playlist_id = mobj.group(2)
2492 self.report_download_page(playlist_id, pagenum)
2493 request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2495 page = urllib2.urlopen(request).read()
2496 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2497 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2500 # Extract video identifiers
2502 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2503 if mobj.group(1) not in ids_in_page:
2504 ids_in_page.append(mobj.group(1))
2505 video_ids.extend(ids_in_page)
2507 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2509 pagenum = pagenum + 1
2511 playliststart = self._downloader.params.get('playliststart', 1) - 1
2512 playlistend = self._downloader.params.get('playlistend', -1)
2513 video_ids = video_ids[playliststart:playlistend]
2515 for id in video_ids:
2516 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2520 class YoutubeUserIE(InfoExtractor):
2521 """Information Extractor for YouTube users."""
2523 _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2524 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2525 _GDATA_PAGE_SIZE = 50
2526 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2527 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2530 def __init__(self, youtube_ie, downloader=None):
2531 InfoExtractor.__init__(self, downloader)
2532 self._youtube_ie = youtube_ie
2536 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2538 def report_download_page(self, username, start_index):
2539 """Report attempt to download user page."""
2540 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2541 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2543 def _real_initialize(self):
2544 self._youtube_ie.initialize()
2546 def _real_extract(self, url):
2548 mobj = re.match(self._VALID_URL, url)
2550 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2553 username = mobj.group(1)
2555 # Download video ids using YouTube Data API. Result size per
2556 # query is limited (currently to 50 videos) so we need to query
2557 # page by page until there are no video ids - it means we got
2564 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2565 self.report_download_page(username, start_index)
2567 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2570 page = urllib2.urlopen(request).read()
2571 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2572 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2575 # Extract video identifiers
2578 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2579 if mobj.group(1) not in ids_in_page:
2580 ids_in_page.append(mobj.group(1))
2582 video_ids.extend(ids_in_page)
2584 # A little optimization - if current page is not
2585 # "full", ie. does not contain PAGE_SIZE video ids then
2586 # we can assume that this page is the last one - there
2587 # are no more ids on further pages - no need to query
2590 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2595 all_ids_count = len(video_ids)
2596 playliststart = self._downloader.params.get('playliststart', 1) - 1
2597 playlistend = self._downloader.params.get('playlistend', -1)
2599 if playlistend == -1:
2600 video_ids = video_ids[playliststart:]
2602 video_ids = video_ids[playliststart:playlistend]
2604 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2605 (username, all_ids_count, len(video_ids)))
2607 for video_id in video_ids:
2608 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2611 class DepositFilesIE(InfoExtractor):
2612 """Information extractor for depositfiles.com"""
2614 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2616 def __init__(self, downloader=None):
2617 InfoExtractor.__init__(self, downloader)
2621 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2623 def report_download_webpage(self, file_id):
2624 """Report webpage download."""
2625 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2627 def report_extraction(self, file_id):
2628 """Report information extraction."""
2629 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2631 def _real_initialize(self):
2634 def _real_extract(self, url):
2635 # At this point we have a new file
2636 self._downloader.increment_downloads()
2638 file_id = url.split('/')[-1]
2639 # Rebuild url in english locale
2640 url = 'http://depositfiles.com/en/files/' + file_id
2642 # Retrieve file webpage with 'Free download' button pressed
2643 free_download_indication = { 'gateway_result' : '1' }
2644 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2646 self.report_download_webpage(file_id)
2647 webpage = urllib2.urlopen(request).read()
2648 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2649 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2652 # Search for the real file URL
2653 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2654 if (mobj is None) or (mobj.group(1) is None):
2655 # Try to figure out reason of the error.
2656 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2657 if (mobj is not None) and (mobj.group(1) is not None):
2658 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2659 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2661 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2664 file_url = mobj.group(1)
2665 file_extension = os.path.splitext(file_url)[1][1:]
2667 # Search for file title
2668 mobj = re.search(r'<b title="(.*?)">', webpage)
2670 self._downloader.trouble(u'ERROR: unable to extract title')
2672 file_title = mobj.group(1).decode('utf-8')
2675 # Process file information
2676 self._downloader.process_info({
2677 'id': file_id.decode('utf-8'),
2678 'url': file_url.decode('utf-8'),
2680 'upload_date': u'NA',
2681 'title': file_title,
2682 'stitle': file_title,
2683 'ext': file_extension.decode('utf-8'),
2687 except UnavailableVideoError, err:
2688 self._downloader.trouble(u'ERROR: unable to download file')
2691 class FacebookIE(InfoExtractor):
2692 """Information Extractor for Facebook"""
2694 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2695 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2696 _NETRC_MACHINE = 'facebook'
2697 _available_formats = ['highqual', 'lowqual']
2698 _video_extensions = {
2703 def __init__(self, downloader=None):
2704 InfoExtractor.__init__(self, downloader)
2708 return (re.match(FacebookIE._VALID_URL, url) is not None)
2710 def _reporter(self, message):
2711 """Add header and report message."""
2712 self._downloader.to_screen(u'[facebook] %s' % message)
2714 def report_login(self):
2715 """Report attempt to log in."""
2716 self._reporter(u'Logging in')
2718 def report_video_webpage_download(self, video_id):
2719 """Report attempt to download video webpage."""
2720 self._reporter(u'%s: Downloading video webpage' % video_id)
2722 def report_information_extraction(self, video_id):
2723 """Report attempt to extract video information."""
2724 self._reporter(u'%s: Extracting video information' % video_id)
2726 def _parse_page(self, video_webpage):
2727 """Extract video information from page"""
2729 data = {'title': r'class="video_title datawrap">(.*?)</',
2730 'description': r'<div class="datawrap">(.*?)</div>',
2731 'owner': r'\("video_owner_name", "(.*?)"\)',
2732 'upload_date': r'data-date="(.*?)"',
2733 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2736 for piece in data.keys():
2737 mobj = re.search(data[piece], video_webpage)
2738 if mobj is not None:
2739 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2743 for fmt in self._available_formats:
2744 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2745 if mobj is not None:
2746 # URL is in a Javascript segment inside an escaped Unicode format within
2747 # the generally utf-8 page
2748 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2749 video_info['video_urls'] = video_urls
2753 def _real_initialize(self):
2754 if self._downloader is None:
2759 downloader_params = self._downloader.params
2761 # Attempt to use provided username and password or .netrc data
2762 if downloader_params.get('username', None) is not None:
2763 useremail = downloader_params['username']
2764 password = downloader_params['password']
2765 elif downloader_params.get('usenetrc', False):
2767 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2768 if info is not None:
2772 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2773 except (IOError, netrc.NetrcParseError), err:
2774 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2777 if useremail is None:
2786 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2789 login_results = urllib2.urlopen(request).read()
2790 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2791 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2793 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2794 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2797 def _real_extract(self, url):
2798 mobj = re.match(self._VALID_URL, url)
2800 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2802 video_id = mobj.group('ID')
2805 self.report_video_webpage_download(video_id)
2806 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2808 page = urllib2.urlopen(request)
2809 video_webpage = page.read()
2810 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2811 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2814 # Start extracting information
2815 self.report_information_extraction(video_id)
2817 # Extract information
2818 video_info = self._parse_page(video_webpage)
2821 if 'owner' not in video_info:
2822 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2824 video_uploader = video_info['owner']
2827 if 'title' not in video_info:
2828 self._downloader.trouble(u'ERROR: unable to extract video title')
2830 video_title = video_info['title']
2831 video_title = video_title.decode('utf-8')
2832 video_title = sanitize_title(video_title)
2835 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2836 simple_title = simple_title.strip(ur'_')
2839 if 'thumbnail' not in video_info:
2840 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2841 video_thumbnail = ''
2843 video_thumbnail = video_info['thumbnail']
2847 if 'upload_date' in video_info:
2848 upload_time = video_info['upload_date']
2849 timetuple = email.utils.parsedate_tz(upload_time)
2850 if timetuple is not None:
2852 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2857 video_description = video_info.get('description', 'No description available.')
2859 url_map = video_info['video_urls']
2860 if len(url_map.keys()) > 0:
2861 # Decide which formats to download
2862 req_format = self._downloader.params.get('format', None)
2863 format_limit = self._downloader.params.get('format_limit', None)
2865 if format_limit is not None and format_limit in self._available_formats:
2866 format_list = self._available_formats[self._available_formats.index(format_limit):]
2868 format_list = self._available_formats
2869 existing_formats = [x for x in format_list if x in url_map]
2870 if len(existing_formats) == 0:
2871 self._downloader.trouble(u'ERROR: no known formats available for video')
2873 if req_format is None:
2874 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2875 elif req_format == '-1':
2876 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2879 if req_format not in url_map:
2880 self._downloader.trouble(u'ERROR: requested format not available')
2882 video_url_list = [(req_format, url_map[req_format])] # Specific format
2884 for format_param, video_real_url in video_url_list:
2886 # At this point we have a new video
2887 self._downloader.increment_downloads()
2890 video_extension = self._video_extensions.get(format_param, 'mp4')
2893 # Process video information
2894 self._downloader.process_info({
2895 'id': video_id.decode('utf-8'),
2896 'url': video_real_url.decode('utf-8'),
2897 'uploader': video_uploader.decode('utf-8'),
2898 'upload_date': upload_date,
2899 'title': video_title,
2900 'stitle': simple_title,
2901 'ext': video_extension.decode('utf-8'),
2902 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2903 'thumbnail': video_thumbnail.decode('utf-8'),
2904 'description': video_description.decode('utf-8'),
2907 except UnavailableVideoError, err:
2908 self._downloader.trouble(u'\nERROR: unable to download video')
2910 class BlipTVIE(InfoExtractor):
2911 """Information extractor for blip.tv"""
2913 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2914 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2918 return (re.match(BlipTVIE._VALID_URL, url) is not None)
2920 def report_extraction(self, file_id):
2921 """Report information extraction."""
2922 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
2924 def _simplify_title(self, title):
2925 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2926 res = res.strip(ur'_')
2929 def _real_extract(self, url):
2930 mobj = re.match(self._VALID_URL, url)
2932 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2939 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2940 request = urllib2.Request(json_url)
2941 self.report_extraction(mobj.group(1))
2943 json_code = urllib2.urlopen(request).read()
2944 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2945 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2948 json_data = json.loads(json_code)
2949 if 'Post' in json_data:
2950 data = json_data['Post']
2954 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2955 video_url = data['media']['url']
2956 umobj = re.match(self._URL_EXT, video_url)
2958 raise ValueError('Can not determine filename extension')
2959 ext = umobj.group(1)
2961 self._downloader.increment_downloads()
2964 'id': data['item_id'],
2966 'uploader': data['display_name'],
2967 'upload_date': upload_date,
2968 'title': data['title'],
2969 'stitle': self._simplify_title(data['title']),
2971 'format': data['media']['mimeType'],
2972 'thumbnail': data['thumbnailUrl'],
2973 'description': data['description'],
2974 'player_url': data['embedUrl']
2976 except (ValueError,KeyError), err:
2977 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2981 self._downloader.process_info(info)
2982 except UnavailableVideoError, err:
2983 self._downloader.trouble(u'\nERROR: unable to download video')
2986 class MyVideoIE(InfoExtractor):
2987 """Information Extractor for myvideo.de."""
2989 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2991 def __init__(self, downloader=None):
2992 InfoExtractor.__init__(self, downloader)
2996 return (re.match(MyVideoIE._VALID_URL, url) is not None)
2998 def report_download_webpage(self, video_id):
2999 """Report webpage download."""
3000 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3002 def report_extraction(self, video_id):
3003 """Report information extraction."""
3004 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3006 def _real_initialize(self):
3009 def _real_extract(self,url):
3010 mobj = re.match(self._VALID_URL, url)
3012 self._download.trouble(u'ERROR: invalid URL: %s' % url)
3015 video_id = mobj.group(1)
3016 simple_title = mobj.group(2).decode('utf-8')
3017 # should actually not be necessary
3018 simple_title = sanitize_title(simple_title)
3019 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', simple_title)
3022 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3024 self.report_download_webpage(video_id)
3025 webpage = urllib2.urlopen(request).read()
3026 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3027 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3030 self.report_extraction(video_id)
3031 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3034 self._downloader.trouble(u'ERROR: unable to extract media URL')
3036 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3038 mobj = re.search('<title>([^<]+)</title>', webpage)
3040 self._downloader.trouble(u'ERROR: unable to extract title')
3043 video_title = mobj.group(1)
3044 video_title = sanitize_title(video_title)
3048 self._downloader.process_info({
3052 'upload_date': u'NA',
3053 'title': video_title,
3054 'stitle': simple_title,
3059 except UnavailableVideoError:
3060 self._downloader.trouble(u'\nERROR: Unable to download video')
3062 class ComedyCentralIE(InfoExtractor):
3063 """Information extractor for The Daily Show and Colbert Report """
3065 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3069 return (re.match(ComedyCentralIE._VALID_URL, url) is not None)
3071 def report_extraction(self, episode_id):
3072 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3074 def report_config_download(self, episode_id):
3075 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3077 def report_index_download(self, episode_id):
3078 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3080 def report_player_url(self, episode_id):
3081 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3083 def _simplify_title(self, title):
3084 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3085 res = res.strip(ur'_')
3088 def _real_extract(self, url):
3089 mobj = re.match(self._VALID_URL, url)
3091 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3094 if mobj.group('shortname'):
3095 if mobj.group('shortname') in ('tds', 'thedailyshow'):
3096 url = 'http://www.thedailyshow.com/full-episodes/'
3098 url = 'http://www.colbertnation.com/full-episodes/'
3099 mobj = re.match(self._VALID_URL, url)
3100 assert mobj is not None
3102 dlNewest = not mobj.group('episode')
3104 epTitle = mobj.group('showname')
3106 epTitle = mobj.group('episode')
3108 req = urllib2.Request(url)
3109 self.report_extraction(epTitle)
3111 htmlHandle = urllib2.urlopen(req)
3112 html = htmlHandle.read()
3113 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3114 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3117 url = htmlHandle.geturl()
3118 mobj = re.match(self._VALID_URL, url)
3120 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3122 if mobj.group('episode') == '':
3123 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3125 epTitle = mobj.group('episode')
3127 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"/>', html)
3128 if len(mMovieParams) == 0:
3129 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3132 playerUrl_raw = mMovieParams[0][0]
3133 self.report_player_url(epTitle)
3135 urlHandle = urllib2.urlopen(playerUrl_raw)
3136 playerUrl = urlHandle.geturl()
3137 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3138 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3141 uri = mMovieParams[0][1]
3142 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3143 self.report_index_download(epTitle)
3145 indexXml = urllib2.urlopen(indexUrl).read()
3146 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3147 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3150 idoc = xml.etree.ElementTree.fromstring(indexXml)
3151 itemEls = idoc.findall('.//item')
3152 for itemEl in itemEls:
3153 mediaId = itemEl.findall('./guid')[0].text
3154 shortMediaId = mediaId.split(':')[-1]
3155 showId = mediaId.split(':')[-2].replace('.com', '')
3156 officialTitle = itemEl.findall('./title')[0].text
3157 officialDate = itemEl.findall('./pubDate')[0].text
3159 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3160 urllib.urlencode({'uri': mediaId}))
3161 configReq = urllib2.Request(configUrl)
3162 self.report_config_download(epTitle)
3164 configXml = urllib2.urlopen(configReq).read()
3165 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3166 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3169 cdoc = xml.etree.ElementTree.fromstring(configXml)
3171 for rendition in cdoc.findall('.//rendition'):
3172 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3176 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3179 # For now, just pick the highest bitrate
3180 format,video_url = turls[-1]
3182 self._downloader.increment_downloads()
3184 effTitle = showId + '-' + epTitle
3189 'upload_date': officialDate,
3191 'stitle': self._simplify_title(effTitle),
3195 'description': officialTitle,
3196 'player_url': playerUrl
3200 self._downloader.process_info(info)
3201 except UnavailableVideoError, err:
3202 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3206 class EscapistIE(InfoExtractor):
3207 """Information extractor for The Escapist """
3209 _VALID_URL = r'^(https?://)?(www\.)escapistmagazine.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?].*$'
3213 return (re.match(EscapistIE._VALID_URL, url) is not None)
3215 def report_extraction(self, showName):
3216 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3218 def report_config_download(self, showName):
3219 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3221 def _simplify_title(self, title):
3222 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3223 res = res.strip(ur'_')
3226 def _real_extract(self, url):
3227 htmlParser = HTMLParser.HTMLParser()
3229 mobj = re.match(self._VALID_URL, url)
3231 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3233 showName = mobj.group('showname')
3234 videoId = mobj.group('episode')
3236 self.report_extraction(showName)
3238 webPage = urllib2.urlopen(url).read()
3239 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3240 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3243 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3244 description = htmlParser.unescape(descMatch.group(1))
3245 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3246 imgUrl = htmlParser.unescape(imgMatch.group(1))
3247 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3248 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3249 configUrlMatch = re.search('config=(.*)$', playerUrl)
3250 configUrl = urllib2.unquote(configUrlMatch.group(1))
3252 self.report_config_download(showName)
3254 configJSON = urllib2.urlopen(configUrl).read()
3255 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3256 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3259 # Technically, it's JavaScript, not JSON
3260 configJSON = configJSON.replace("'", '"')
3263 config = json.loads(configJSON)
3264 except (ValueError,), err:
3265 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3268 playlist = config['playlist']
3269 videoUrl = playlist[1]['url']
3271 self._downloader.increment_downloads()
3275 'uploader': showName,
3276 'upload_date': None,
3278 'stitle': self._simplify_title(showName),
3281 'thumbnail': imgUrl,
3282 'description': description,
3283 'player_url': playerUrl,
3287 self._downloader.process_info(info)
3288 except UnavailableVideoError, err:
3289 self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3293 class PostProcessor(object):
3294 """Post Processor class.
3296 PostProcessor objects can be added to downloaders with their
3297 add_post_processor() method. When the downloader has finished a
3298 successful download, it will take its internal chain of PostProcessors
3299 and start calling the run() method on each one of them, first with
3300 an initial argument and then with the returned value of the previous
3303 The chain will be stopped if one of them ever returns None or the end
3304 of the chain is reached.
3306 PostProcessor objects follow a "mutual registration" process similar
3307 to InfoExtractor objects.
3312 def __init__(self, downloader=None):
3313 self._downloader = downloader
3315 def set_downloader(self, downloader):
3316 """Sets the downloader for this PP."""
3317 self._downloader = downloader
3319 def run(self, information):
3320 """Run the PostProcessor.
3322 The "information" argument is a dictionary like the ones
3323 composed by InfoExtractors. The only difference is that this
3324 one has an extra field called "filepath" that points to the
3327 When this method returns None, the postprocessing chain is
3328 stopped. However, this method may return an information
3329 dictionary that will be passed to the next postprocessing
3330 object in the chain. It can be the one it received after
3331 changing some fields.
3333 In addition, this method may raise a PostProcessingError
3334 exception that will be taken into account by the downloader
3337 return information # by default, do nothing
3340 class FFmpegExtractAudioPP(PostProcessor):
3342 def __init__(self, downloader=None, preferredcodec=None):
3343 PostProcessor.__init__(self, downloader)
3344 if preferredcodec is None:
3345 preferredcodec = 'best'
3346 self._preferredcodec = preferredcodec
3349 def get_audio_codec(path):
3351 cmd = ['ffprobe', '-show_streams', '--', path]
3352 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3353 output = handle.communicate()[0]
3354 if handle.wait() != 0:
3356 except (IOError, OSError):
3359 for line in output.split('\n'):
3360 if line.startswith('codec_name='):
3361 audio_codec = line.split('=')[1].strip()
3362 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3367 def run_ffmpeg(path, out_path, codec, more_opts):
3369 cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3370 ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3372 except (IOError, OSError):
3375 def run(self, information):
3376 path = information['filepath']
3378 filecodec = self.get_audio_codec(path)
3379 if filecodec is None:
3380 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3384 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3385 if filecodec == 'aac' or filecodec == 'mp3':
3386 # Lossless if possible
3388 extension = filecodec
3389 if filecodec == 'aac':
3390 more_opts = ['-f', 'adts']
3393 acodec = 'libmp3lame'
3395 more_opts = ['-ab', '128k']
3397 # We convert the audio (lossy)
3398 acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
3399 extension = self._preferredcodec
3400 more_opts = ['-ab', '128k']
3401 if self._preferredcodec == 'aac':
3402 more_opts += ['-f', 'adts']
3404 (prefix, ext) = os.path.splitext(path)
3405 new_path = prefix + '.' + extension
3406 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3407 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3410 self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3415 except (IOError, OSError):
3416 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3419 information['filepath'] = new_path
3423 def updateSelf(downloader, filename):
3424 ''' Update the program file with the latest version from the repository '''
3425 # Note: downloader only used for options
3426 if not os.access(filename, os.W_OK):
3427 sys.exit('ERROR: no write permissions on %s' % filename)
3429 downloader.to_screen('Updating to latest version...')
3433 urlh = urllib.urlopen(UPDATE_URL)
3434 newcontent = urlh.read()
3437 except (IOError, OSError), err:
3438 sys.exit('ERROR: unable to download latest version')
3441 outf = open(filename, 'wb')
3443 outf.write(newcontent)
3446 except (IOError, OSError), err:
3447 sys.exit('ERROR: unable to overwrite current version')
3449 downloader.to_screen('Updated youtube-dl. Restart to use the new version.')
3456 def _format_option_string(option):
3457 ''' ('-o', '--option') -> -o, --format METAVAR'''
3461 if option._short_opts: opts.append(option._short_opts[0])
3462 if option._long_opts: opts.append(option._long_opts[0])
3463 if len(opts) > 1: opts.insert(1, ', ')
3465 if option.takes_value(): opts.append(' %s' % option.metavar)
3467 return "".join(opts)
3469 def _find_term_columns():
3470 columns = os.environ.get('COLUMNS', None)
3475 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3476 out,err = sp.communicate()
3477 return int(out.split()[1])
3483 max_help_position = 80
3485 # No need to wrap help messages if we're on a wide console
3486 columns = _find_term_columns()
3487 if columns: max_width = columns
3489 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
3490 fmt.format_option_strings = _format_option_string
3493 'version' : __version__,
3495 'usage' : '%prog [options] url [url...]',
3496 'conflict_handler' : 'resolve',
3499 parser = optparse.OptionParser(**kw)
3502 general = optparse.OptionGroup(parser, 'General Options')
3503 selection = optparse.OptionGroup(parser, 'Video Selection')
3504 authentication = optparse.OptionGroup(parser, 'Authentication Options')
3505 video_format = optparse.OptionGroup(parser, 'Video Format Options')
3506 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
3507 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
3508 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3510 general.add_option('-h', '--help',
3511 action='help', help='print this help text and exit')
3512 general.add_option('-v', '--version',
3513 action='version', help='print program version and exit')
3514 general.add_option('-U', '--update',
3515 action='store_true', dest='update_self', help='update this program to latest version')
3516 general.add_option('-i', '--ignore-errors',
3517 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3518 general.add_option('-r', '--rate-limit',
3519 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3520 general.add_option('-R', '--retries',
3521 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
3522 general.add_option('--dump-user-agent',
3523 action='store_true', dest='dump_user_agent',
3524 help='display the current browser identification', default=False)
3526 selection.add_option('--playlist-start',
3527 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3528 selection.add_option('--playlist-end',
3529 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3530 selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
3531 selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
3533 authentication.add_option('-u', '--username',
3534 dest='username', metavar='USERNAME', help='account username')
3535 authentication.add_option('-p', '--password',
3536 dest='password', metavar='PASSWORD', help='account password')
3537 authentication.add_option('-n', '--netrc',
3538 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3541 video_format.add_option('-f', '--format',
3542 action='store', dest='format', metavar='FORMAT', help='video format code')
3543 video_format.add_option('--all-formats',
3544 action='store_const', dest='format', help='download all available video formats', const='-1')
3545 video_format.add_option('--max-quality',
3546 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3549 verbosity.add_option('-q', '--quiet',
3550 action='store_true', dest='quiet', help='activates quiet mode', default=False)
3551 verbosity.add_option('-s', '--simulate',
3552 action='store_true', dest='simulate', help='do not download video', default=False)
3553 verbosity.add_option('-g', '--get-url',
3554 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3555 verbosity.add_option('-e', '--get-title',
3556 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3557 verbosity.add_option('--get-thumbnail',
3558 action='store_true', dest='getthumbnail',
3559 help='simulate, quiet but print thumbnail URL', default=False)
3560 verbosity.add_option('--get-description',
3561 action='store_true', dest='getdescription',
3562 help='simulate, quiet but print video description', default=False)
3563 verbosity.add_option('--get-filename',
3564 action='store_true', dest='getfilename',
3565 help='simulate, quiet but print output filename', default=False)
3566 verbosity.add_option('--no-progress',
3567 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3568 verbosity.add_option('--console-title',
3569 action='store_true', dest='consoletitle',
3570 help='display progress in console titlebar', default=False)
3573 filesystem.add_option('-t', '--title',
3574 action='store_true', dest='usetitle', help='use title in file name', default=False)
3575 filesystem.add_option('-l', '--literal',
3576 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3577 filesystem.add_option('-A', '--auto-number',
3578 action='store_true', dest='autonumber',
3579 help='number downloaded files starting from 00000', default=False)
3580 filesystem.add_option('-o', '--output',
3581 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
3582 filesystem.add_option('-a', '--batch-file',
3583 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3584 filesystem.add_option('-w', '--no-overwrites',
3585 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3586 filesystem.add_option('-c', '--continue',
3587 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3588 filesystem.add_option('--cookies',
3589 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
3590 filesystem.add_option('--no-part',
3591 action='store_true', dest='nopart', help='do not use .part files', default=False)
3592 filesystem.add_option('--no-mtime',
3593 action='store_false', dest='updatetime',
3594 help='do not use the Last-modified header to set the file modification time', default=True)
3595 filesystem.add_option('--write-description',
3596 action='store_true', dest='writedescription',
3597 help='write video description to a .description file', default=False)
3598 filesystem.add_option('--write-info-json',
3599 action='store_true', dest='writeinfojson',
3600 help='write video metadata to a .info.json file', default=False)
3603 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3604 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3605 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3606 help='"best", "aac" or "mp3"; best by default')
3609 parser.add_option_group(general)
3610 parser.add_option_group(selection)
3611 parser.add_option_group(filesystem)
3612 parser.add_option_group(verbosity)
3613 parser.add_option_group(video_format)
3614 parser.add_option_group(authentication)
3615 parser.add_option_group(postproc)
3617 opts, args = parser.parse_args()
3619 return parser, opts, args
3622 parser, opts, args = parseOpts()
3624 # Open appropriate CookieJar
3625 if opts.cookiefile is None:
3626 jar = cookielib.CookieJar()
3629 jar = cookielib.MozillaCookieJar(opts.cookiefile)
3630 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3632 except (IOError, OSError), err:
3633 sys.exit(u'ERROR: unable to open cookie file')
3636 if opts.dump_user_agent:
3637 print std_headers['User-Agent']
3640 # General configuration
3641 cookie_processor = urllib2.HTTPCookieProcessor(jar)
3642 opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
3643 urllib2.install_opener(opener)
3644 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3646 # Batch file verification
3648 if opts.batchfile is not None:
3650 if opts.batchfile == '-':
3653 batchfd = open(opts.batchfile, 'r')
3654 batchurls = batchfd.readlines()
3655 batchurls = [x.strip() for x in batchurls]
3656 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3658 sys.exit(u'ERROR: batch file could not be read')
3659 all_urls = batchurls + args
3661 # Conflicting, missing and erroneous options
3662 if opts.usenetrc and (opts.username is not None or opts.password is not None):
3663 parser.error(u'using .netrc conflicts with giving username/password')
3664 if opts.password is not None and opts.username is None:
3665 parser.error(u'account username missing')
3666 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3667 parser.error(u'using output template conflicts with using title, literal title or auto number')
3668 if opts.usetitle and opts.useliteral:
3669 parser.error(u'using title conflicts with using literal title')
3670 if opts.username is not None and opts.password is None:
3671 opts.password = getpass.getpass(u'Type account password and press return:')
3672 if opts.ratelimit is not None:
3673 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3674 if numeric_limit is None:
3675 parser.error(u'invalid rate limit specified')
3676 opts.ratelimit = numeric_limit
3677 if opts.retries is not None:
3679 opts.retries = long(opts.retries)
3680 except (TypeError, ValueError), err:
3681 parser.error(u'invalid retry count specified')
3683 opts.playliststart = int(opts.playliststart)
3684 if opts.playliststart <= 0:
3685 raise ValueError(u'Playlist start must be positive')
3686 except (TypeError, ValueError), err:
3687 parser.error(u'invalid playlist start number specified')
3689 opts.playlistend = int(opts.playlistend)
3690 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3691 raise ValueError(u'Playlist end must be greater than playlist start')
3692 except (TypeError, ValueError), err:
3693 parser.error(u'invalid playlist end number specified')
3694 if opts.extractaudio:
3695 if opts.audioformat not in ['best', 'aac', 'mp3']:
3696 parser.error(u'invalid audio format specified')
3698 # Information extractors
3699 youtube_ie = YoutubeIE()
3700 google_ie = GoogleIE()
3701 yahoo_ie = YahooIE()
3702 extractors = [ # Order does matter
3704 MetacafeIE(youtube_ie),
3706 YoutubePlaylistIE(youtube_ie),
3707 YoutubeUserIE(youtube_ie),
3708 YoutubeSearchIE(youtube_ie),
3710 GoogleSearchIE(google_ie),
3713 YahooSearchIE(yahoo_ie),
3726 fd = FileDownloader({
3727 'usenetrc': opts.usenetrc,
3728 'username': opts.username,
3729 'password': opts.password,
3730 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3731 'forceurl': opts.geturl,
3732 'forcetitle': opts.gettitle,
3733 'forcethumbnail': opts.getthumbnail,
3734 'forcedescription': opts.getdescription,
3735 'forcefilename': opts.getfilename,
3736 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3737 'format': opts.format,
3738 'format_limit': opts.format_limit,
3739 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3740 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3741 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3742 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3743 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3744 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3745 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3746 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3747 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3748 or u'%(id)s.%(ext)s'),
3749 'ignoreerrors': opts.ignoreerrors,
3750 'ratelimit': opts.ratelimit,
3751 'nooverwrites': opts.nooverwrites,
3752 'retries': opts.retries,
3753 'continuedl': opts.continue_dl,
3754 'noprogress': opts.noprogress,
3755 'playliststart': opts.playliststart,
3756 'playlistend': opts.playlistend,
3757 'logtostderr': opts.outtmpl == '-',
3758 'consoletitle': opts.consoletitle,
3759 'nopart': opts.nopart,
3760 'updatetime': opts.updatetime,
3761 'writedescription': opts.writedescription,
3762 'writeinfojson': opts.writeinfojson,
3763 'matchtitle': opts.matchtitle,
3764 'rejecttitle': opts.rejecttitle,
3766 for extractor in extractors:
3767 fd.add_info_extractor(extractor)
3770 if opts.extractaudio:
3771 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))
3774 if opts.update_self:
3775 updateSelf(fd, sys.argv[0])
3778 if len(all_urls) < 1:
3779 if not opts.update_self:
3780 parser.error(u'you must provide at least one URL')
3783 retcode = fd.download(all_urls)
3785 # Dump cookie jar if requested
3786 if opts.cookiefile is not None:
3789 except (IOError, OSError), err:
3790 sys.exit(u'ERROR: unable to save cookie jar')
3795 if __name__ == '__main__':
3798 except DownloadError:
3800 except SameFileError:
3801 sys.exit(u'ERROR: fixed output name but more than one file to download')
3802 except KeyboardInterrupt:
3803 sys.exit(u'\nERROR: Interrupted by user')
3805 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: