2 # -*- coding: utf-8 -*-
5 'Ricardo Garcia Gonzalez',
13 'Philipp Hagemeister',
17 __license__ = 'Public Domain'
18 __version__ = '2011.09.16'
20 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
49 except ImportError: # Python 2.4
52 import cStringIO as StringIO
56 # parse_qs was moved from the cgi module to the urlparse module recently.
58 from urlparse import parse_qs
60 from cgi import parse_qs
68 import xml.etree.ElementTree
69 except ImportError: # Python<2.5: Not officially supported, but let it slip
70 warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
73 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
74 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
75 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
76 'Accept-Encoding': 'gzip, deflate',
77 'Accept-Language': 'en-us,en;q=0.5',
80 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
84 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
90 def raiseError(msg, i):
91 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
92 def skipSpace(i, expectMore=True):
93 while i < len(s) and s[i] in ' \t\r\n':
97 raiseError('Premature end', i)
99 def decodeEscape(match):
115 return unichr(int(esc[1:5], 16))
116 if len(esc) == 5+6 and esc[5:7] == '\\u':
117 hi = int(esc[1:5], 16)
118 low = int(esc[7:11], 16)
119 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
120 raise ValueError('Unknown escape ' + str(esc))
127 while s[e-bslashes-1] == '\\':
129 if bslashes % 2 == 1:
133 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
134 stri = rexp.sub(decodeEscape, s[i:e])
140 if s[i] == '}': # Empty dictionary
144 raiseError('Expected a string object key', i)
145 i,key = parseString(i)
147 if i >= len(s) or s[i] != ':':
148 raiseError('Expected a colon', i)
155 raiseError('Expected comma or closing curly brace', i)
160 if s[i] == ']': # Empty array
165 i = skipSpace(i) # Raise exception if premature end
169 raiseError('Expected a comma or closing bracket', i)
171 def parseDiscrete(i):
172 for k,v in {'true': True, 'false': False, 'null': None}.items():
173 if s.startswith(k, i):
175 raiseError('Not a boolean (or null)', i)
177 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
179 raiseError('Not a number', i)
181 if '.' in nums or 'e' in nums or 'E' in nums:
182 return (i+len(nums), float(nums))
183 return (i+len(nums), int(nums))
184 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
187 i,res = CHARMAP.get(s[i], parseNumber)(i)
188 i = skipSpace(i, False)
192 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
195 def preferredencoding():
196 """Get preferred encoding.
198 Returns the best encoding scheme for the system, based on
199 locale.getpreferredencoding() and some further tweaks.
201 def yield_preferredencoding():
203 pref = locale.getpreferredencoding()
209 return yield_preferredencoding().next()
212 def htmlentity_transform(matchobj):
213 """Transforms an HTML entity to a Unicode character.
215 This function receives a match object and is intended to be used with
216 the re.sub() function.
218 entity = matchobj.group(1)
220 # Known non-numeric HTML entity
221 if entity in htmlentitydefs.name2codepoint:
222 return unichr(htmlentitydefs.name2codepoint[entity])
225 mobj = re.match(ur'(?u)#(x?\d+)', entity)
227 numstr = mobj.group(1)
228 if numstr.startswith(u'x'):
230 numstr = u'0%s' % numstr
233 return unichr(long(numstr, base))
235 # Unknown entity in name, return its literal representation
236 return (u'&%s;' % entity)
239 def sanitize_title(utitle):
240 """Sanitizes a video title so it could be used as part of a filename."""
241 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
242 return utitle.replace(unicode(os.sep), u'%')
245 def sanitize_open(filename, open_mode):
246 """Try to open the given filename, and slightly tweak it if this fails.
248 Attempts to open the given filename. If this fails, it tries to change
249 the filename slightly, step by step, until it's either able to open it
250 or it fails and raises a final exception, like the standard open()
253 It returns the tuple (stream, definitive_file_name).
257 if sys.platform == 'win32':
259 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
260 return (sys.stdout, filename)
261 stream = open(filename, open_mode)
262 return (stream, filename)
263 except (IOError, OSError), err:
264 # In case of error, try to remove win32 forbidden chars
265 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
267 # An exception here should be caught in the caller
268 stream = open(filename, open_mode)
269 return (stream, filename)
272 def timeconvert(timestr):
273 """Convert RFC 2822 defined time string into system timestamp"""
275 timetuple = email.utils.parsedate_tz(timestr)
276 if timetuple is not None:
277 timestamp = email.utils.mktime_tz(timetuple)
281 class DownloadError(Exception):
282 """Download Error exception.
284 This exception may be thrown by FileDownloader objects if they are not
285 configured to continue on errors. They will contain the appropriate
291 class SameFileError(Exception):
292 """Same File exception.
294 This exception will be thrown by FileDownloader objects if they detect
295 multiple files would have to be downloaded to the same file on disk.
300 class PostProcessingError(Exception):
301 """Post Processing exception.
303 This exception may be raised by PostProcessor's .run() method to
304 indicate an error in the postprocessing task.
309 class UnavailableVideoError(Exception):
310 """Unavailable Format exception.
312 This exception will be thrown when a video is requested
313 in a format that is not available for that video.
318 class ContentTooShortError(Exception):
319 """Content Too Short exception.
321 This exception may be raised by FileDownloader objects when a file they
322 download is too small for what the server announced first, indicating
323 the connection was probably interrupted.
329 def __init__(self, downloaded, expected):
330 self.downloaded = downloaded
331 self.expected = expected
334 class YoutubeDLHandler(urllib2.HTTPHandler):
335 """Handler for HTTP requests and responses.
337 This class, when installed with an OpenerDirector, automatically adds
338 the standard headers to every HTTP request and handles gzipped and
339 deflated responses from web servers. If compression is to be avoided in
340 a particular request, the original request in the program code only has
341 to include the HTTP header "Youtubedl-No-Compression", which will be
342 removed before making the real request.
344 Part of this code was copied from:
346 http://techknack.net/python-urllib2-handlers/
348 Andrew Rowls, the author of that code, agreed to release it to the
355 return zlib.decompress(data, -zlib.MAX_WBITS)
357 return zlib.decompress(data)
360 def addinfourl_wrapper(stream, headers, url, code):
361 if hasattr(urllib2.addinfourl, 'getcode'):
362 return urllib2.addinfourl(stream, headers, url, code)
363 ret = urllib2.addinfourl(stream, headers, url)
367 def http_request(self, req):
368 for h in std_headers:
371 req.add_header(h, std_headers[h])
372 if 'Youtubedl-no-compression' in req.headers:
373 if 'Accept-encoding' in req.headers:
374 del req.headers['Accept-encoding']
375 del req.headers['Youtubedl-no-compression']
378 def http_response(self, req, resp):
381 if resp.headers.get('Content-encoding', '') == 'gzip':
382 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
383 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
384 resp.msg = old_resp.msg
386 if resp.headers.get('Content-encoding', '') == 'deflate':
387 gz = StringIO.StringIO(self.deflate(resp.read()))
388 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
389 resp.msg = old_resp.msg
393 class FileDownloader(object):
394 """File Downloader class.
396 File downloader objects are the ones responsible of downloading the
397 actual video file and writing it to disk if the user has requested
398 it, among some other tasks. In most cases there should be one per
399 program. As, given a video URL, the downloader doesn't know how to
400 extract all the needed information, task that InfoExtractors do, it
401 has to pass the URL to one of them.
403 For this, file downloader objects have a method that allows
404 InfoExtractors to be registered in a given order. When it is passed
405 a URL, the file downloader handles it to the first InfoExtractor it
406 finds that reports being able to handle it. The InfoExtractor extracts
407 all the information about the video or videos the URL refers to, and
408 asks the FileDownloader to process the video information, possibly
409 downloading the video.
411 File downloaders accept a lot of parameters. In order not to saturate
412 the object constructor with arguments, it receives a dictionary of
413 options instead. These options are available through the params
414 attribute for the InfoExtractors to use. The FileDownloader also
415 registers itself as the downloader in charge for the InfoExtractors
416 that are added to it, so this is a "mutual registration".
420 username: Username for authentication purposes.
421 password: Password for authentication purposes.
422 usenetrc: Use netrc for authentication instead.
423 quiet: Do not print messages to stdout.
424 forceurl: Force printing final URL.
425 forcetitle: Force printing title.
426 forcethumbnail: Force printing thumbnail URL.
427 forcedescription: Force printing description.
428 forcefilename: Force printing final filename.
429 simulate: Do not download the video files.
430 format: Video format code.
431 format_limit: Highest quality format to try.
432 outtmpl: Template for output names.
433 ignoreerrors: Do not stop on download errors.
434 ratelimit: Download speed limit, in bytes/sec.
435 nooverwrites: Prevent overwriting files.
436 retries: Number of times to retry for HTTP error 5xx
437 continuedl: Try to continue downloads if possible.
438 noprogress: Do not print the progress bar.
439 playliststart: Playlist item to start at.
440 playlistend: Playlist item to end at.
441 matchtitle: Download only matching titles.
442 rejecttitle: Reject downloads for matching titles.
443 logtostderr: Log messages to stderr instead of stdout.
444 consoletitle: Display progress in console window's titlebar.
445 nopart: Do not use temporary .part files.
446 updatetime: Use the Last-modified header to set output file timestamps.
447 writedescription: Write the video description to a .description file
448 writeinfojson: Write the video description to a .info.json file
454 _download_retcode = None
455 _num_downloads = None
458 def __init__(self, params):
459 """Create a FileDownloader object with the given options."""
462 self._download_retcode = 0
463 self._num_downloads = 0
464 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
468 def format_bytes(bytes):
471 if type(bytes) is str:
476 exponent = long(math.log(bytes, 1024.0))
477 suffix = 'bkMGTPEZY'[exponent]
478 converted = float(bytes) / float(1024 ** exponent)
479 return '%.2f%s' % (converted, suffix)
482 def calc_percent(byte_counter, data_len):
485 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
488 def calc_eta(start, now, total, current):
492 if current == 0 or dif < 0.001: # One millisecond
494 rate = float(current) / dif
495 eta = long((float(total) - float(current)) / rate)
496 (eta_mins, eta_secs) = divmod(eta, 60)
499 return '%02d:%02d' % (eta_mins, eta_secs)
502 def calc_speed(start, now, bytes):
504 if bytes == 0 or dif < 0.001: # One millisecond
505 return '%10s' % '---b/s'
506 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
509 def best_block_size(elapsed_time, bytes):
510 new_min = max(bytes / 2.0, 1.0)
511 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
512 if elapsed_time < 0.001:
514 rate = bytes / elapsed_time
522 def parse_bytes(bytestr):
523 """Parse a string indicating a byte quantity into a long integer."""
524 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
527 number = float(matchobj.group(1))
528 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
529 return long(round(number * multiplier))
531 def add_info_extractor(self, ie):
532 """Add an InfoExtractor object to the end of the list."""
534 ie.set_downloader(self)
536 def add_post_processor(self, pp):
537 """Add a PostProcessor object to the end of the chain."""
539 pp.set_downloader(self)
541 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
542 """Print message to stdout if not in quiet mode."""
544 if not self.params.get('quiet', False):
545 terminator = [u'\n', u''][skip_eol]
546 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
547 self._screen_file.flush()
548 except (UnicodeEncodeError), err:
549 if not ignore_encoding_errors:
552 def to_stderr(self, message):
553 """Print message to stderr."""
554 print >>sys.stderr, message.encode(preferredencoding())
556 def to_cons_title(self, message):
557 """Set console/terminal window title to message."""
558 if not self.params.get('consoletitle', False):
560 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
561 # c_wchar_p() might not be necessary if `message` is
562 # already of type unicode()
563 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
564 elif 'TERM' in os.environ:
565 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
567 def fixed_template(self):
568 """Checks if the output template is fixed."""
569 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
571 def trouble(self, message=None):
572 """Determine action to take when a download problem appears.
574 Depending on if the downloader has been configured to ignore
575 download errors or not, this method may throw an exception or
576 not when errors are found, after printing the message.
578 if message is not None:
579 self.to_stderr(message)
580 if not self.params.get('ignoreerrors', False):
581 raise DownloadError(message)
582 self._download_retcode = 1
584 def slow_down(self, start_time, byte_counter):
585 """Sleep if the download speed is over the rate limit."""
586 rate_limit = self.params.get('ratelimit', None)
587 if rate_limit is None or byte_counter == 0:
590 elapsed = now - start_time
593 speed = float(byte_counter) / elapsed
594 if speed > rate_limit:
595 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
597 def temp_name(self, filename):
598 """Returns a temporary filename for the given filename."""
599 if self.params.get('nopart', False) or filename == u'-' or \
600 (os.path.exists(filename) and not os.path.isfile(filename)):
602 return filename + u'.part'
604 def undo_temp_name(self, filename):
605 if filename.endswith(u'.part'):
606 return filename[:-len(u'.part')]
609 def try_rename(self, old_filename, new_filename):
611 if old_filename == new_filename:
613 os.rename(old_filename, new_filename)
614 except (IOError, OSError), err:
615 self.trouble(u'ERROR: unable to rename file')
617 def try_utime(self, filename, last_modified_hdr):
618 """Try to set the last-modified time of the given file."""
619 if last_modified_hdr is None:
621 if not os.path.isfile(filename):
623 timestr = last_modified_hdr
626 filetime = timeconvert(timestr)
630 os.utime(filename, (time.time(), filetime))
635 def report_writedescription(self, descfn):
636 """ Report that the description file is being written """
637 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
639 def report_writeinfojson(self, infofn):
640 """ Report that the metadata file has been written """
641 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
643 def report_destination(self, filename):
644 """Report destination filename."""
645 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
647 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
648 """Report download progress."""
649 if self.params.get('noprogress', False):
651 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
652 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
653 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
654 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
656 def report_resuming_byte(self, resume_len):
657 """Report attempt to resume at given byte."""
658 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
660 def report_retry(self, count, retries):
661 """Report retry in case of HTTP error 5xx"""
662 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
664 def report_file_already_downloaded(self, file_name):
665 """Report file has already been fully downloaded."""
667 self.to_screen(u'[download] %s has already been downloaded' % file_name)
668 except (UnicodeEncodeError), err:
669 self.to_screen(u'[download] The file has already been downloaded')
671 def report_unable_to_resume(self):
672 """Report it was impossible to resume download."""
673 self.to_screen(u'[download] Unable to resume')
675 def report_finish(self):
676 """Report download finished."""
677 if self.params.get('noprogress', False):
678 self.to_screen(u'[download] Download completed')
682 def increment_downloads(self):
683 """Increment the ordinal that assigns a number to each file."""
684 self._num_downloads += 1
686 def prepare_filename(self, info_dict):
687 """Generate the output filename."""
689 template_dict = dict(info_dict)
690 template_dict['epoch'] = unicode(long(time.time()))
691 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
692 filename = self.params['outtmpl'] % template_dict
694 except (ValueError, KeyError), err:
695 self.trouble(u'ERROR: invalid system charset or erroneous output template')
698 def process_info(self, info_dict):
699 """Process a single dictionary returned by an InfoExtractor."""
700 filename = self.prepare_filename(info_dict)
703 if self.params.get('forcetitle', False):
704 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
705 if self.params.get('forceurl', False):
706 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
707 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
708 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
709 if self.params.get('forcedescription', False) and 'description' in info_dict:
710 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
711 if self.params.get('forcefilename', False) and filename is not None:
712 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
713 if self.params.get('forceformat', False):
714 print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
716 # Do nothing else if in simulate mode
717 if self.params.get('simulate', False):
723 matchtitle=self.params.get('matchtitle',False)
724 rejecttitle=self.params.get('rejecttitle',False)
725 title=info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
726 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
727 self.to_screen(u'[download] "%s" title did not match pattern "%s"' % (title, matchtitle))
729 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
730 self.to_screen(u'[download] "%s" title matched reject pattern "%s"' % (title, rejecttitle))
733 if self.params.get('nooverwrites', False) and os.path.exists(filename):
734 self.to_stderr(u'WARNING: file exists and will be skipped')
738 dn = os.path.dirname(filename)
739 if dn != '' and not os.path.exists(dn):
741 except (OSError, IOError), err:
742 self.trouble(u'ERROR: unable to create directory ' + unicode(err))
745 if self.params.get('writedescription', False):
747 descfn = filename + '.description'
748 self.report_writedescription(descfn)
749 descfile = open(descfn, 'wb')
751 descfile.write(info_dict['description'].encode('utf-8'))
754 except (OSError, IOError):
755 self.trouble(u'ERROR: Cannot write description file ' + descfn)
758 if self.params.get('writeinfojson', False):
759 infofn = filename + '.info.json'
760 self.report_writeinfojson(infofn)
763 except (NameError,AttributeError):
764 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
767 infof = open(infofn, 'wb')
769 json.dump(info_dict, infof)
772 except (OSError, IOError):
773 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
776 if not self.params.get('skip_download', False):
778 success,add_data = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
779 info_dict.update(add_data)
780 except (OSError, IOError), err:
781 raise UnavailableVideoError
782 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
783 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
785 except (ContentTooShortError, ), err:
786 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
791 self.post_process(filename, info_dict)
792 except (PostProcessingError), err:
793 self.trouble(u'ERROR: postprocessing: %s' % str(err))
796 def download(self, url_list):
797 """Download a given list of URLs."""
798 if len(url_list) > 1 and self.fixed_template():
799 raise SameFileError(self.params['outtmpl'])
802 suitable_found = False
804 # Go to next InfoExtractor if not suitable
805 if not ie.suitable(url):
808 # Suitable InfoExtractor found
809 suitable_found = True
811 # Extract information from URL and process it
814 # Suitable InfoExtractor had been found; go to next URL
817 if not suitable_found:
818 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
820 return self._download_retcode
822 def post_process(self, filename, ie_info):
823 """Run the postprocessing chain on the given file."""
825 info['filepath'] = filename
831 def _download_with_rtmpdump(self, filename, url, player_url):
832 self.report_destination(filename)
833 tmpfilename = self.temp_name(filename)
835 # Check for rtmpdump first
837 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
838 except (OSError, IOError):
839 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
842 # Download using rtmpdump. rtmpdump returns exit code 2 when
843 # the connection was interrumpted and resuming appears to be
844 # possible. This is part of rtmpdump's normal usage, AFAIK.
845 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
846 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
847 while retval == 2 or retval == 1:
848 prevsize = os.path.getsize(tmpfilename)
849 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
850 time.sleep(5.0) # This seems to be needed
851 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
852 cursize = os.path.getsize(tmpfilename)
853 if prevsize == cursize and retval == 1:
855 # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
856 if prevsize == cursize and retval == 2 and cursize > 1024:
857 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
861 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
862 self.try_rename(tmpfilename, filename)
865 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
868 def _do_download(self, filename, url, player_url):
869 # Check file already present
870 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
871 self.report_file_already_downloaded(filename)
874 # Attempt to download using rtmpdump
875 if url.startswith('rtmp'):
876 return self._download_with_rtmpdump(filename, url, player_url)
878 tmpfilename = self.temp_name(filename)
881 # Do not include the Accept-Encoding header
882 headers = {'Youtubedl-no-compression': 'True'}
883 basic_request = urllib2.Request(url, None, headers)
884 request = urllib2.Request(url, None, headers)
886 # Establish possible resume length
887 if os.path.isfile(tmpfilename):
888 resume_len = os.path.getsize(tmpfilename)
894 if self.params.get('continuedl', False):
895 self.report_resuming_byte(resume_len)
896 request.add_header('Range','bytes=%d-' % resume_len)
902 retries = self.params.get('retries', 0)
903 while count <= retries:
904 # Establish connection
906 data = urllib2.urlopen(request)
908 except (urllib2.HTTPError, ), err:
909 if (err.code < 500 or err.code >= 600) and err.code != 416:
910 # Unexpected HTTP error
912 elif err.code == 416:
913 # Unable to resume (requested range not satisfiable)
915 # Open the connection again without the range header
916 data = urllib2.urlopen(basic_request)
917 content_length = data.info()['Content-Length']
918 except (urllib2.HTTPError, ), err:
919 if err.code < 500 or err.code >= 600:
922 # Examine the reported length
923 if (content_length is not None and
924 (resume_len - 100 < long(content_length) < resume_len + 100)):
925 # The file had already been fully downloaded.
926 # Explanation to the above condition: in issue #175 it was revealed that
927 # YouTube sometimes adds or removes a few bytes from the end of the file,
928 # changing the file size slightly and causing problems for some users. So
929 # I decided to implement a suggested change and consider the file
930 # completely downloaded if the file size differs less than 100 bytes from
931 # the one in the hard drive.
932 self.report_file_already_downloaded(filename)
933 self.try_rename(tmpfilename, filename)
936 # The length does not match, we start the download over
937 self.report_unable_to_resume()
943 self.report_retry(count, retries)
946 self.trouble(u'ERROR: giving up after %s retries' % retries)
949 data_len = data.info().get('Content-length', None)
950 if data_len is not None:
951 data_len = long(data_len) + resume_len
952 data_len_str = self.format_bytes(data_len)
953 byte_counter = 0 + resume_len
959 data_block = data.read(block_size)
961 if len(data_block) == 0:
963 byte_counter += len(data_block)
965 # Open file just in time
968 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
969 assert stream is not None
970 filename = self.undo_temp_name(tmpfilename)
971 self.report_destination(filename)
972 except (OSError, IOError), err:
973 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
976 stream.write(data_block)
977 except (IOError, OSError), err:
978 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
980 block_size = self.best_block_size(after - before, len(data_block))
983 percent_str = self.calc_percent(byte_counter, data_len)
984 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
985 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
986 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
989 self.slow_down(start, byte_counter - resume_len)
992 self.trouble(u'\nERROR: Did not get any data blocks')
996 if data_len is not None and byte_counter != data_len:
997 raise ContentTooShortError(byte_counter, long(data_len))
998 self.try_rename(tmpfilename, filename)
1000 # Update file modification time
1002 if self.params.get('updatetime', True):
1003 filetime = self.try_utime(filename, data.info().get('last-modified', None))
1005 return True, {'filetime': filetime}
1008 class InfoExtractor(object):
1009 """Information Extractor class.
1011 Information extractors are the classes that, given a URL, extract
1012 information from the video (or videos) the URL refers to. This
1013 information includes the real video URL, the video title and simplified
1014 title, author and others. The information is stored in a dictionary
1015 which is then passed to the FileDownloader. The FileDownloader
1016 processes this information possibly downloading the video to the file
1017 system, among other possible outcomes. The dictionaries must include
1018 the following fields:
1020 id: Video identifier.
1021 url: Final video URL.
1022 uploader: Nickname of the video uploader.
1023 title: Literal title.
1024 stitle: Simplified title.
1025 ext: Video filename extension.
1026 format: Video format.
1027 player_url: SWF Player URL (may be None).
1029 The following fields are optional. Their primary purpose is to allow
1030 youtube-dl to serve as the backend for a video search function, such
1031 as the one in youtube2mp3. They are only used when their respective
1032 forced printing functions are called:
1034 thumbnail: Full URL to a video thumbnail image.
1035 description: One-line video description.
1037 Subclasses of this one should re-define the _real_initialize() and
1038 _real_extract() methods and define a _VALID_URL regexp.
1039 Probably, they should also be added to the list of extractors.
1045 def __init__(self, downloader=None):
1046 """Constructor. Receives an optional downloader."""
1048 self.set_downloader(downloader)
1050 def suitable(self, url):
1051 """Receives a URL and returns True if suitable for this IE."""
1052 return re.match(self._VALID_URL, url) is not None
1054 def initialize(self):
1055 """Initializes an instance (authentication, etc)."""
1057 self._real_initialize()
1060 def extract(self, url):
1061 """Extracts URL information and returns it in list of dicts."""
1063 return self._real_extract(url)
1065 def set_downloader(self, downloader):
1066 """Sets the downloader for this IE."""
1067 self._downloader = downloader
1069 def _real_initialize(self):
1070 """Real initialization process. Redefine in subclasses."""
1073 def _real_extract(self, url):
1074 """Real extraction process. Redefine in subclasses."""
1078 class YoutubeIE(InfoExtractor):
1079 """Information extractor for youtube.com."""
1081 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1082 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1083 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1084 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1085 _NETRC_MACHINE = 'youtube'
1086 # Listed in order of quality
1087 _available_formats = ['38', '37', '45', '22', '43', '35', '34', '18', '6', '5', '17', '13']
1088 _video_extensions = {
1094 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1098 IE_NAME = u'youtube'
1100 def report_lang(self):
1101 """Report attempt to set language."""
1102 self._downloader.to_screen(u'[youtube] Setting language')
1104 def report_login(self):
1105 """Report attempt to log in."""
1106 self._downloader.to_screen(u'[youtube] Logging in')
1108 def report_age_confirmation(self):
1109 """Report attempt to confirm age."""
1110 self._downloader.to_screen(u'[youtube] Confirming age')
1112 def report_video_webpage_download(self, video_id):
1113 """Report attempt to download video webpage."""
1114 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1116 def report_video_info_webpage_download(self, video_id):
1117 """Report attempt to download video info webpage."""
1118 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1120 def report_information_extraction(self, video_id):
1121 """Report attempt to extract video information."""
1122 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1124 def report_unavailable_format(self, video_id, format):
1125 """Report extracted video URL."""
1126 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1128 def report_rtmp_download(self):
1129 """Indicate the download will use the RTMP protocol."""
1130 self._downloader.to_screen(u'[youtube] RTMP download detected')
1132 def _real_initialize(self):
1133 if self._downloader is None:
1138 downloader_params = self._downloader.params
1140 # Attempt to use provided username and password or .netrc data
1141 if downloader_params.get('username', None) is not None:
1142 username = downloader_params['username']
1143 password = downloader_params['password']
1144 elif downloader_params.get('usenetrc', False):
1146 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1147 if info is not None:
1151 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1152 except (IOError, netrc.NetrcParseError), err:
1153 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1157 request = urllib2.Request(self._LANG_URL)
1160 urllib2.urlopen(request).read()
1161 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1162 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1165 # No authentication to be performed
1166 if username is None:
1171 'current_form': 'loginForm',
1173 'action_login': 'Log In',
1174 'username': username,
1175 'password': password,
1177 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1180 login_results = urllib2.urlopen(request).read()
1181 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1182 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1184 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1185 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1191 'action_confirm': 'Confirm',
1193 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1195 self.report_age_confirmation()
1196 age_results = urllib2.urlopen(request).read()
1197 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1198 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1201 def _real_extract(self, url):
1202 # Extract video id from URL
1203 mobj = re.match(self._VALID_URL, url)
1205 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1207 video_id = mobj.group(2)
1210 self.report_video_webpage_download(video_id)
1211 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1213 video_webpage = urllib2.urlopen(request).read()
1214 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1215 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1218 # Attempt to extract SWF player URL
1219 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1220 if mobj is not None:
1221 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1226 self.report_video_info_webpage_download(video_id)
1227 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1228 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1229 % (video_id, el_type))
1230 request = urllib2.Request(video_info_url)
1232 video_info_webpage = urllib2.urlopen(request).read()
1233 video_info = parse_qs(video_info_webpage)
1234 if 'token' in video_info:
1236 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1237 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1239 if 'token' not in video_info:
1240 if 'reason' in video_info:
1241 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1243 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1246 # Start extracting information
1247 self.report_information_extraction(video_id)
1250 if 'author' not in video_info:
1251 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1253 video_uploader = urllib.unquote_plus(video_info['author'][0])
1256 if 'title' not in video_info:
1257 self._downloader.trouble(u'ERROR: unable to extract video title')
1259 video_title = urllib.unquote_plus(video_info['title'][0])
1260 video_title = video_title.decode('utf-8')
1261 video_title = sanitize_title(video_title)
1264 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1265 simple_title = simple_title.strip(ur'_')
1268 if 'thumbnail_url' not in video_info:
1269 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1270 video_thumbnail = ''
1271 else: # don't panic if we can't find it
1272 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1276 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1277 if mobj is not None:
1278 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1279 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1280 for expression in format_expressions:
1282 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1290 video_description = u'No description available.'
1291 if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1292 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1293 if mobj is not None:
1294 video_description = mobj.group(1).decode('utf-8')
1296 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1297 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1298 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1299 # TODO use another parser
1302 video_token = urllib.unquote_plus(video_info['token'][0])
1304 # Decide which formats to download
1305 req_format = self._downloader.params.get('format', None)
1307 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1308 self.report_rtmp_download()
1309 video_url_list = [(None, video_info['conn'][0])]
1310 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1311 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1312 url_data = [parse_qs(uds) for uds in url_data_strs]
1313 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1314 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1316 format_limit = self._downloader.params.get('format_limit', None)
1317 if format_limit is not None and format_limit in self._available_formats:
1318 format_list = self._available_formats[self._available_formats.index(format_limit):]
1320 format_list = self._available_formats
1321 existing_formats = [x for x in format_list if x in url_map]
1322 if len(existing_formats) == 0:
1323 self._downloader.trouble(u'ERROR: no known formats available for video')
1325 if req_format is None or req_format == 'best':
1326 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1327 elif req_format == 'worst':
1328 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1329 elif req_format in ('-1', 'all'):
1330 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1332 # Specific formats. We pick the first in a slash-delimeted sequence.
1333 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1334 req_formats = req_format.split('/')
1335 video_url_list = None
1336 for rf in req_formats:
1338 video_url_list = [(rf, url_map[rf])]
1340 if video_url_list is None:
1341 self._downloader.trouble(u'ERROR: requested format not available')
1344 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1347 for format_param, video_real_url in video_url_list:
1348 # At this point we have a new video
1349 self._downloader.increment_downloads()
1352 video_extension = self._video_extensions.get(format_param, 'flv')
1355 # Process video information
1356 self._downloader.process_info({
1357 'id': video_id.decode('utf-8'),
1358 'url': video_real_url.decode('utf-8'),
1359 'uploader': video_uploader.decode('utf-8'),
1360 'upload_date': upload_date,
1361 'title': video_title,
1362 'stitle': simple_title,
1363 'ext': video_extension.decode('utf-8'),
1364 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1365 'thumbnail': video_thumbnail.decode('utf-8'),
1366 'description': video_description,
1367 'player_url': player_url,
1369 except UnavailableVideoError, err:
1370 self._downloader.trouble(u'\nERROR: unable to download video')
1373 class MetacafeIE(InfoExtractor):
1374 """Information Extractor for metacafe.com."""
1376 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1377 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1378 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1380 IE_NAME = u'metacafe'
1382 def __init__(self, youtube_ie, downloader=None):
1383 InfoExtractor.__init__(self, downloader)
1384 self._youtube_ie = youtube_ie
1386 def report_disclaimer(self):
1387 """Report disclaimer retrieval."""
1388 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1390 def report_age_confirmation(self):
1391 """Report attempt to confirm age."""
1392 self._downloader.to_screen(u'[metacafe] Confirming age')
1394 def report_download_webpage(self, video_id):
1395 """Report webpage download."""
1396 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1398 def report_extraction(self, video_id):
1399 """Report information extraction."""
1400 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1402 def _real_initialize(self):
1403 # Retrieve disclaimer
1404 request = urllib2.Request(self._DISCLAIMER)
1406 self.report_disclaimer()
1407 disclaimer = urllib2.urlopen(request).read()
1408 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1409 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1415 'submit': "Continue - I'm over 18",
1417 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1419 self.report_age_confirmation()
1420 disclaimer = urllib2.urlopen(request).read()
1421 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1422 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1425 def _real_extract(self, url):
1426 # Extract id and simplified title from URL
1427 mobj = re.match(self._VALID_URL, url)
1429 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1432 video_id = mobj.group(1)
1434 # Check if video comes from YouTube
1435 mobj2 = re.match(r'^yt-(.*)$', video_id)
1436 if mobj2 is not None:
1437 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1440 # At this point we have a new video
1441 self._downloader.increment_downloads()
1443 simple_title = mobj.group(2).decode('utf-8')
1445 # Retrieve video webpage to extract further information
1446 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1448 self.report_download_webpage(video_id)
1449 webpage = urllib2.urlopen(request).read()
1450 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1451 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1454 # Extract URL, uploader and title from webpage
1455 self.report_extraction(video_id)
1456 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1457 if mobj is not None:
1458 mediaURL = urllib.unquote(mobj.group(1))
1459 video_extension = mediaURL[-3:]
1461 # Extract gdaKey if available
1462 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1464 video_url = mediaURL
1466 gdaKey = mobj.group(1)
1467 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1469 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1471 self._downloader.trouble(u'ERROR: unable to extract media URL')
1473 vardict = parse_qs(mobj.group(1))
1474 if 'mediaData' not in vardict:
1475 self._downloader.trouble(u'ERROR: unable to extract media URL')
1477 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1479 self._downloader.trouble(u'ERROR: unable to extract media URL')
1481 mediaURL = mobj.group(1).replace('\\/', '/')
1482 video_extension = mediaURL[-3:]
1483 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1485 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1487 self._downloader.trouble(u'ERROR: unable to extract title')
1489 video_title = mobj.group(1).decode('utf-8')
1490 video_title = sanitize_title(video_title)
1492 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1494 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1496 video_uploader = mobj.group(1)
1499 # Process video information
1500 self._downloader.process_info({
1501 'id': video_id.decode('utf-8'),
1502 'url': video_url.decode('utf-8'),
1503 'uploader': video_uploader.decode('utf-8'),
1504 'upload_date': u'NA',
1505 'title': video_title,
1506 'stitle': simple_title,
1507 'ext': video_extension.decode('utf-8'),
1511 except UnavailableVideoError:
1512 self._downloader.trouble(u'\nERROR: unable to download video')
1515 class DailymotionIE(InfoExtractor):
1516 """Information Extractor for Dailymotion"""
1518 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1519 IE_NAME = u'dailymotion'
1521 def __init__(self, downloader=None):
1522 InfoExtractor.__init__(self, downloader)
1524 def report_download_webpage(self, video_id):
1525 """Report webpage download."""
1526 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1528 def report_extraction(self, video_id):
1529 """Report information extraction."""
1530 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1532 def _real_initialize(self):
1535 def _real_extract(self, url):
1536 # Extract id and simplified title from URL
1537 mobj = re.match(self._VALID_URL, url)
1539 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1542 # At this point we have a new video
1543 self._downloader.increment_downloads()
1544 video_id = mobj.group(1)
1546 simple_title = mobj.group(2).decode('utf-8')
1547 video_extension = 'flv'
1549 # Retrieve video webpage to extract further information
1550 request = urllib2.Request(url)
1551 request.add_header('Cookie', 'family_filter=off')
1553 self.report_download_webpage(video_id)
1554 webpage = urllib2.urlopen(request).read()
1555 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1556 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1559 # Extract URL, uploader and title from webpage
1560 self.report_extraction(video_id)
1561 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1563 self._downloader.trouble(u'ERROR: unable to extract media URL')
1565 sequence = urllib.unquote(mobj.group(1))
1566 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1568 self._downloader.trouble(u'ERROR: unable to extract media URL')
1570 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1572 # if needed add http://www.dailymotion.com/ if relative URL
1574 video_url = mediaURL
1576 mobj = re.search(r'(?im)<title>Dailymotion\s*-\s*(.+)\s*-\s*[^<]+?</title>', webpage)
1578 self._downloader.trouble(u'ERROR: unable to extract title')
1580 video_title = mobj.group(1).decode('utf-8')
1581 video_title = sanitize_title(video_title)
1583 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1585 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1587 video_uploader = mobj.group(1)
1590 # Process video information
1591 self._downloader.process_info({
1592 'id': video_id.decode('utf-8'),
1593 'url': video_url.decode('utf-8'),
1594 'uploader': video_uploader.decode('utf-8'),
1595 'upload_date': u'NA',
1596 'title': video_title,
1597 'stitle': simple_title,
1598 'ext': video_extension.decode('utf-8'),
1602 except UnavailableVideoError:
1603 self._downloader.trouble(u'\nERROR: unable to download video')
1606 class GoogleIE(InfoExtractor):
1607 """Information extractor for video.google.com."""
1609 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1610 IE_NAME = u'video.google'
1612 def __init__(self, downloader=None):
1613 InfoExtractor.__init__(self, downloader)
1615 def report_download_webpage(self, video_id):
1616 """Report webpage download."""
1617 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1619 def report_extraction(self, video_id):
1620 """Report information extraction."""
1621 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1623 def _real_initialize(self):
1626 def _real_extract(self, url):
1627 # Extract id from URL
1628 mobj = re.match(self._VALID_URL, url)
1630 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1633 # At this point we have a new video
1634 self._downloader.increment_downloads()
1635 video_id = mobj.group(1)
1637 video_extension = 'mp4'
1639 # Retrieve video webpage to extract further information
1640 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1642 self.report_download_webpage(video_id)
1643 webpage = urllib2.urlopen(request).read()
1644 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1645 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1648 # Extract URL, uploader, and title from webpage
1649 self.report_extraction(video_id)
1650 mobj = re.search(r"download_url:'([^']+)'", webpage)
1652 video_extension = 'flv'
1653 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1655 self._downloader.trouble(u'ERROR: unable to extract media URL')
1657 mediaURL = urllib.unquote(mobj.group(1))
1658 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1659 mediaURL = mediaURL.replace('\\x26', '\x26')
1661 video_url = mediaURL
1663 mobj = re.search(r'<title>(.*)</title>', webpage)
1665 self._downloader.trouble(u'ERROR: unable to extract title')
1667 video_title = mobj.group(1).decode('utf-8')
1668 video_title = sanitize_title(video_title)
1669 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1671 # Extract video description
1672 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1674 self._downloader.trouble(u'ERROR: unable to extract video description')
1676 video_description = mobj.group(1).decode('utf-8')
1677 if not video_description:
1678 video_description = 'No description available.'
1680 # Extract video thumbnail
1681 if self._downloader.params.get('forcethumbnail', False):
1682 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1684 webpage = urllib2.urlopen(request).read()
1685 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1686 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1688 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1690 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1692 video_thumbnail = mobj.group(1)
1693 else: # we need something to pass to process_info
1694 video_thumbnail = ''
1697 # Process video information
1698 self._downloader.process_info({
1699 'id': video_id.decode('utf-8'),
1700 'url': video_url.decode('utf-8'),
1702 'upload_date': u'NA',
1703 'title': video_title,
1704 'stitle': simple_title,
1705 'ext': video_extension.decode('utf-8'),
1709 except UnavailableVideoError:
1710 self._downloader.trouble(u'\nERROR: unable to download video')
1713 class PhotobucketIE(InfoExtractor):
1714 """Information extractor for photobucket.com."""
1716 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1717 IE_NAME = u'photobucket'
1719 def __init__(self, downloader=None):
1720 InfoExtractor.__init__(self, downloader)
1722 def report_download_webpage(self, video_id):
1723 """Report webpage download."""
1724 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1726 def report_extraction(self, video_id):
1727 """Report information extraction."""
1728 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1730 def _real_initialize(self):
1733 def _real_extract(self, url):
1734 # Extract id from URL
1735 mobj = re.match(self._VALID_URL, url)
1737 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1740 # At this point we have a new video
1741 self._downloader.increment_downloads()
1742 video_id = mobj.group(1)
1744 video_extension = 'flv'
1746 # Retrieve video webpage to extract further information
1747 request = urllib2.Request(url)
1749 self.report_download_webpage(video_id)
1750 webpage = urllib2.urlopen(request).read()
1751 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1752 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1755 # Extract URL, uploader, and title from webpage
1756 self.report_extraction(video_id)
1757 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1759 self._downloader.trouble(u'ERROR: unable to extract media URL')
1761 mediaURL = urllib.unquote(mobj.group(1))
1763 video_url = mediaURL
1765 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1767 self._downloader.trouble(u'ERROR: unable to extract title')
1769 video_title = mobj.group(1).decode('utf-8')
1770 video_title = sanitize_title(video_title)
1771 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1773 video_uploader = mobj.group(2).decode('utf-8')
1776 # Process video information
1777 self._downloader.process_info({
1778 'id': video_id.decode('utf-8'),
1779 'url': video_url.decode('utf-8'),
1780 'uploader': video_uploader,
1781 'upload_date': u'NA',
1782 'title': video_title,
1783 'stitle': simple_title,
1784 'ext': video_extension.decode('utf-8'),
1788 except UnavailableVideoError:
1789 self._downloader.trouble(u'\nERROR: unable to download video')
1792 class YahooIE(InfoExtractor):
1793 """Information extractor for video.yahoo.com."""
1795 # _VALID_URL matches all Yahoo! Video URLs
1796 # _VPAGE_URL matches only the extractable '/watch/' URLs
1797 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1798 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1799 IE_NAME = u'video.yahoo'
1801 def __init__(self, downloader=None):
1802 InfoExtractor.__init__(self, downloader)
1804 def report_download_webpage(self, video_id):
1805 """Report webpage download."""
1806 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1808 def report_extraction(self, video_id):
1809 """Report information extraction."""
1810 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1812 def _real_initialize(self):
1815 def _real_extract(self, url, new_video=True):
1816 # Extract ID from URL
1817 mobj = re.match(self._VALID_URL, url)
1819 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1822 # At this point we have a new video
1823 self._downloader.increment_downloads()
1824 video_id = mobj.group(2)
1825 video_extension = 'flv'
1827 # Rewrite valid but non-extractable URLs as
1828 # extractable English language /watch/ URLs
1829 if re.match(self._VPAGE_URL, url) is None:
1830 request = urllib2.Request(url)
1832 webpage = urllib2.urlopen(request).read()
1833 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1834 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1837 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1839 self._downloader.trouble(u'ERROR: Unable to extract id field')
1841 yahoo_id = mobj.group(1)
1843 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1845 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1847 yahoo_vid = mobj.group(1)
1849 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1850 return self._real_extract(url, new_video=False)
1852 # Retrieve video webpage to extract further information
1853 request = urllib2.Request(url)
1855 self.report_download_webpage(video_id)
1856 webpage = urllib2.urlopen(request).read()
1857 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1858 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1861 # Extract uploader and title from webpage
1862 self.report_extraction(video_id)
1863 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1865 self._downloader.trouble(u'ERROR: unable to extract video title')
1867 video_title = mobj.group(1).decode('utf-8')
1868 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1870 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1872 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1874 video_uploader = mobj.group(1).decode('utf-8')
1876 # Extract video thumbnail
1877 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1879 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1881 video_thumbnail = mobj.group(1).decode('utf-8')
1883 # Extract video description
1884 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1886 self._downloader.trouble(u'ERROR: unable to extract video description')
1888 video_description = mobj.group(1).decode('utf-8')
1889 if not video_description:
1890 video_description = 'No description available.'
1892 # Extract video height and width
1893 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1895 self._downloader.trouble(u'ERROR: unable to extract video height')
1897 yv_video_height = mobj.group(1)
1899 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1901 self._downloader.trouble(u'ERROR: unable to extract video width')
1903 yv_video_width = mobj.group(1)
1905 # Retrieve video playlist to extract media URL
1906 # I'm not completely sure what all these options are, but we
1907 # seem to need most of them, otherwise the server sends a 401.
1908 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1909 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1910 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1911 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1912 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1914 self.report_download_webpage(video_id)
1915 webpage = urllib2.urlopen(request).read()
1916 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1917 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1920 # Extract media URL from playlist XML
1921 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1923 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1925 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1926 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1929 # Process video information
1930 self._downloader.process_info({
1931 'id': video_id.decode('utf-8'),
1933 'uploader': video_uploader,
1934 'upload_date': u'NA',
1935 'title': video_title,
1936 'stitle': simple_title,
1937 'ext': video_extension.decode('utf-8'),
1938 'thumbnail': video_thumbnail.decode('utf-8'),
1939 'description': video_description,
1940 'thumbnail': video_thumbnail,
1943 except UnavailableVideoError:
1944 self._downloader.trouble(u'\nERROR: unable to download video')
1947 class VimeoIE(InfoExtractor):
1948 """Information extractor for vimeo.com."""
1950 # _VALID_URL matches Vimeo URLs
1951 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1954 def __init__(self, downloader=None):
1955 InfoExtractor.__init__(self, downloader)
1957 def report_download_webpage(self, video_id):
1958 """Report webpage download."""
1959 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1961 def report_extraction(self, video_id):
1962 """Report information extraction."""
1963 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1965 def _real_initialize(self):
1968 def _real_extract(self, url, new_video=True):
1969 # Extract ID from URL
1970 mobj = re.match(self._VALID_URL, url)
1972 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1975 # At this point we have a new video
1976 self._downloader.increment_downloads()
1977 video_id = mobj.group(1)
1979 # Retrieve video webpage to extract further information
1980 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
1982 self.report_download_webpage(video_id)
1983 webpage = urllib2.urlopen(request).read()
1984 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1985 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1988 # Now we begin extracting as much information as we can from what we
1989 # retrieved. First we extract the information common to all extractors,
1990 # and latter we extract those that are Vimeo specific.
1991 self.report_extraction(video_id)
1994 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
1996 self._downloader.trouble(u'ERROR: unable to extract video title')
1998 video_title = mobj.group(1).decode('utf-8')
1999 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2002 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
2004 self._downloader.trouble(u'ERROR: unable to extract video uploader')
2006 video_uploader = mobj.group(1).decode('utf-8')
2008 # Extract video thumbnail
2009 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2011 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2013 video_thumbnail = mobj.group(1).decode('utf-8')
2015 # # Extract video description
2016 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2018 # self._downloader.trouble(u'ERROR: unable to extract video description')
2020 # video_description = mobj.group(1).decode('utf-8')
2021 # if not video_description: video_description = 'No description available.'
2022 video_description = 'Foo.'
2024 # Vimeo specific: extract request signature
2025 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2027 self._downloader.trouble(u'ERROR: unable to extract request signature')
2029 sig = mobj.group(1).decode('utf-8')
2031 # Vimeo specific: Extract request signature expiration
2032 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2034 self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2036 sig_exp = mobj.group(1).decode('utf-8')
2038 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s" % (video_id, sig, sig_exp)
2041 # Process video information
2042 self._downloader.process_info({
2043 'id': video_id.decode('utf-8'),
2045 'uploader': video_uploader,
2046 'upload_date': u'NA',
2047 'title': video_title,
2048 'stitle': simple_title,
2050 'thumbnail': video_thumbnail.decode('utf-8'),
2051 'description': video_description,
2052 'thumbnail': video_thumbnail,
2053 'description': video_description,
2056 except UnavailableVideoError:
2057 self._downloader.trouble(u'ERROR: unable to download video')
2060 class GenericIE(InfoExtractor):
2061 """Generic last-resort information extractor."""
2064 IE_NAME = u'generic'
2066 def __init__(self, downloader=None):
2067 InfoExtractor.__init__(self, downloader)
2069 def report_download_webpage(self, video_id):
2070 """Report webpage download."""
2071 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2072 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2074 def report_extraction(self, video_id):
2075 """Report information extraction."""
2076 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2078 def _real_initialize(self):
2081 def _real_extract(self, url):
2082 # At this point we have a new video
2083 self._downloader.increment_downloads()
2085 video_id = url.split('/')[-1]
2086 request = urllib2.Request(url)
2088 self.report_download_webpage(video_id)
2089 webpage = urllib2.urlopen(request).read()
2090 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2091 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2093 except ValueError, err:
2094 # since this is the last-resort InfoExtractor, if
2095 # this error is thrown, it'll be thrown here
2096 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2099 self.report_extraction(video_id)
2100 # Start with something easy: JW Player in SWFObject
2101 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2103 # Broaden the search a little bit
2104 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2106 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2109 # It's possible that one of the regexes
2110 # matched, but returned an empty group:
2111 if mobj.group(1) is None:
2112 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2115 video_url = urllib.unquote(mobj.group(1))
2116 video_id = os.path.basename(video_url)
2118 # here's a fun little line of code for you:
2119 video_extension = os.path.splitext(video_id)[1][1:]
2120 video_id = os.path.splitext(video_id)[0]
2122 # it's tempting to parse this further, but you would
2123 # have to take into account all the variations like
2124 # Video Title - Site Name
2125 # Site Name | Video Title
2126 # Video Title - Tagline | Site Name
2127 # and so on and so forth; it's just not practical
2128 mobj = re.search(r'<title>(.*)</title>', webpage)
2130 self._downloader.trouble(u'ERROR: unable to extract title')
2132 video_title = mobj.group(1).decode('utf-8')
2133 video_title = sanitize_title(video_title)
2134 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2136 # video uploader is domain name
2137 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2139 self._downloader.trouble(u'ERROR: unable to extract title')
2141 video_uploader = mobj.group(1).decode('utf-8')
2144 # Process video information
2145 self._downloader.process_info({
2146 'id': video_id.decode('utf-8'),
2147 'url': video_url.decode('utf-8'),
2148 'uploader': video_uploader,
2149 'upload_date': u'NA',
2150 'title': video_title,
2151 'stitle': simple_title,
2152 'ext': video_extension.decode('utf-8'),
2156 except UnavailableVideoError, err:
2157 self._downloader.trouble(u'\nERROR: unable to download video')
2160 class YoutubeSearchIE(InfoExtractor):
2161 """Information Extractor for YouTube search queries."""
2162 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2163 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2164 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2165 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2167 _max_youtube_results = 1000
2168 IE_NAME = u'youtube:search'
2170 def __init__(self, youtube_ie, downloader=None):
2171 InfoExtractor.__init__(self, downloader)
2172 self._youtube_ie = youtube_ie
2174 def report_download_page(self, query, pagenum):
2175 """Report attempt to download playlist page with given number."""
2176 query = query.decode(preferredencoding())
2177 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2179 def _real_initialize(self):
2180 self._youtube_ie.initialize()
2182 def _real_extract(self, query):
2183 mobj = re.match(self._VALID_URL, query)
2185 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2188 prefix, query = query.split(':')
2190 query = query.encode('utf-8')
2192 self._download_n_results(query, 1)
2194 elif prefix == 'all':
2195 self._download_n_results(query, self._max_youtube_results)
2201 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2203 elif n > self._max_youtube_results:
2204 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2205 n = self._max_youtube_results
2206 self._download_n_results(query, n)
2208 except ValueError: # parsing prefix as integer fails
2209 self._download_n_results(query, 1)
2212 def _download_n_results(self, query, n):
2213 """Downloads a specified number of results for a query"""
2216 already_seen = set()
2220 self.report_download_page(query, pagenum)
2221 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2222 request = urllib2.Request(result_url)
2224 page = urllib2.urlopen(request).read()
2225 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2226 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2229 # Extract video identifiers
2230 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2231 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2232 if video_id not in already_seen:
2233 video_ids.append(video_id)
2234 already_seen.add(video_id)
2235 if len(video_ids) == n:
2236 # Specified n videos reached
2237 for id in video_ids:
2238 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2241 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2242 for id in video_ids:
2243 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2246 pagenum = pagenum + 1
2249 class GoogleSearchIE(InfoExtractor):
2250 """Information Extractor for Google Video search queries."""
2251 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2252 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2253 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2254 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2256 _max_google_results = 1000
2257 IE_NAME = u'video.google:search'
2259 def __init__(self, google_ie, downloader=None):
2260 InfoExtractor.__init__(self, downloader)
2261 self._google_ie = google_ie
2263 def report_download_page(self, query, pagenum):
2264 """Report attempt to download playlist page with given number."""
2265 query = query.decode(preferredencoding())
2266 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2268 def _real_initialize(self):
2269 self._google_ie.initialize()
2271 def _real_extract(self, query):
2272 mobj = re.match(self._VALID_URL, query)
2274 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2277 prefix, query = query.split(':')
2279 query = query.encode('utf-8')
2281 self._download_n_results(query, 1)
2283 elif prefix == 'all':
2284 self._download_n_results(query, self._max_google_results)
2290 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2292 elif n > self._max_google_results:
2293 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2294 n = self._max_google_results
2295 self._download_n_results(query, n)
2297 except ValueError: # parsing prefix as integer fails
2298 self._download_n_results(query, 1)
2301 def _download_n_results(self, query, n):
2302 """Downloads a specified number of results for a query"""
2305 already_seen = set()
2309 self.report_download_page(query, pagenum)
2310 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2311 request = urllib2.Request(result_url)
2313 page = urllib2.urlopen(request).read()
2314 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2315 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2318 # Extract video identifiers
2319 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2320 video_id = mobj.group(1)
2321 if video_id not in already_seen:
2322 video_ids.append(video_id)
2323 already_seen.add(video_id)
2324 if len(video_ids) == n:
2325 # Specified n videos reached
2326 for id in video_ids:
2327 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2330 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2331 for id in video_ids:
2332 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2335 pagenum = pagenum + 1
2338 class YahooSearchIE(InfoExtractor):
2339 """Information Extractor for Yahoo! Video search queries."""
2340 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2341 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2342 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2343 _MORE_PAGES_INDICATOR = r'\s*Next'
2345 _max_yahoo_results = 1000
2346 IE_NAME = u'video.yahoo:search'
2348 def __init__(self, yahoo_ie, downloader=None):
2349 InfoExtractor.__init__(self, downloader)
2350 self._yahoo_ie = yahoo_ie
2352 def report_download_page(self, query, pagenum):
2353 """Report attempt to download playlist page with given number."""
2354 query = query.decode(preferredencoding())
2355 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2357 def _real_initialize(self):
2358 self._yahoo_ie.initialize()
2360 def _real_extract(self, query):
2361 mobj = re.match(self._VALID_URL, query)
2363 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2366 prefix, query = query.split(':')
2368 query = query.encode('utf-8')
2370 self._download_n_results(query, 1)
2372 elif prefix == 'all':
2373 self._download_n_results(query, self._max_yahoo_results)
2379 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2381 elif n > self._max_yahoo_results:
2382 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2383 n = self._max_yahoo_results
2384 self._download_n_results(query, n)
2386 except ValueError: # parsing prefix as integer fails
2387 self._download_n_results(query, 1)
2390 def _download_n_results(self, query, n):
2391 """Downloads a specified number of results for a query"""
2394 already_seen = set()
2398 self.report_download_page(query, pagenum)
2399 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2400 request = urllib2.Request(result_url)
2402 page = urllib2.urlopen(request).read()
2403 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2404 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2407 # Extract video identifiers
2408 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2409 video_id = mobj.group(1)
2410 if video_id not in already_seen:
2411 video_ids.append(video_id)
2412 already_seen.add(video_id)
2413 if len(video_ids) == n:
2414 # Specified n videos reached
2415 for id in video_ids:
2416 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2419 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2420 for id in video_ids:
2421 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2424 pagenum = pagenum + 1
2427 class YoutubePlaylistIE(InfoExtractor):
2428 """Information Extractor for YouTube playlists."""
2430 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2431 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2432 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2433 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2435 IE_NAME = u'youtube:playlist'
2437 def __init__(self, youtube_ie, downloader=None):
2438 InfoExtractor.__init__(self, downloader)
2439 self._youtube_ie = youtube_ie
2441 def report_download_page(self, playlist_id, pagenum):
2442 """Report attempt to download playlist page with given number."""
2443 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2445 def _real_initialize(self):
2446 self._youtube_ie.initialize()
2448 def _real_extract(self, url):
2449 # Extract playlist id
2450 mobj = re.match(self._VALID_URL, url)
2452 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2456 if mobj.group(3) is not None:
2457 self._youtube_ie.extract(mobj.group(3))
2460 # Download playlist pages
2461 # prefix is 'p' as default for playlists but there are other types that need extra care
2462 playlist_prefix = mobj.group(1)
2463 if playlist_prefix == 'a':
2464 playlist_access = 'artist'
2466 playlist_prefix = 'p'
2467 playlist_access = 'view_play_list'
2468 playlist_id = mobj.group(2)
2473 self.report_download_page(playlist_id, pagenum)
2474 request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2476 page = urllib2.urlopen(request).read()
2477 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2478 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2481 # Extract video identifiers
2483 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2484 if mobj.group(1) not in ids_in_page:
2485 ids_in_page.append(mobj.group(1))
2486 video_ids.extend(ids_in_page)
2488 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2490 pagenum = pagenum + 1
2492 playliststart = self._downloader.params.get('playliststart', 1) - 1
2493 playlistend = self._downloader.params.get('playlistend', -1)
2494 video_ids = video_ids[playliststart:playlistend]
2496 for id in video_ids:
2497 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2501 class YoutubeUserIE(InfoExtractor):
2502 """Information Extractor for YouTube users."""
2504 _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2505 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2506 _GDATA_PAGE_SIZE = 50
2507 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2508 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2510 IE_NAME = u'youtube:user'
2512 def __init__(self, youtube_ie, downloader=None):
2513 InfoExtractor.__init__(self, downloader)
2514 self._youtube_ie = youtube_ie
2516 def report_download_page(self, username, start_index):
2517 """Report attempt to download user page."""
2518 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2519 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2521 def _real_initialize(self):
2522 self._youtube_ie.initialize()
2524 def _real_extract(self, url):
2526 mobj = re.match(self._VALID_URL, url)
2528 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2531 username = mobj.group(1)
2533 # Download video ids using YouTube Data API. Result size per
2534 # query is limited (currently to 50 videos) so we need to query
2535 # page by page until there are no video ids - it means we got
2542 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2543 self.report_download_page(username, start_index)
2545 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2548 page = urllib2.urlopen(request).read()
2549 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2550 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2553 # Extract video identifiers
2556 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2557 if mobj.group(1) not in ids_in_page:
2558 ids_in_page.append(mobj.group(1))
2560 video_ids.extend(ids_in_page)
2562 # A little optimization - if current page is not
2563 # "full", ie. does not contain PAGE_SIZE video ids then
2564 # we can assume that this page is the last one - there
2565 # are no more ids on further pages - no need to query
2568 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2573 all_ids_count = len(video_ids)
2574 playliststart = self._downloader.params.get('playliststart', 1) - 1
2575 playlistend = self._downloader.params.get('playlistend', -1)
2577 if playlistend == -1:
2578 video_ids = video_ids[playliststart:]
2580 video_ids = video_ids[playliststart:playlistend]
2582 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2583 (username, all_ids_count, len(video_ids)))
2585 for video_id in video_ids:
2586 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2589 class DepositFilesIE(InfoExtractor):
2590 """Information extractor for depositfiles.com"""
2592 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2593 IE_NAME = u'DepositFiles'
2595 def __init__(self, downloader=None):
2596 InfoExtractor.__init__(self, downloader)
2598 def report_download_webpage(self, file_id):
2599 """Report webpage download."""
2600 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2602 def report_extraction(self, file_id):
2603 """Report information extraction."""
2604 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2606 def _real_initialize(self):
2609 def _real_extract(self, url):
2610 # At this point we have a new file
2611 self._downloader.increment_downloads()
2613 file_id = url.split('/')[-1]
2614 # Rebuild url in english locale
2615 url = 'http://depositfiles.com/en/files/' + file_id
2617 # Retrieve file webpage with 'Free download' button pressed
2618 free_download_indication = { 'gateway_result' : '1' }
2619 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2621 self.report_download_webpage(file_id)
2622 webpage = urllib2.urlopen(request).read()
2623 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2624 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2627 # Search for the real file URL
2628 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2629 if (mobj is None) or (mobj.group(1) is None):
2630 # Try to figure out reason of the error.
2631 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2632 if (mobj is not None) and (mobj.group(1) is not None):
2633 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2634 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2636 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2639 file_url = mobj.group(1)
2640 file_extension = os.path.splitext(file_url)[1][1:]
2642 # Search for file title
2643 mobj = re.search(r'<b title="(.*?)">', webpage)
2645 self._downloader.trouble(u'ERROR: unable to extract title')
2647 file_title = mobj.group(1).decode('utf-8')
2650 # Process file information
2651 self._downloader.process_info({
2652 'id': file_id.decode('utf-8'),
2653 'url': file_url.decode('utf-8'),
2655 'upload_date': u'NA',
2656 'title': file_title,
2657 'stitle': file_title,
2658 'ext': file_extension.decode('utf-8'),
2662 except UnavailableVideoError, err:
2663 self._downloader.trouble(u'ERROR: unable to download file')
2666 class FacebookIE(InfoExtractor):
2667 """Information Extractor for Facebook"""
2669 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2670 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2671 _NETRC_MACHINE = 'facebook'
2672 _available_formats = ['highqual', 'lowqual']
2673 _video_extensions = {
2677 IE_NAME = u'facebook'
2679 def __init__(self, downloader=None):
2680 InfoExtractor.__init__(self, downloader)
2682 def _reporter(self, message):
2683 """Add header and report message."""
2684 self._downloader.to_screen(u'[facebook] %s' % message)
2686 def report_login(self):
2687 """Report attempt to log in."""
2688 self._reporter(u'Logging in')
2690 def report_video_webpage_download(self, video_id):
2691 """Report attempt to download video webpage."""
2692 self._reporter(u'%s: Downloading video webpage' % video_id)
2694 def report_information_extraction(self, video_id):
2695 """Report attempt to extract video information."""
2696 self._reporter(u'%s: Extracting video information' % video_id)
2698 def _parse_page(self, video_webpage):
2699 """Extract video information from page"""
2701 data = {'title': r'class="video_title datawrap">(.*?)</',
2702 'description': r'<div class="datawrap">(.*?)</div>',
2703 'owner': r'\("video_owner_name", "(.*?)"\)',
2704 'upload_date': r'data-date="(.*?)"',
2705 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2708 for piece in data.keys():
2709 mobj = re.search(data[piece], video_webpage)
2710 if mobj is not None:
2711 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2715 for fmt in self._available_formats:
2716 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2717 if mobj is not None:
2718 # URL is in a Javascript segment inside an escaped Unicode format within
2719 # the generally utf-8 page
2720 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2721 video_info['video_urls'] = video_urls
2725 def _real_initialize(self):
2726 if self._downloader is None:
2731 downloader_params = self._downloader.params
2733 # Attempt to use provided username and password or .netrc data
2734 if downloader_params.get('username', None) is not None:
2735 useremail = downloader_params['username']
2736 password = downloader_params['password']
2737 elif downloader_params.get('usenetrc', False):
2739 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2740 if info is not None:
2744 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2745 except (IOError, netrc.NetrcParseError), err:
2746 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2749 if useremail is None:
2758 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2761 login_results = urllib2.urlopen(request).read()
2762 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2763 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2765 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2766 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2769 def _real_extract(self, url):
2770 mobj = re.match(self._VALID_URL, url)
2772 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2774 video_id = mobj.group('ID')
2777 self.report_video_webpage_download(video_id)
2778 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2780 page = urllib2.urlopen(request)
2781 video_webpage = page.read()
2782 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2783 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2786 # Start extracting information
2787 self.report_information_extraction(video_id)
2789 # Extract information
2790 video_info = self._parse_page(video_webpage)
2793 if 'owner' not in video_info:
2794 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2796 video_uploader = video_info['owner']
2799 if 'title' not in video_info:
2800 self._downloader.trouble(u'ERROR: unable to extract video title')
2802 video_title = video_info['title']
2803 video_title = video_title.decode('utf-8')
2804 video_title = sanitize_title(video_title)
2807 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2808 simple_title = simple_title.strip(ur'_')
2811 if 'thumbnail' not in video_info:
2812 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2813 video_thumbnail = ''
2815 video_thumbnail = video_info['thumbnail']
2819 if 'upload_date' in video_info:
2820 upload_time = video_info['upload_date']
2821 timetuple = email.utils.parsedate_tz(upload_time)
2822 if timetuple is not None:
2824 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2829 video_description = video_info.get('description', 'No description available.')
2831 url_map = video_info['video_urls']
2832 if len(url_map.keys()) > 0:
2833 # Decide which formats to download
2834 req_format = self._downloader.params.get('format', None)
2835 format_limit = self._downloader.params.get('format_limit', None)
2837 if format_limit is not None and format_limit in self._available_formats:
2838 format_list = self._available_formats[self._available_formats.index(format_limit):]
2840 format_list = self._available_formats
2841 existing_formats = [x for x in format_list if x in url_map]
2842 if len(existing_formats) == 0:
2843 self._downloader.trouble(u'ERROR: no known formats available for video')
2845 if req_format is None:
2846 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2847 elif req_format == 'worst':
2848 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2849 elif req_format == '-1':
2850 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2853 if req_format not in url_map:
2854 self._downloader.trouble(u'ERROR: requested format not available')
2856 video_url_list = [(req_format, url_map[req_format])] # Specific format
2858 for format_param, video_real_url in video_url_list:
2860 # At this point we have a new video
2861 self._downloader.increment_downloads()
2864 video_extension = self._video_extensions.get(format_param, 'mp4')
2867 # Process video information
2868 self._downloader.process_info({
2869 'id': video_id.decode('utf-8'),
2870 'url': video_real_url.decode('utf-8'),
2871 'uploader': video_uploader.decode('utf-8'),
2872 'upload_date': upload_date,
2873 'title': video_title,
2874 'stitle': simple_title,
2875 'ext': video_extension.decode('utf-8'),
2876 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2877 'thumbnail': video_thumbnail.decode('utf-8'),
2878 'description': video_description.decode('utf-8'),
2881 except UnavailableVideoError, err:
2882 self._downloader.trouble(u'\nERROR: unable to download video')
2884 class BlipTVIE(InfoExtractor):
2885 """Information extractor for blip.tv"""
2887 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2888 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2889 IE_NAME = u'blip.tv'
2891 def report_extraction(self, file_id):
2892 """Report information extraction."""
2893 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
2895 def _simplify_title(self, title):
2896 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2897 res = res.strip(ur'_')
2900 def _real_extract(self, url):
2901 mobj = re.match(self._VALID_URL, url)
2903 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2910 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2911 request = urllib2.Request(json_url)
2912 self.report_extraction(mobj.group(1))
2914 json_code = urllib2.urlopen(request).read()
2915 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2916 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2919 json_data = json.loads(json_code)
2920 if 'Post' in json_data:
2921 data = json_data['Post']
2925 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2926 video_url = data['media']['url']
2927 umobj = re.match(self._URL_EXT, video_url)
2929 raise ValueError('Can not determine filename extension')
2930 ext = umobj.group(1)
2932 self._downloader.increment_downloads()
2935 'id': data['item_id'],
2937 'uploader': data['display_name'],
2938 'upload_date': upload_date,
2939 'title': data['title'],
2940 'stitle': self._simplify_title(data['title']),
2942 'format': data['media']['mimeType'],
2943 'thumbnail': data['thumbnailUrl'],
2944 'description': data['description'],
2945 'player_url': data['embedUrl']
2947 except (ValueError,KeyError), err:
2948 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2952 self._downloader.process_info(info)
2953 except UnavailableVideoError, err:
2954 self._downloader.trouble(u'\nERROR: unable to download video')
2957 class MyVideoIE(InfoExtractor):
2958 """Information Extractor for myvideo.de."""
2960 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2961 IE_NAME = u'myvideo'
2963 def __init__(self, downloader=None):
2964 InfoExtractor.__init__(self, downloader)
2966 def report_download_webpage(self, video_id):
2967 """Report webpage download."""
2968 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2970 def report_extraction(self, video_id):
2971 """Report information extraction."""
2972 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2974 def _real_initialize(self):
2977 def _real_extract(self,url):
2978 mobj = re.match(self._VALID_URL, url)
2980 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2983 video_id = mobj.group(1)
2984 simple_title = mobj.group(2).decode('utf-8')
2985 # should actually not be necessary
2986 simple_title = sanitize_title(simple_title)
2987 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', simple_title)
2990 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2992 self.report_download_webpage(video_id)
2993 webpage = urllib2.urlopen(request).read()
2994 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2995 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2998 self.report_extraction(video_id)
2999 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3002 self._downloader.trouble(u'ERROR: unable to extract media URL')
3004 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3006 mobj = re.search('<title>([^<]+)</title>', webpage)
3008 self._downloader.trouble(u'ERROR: unable to extract title')
3011 video_title = mobj.group(1)
3012 video_title = sanitize_title(video_title)
3016 self._downloader.process_info({
3020 'upload_date': u'NA',
3021 'title': video_title,
3022 'stitle': simple_title,
3027 except UnavailableVideoError:
3028 self._downloader.trouble(u'\nERROR: Unable to download video')
3030 class ComedyCentralIE(InfoExtractor):
3031 """Information extractor for The Daily Show and Colbert Report """
3033 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3034 IE_NAME = u'comedycentral'
3036 def report_extraction(self, episode_id):
3037 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3039 def report_config_download(self, episode_id):
3040 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3042 def report_index_download(self, episode_id):
3043 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3045 def report_player_url(self, episode_id):
3046 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3048 def _simplify_title(self, title):
3049 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3050 res = res.strip(ur'_')
3053 def _real_extract(self, url):
3054 mobj = re.match(self._VALID_URL, url)
3056 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3059 if mobj.group('shortname'):
3060 if mobj.group('shortname') in ('tds', 'thedailyshow'):
3061 url = 'http://www.thedailyshow.com/full-episodes/'
3063 url = 'http://www.colbertnation.com/full-episodes/'
3064 mobj = re.match(self._VALID_URL, url)
3065 assert mobj is not None
3067 dlNewest = not mobj.group('episode')
3069 epTitle = mobj.group('showname')
3071 epTitle = mobj.group('episode')
3073 req = urllib2.Request(url)
3074 self.report_extraction(epTitle)
3076 htmlHandle = urllib2.urlopen(req)
3077 html = htmlHandle.read()
3078 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3079 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3082 url = htmlHandle.geturl()
3083 mobj = re.match(self._VALID_URL, url)
3085 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3087 if mobj.group('episode') == '':
3088 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3090 epTitle = mobj.group('episode')
3092 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"/>', html)
3093 if len(mMovieParams) == 0:
3094 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3097 playerUrl_raw = mMovieParams[0][0]
3098 self.report_player_url(epTitle)
3100 urlHandle = urllib2.urlopen(playerUrl_raw)
3101 playerUrl = urlHandle.geturl()
3102 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3103 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3106 uri = mMovieParams[0][1]
3107 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3108 self.report_index_download(epTitle)
3110 indexXml = urllib2.urlopen(indexUrl).read()
3111 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3112 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3115 idoc = xml.etree.ElementTree.fromstring(indexXml)
3116 itemEls = idoc.findall('.//item')
3117 for itemEl in itemEls:
3118 mediaId = itemEl.findall('./guid')[0].text
3119 shortMediaId = mediaId.split(':')[-1]
3120 showId = mediaId.split(':')[-2].replace('.com', '')
3121 officialTitle = itemEl.findall('./title')[0].text
3122 officialDate = itemEl.findall('./pubDate')[0].text
3124 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3125 urllib.urlencode({'uri': mediaId}))
3126 configReq = urllib2.Request(configUrl)
3127 self.report_config_download(epTitle)
3129 configXml = urllib2.urlopen(configReq).read()
3130 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3131 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3134 cdoc = xml.etree.ElementTree.fromstring(configXml)
3136 for rendition in cdoc.findall('.//rendition'):
3137 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3141 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3144 # For now, just pick the highest bitrate
3145 format,video_url = turls[-1]
3147 self._downloader.increment_downloads()
3149 effTitle = showId + '-' + epTitle
3154 'upload_date': officialDate,
3156 'stitle': self._simplify_title(effTitle),
3160 'description': officialTitle,
3161 'player_url': playerUrl
3165 self._downloader.process_info(info)
3166 except UnavailableVideoError, err:
3167 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3171 class EscapistIE(InfoExtractor):
3172 """Information extractor for The Escapist """
3174 _VALID_URL = r'^(https?://)?(www\.)escapistmagazine.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?].*$'
3175 IE_NAME = u'escapist'
3177 def report_extraction(self, showName):
3178 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3180 def report_config_download(self, showName):
3181 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3183 def _simplify_title(self, title):
3184 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3185 res = res.strip(ur'_')
3188 def _real_extract(self, url):
3189 htmlParser = HTMLParser.HTMLParser()
3191 mobj = re.match(self._VALID_URL, url)
3193 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3195 showName = mobj.group('showname')
3196 videoId = mobj.group('episode')
3198 self.report_extraction(showName)
3200 webPage = urllib2.urlopen(url).read()
3201 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3202 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3205 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3206 description = htmlParser.unescape(descMatch.group(1))
3207 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3208 imgUrl = htmlParser.unescape(imgMatch.group(1))
3209 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3210 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3211 configUrlMatch = re.search('config=(.*)$', playerUrl)
3212 configUrl = urllib2.unquote(configUrlMatch.group(1))
3214 self.report_config_download(showName)
3216 configJSON = urllib2.urlopen(configUrl).read()
3217 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3218 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3221 # Technically, it's JavaScript, not JSON
3222 configJSON = configJSON.replace("'", '"')
3225 config = json.loads(configJSON)
3226 except (ValueError,), err:
3227 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3230 playlist = config['playlist']
3231 videoUrl = playlist[1]['url']
3233 self._downloader.increment_downloads()
3237 'uploader': showName,
3238 'upload_date': None,
3240 'stitle': self._simplify_title(showName),
3243 'thumbnail': imgUrl,
3244 'description': description,
3245 'player_url': playerUrl,
3249 self._downloader.process_info(info)
3250 except UnavailableVideoError, err:
3251 self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3255 class PostProcessor(object):
3256 """Post Processor class.
3258 PostProcessor objects can be added to downloaders with their
3259 add_post_processor() method. When the downloader has finished a
3260 successful download, it will take its internal chain of PostProcessors
3261 and start calling the run() method on each one of them, first with
3262 an initial argument and then with the returned value of the previous
3265 The chain will be stopped if one of them ever returns None or the end
3266 of the chain is reached.
3268 PostProcessor objects follow a "mutual registration" process similar
3269 to InfoExtractor objects.
3274 def __init__(self, downloader=None):
3275 self._downloader = downloader
3277 def set_downloader(self, downloader):
3278 """Sets the downloader for this PP."""
3279 self._downloader = downloader
3281 def run(self, information):
3282 """Run the PostProcessor.
3284 The "information" argument is a dictionary like the ones
3285 composed by InfoExtractors. The only difference is that this
3286 one has an extra field called "filepath" that points to the
3289 When this method returns None, the postprocessing chain is
3290 stopped. However, this method may return an information
3291 dictionary that will be passed to the next postprocessing
3292 object in the chain. It can be the one it received after
3293 changing some fields.
3295 In addition, this method may raise a PostProcessingError
3296 exception that will be taken into account by the downloader
3299 return information # by default, do nothing
3302 class FFmpegExtractAudioPP(PostProcessor):
3304 def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
3305 PostProcessor.__init__(self, downloader)
3306 if preferredcodec is None:
3307 preferredcodec = 'best'
3308 self._preferredcodec = preferredcodec
3309 self._preferredquality = preferredquality
3310 self._keepvideo = keepvideo
3313 def get_audio_codec(path):
3315 cmd = ['ffprobe', '-show_streams', '--', path]
3316 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3317 output = handle.communicate()[0]
3318 if handle.wait() != 0:
3320 except (IOError, OSError):
3323 for line in output.split('\n'):
3324 if line.startswith('codec_name='):
3325 audio_codec = line.split('=')[1].strip()
3326 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3331 def run_ffmpeg(path, out_path, codec, more_opts):
3333 cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3334 ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3336 except (IOError, OSError):
3339 def run(self, information):
3340 path = information['filepath']
3342 filecodec = self.get_audio_codec(path)
3343 if filecodec is None:
3344 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3348 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3349 if filecodec == 'aac' or filecodec == 'mp3':
3350 # Lossless if possible
3352 extension = filecodec
3353 if filecodec == 'aac':
3354 more_opts = ['-f', 'adts']
3357 acodec = 'libmp3lame'
3360 if self._preferredquality is not None:
3361 more_opts += ['-ab', self._preferredquality]
3363 # We convert the audio (lossy)
3364 acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
3365 extension = self._preferredcodec
3367 if self._preferredquality is not None:
3368 more_opts += ['-ab', self._preferredquality]
3369 if self._preferredcodec == 'aac':
3370 more_opts += ['-f', 'adts']
3372 (prefix, ext) = os.path.splitext(path)
3373 new_path = prefix + '.' + extension
3374 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3375 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3378 self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3381 # Try to update the date time for extracted audio file.
3382 if information.get('filetime') is not None:
3384 os.utime(new_path, (time.time(), information['filetime']))
3386 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
3388 if not self._keepvideo:
3391 except (IOError, OSError):
3392 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3395 information['filepath'] = new_path
3399 def updateSelf(downloader, filename):
3400 ''' Update the program file with the latest version from the repository '''
3401 # Note: downloader only used for options
3402 if not os.access(filename, os.W_OK):
3403 sys.exit('ERROR: no write permissions on %s' % filename)
3405 downloader.to_screen('Updating to latest version...')
3409 urlh = urllib.urlopen(UPDATE_URL)
3410 newcontent = urlh.read()
3413 except (IOError, OSError), err:
3414 sys.exit('ERROR: unable to download latest version')
3417 outf = open(filename, 'wb')
3419 outf.write(newcontent)
3422 except (IOError, OSError), err:
3423 sys.exit('ERROR: unable to overwrite current version')
3425 downloader.to_screen('Updated youtube-dl. Restart to use the new version.')
3432 def _format_option_string(option):
3433 ''' ('-o', '--option') -> -o, --format METAVAR'''
3437 if option._short_opts: opts.append(option._short_opts[0])
3438 if option._long_opts: opts.append(option._long_opts[0])
3439 if len(opts) > 1: opts.insert(1, ', ')
3441 if option.takes_value(): opts.append(' %s' % option.metavar)
3443 return "".join(opts)
3445 def _find_term_columns():
3446 columns = os.environ.get('COLUMNS', None)
3451 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3452 out,err = sp.communicate()
3453 return int(out.split()[1])
3459 max_help_position = 80
3461 # No need to wrap help messages if we're on a wide console
3462 columns = _find_term_columns()
3463 if columns: max_width = columns
3465 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
3466 fmt.format_option_strings = _format_option_string
3469 'version' : __version__,
3471 'usage' : '%prog [options] url [url...]',
3472 'conflict_handler' : 'resolve',
3475 parser = optparse.OptionParser(**kw)
3478 general = optparse.OptionGroup(parser, 'General Options')
3479 selection = optparse.OptionGroup(parser, 'Video Selection')
3480 authentication = optparse.OptionGroup(parser, 'Authentication Options')
3481 video_format = optparse.OptionGroup(parser, 'Video Format Options')
3482 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
3483 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
3484 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3486 general.add_option('-h', '--help',
3487 action='help', help='print this help text and exit')
3488 general.add_option('-v', '--version',
3489 action='version', help='print program version and exit')
3490 general.add_option('-U', '--update',
3491 action='store_true', dest='update_self', help='update this program to latest version')
3492 general.add_option('-i', '--ignore-errors',
3493 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3494 general.add_option('-r', '--rate-limit',
3495 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3496 general.add_option('-R', '--retries',
3497 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
3498 general.add_option('--dump-user-agent',
3499 action='store_true', dest='dump_user_agent',
3500 help='display the current browser identification', default=False)
3501 general.add_option('--list-extractors',
3502 action='store_true', dest='list_extractors',
3503 help='List all supported extractors and the URLs they would handle', default=False)
3505 selection.add_option('--playlist-start',
3506 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3507 selection.add_option('--playlist-end',
3508 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3509 selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
3510 selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
3512 authentication.add_option('-u', '--username',
3513 dest='username', metavar='USERNAME', help='account username')
3514 authentication.add_option('-p', '--password',
3515 dest='password', metavar='PASSWORD', help='account password')
3516 authentication.add_option('-n', '--netrc',
3517 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3520 video_format.add_option('-f', '--format',
3521 action='store', dest='format', metavar='FORMAT', help='video format code')
3522 video_format.add_option('--all-formats',
3523 action='store_const', dest='format', help='download all available video formats', const='all')
3524 video_format.add_option('--max-quality',
3525 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3528 verbosity.add_option('-q', '--quiet',
3529 action='store_true', dest='quiet', help='activates quiet mode', default=False)
3530 verbosity.add_option('-s', '--simulate',
3531 action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
3532 verbosity.add_option('--skip-download',
3533 action='store_true', dest='skip_download', help='do not download the video', default=False)
3534 verbosity.add_option('-g', '--get-url',
3535 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3536 verbosity.add_option('-e', '--get-title',
3537 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3538 verbosity.add_option('--get-thumbnail',
3539 action='store_true', dest='getthumbnail',
3540 help='simulate, quiet but print thumbnail URL', default=False)
3541 verbosity.add_option('--get-description',
3542 action='store_true', dest='getdescription',
3543 help='simulate, quiet but print video description', default=False)
3544 verbosity.add_option('--get-filename',
3545 action='store_true', dest='getfilename',
3546 help='simulate, quiet but print output filename', default=False)
3547 verbosity.add_option('--get-format',
3548 action='store_true', dest='getformat',
3549 help='simulate, quiet but print output format', default=False)
3550 verbosity.add_option('--no-progress',
3551 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3552 verbosity.add_option('--console-title',
3553 action='store_true', dest='consoletitle',
3554 help='display progress in console titlebar', default=False)
3557 filesystem.add_option('-t', '--title',
3558 action='store_true', dest='usetitle', help='use title in file name', default=False)
3559 filesystem.add_option('-l', '--literal',
3560 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3561 filesystem.add_option('-A', '--auto-number',
3562 action='store_true', dest='autonumber',
3563 help='number downloaded files starting from 00000', default=False)
3564 filesystem.add_option('-o', '--output',
3565 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
3566 filesystem.add_option('-a', '--batch-file',
3567 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3568 filesystem.add_option('-w', '--no-overwrites',
3569 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3570 filesystem.add_option('-c', '--continue',
3571 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3572 filesystem.add_option('--cookies',
3573 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
3574 filesystem.add_option('--no-part',
3575 action='store_true', dest='nopart', help='do not use .part files', default=False)
3576 filesystem.add_option('--no-mtime',
3577 action='store_false', dest='updatetime',
3578 help='do not use the Last-modified header to set the file modification time', default=True)
3579 filesystem.add_option('--write-description',
3580 action='store_true', dest='writedescription',
3581 help='write video description to a .description file', default=False)
3582 filesystem.add_option('--write-info-json',
3583 action='store_true', dest='writeinfojson',
3584 help='write video metadata to a .info.json file', default=False)
3587 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3588 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3589 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3590 help='"best", "aac" or "mp3"; best by default')
3591 postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
3592 help='ffmpeg audio bitrate specification, 128k by default')
3593 postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
3594 help='keeps the video file on disk after the post-processing; the video is erased by default')
3597 parser.add_option_group(general)
3598 parser.add_option_group(selection)
3599 parser.add_option_group(filesystem)
3600 parser.add_option_group(verbosity)
3601 parser.add_option_group(video_format)
3602 parser.add_option_group(authentication)
3603 parser.add_option_group(postproc)
3605 opts, args = parser.parse_args()
3607 return parser, opts, args
3609 def gen_extractors():
3610 """ Return a list of an instance of every supported extractor.
3611 The order does matter; the first extractor matched is the one handling the URL.
3613 youtube_ie = YoutubeIE()
3614 google_ie = GoogleIE()
3615 yahoo_ie = YahooIE()
3618 MetacafeIE(youtube_ie),
3620 YoutubePlaylistIE(youtube_ie),
3621 YoutubeUserIE(youtube_ie),
3622 YoutubeSearchIE(youtube_ie),
3624 GoogleSearchIE(google_ie),
3627 YahooSearchIE(yahoo_ie),
3640 parser, opts, args = parseOpts()
3642 # Open appropriate CookieJar
3643 if opts.cookiefile is None:
3644 jar = cookielib.CookieJar()
3647 jar = cookielib.MozillaCookieJar(opts.cookiefile)
3648 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3650 except (IOError, OSError), err:
3651 sys.exit(u'ERROR: unable to open cookie file')
3654 if opts.dump_user_agent:
3655 print std_headers['User-Agent']
3658 # Batch file verification
3660 if opts.batchfile is not None:
3662 if opts.batchfile == '-':
3665 batchfd = open(opts.batchfile, 'r')
3666 batchurls = batchfd.readlines()
3667 batchurls = [x.strip() for x in batchurls]
3668 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3670 sys.exit(u'ERROR: batch file could not be read')
3671 all_urls = batchurls + args
3673 # General configuration
3674 cookie_processor = urllib2.HTTPCookieProcessor(jar)
3675 opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
3676 urllib2.install_opener(opener)
3677 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3679 extractors = gen_extractors()
3681 if opts.list_extractors:
3682 for ie in extractors:
3684 matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
3685 all_urls = filter(lambda url: url not in matchedUrls, all_urls)
3686 for mu in matchedUrls:
3690 # Conflicting, missing and erroneous options
3691 if opts.usenetrc and (opts.username is not None or opts.password is not None):
3692 parser.error(u'using .netrc conflicts with giving username/password')
3693 if opts.password is not None and opts.username is None:
3694 parser.error(u'account username missing')
3695 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3696 parser.error(u'using output template conflicts with using title, literal title or auto number')
3697 if opts.usetitle and opts.useliteral:
3698 parser.error(u'using title conflicts with using literal title')
3699 if opts.username is not None and opts.password is None:
3700 opts.password = getpass.getpass(u'Type account password and press return:')
3701 if opts.ratelimit is not None:
3702 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3703 if numeric_limit is None:
3704 parser.error(u'invalid rate limit specified')
3705 opts.ratelimit = numeric_limit
3706 if opts.retries is not None:
3708 opts.retries = long(opts.retries)
3709 except (TypeError, ValueError), err:
3710 parser.error(u'invalid retry count specified')
3712 opts.playliststart = int(opts.playliststart)
3713 if opts.playliststart <= 0:
3714 raise ValueError(u'Playlist start must be positive')
3715 except (TypeError, ValueError), err:
3716 parser.error(u'invalid playlist start number specified')
3718 opts.playlistend = int(opts.playlistend)
3719 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3720 raise ValueError(u'Playlist end must be greater than playlist start')
3721 except (TypeError, ValueError), err:
3722 parser.error(u'invalid playlist end number specified')
3723 if opts.extractaudio:
3724 if opts.audioformat not in ['best', 'aac', 'mp3']:
3725 parser.error(u'invalid audio format specified')
3728 fd = FileDownloader({
3729 'usenetrc': opts.usenetrc,
3730 'username': opts.username,
3731 'password': opts.password,
3732 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
3733 'forceurl': opts.geturl,
3734 'forcetitle': opts.gettitle,
3735 'forcethumbnail': opts.getthumbnail,
3736 'forcedescription': opts.getdescription,
3737 'forcefilename': opts.getfilename,
3738 'forceformat': opts.getformat,
3739 'simulate': opts.simulate,
3740 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
3741 'format': opts.format,
3742 'format_limit': opts.format_limit,
3743 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3744 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3745 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3746 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3747 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3748 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3749 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3750 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3751 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3752 or u'%(id)s.%(ext)s'),
3753 'ignoreerrors': opts.ignoreerrors,
3754 'ratelimit': opts.ratelimit,
3755 'nooverwrites': opts.nooverwrites,
3756 'retries': opts.retries,
3757 'continuedl': opts.continue_dl,
3758 'noprogress': opts.noprogress,
3759 'playliststart': opts.playliststart,
3760 'playlistend': opts.playlistend,
3761 'logtostderr': opts.outtmpl == '-',
3762 'consoletitle': opts.consoletitle,
3763 'nopart': opts.nopart,
3764 'updatetime': opts.updatetime,
3765 'writedescription': opts.writedescription,
3766 'writeinfojson': opts.writeinfojson,
3767 'matchtitle': opts.matchtitle,
3768 'rejecttitle': opts.rejecttitle,
3770 for extractor in extractors:
3771 fd.add_info_extractor(extractor)
3774 if opts.extractaudio:
3775 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
3778 if opts.update_self:
3779 updateSelf(fd, sys.argv[0])
3782 if len(all_urls) < 1:
3783 if not opts.update_self:
3784 parser.error(u'you must provide at least one URL')
3787 retcode = fd.download(all_urls)
3789 # Dump cookie jar if requested
3790 if opts.cookiefile is not None:
3793 except (IOError, OSError), err:
3794 sys.exit(u'ERROR: unable to save cookie jar')
3799 if __name__ == '__main__':
3802 except DownloadError:
3804 except SameFileError:
3805 sys.exit(u'ERROR: fixed output name but more than one file to download')
3806 except KeyboardInterrupt:
3807 sys.exit(u'\nERROR: Interrupted by user')
3809 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: