2 # -*- coding: utf-8 -*-
5 'Ricardo Garcia Gonzalez',
13 'Philipp Hagemeister',
19 __license__ = 'Public Domain'
20 __version__ = '2011.10.19'
22 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
51 except ImportError: # Python 2.4
54 import cStringIO as StringIO
58 # parse_qs was moved from the cgi module to the urlparse module recently.
60 from urlparse import parse_qs
62 from cgi import parse_qs
70 import xml.etree.ElementTree
71 except ImportError: # Python<2.5: Not officially supported, but let it slip
72 warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
75 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
76 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
77 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
78 'Accept-Encoding': 'gzip, deflate',
79 'Accept-Language': 'en-us,en;q=0.5',
82 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
86 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
92 def raiseError(msg, i):
93 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
94 def skipSpace(i, expectMore=True):
95 while i < len(s) and s[i] in ' \t\r\n':
99 raiseError('Premature end', i)
101 def decodeEscape(match):
117 return unichr(int(esc[1:5], 16))
118 if len(esc) == 5+6 and esc[5:7] == '\\u':
119 hi = int(esc[1:5], 16)
120 low = int(esc[7:11], 16)
121 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
122 raise ValueError('Unknown escape ' + str(esc))
129 while s[e-bslashes-1] == '\\':
131 if bslashes % 2 == 1:
135 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
136 stri = rexp.sub(decodeEscape, s[i:e])
142 if s[i] == '}': # Empty dictionary
146 raiseError('Expected a string object key', i)
147 i,key = parseString(i)
149 if i >= len(s) or s[i] != ':':
150 raiseError('Expected a colon', i)
157 raiseError('Expected comma or closing curly brace', i)
162 if s[i] == ']': # Empty array
167 i = skipSpace(i) # Raise exception if premature end
171 raiseError('Expected a comma or closing bracket', i)
173 def parseDiscrete(i):
174 for k,v in {'true': True, 'false': False, 'null': None}.items():
175 if s.startswith(k, i):
177 raiseError('Not a boolean (or null)', i)
179 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
181 raiseError('Not a number', i)
183 if '.' in nums or 'e' in nums or 'E' in nums:
184 return (i+len(nums), float(nums))
185 return (i+len(nums), int(nums))
186 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
189 i,res = CHARMAP.get(s[i], parseNumber)(i)
190 i = skipSpace(i, False)
194 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
197 def preferredencoding():
198 """Get preferred encoding.
200 Returns the best encoding scheme for the system, based on
201 locale.getpreferredencoding() and some further tweaks.
203 def yield_preferredencoding():
205 pref = locale.getpreferredencoding()
211 return yield_preferredencoding().next()
214 def htmlentity_transform(matchobj):
215 """Transforms an HTML entity to a Unicode character.
217 This function receives a match object and is intended to be used with
218 the re.sub() function.
220 entity = matchobj.group(1)
222 # Known non-numeric HTML entity
223 if entity in htmlentitydefs.name2codepoint:
224 return unichr(htmlentitydefs.name2codepoint[entity])
227 mobj = re.match(ur'(?u)#(x?\d+)', entity)
229 numstr = mobj.group(1)
230 if numstr.startswith(u'x'):
232 numstr = u'0%s' % numstr
235 return unichr(long(numstr, base))
237 # Unknown entity in name, return its literal representation
238 return (u'&%s;' % entity)
241 def sanitize_title(utitle):
242 """Sanitizes a video title so it could be used as part of a filename."""
243 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
244 return utitle.replace(unicode(os.sep), u'%')
247 def sanitize_open(filename, open_mode):
248 """Try to open the given filename, and slightly tweak it if this fails.
250 Attempts to open the given filename. If this fails, it tries to change
251 the filename slightly, step by step, until it's either able to open it
252 or it fails and raises a final exception, like the standard open()
255 It returns the tuple (stream, definitive_file_name).
259 if sys.platform == 'win32':
261 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
262 return (sys.stdout, filename)
263 stream = open(filename, open_mode)
264 return (stream, filename)
265 except (IOError, OSError), err:
266 # In case of error, try to remove win32 forbidden chars
267 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
269 # An exception here should be caught in the caller
270 stream = open(filename, open_mode)
271 return (stream, filename)
274 def timeconvert(timestr):
275 """Convert RFC 2822 defined time string into system timestamp"""
277 timetuple = email.utils.parsedate_tz(timestr)
278 if timetuple is not None:
279 timestamp = email.utils.mktime_tz(timetuple)
282 def _simplify_title(title):
283 return re.sub(ur'[^\w\d_\-]+', u'_', title)
285 class DownloadError(Exception):
286 """Download Error exception.
288 This exception may be thrown by FileDownloader objects if they are not
289 configured to continue on errors. They will contain the appropriate
295 class SameFileError(Exception):
296 """Same File exception.
298 This exception will be thrown by FileDownloader objects if they detect
299 multiple files would have to be downloaded to the same file on disk.
304 class PostProcessingError(Exception):
305 """Post Processing exception.
307 This exception may be raised by PostProcessor's .run() method to
308 indicate an error in the postprocessing task.
313 class UnavailableVideoError(Exception):
314 """Unavailable Format exception.
316 This exception will be thrown when a video is requested
317 in a format that is not available for that video.
322 class ContentTooShortError(Exception):
323 """Content Too Short exception.
325 This exception may be raised by FileDownloader objects when a file they
326 download is too small for what the server announced first, indicating
327 the connection was probably interrupted.
333 def __init__(self, downloaded, expected):
334 self.downloaded = downloaded
335 self.expected = expected
338 class YoutubeDLHandler(urllib2.HTTPHandler):
339 """Handler for HTTP requests and responses.
341 This class, when installed with an OpenerDirector, automatically adds
342 the standard headers to every HTTP request and handles gzipped and
343 deflated responses from web servers. If compression is to be avoided in
344 a particular request, the original request in the program code only has
345 to include the HTTP header "Youtubedl-No-Compression", which will be
346 removed before making the real request.
348 Part of this code was copied from:
350 http://techknack.net/python-urllib2-handlers/
352 Andrew Rowls, the author of that code, agreed to release it to the
359 return zlib.decompress(data, -zlib.MAX_WBITS)
361 return zlib.decompress(data)
364 def addinfourl_wrapper(stream, headers, url, code):
365 if hasattr(urllib2.addinfourl, 'getcode'):
366 return urllib2.addinfourl(stream, headers, url, code)
367 ret = urllib2.addinfourl(stream, headers, url)
371 def http_request(self, req):
372 for h in std_headers:
375 req.add_header(h, std_headers[h])
376 if 'Youtubedl-no-compression' in req.headers:
377 if 'Accept-encoding' in req.headers:
378 del req.headers['Accept-encoding']
379 del req.headers['Youtubedl-no-compression']
382 def http_response(self, req, resp):
385 if resp.headers.get('Content-encoding', '') == 'gzip':
386 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
387 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
388 resp.msg = old_resp.msg
390 if resp.headers.get('Content-encoding', '') == 'deflate':
391 gz = StringIO.StringIO(self.deflate(resp.read()))
392 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
393 resp.msg = old_resp.msg
397 class FileDownloader(object):
398 """File Downloader class.
400 File downloader objects are the ones responsible of downloading the
401 actual video file and writing it to disk if the user has requested
402 it, among some other tasks. In most cases there should be one per
403 program. As, given a video URL, the downloader doesn't know how to
404 extract all the needed information, task that InfoExtractors do, it
405 has to pass the URL to one of them.
407 For this, file downloader objects have a method that allows
408 InfoExtractors to be registered in a given order. When it is passed
409 a URL, the file downloader handles it to the first InfoExtractor it
410 finds that reports being able to handle it. The InfoExtractor extracts
411 all the information about the video or videos the URL refers to, and
412 asks the FileDownloader to process the video information, possibly
413 downloading the video.
415 File downloaders accept a lot of parameters. In order not to saturate
416 the object constructor with arguments, it receives a dictionary of
417 options instead. These options are available through the params
418 attribute for the InfoExtractors to use. The FileDownloader also
419 registers itself as the downloader in charge for the InfoExtractors
420 that are added to it, so this is a "mutual registration".
424 username: Username for authentication purposes.
425 password: Password for authentication purposes.
426 usenetrc: Use netrc for authentication instead.
427 quiet: Do not print messages to stdout.
428 forceurl: Force printing final URL.
429 forcetitle: Force printing title.
430 forcethumbnail: Force printing thumbnail URL.
431 forcedescription: Force printing description.
432 forcefilename: Force printing final filename.
433 simulate: Do not download the video files.
434 format: Video format code.
435 format_limit: Highest quality format to try.
436 outtmpl: Template for output names.
437 ignoreerrors: Do not stop on download errors.
438 ratelimit: Download speed limit, in bytes/sec.
439 nooverwrites: Prevent overwriting files.
440 retries: Number of times to retry for HTTP error 5xx
441 continuedl: Try to continue downloads if possible.
442 noprogress: Do not print the progress bar.
443 playliststart: Playlist item to start at.
444 playlistend: Playlist item to end at.
445 matchtitle: Download only matching titles.
446 rejecttitle: Reject downloads for matching titles.
447 logtostderr: Log messages to stderr instead of stdout.
448 consoletitle: Display progress in console window's titlebar.
449 nopart: Do not use temporary .part files.
450 updatetime: Use the Last-modified header to set output file timestamps.
451 writedescription: Write the video description to a .description file
452 writeinfojson: Write the video description to a .info.json file
458 _download_retcode = None
459 _num_downloads = None
462 def __init__(self, params):
463 """Create a FileDownloader object with the given options."""
466 self._download_retcode = 0
467 self._num_downloads = 0
468 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
472 def format_bytes(bytes):
475 if type(bytes) is str:
480 exponent = long(math.log(bytes, 1024.0))
481 suffix = 'bkMGTPEZY'[exponent]
482 converted = float(bytes) / float(1024 ** exponent)
483 return '%.2f%s' % (converted, suffix)
486 def calc_percent(byte_counter, data_len):
489 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
492 def calc_eta(start, now, total, current):
496 if current == 0 or dif < 0.001: # One millisecond
498 rate = float(current) / dif
499 eta = long((float(total) - float(current)) / rate)
500 (eta_mins, eta_secs) = divmod(eta, 60)
503 return '%02d:%02d' % (eta_mins, eta_secs)
506 def calc_speed(start, now, bytes):
508 if bytes == 0 or dif < 0.001: # One millisecond
509 return '%10s' % '---b/s'
510 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
513 def best_block_size(elapsed_time, bytes):
514 new_min = max(bytes / 2.0, 1.0)
515 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
516 if elapsed_time < 0.001:
518 rate = bytes / elapsed_time
526 def parse_bytes(bytestr):
527 """Parse a string indicating a byte quantity into a long integer."""
528 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
531 number = float(matchobj.group(1))
532 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
533 return long(round(number * multiplier))
535 def add_info_extractor(self, ie):
536 """Add an InfoExtractor object to the end of the list."""
538 ie.set_downloader(self)
540 def add_post_processor(self, pp):
541 """Add a PostProcessor object to the end of the chain."""
543 pp.set_downloader(self)
545 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
546 """Print message to stdout if not in quiet mode."""
548 if not self.params.get('quiet', False):
549 terminator = [u'\n', u''][skip_eol]
550 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
551 self._screen_file.flush()
552 except (UnicodeEncodeError), err:
553 if not ignore_encoding_errors:
556 def to_stderr(self, message):
557 """Print message to stderr."""
558 print >>sys.stderr, message.encode(preferredencoding())
560 def to_cons_title(self, message):
561 """Set console/terminal window title to message."""
562 if not self.params.get('consoletitle', False):
564 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
565 # c_wchar_p() might not be necessary if `message` is
566 # already of type unicode()
567 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
568 elif 'TERM' in os.environ:
569 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
571 def fixed_template(self):
572 """Checks if the output template is fixed."""
573 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
575 def trouble(self, message=None):
576 """Determine action to take when a download problem appears.
578 Depending on if the downloader has been configured to ignore
579 download errors or not, this method may throw an exception or
580 not when errors are found, after printing the message.
582 if message is not None:
583 self.to_stderr(message)
584 if not self.params.get('ignoreerrors', False):
585 raise DownloadError(message)
586 self._download_retcode = 1
588 def slow_down(self, start_time, byte_counter):
589 """Sleep if the download speed is over the rate limit."""
590 rate_limit = self.params.get('ratelimit', None)
591 if rate_limit is None or byte_counter == 0:
594 elapsed = now - start_time
597 speed = float(byte_counter) / elapsed
598 if speed > rate_limit:
599 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
601 def temp_name(self, filename):
602 """Returns a temporary filename for the given filename."""
603 if self.params.get('nopart', False) or filename == u'-' or \
604 (os.path.exists(filename) and not os.path.isfile(filename)):
606 return filename + u'.part'
608 def undo_temp_name(self, filename):
609 if filename.endswith(u'.part'):
610 return filename[:-len(u'.part')]
613 def try_rename(self, old_filename, new_filename):
615 if old_filename == new_filename:
617 os.rename(old_filename, new_filename)
618 except (IOError, OSError), err:
619 self.trouble(u'ERROR: unable to rename file')
621 def try_utime(self, filename, last_modified_hdr):
622 """Try to set the last-modified time of the given file."""
623 if last_modified_hdr is None:
625 if not os.path.isfile(filename):
627 timestr = last_modified_hdr
630 filetime = timeconvert(timestr)
634 os.utime(filename, (time.time(), filetime))
639 def report_writedescription(self, descfn):
640 """ Report that the description file is being written """
641 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
643 def report_writeinfojson(self, infofn):
644 """ Report that the metadata file has been written """
645 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
647 def report_destination(self, filename):
648 """Report destination filename."""
649 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
651 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
652 """Report download progress."""
653 if self.params.get('noprogress', False):
655 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
656 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
657 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
658 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
660 def report_resuming_byte(self, resume_len):
661 """Report attempt to resume at given byte."""
662 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
664 def report_retry(self, count, retries):
665 """Report retry in case of HTTP error 5xx"""
666 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
668 def report_file_already_downloaded(self, file_name):
669 """Report file has already been fully downloaded."""
671 self.to_screen(u'[download] %s has already been downloaded' % file_name)
672 except (UnicodeEncodeError), err:
673 self.to_screen(u'[download] The file has already been downloaded')
675 def report_unable_to_resume(self):
676 """Report it was impossible to resume download."""
677 self.to_screen(u'[download] Unable to resume')
679 def report_finish(self):
680 """Report download finished."""
681 if self.params.get('noprogress', False):
682 self.to_screen(u'[download] Download completed')
686 def increment_downloads(self):
687 """Increment the ordinal that assigns a number to each file."""
688 self._num_downloads += 1
690 def prepare_filename(self, info_dict):
691 """Generate the output filename."""
693 template_dict = dict(info_dict)
694 template_dict['epoch'] = unicode(long(time.time()))
695 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
696 filename = self.params['outtmpl'] % template_dict
698 except (ValueError, KeyError), err:
699 self.trouble(u'ERROR: invalid system charset or erroneous output template')
702 def process_info(self, info_dict):
703 """Process a single dictionary returned by an InfoExtractor."""
704 filename = self.prepare_filename(info_dict)
707 if self.params.get('forcetitle', False):
708 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
709 if self.params.get('forceurl', False):
710 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
711 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
712 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
713 if self.params.get('forcedescription', False) and 'description' in info_dict:
714 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
715 if self.params.get('forcefilename', False) and filename is not None:
716 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
717 if self.params.get('forceformat', False):
718 print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
720 # Do nothing else if in simulate mode
721 if self.params.get('simulate', False):
727 matchtitle=self.params.get('matchtitle',False)
728 rejecttitle=self.params.get('rejecttitle',False)
729 title=info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
730 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
731 self.to_screen(u'[download] "%s" title did not match pattern "%s"' % (title, matchtitle))
733 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
734 self.to_screen(u'[download] "%s" title matched reject pattern "%s"' % (title, rejecttitle))
737 if self.params.get('nooverwrites', False) and os.path.exists(filename):
738 self.to_stderr(u'WARNING: file exists and will be skipped')
742 dn = os.path.dirname(filename)
743 if dn != '' and not os.path.exists(dn):
745 except (OSError, IOError), err:
746 self.trouble(u'ERROR: unable to create directory ' + unicode(err))
749 if self.params.get('writedescription', False):
751 descfn = filename + '.description'
752 self.report_writedescription(descfn)
753 descfile = open(descfn, 'wb')
755 descfile.write(info_dict['description'].encode('utf-8'))
758 except (OSError, IOError):
759 self.trouble(u'ERROR: Cannot write description file ' + descfn)
762 if self.params.get('writeinfojson', False):
763 infofn = filename + '.info.json'
764 self.report_writeinfojson(infofn)
767 except (NameError,AttributeError):
768 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
771 infof = open(infofn, 'wb')
773 json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
774 json.dump(json_info_dict, infof)
777 except (OSError, IOError):
778 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
781 if not self.params.get('skip_download', False):
783 success = self._do_download(filename, info_dict)
784 except (OSError, IOError), err:
785 raise UnavailableVideoError
786 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
787 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
789 except (ContentTooShortError, ), err:
790 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
795 self.post_process(filename, info_dict)
796 except (PostProcessingError), err:
797 self.trouble(u'ERROR: postprocessing: %s' % str(err))
800 def download(self, url_list):
801 """Download a given list of URLs."""
802 if len(url_list) > 1 and self.fixed_template():
803 raise SameFileError(self.params['outtmpl'])
806 suitable_found = False
808 # Go to next InfoExtractor if not suitable
809 if not ie.suitable(url):
812 # Suitable InfoExtractor found
813 suitable_found = True
815 # Extract information from URL and process it
818 # Suitable InfoExtractor had been found; go to next URL
821 if not suitable_found:
822 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
824 return self._download_retcode
826 def post_process(self, filename, ie_info):
827 """Run the postprocessing chain on the given file."""
829 info['filepath'] = filename
835 def _download_with_rtmpdump(self, filename, url, player_url):
836 self.report_destination(filename)
837 tmpfilename = self.temp_name(filename)
839 # Check for rtmpdump first
841 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
842 except (OSError, IOError):
843 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
846 # Download using rtmpdump. rtmpdump returns exit code 2 when
847 # the connection was interrumpted and resuming appears to be
848 # possible. This is part of rtmpdump's normal usage, AFAIK.
849 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
850 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
851 while retval == 2 or retval == 1:
852 prevsize = os.path.getsize(tmpfilename)
853 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
854 time.sleep(5.0) # This seems to be needed
855 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
856 cursize = os.path.getsize(tmpfilename)
857 if prevsize == cursize and retval == 1:
859 # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
860 if prevsize == cursize and retval == 2 and cursize > 1024:
861 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
865 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
866 self.try_rename(tmpfilename, filename)
869 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
872 def _do_download(self, filename, info_dict):
873 url = info_dict['url']
874 player_url = info_dict.get('player_url', None)
876 # Check file already present
877 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
878 self.report_file_already_downloaded(filename)
881 # Attempt to download using rtmpdump
882 if url.startswith('rtmp'):
883 return self._download_with_rtmpdump(filename, url, player_url)
885 tmpfilename = self.temp_name(filename)
888 # Do not include the Accept-Encoding header
889 headers = {'Youtubedl-no-compression': 'True'}
890 basic_request = urllib2.Request(url, None, headers)
891 request = urllib2.Request(url, None, headers)
893 # Establish possible resume length
894 if os.path.isfile(tmpfilename):
895 resume_len = os.path.getsize(tmpfilename)
901 if self.params.get('continuedl', False):
902 self.report_resuming_byte(resume_len)
903 request.add_header('Range','bytes=%d-' % resume_len)
909 retries = self.params.get('retries', 0)
910 while count <= retries:
911 # Establish connection
913 if count == 0 and 'urlhandle' in info_dict:
914 data = info_dict['urlhandle']
915 data = urllib2.urlopen(request)
917 except (urllib2.HTTPError, ), err:
918 if (err.code < 500 or err.code >= 600) and err.code != 416:
919 # Unexpected HTTP error
921 elif err.code == 416:
922 # Unable to resume (requested range not satisfiable)
924 # Open the connection again without the range header
925 data = urllib2.urlopen(basic_request)
926 content_length = data.info()['Content-Length']
927 except (urllib2.HTTPError, ), err:
928 if err.code < 500 or err.code >= 600:
931 # Examine the reported length
932 if (content_length is not None and
933 (resume_len - 100 < long(content_length) < resume_len + 100)):
934 # The file had already been fully downloaded.
935 # Explanation to the above condition: in issue #175 it was revealed that
936 # YouTube sometimes adds or removes a few bytes from the end of the file,
937 # changing the file size slightly and causing problems for some users. So
938 # I decided to implement a suggested change and consider the file
939 # completely downloaded if the file size differs less than 100 bytes from
940 # the one in the hard drive.
941 self.report_file_already_downloaded(filename)
942 self.try_rename(tmpfilename, filename)
945 # The length does not match, we start the download over
946 self.report_unable_to_resume()
952 self.report_retry(count, retries)
955 self.trouble(u'ERROR: giving up after %s retries' % retries)
958 data_len = data.info().get('Content-length', None)
959 if data_len is not None:
960 data_len = long(data_len) + resume_len
961 data_len_str = self.format_bytes(data_len)
962 byte_counter = 0 + resume_len
968 data_block = data.read(block_size)
970 if len(data_block) == 0:
972 byte_counter += len(data_block)
974 # Open file just in time
977 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
978 assert stream is not None
979 filename = self.undo_temp_name(tmpfilename)
980 self.report_destination(filename)
981 except (OSError, IOError), err:
982 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
985 stream.write(data_block)
986 except (IOError, OSError), err:
987 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
989 block_size = self.best_block_size(after - before, len(data_block))
992 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
994 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
996 percent_str = self.calc_percent(byte_counter, data_len)
997 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
998 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1001 self.slow_down(start, byte_counter - resume_len)
1004 self.trouble(u'\nERROR: Did not get any data blocks')
1007 self.report_finish()
1008 if data_len is not None and byte_counter != data_len:
1009 raise ContentTooShortError(byte_counter, long(data_len))
1010 self.try_rename(tmpfilename, filename)
1012 # Update file modification time
1013 if self.params.get('updatetime', True):
1014 info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1019 class InfoExtractor(object):
1020 """Information Extractor class.
1022 Information extractors are the classes that, given a URL, extract
1023 information from the video (or videos) the URL refers to. This
1024 information includes the real video URL, the video title and simplified
1025 title, author and others. The information is stored in a dictionary
1026 which is then passed to the FileDownloader. The FileDownloader
1027 processes this information possibly downloading the video to the file
1028 system, among other possible outcomes. The dictionaries must include
1029 the following fields:
1031 id: Video identifier.
1032 url: Final video URL.
1033 uploader: Nickname of the video uploader.
1034 title: Literal title.
1035 stitle: Simplified title.
1036 ext: Video filename extension.
1037 format: Video format.
1038 player_url: SWF Player URL (may be None).
1040 The following fields are optional. Their primary purpose is to allow
1041 youtube-dl to serve as the backend for a video search function, such
1042 as the one in youtube2mp3. They are only used when their respective
1043 forced printing functions are called:
1045 thumbnail: Full URL to a video thumbnail image.
1046 description: One-line video description.
1048 Subclasses of this one should re-define the _real_initialize() and
1049 _real_extract() methods and define a _VALID_URL regexp.
1050 Probably, they should also be added to the list of extractors.
1056 def __init__(self, downloader=None):
1057 """Constructor. Receives an optional downloader."""
1059 self.set_downloader(downloader)
1061 def suitable(self, url):
1062 """Receives a URL and returns True if suitable for this IE."""
1063 return re.match(self._VALID_URL, url) is not None
1065 def initialize(self):
1066 """Initializes an instance (authentication, etc)."""
1068 self._real_initialize()
1071 def extract(self, url):
1072 """Extracts URL information and returns it in list of dicts."""
1074 return self._real_extract(url)
1076 def set_downloader(self, downloader):
1077 """Sets the downloader for this IE."""
1078 self._downloader = downloader
1080 def _real_initialize(self):
1081 """Real initialization process. Redefine in subclasses."""
1084 def _real_extract(self, url):
1085 """Real extraction process. Redefine in subclasses."""
1089 class YoutubeIE(InfoExtractor):
1090 """Information extractor for youtube.com."""
1092 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1093 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1094 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1095 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1096 _NETRC_MACHINE = 'youtube'
1097 # Listed in order of quality
1098 _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1099 _video_extensions = {
1105 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1110 _video_dimensions = {
1125 IE_NAME = u'youtube'
1127 def report_lang(self):
1128 """Report attempt to set language."""
1129 self._downloader.to_screen(u'[youtube] Setting language')
1131 def report_login(self):
1132 """Report attempt to log in."""
1133 self._downloader.to_screen(u'[youtube] Logging in')
1135 def report_age_confirmation(self):
1136 """Report attempt to confirm age."""
1137 self._downloader.to_screen(u'[youtube] Confirming age')
1139 def report_video_webpage_download(self, video_id):
1140 """Report attempt to download video webpage."""
1141 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1143 def report_video_info_webpage_download(self, video_id):
1144 """Report attempt to download video info webpage."""
1145 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1147 def report_information_extraction(self, video_id):
1148 """Report attempt to extract video information."""
1149 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1151 def report_unavailable_format(self, video_id, format):
1152 """Report extracted video URL."""
1153 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1155 def report_rtmp_download(self):
1156 """Indicate the download will use the RTMP protocol."""
1157 self._downloader.to_screen(u'[youtube] RTMP download detected')
1159 def _print_formats(self, formats):
1160 print 'Available formats:'
1162 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1164 def _real_initialize(self):
1165 if self._downloader is None:
1170 downloader_params = self._downloader.params
1172 # Attempt to use provided username and password or .netrc data
1173 if downloader_params.get('username', None) is not None:
1174 username = downloader_params['username']
1175 password = downloader_params['password']
1176 elif downloader_params.get('usenetrc', False):
1178 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1179 if info is not None:
1183 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1184 except (IOError, netrc.NetrcParseError), err:
1185 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1189 request = urllib2.Request(self._LANG_URL)
1192 urllib2.urlopen(request).read()
1193 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1194 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1197 # No authentication to be performed
1198 if username is None:
1203 'current_form': 'loginForm',
1205 'action_login': 'Log In',
1206 'username': username,
1207 'password': password,
1209 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1212 login_results = urllib2.urlopen(request).read()
1213 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1214 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1216 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1217 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1223 'action_confirm': 'Confirm',
1225 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1227 self.report_age_confirmation()
1228 age_results = urllib2.urlopen(request).read()
1229 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1230 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1233 def _real_extract(self, url):
1234 # Extract video id from URL
1235 mobj = re.match(self._VALID_URL, url)
1237 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1239 video_id = mobj.group(2)
1242 self.report_video_webpage_download(video_id)
1243 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1245 video_webpage = urllib2.urlopen(request).read()
1246 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1247 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1250 # Attempt to extract SWF player URL
1251 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1252 if mobj is not None:
1253 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1258 self.report_video_info_webpage_download(video_id)
1259 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1260 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1261 % (video_id, el_type))
1262 request = urllib2.Request(video_info_url)
1264 video_info_webpage = urllib2.urlopen(request).read()
1265 video_info = parse_qs(video_info_webpage)
1266 if 'token' in video_info:
1268 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1269 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1271 if 'token' not in video_info:
1272 if 'reason' in video_info:
1273 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1275 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1278 # Start extracting information
1279 self.report_information_extraction(video_id)
1282 if 'author' not in video_info:
1283 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1285 video_uploader = urllib.unquote_plus(video_info['author'][0])
1288 if 'title' not in video_info:
1289 self._downloader.trouble(u'ERROR: unable to extract video title')
1291 video_title = urllib.unquote_plus(video_info['title'][0])
1292 video_title = video_title.decode('utf-8')
1293 video_title = sanitize_title(video_title)
1296 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1297 simple_title = simple_title.strip(ur'_')
1300 if 'thumbnail_url' not in video_info:
1301 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1302 video_thumbnail = ''
1303 else: # don't panic if we can't find it
1304 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1308 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1309 if mobj is not None:
1310 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1311 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1312 for expression in format_expressions:
1314 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1322 video_description = u'No description available.'
1323 if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1324 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1325 if mobj is not None:
1326 video_description = mobj.group(1).decode('utf-8')
1328 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1329 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1330 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1331 # TODO use another parser
1334 video_token = urllib.unquote_plus(video_info['token'][0])
1336 # Decide which formats to download
1337 req_format = self._downloader.params.get('format', None)
1339 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1340 self.report_rtmp_download()
1341 video_url_list = [(None, video_info['conn'][0])]
1342 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1343 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1344 url_data = [parse_qs(uds) for uds in url_data_strs]
1345 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1346 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1348 format_limit = self._downloader.params.get('format_limit', None)
1349 if format_limit is not None and format_limit in self._available_formats:
1350 format_list = self._available_formats[self._available_formats.index(format_limit):]
1352 format_list = self._available_formats
1353 existing_formats = [x for x in format_list if x in url_map]
1354 if len(existing_formats) == 0:
1355 self._downloader.trouble(u'ERROR: no known formats available for video')
1357 if self._downloader.params.get('listformats', None):
1358 self._print_formats(existing_formats)
1360 if req_format is None or req_format == 'best':
1361 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1362 elif req_format == 'worst':
1363 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1364 elif req_format in ('-1', 'all'):
1365 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1367 # Specific formats. We pick the first in a slash-delimeted sequence.
1368 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1369 req_formats = req_format.split('/')
1370 video_url_list = None
1371 for rf in req_formats:
1373 video_url_list = [(rf, url_map[rf])]
1375 if video_url_list is None:
1376 self._downloader.trouble(u'ERROR: requested format not available')
1379 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1382 for format_param, video_real_url in video_url_list:
1383 # At this point we have a new video
1384 self._downloader.increment_downloads()
1387 video_extension = self._video_extensions.get(format_param, 'flv')
1390 # Process video information
1391 self._downloader.process_info({
1392 'id': video_id.decode('utf-8'),
1393 'url': video_real_url.decode('utf-8'),
1394 'uploader': video_uploader.decode('utf-8'),
1395 'upload_date': upload_date,
1396 'title': video_title,
1397 'stitle': simple_title,
1398 'ext': video_extension.decode('utf-8'),
1399 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1400 'thumbnail': video_thumbnail.decode('utf-8'),
1401 'description': video_description,
1402 'player_url': player_url,
1404 except UnavailableVideoError, err:
1405 self._downloader.trouble(u'\nERROR: unable to download video')
1408 class MetacafeIE(InfoExtractor):
1409 """Information Extractor for metacafe.com."""
1411 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1412 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1413 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1415 IE_NAME = u'metacafe'
1417 def __init__(self, youtube_ie, downloader=None):
1418 InfoExtractor.__init__(self, downloader)
1419 self._youtube_ie = youtube_ie
1421 def report_disclaimer(self):
1422 """Report disclaimer retrieval."""
1423 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1425 def report_age_confirmation(self):
1426 """Report attempt to confirm age."""
1427 self._downloader.to_screen(u'[metacafe] Confirming age')
1429 def report_download_webpage(self, video_id):
1430 """Report webpage download."""
1431 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1433 def report_extraction(self, video_id):
1434 """Report information extraction."""
1435 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1437 def _real_initialize(self):
1438 # Retrieve disclaimer
1439 request = urllib2.Request(self._DISCLAIMER)
1441 self.report_disclaimer()
1442 disclaimer = urllib2.urlopen(request).read()
1443 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1444 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1450 'submit': "Continue - I'm over 18",
1452 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1454 self.report_age_confirmation()
1455 disclaimer = urllib2.urlopen(request).read()
1456 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1457 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1460 def _real_extract(self, url):
1461 # Extract id and simplified title from URL
1462 mobj = re.match(self._VALID_URL, url)
1464 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1467 video_id = mobj.group(1)
1469 # Check if video comes from YouTube
1470 mobj2 = re.match(r'^yt-(.*)$', video_id)
1471 if mobj2 is not None:
1472 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1475 # At this point we have a new video
1476 self._downloader.increment_downloads()
1478 simple_title = mobj.group(2).decode('utf-8')
1480 # Retrieve video webpage to extract further information
1481 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1483 self.report_download_webpage(video_id)
1484 webpage = urllib2.urlopen(request).read()
1485 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1486 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1489 # Extract URL, uploader and title from webpage
1490 self.report_extraction(video_id)
1491 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1492 if mobj is not None:
1493 mediaURL = urllib.unquote(mobj.group(1))
1494 video_extension = mediaURL[-3:]
1496 # Extract gdaKey if available
1497 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1499 video_url = mediaURL
1501 gdaKey = mobj.group(1)
1502 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1504 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1506 self._downloader.trouble(u'ERROR: unable to extract media URL')
1508 vardict = parse_qs(mobj.group(1))
1509 if 'mediaData' not in vardict:
1510 self._downloader.trouble(u'ERROR: unable to extract media URL')
1512 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1514 self._downloader.trouble(u'ERROR: unable to extract media URL')
1516 mediaURL = mobj.group(1).replace('\\/', '/')
1517 video_extension = mediaURL[-3:]
1518 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1520 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1522 self._downloader.trouble(u'ERROR: unable to extract title')
1524 video_title = mobj.group(1).decode('utf-8')
1525 video_title = sanitize_title(video_title)
1527 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1529 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1531 video_uploader = mobj.group(1)
1534 # Process video information
1535 self._downloader.process_info({
1536 'id': video_id.decode('utf-8'),
1537 'url': video_url.decode('utf-8'),
1538 'uploader': video_uploader.decode('utf-8'),
1539 'upload_date': u'NA',
1540 'title': video_title,
1541 'stitle': simple_title,
1542 'ext': video_extension.decode('utf-8'),
1546 except UnavailableVideoError:
1547 self._downloader.trouble(u'\nERROR: unable to download video')
1550 class DailymotionIE(InfoExtractor):
1551 """Information Extractor for Dailymotion"""
1553 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1554 IE_NAME = u'dailymotion'
1556 def __init__(self, downloader=None):
1557 InfoExtractor.__init__(self, downloader)
1559 def report_download_webpage(self, video_id):
1560 """Report webpage download."""
1561 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1563 def report_extraction(self, video_id):
1564 """Report information extraction."""
1565 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1567 def _real_extract(self, url):
1568 # Extract id and simplified title from URL
1569 mobj = re.match(self._VALID_URL, url)
1571 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1574 # At this point we have a new video
1575 self._downloader.increment_downloads()
1576 video_id = mobj.group(1)
1578 simple_title = mobj.group(2).decode('utf-8')
1579 video_extension = 'flv'
1581 # Retrieve video webpage to extract further information
1582 request = urllib2.Request(url)
1583 request.add_header('Cookie', 'family_filter=off')
1585 self.report_download_webpage(video_id)
1586 webpage = urllib2.urlopen(request).read()
1587 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1588 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1591 # Extract URL, uploader and title from webpage
1592 self.report_extraction(video_id)
1593 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1595 self._downloader.trouble(u'ERROR: unable to extract media URL')
1597 sequence = urllib.unquote(mobj.group(1))
1598 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1600 self._downloader.trouble(u'ERROR: unable to extract media URL')
1602 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1604 # if needed add http://www.dailymotion.com/ if relative URL
1606 video_url = mediaURL
1608 mobj = re.search(r'(?im)<title>Dailymotion\s*-\s*(.+)\s*-\s*[^<]+?</title>', webpage)
1610 self._downloader.trouble(u'ERROR: unable to extract title')
1612 video_title = mobj.group(1).decode('utf-8')
1613 video_title = sanitize_title(video_title)
1615 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1617 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1619 video_uploader = mobj.group(1)
1622 # Process video information
1623 self._downloader.process_info({
1624 'id': video_id.decode('utf-8'),
1625 'url': video_url.decode('utf-8'),
1626 'uploader': video_uploader.decode('utf-8'),
1627 'upload_date': u'NA',
1628 'title': video_title,
1629 'stitle': simple_title,
1630 'ext': video_extension.decode('utf-8'),
1634 except UnavailableVideoError:
1635 self._downloader.trouble(u'\nERROR: unable to download video')
1638 class GoogleIE(InfoExtractor):
1639 """Information extractor for video.google.com."""
1641 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1642 IE_NAME = u'video.google'
1644 def __init__(self, downloader=None):
1645 InfoExtractor.__init__(self, downloader)
1647 def report_download_webpage(self, video_id):
1648 """Report webpage download."""
1649 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1651 def report_extraction(self, video_id):
1652 """Report information extraction."""
1653 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1655 def _real_extract(self, url):
1656 # Extract id from URL
1657 mobj = re.match(self._VALID_URL, url)
1659 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1662 # At this point we have a new video
1663 self._downloader.increment_downloads()
1664 video_id = mobj.group(1)
1666 video_extension = 'mp4'
1668 # Retrieve video webpage to extract further information
1669 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1671 self.report_download_webpage(video_id)
1672 webpage = urllib2.urlopen(request).read()
1673 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1674 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1677 # Extract URL, uploader, and title from webpage
1678 self.report_extraction(video_id)
1679 mobj = re.search(r"download_url:'([^']+)'", webpage)
1681 video_extension = 'flv'
1682 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1684 self._downloader.trouble(u'ERROR: unable to extract media URL')
1686 mediaURL = urllib.unquote(mobj.group(1))
1687 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1688 mediaURL = mediaURL.replace('\\x26', '\x26')
1690 video_url = mediaURL
1692 mobj = re.search(r'<title>(.*)</title>', webpage)
1694 self._downloader.trouble(u'ERROR: unable to extract title')
1696 video_title = mobj.group(1).decode('utf-8')
1697 video_title = sanitize_title(video_title)
1698 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1700 # Extract video description
1701 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1703 self._downloader.trouble(u'ERROR: unable to extract video description')
1705 video_description = mobj.group(1).decode('utf-8')
1706 if not video_description:
1707 video_description = 'No description available.'
1709 # Extract video thumbnail
1710 if self._downloader.params.get('forcethumbnail', False):
1711 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1713 webpage = urllib2.urlopen(request).read()
1714 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1715 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1717 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1719 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1721 video_thumbnail = mobj.group(1)
1722 else: # we need something to pass to process_info
1723 video_thumbnail = ''
1726 # Process video information
1727 self._downloader.process_info({
1728 'id': video_id.decode('utf-8'),
1729 'url': video_url.decode('utf-8'),
1731 'upload_date': u'NA',
1732 'title': video_title,
1733 'stitle': simple_title,
1734 'ext': video_extension.decode('utf-8'),
1738 except UnavailableVideoError:
1739 self._downloader.trouble(u'\nERROR: unable to download video')
1742 class PhotobucketIE(InfoExtractor):
1743 """Information extractor for photobucket.com."""
1745 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1746 IE_NAME = u'photobucket'
1748 def __init__(self, downloader=None):
1749 InfoExtractor.__init__(self, downloader)
1751 def report_download_webpage(self, video_id):
1752 """Report webpage download."""
1753 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1755 def report_extraction(self, video_id):
1756 """Report information extraction."""
1757 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1759 def _real_extract(self, url):
1760 # Extract id from URL
1761 mobj = re.match(self._VALID_URL, url)
1763 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1766 # At this point we have a new video
1767 self._downloader.increment_downloads()
1768 video_id = mobj.group(1)
1770 video_extension = 'flv'
1772 # Retrieve video webpage to extract further information
1773 request = urllib2.Request(url)
1775 self.report_download_webpage(video_id)
1776 webpage = urllib2.urlopen(request).read()
1777 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1778 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1781 # Extract URL, uploader, and title from webpage
1782 self.report_extraction(video_id)
1783 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1785 self._downloader.trouble(u'ERROR: unable to extract media URL')
1787 mediaURL = urllib.unquote(mobj.group(1))
1789 video_url = mediaURL
1791 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1793 self._downloader.trouble(u'ERROR: unable to extract title')
1795 video_title = mobj.group(1).decode('utf-8')
1796 video_title = sanitize_title(video_title)
1797 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1799 video_uploader = mobj.group(2).decode('utf-8')
1802 # Process video information
1803 self._downloader.process_info({
1804 'id': video_id.decode('utf-8'),
1805 'url': video_url.decode('utf-8'),
1806 'uploader': video_uploader,
1807 'upload_date': u'NA',
1808 'title': video_title,
1809 'stitle': simple_title,
1810 'ext': video_extension.decode('utf-8'),
1814 except UnavailableVideoError:
1815 self._downloader.trouble(u'\nERROR: unable to download video')
1818 class YahooIE(InfoExtractor):
1819 """Information extractor for video.yahoo.com."""
1821 # _VALID_URL matches all Yahoo! Video URLs
1822 # _VPAGE_URL matches only the extractable '/watch/' URLs
1823 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1824 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1825 IE_NAME = u'video.yahoo'
1827 def __init__(self, downloader=None):
1828 InfoExtractor.__init__(self, downloader)
1830 def report_download_webpage(self, video_id):
1831 """Report webpage download."""
1832 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1834 def report_extraction(self, video_id):
1835 """Report information extraction."""
1836 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1838 def _real_extract(self, url, new_video=True):
1839 # Extract ID from URL
1840 mobj = re.match(self._VALID_URL, url)
1842 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1845 # At this point we have a new video
1846 self._downloader.increment_downloads()
1847 video_id = mobj.group(2)
1848 video_extension = 'flv'
1850 # Rewrite valid but non-extractable URLs as
1851 # extractable English language /watch/ URLs
1852 if re.match(self._VPAGE_URL, url) is None:
1853 request = urllib2.Request(url)
1855 webpage = urllib2.urlopen(request).read()
1856 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1857 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1860 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1862 self._downloader.trouble(u'ERROR: Unable to extract id field')
1864 yahoo_id = mobj.group(1)
1866 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1868 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1870 yahoo_vid = mobj.group(1)
1872 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1873 return self._real_extract(url, new_video=False)
1875 # Retrieve video webpage to extract further information
1876 request = urllib2.Request(url)
1878 self.report_download_webpage(video_id)
1879 webpage = urllib2.urlopen(request).read()
1880 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1881 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1884 # Extract uploader and title from webpage
1885 self.report_extraction(video_id)
1886 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1888 self._downloader.trouble(u'ERROR: unable to extract video title')
1890 video_title = mobj.group(1).decode('utf-8')
1891 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1893 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1895 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1897 video_uploader = mobj.group(1).decode('utf-8')
1899 # Extract video thumbnail
1900 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1902 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1904 video_thumbnail = mobj.group(1).decode('utf-8')
1906 # Extract video description
1907 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1909 self._downloader.trouble(u'ERROR: unable to extract video description')
1911 video_description = mobj.group(1).decode('utf-8')
1912 if not video_description:
1913 video_description = 'No description available.'
1915 # Extract video height and width
1916 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1918 self._downloader.trouble(u'ERROR: unable to extract video height')
1920 yv_video_height = mobj.group(1)
1922 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1924 self._downloader.trouble(u'ERROR: unable to extract video width')
1926 yv_video_width = mobj.group(1)
1928 # Retrieve video playlist to extract media URL
1929 # I'm not completely sure what all these options are, but we
1930 # seem to need most of them, otherwise the server sends a 401.
1931 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1932 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1933 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1934 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1935 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1937 self.report_download_webpage(video_id)
1938 webpage = urllib2.urlopen(request).read()
1939 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1940 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1943 # Extract media URL from playlist XML
1944 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1946 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1948 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1949 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1952 # Process video information
1953 self._downloader.process_info({
1954 'id': video_id.decode('utf-8'),
1956 'uploader': video_uploader,
1957 'upload_date': u'NA',
1958 'title': video_title,
1959 'stitle': simple_title,
1960 'ext': video_extension.decode('utf-8'),
1961 'thumbnail': video_thumbnail.decode('utf-8'),
1962 'description': video_description,
1963 'thumbnail': video_thumbnail,
1966 except UnavailableVideoError:
1967 self._downloader.trouble(u'\nERROR: unable to download video')
1970 class VimeoIE(InfoExtractor):
1971 """Information extractor for vimeo.com."""
1973 # _VALID_URL matches Vimeo URLs
1974 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1977 def __init__(self, downloader=None):
1978 InfoExtractor.__init__(self, downloader)
1980 def report_download_webpage(self, video_id):
1981 """Report webpage download."""
1982 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1984 def report_extraction(self, video_id):
1985 """Report information extraction."""
1986 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1988 def _real_extract(self, url, new_video=True):
1989 # Extract ID from URL
1990 mobj = re.match(self._VALID_URL, url)
1992 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1995 # At this point we have a new video
1996 self._downloader.increment_downloads()
1997 video_id = mobj.group(1)
1999 # Retrieve video webpage to extract further information
2000 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
2002 self.report_download_webpage(video_id)
2003 webpage = urllib2.urlopen(request).read()
2004 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2005 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2008 # Now we begin extracting as much information as we can from what we
2009 # retrieved. First we extract the information common to all extractors,
2010 # and latter we extract those that are Vimeo specific.
2011 self.report_extraction(video_id)
2014 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
2016 self._downloader.trouble(u'ERROR: unable to extract video title')
2018 video_title = mobj.group(1).decode('utf-8')
2019 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2022 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
2024 self._downloader.trouble(u'ERROR: unable to extract video uploader')
2026 video_uploader = mobj.group(1).decode('utf-8')
2028 # Extract video thumbnail
2029 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2031 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2033 video_thumbnail = mobj.group(1).decode('utf-8')
2035 # # Extract video description
2036 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2038 # self._downloader.trouble(u'ERROR: unable to extract video description')
2040 # video_description = mobj.group(1).decode('utf-8')
2041 # if not video_description: video_description = 'No description available.'
2042 video_description = 'Foo.'
2044 # Vimeo specific: extract request signature
2045 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2047 self._downloader.trouble(u'ERROR: unable to extract request signature')
2049 sig = mobj.group(1).decode('utf-8')
2051 # Vimeo specific: extract video quality information
2052 mobj = re.search(r'<isHD>(\d+)</isHD>', webpage)
2054 self._downloader.trouble(u'ERROR: unable to extract video quality information')
2056 quality = mobj.group(1).decode('utf-8')
2058 if int(quality) == 1:
2063 # Vimeo specific: Extract request signature expiration
2064 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2066 self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2068 sig_exp = mobj.group(1).decode('utf-8')
2070 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s/?q=%s" % (video_id, sig, sig_exp, quality)
2073 # Process video information
2074 self._downloader.process_info({
2075 'id': video_id.decode('utf-8'),
2077 'uploader': video_uploader,
2078 'upload_date': u'NA',
2079 'title': video_title,
2080 'stitle': simple_title,
2082 'thumbnail': video_thumbnail.decode('utf-8'),
2083 'description': video_description,
2084 'thumbnail': video_thumbnail,
2085 'description': video_description,
2088 except UnavailableVideoError:
2089 self._downloader.trouble(u'ERROR: unable to download video')
2092 class GenericIE(InfoExtractor):
2093 """Generic last-resort information extractor."""
2096 IE_NAME = u'generic'
2098 def __init__(self, downloader=None):
2099 InfoExtractor.__init__(self, downloader)
2101 def report_download_webpage(self, video_id):
2102 """Report webpage download."""
2103 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2104 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2106 def report_extraction(self, video_id):
2107 """Report information extraction."""
2108 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2110 def _real_extract(self, url):
2111 # At this point we have a new video
2112 self._downloader.increment_downloads()
2114 video_id = url.split('/')[-1]
2115 request = urllib2.Request(url)
2117 self.report_download_webpage(video_id)
2118 webpage = urllib2.urlopen(request).read()
2119 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2120 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2122 except ValueError, err:
2123 # since this is the last-resort InfoExtractor, if
2124 # this error is thrown, it'll be thrown here
2125 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2128 self.report_extraction(video_id)
2129 # Start with something easy: JW Player in SWFObject
2130 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2132 # Broaden the search a little bit
2133 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2135 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2138 # It's possible that one of the regexes
2139 # matched, but returned an empty group:
2140 if mobj.group(1) is None:
2141 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2144 video_url = urllib.unquote(mobj.group(1))
2145 video_id = os.path.basename(video_url)
2147 # here's a fun little line of code for you:
2148 video_extension = os.path.splitext(video_id)[1][1:]
2149 video_id = os.path.splitext(video_id)[0]
2151 # it's tempting to parse this further, but you would
2152 # have to take into account all the variations like
2153 # Video Title - Site Name
2154 # Site Name | Video Title
2155 # Video Title - Tagline | Site Name
2156 # and so on and so forth; it's just not practical
2157 mobj = re.search(r'<title>(.*)</title>', webpage)
2159 self._downloader.trouble(u'ERROR: unable to extract title')
2161 video_title = mobj.group(1).decode('utf-8')
2162 video_title = sanitize_title(video_title)
2163 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2165 # video uploader is domain name
2166 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2168 self._downloader.trouble(u'ERROR: unable to extract title')
2170 video_uploader = mobj.group(1).decode('utf-8')
2173 # Process video information
2174 self._downloader.process_info({
2175 'id': video_id.decode('utf-8'),
2176 'url': video_url.decode('utf-8'),
2177 'uploader': video_uploader,
2178 'upload_date': u'NA',
2179 'title': video_title,
2180 'stitle': simple_title,
2181 'ext': video_extension.decode('utf-8'),
2185 except UnavailableVideoError, err:
2186 self._downloader.trouble(u'\nERROR: unable to download video')
2189 class YoutubeSearchIE(InfoExtractor):
2190 """Information Extractor for YouTube search queries."""
2191 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2192 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2193 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2194 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2196 _max_youtube_results = 1000
2197 IE_NAME = u'youtube:search'
2199 def __init__(self, youtube_ie, downloader=None):
2200 InfoExtractor.__init__(self, downloader)
2201 self._youtube_ie = youtube_ie
2203 def report_download_page(self, query, pagenum):
2204 """Report attempt to download playlist page with given number."""
2205 query = query.decode(preferredencoding())
2206 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2208 def _real_initialize(self):
2209 self._youtube_ie.initialize()
2211 def _real_extract(self, query):
2212 mobj = re.match(self._VALID_URL, query)
2214 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2217 prefix, query = query.split(':')
2219 query = query.encode('utf-8')
2221 self._download_n_results(query, 1)
2223 elif prefix == 'all':
2224 self._download_n_results(query, self._max_youtube_results)
2230 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2232 elif n > self._max_youtube_results:
2233 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2234 n = self._max_youtube_results
2235 self._download_n_results(query, n)
2237 except ValueError: # parsing prefix as integer fails
2238 self._download_n_results(query, 1)
2241 def _download_n_results(self, query, n):
2242 """Downloads a specified number of results for a query"""
2245 already_seen = set()
2249 self.report_download_page(query, pagenum)
2250 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2251 request = urllib2.Request(result_url)
2253 page = urllib2.urlopen(request).read()
2254 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2255 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2258 # Extract video identifiers
2259 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2260 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2261 if video_id not in already_seen:
2262 video_ids.append(video_id)
2263 already_seen.add(video_id)
2264 if len(video_ids) == n:
2265 # Specified n videos reached
2266 for id in video_ids:
2267 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2270 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2271 for id in video_ids:
2272 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2275 pagenum = pagenum + 1
2278 class GoogleSearchIE(InfoExtractor):
2279 """Information Extractor for Google Video search queries."""
2280 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2281 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2282 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2283 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2285 _max_google_results = 1000
2286 IE_NAME = u'video.google:search'
2288 def __init__(self, google_ie, downloader=None):
2289 InfoExtractor.__init__(self, downloader)
2290 self._google_ie = google_ie
2292 def report_download_page(self, query, pagenum):
2293 """Report attempt to download playlist page with given number."""
2294 query = query.decode(preferredencoding())
2295 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2297 def _real_initialize(self):
2298 self._google_ie.initialize()
2300 def _real_extract(self, query):
2301 mobj = re.match(self._VALID_URL, query)
2303 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2306 prefix, query = query.split(':')
2308 query = query.encode('utf-8')
2310 self._download_n_results(query, 1)
2312 elif prefix == 'all':
2313 self._download_n_results(query, self._max_google_results)
2319 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2321 elif n > self._max_google_results:
2322 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2323 n = self._max_google_results
2324 self._download_n_results(query, n)
2326 except ValueError: # parsing prefix as integer fails
2327 self._download_n_results(query, 1)
2330 def _download_n_results(self, query, n):
2331 """Downloads a specified number of results for a query"""
2334 already_seen = set()
2338 self.report_download_page(query, pagenum)
2339 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2340 request = urllib2.Request(result_url)
2342 page = urllib2.urlopen(request).read()
2343 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2344 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2347 # Extract video identifiers
2348 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2349 video_id = mobj.group(1)
2350 if video_id not in already_seen:
2351 video_ids.append(video_id)
2352 already_seen.add(video_id)
2353 if len(video_ids) == n:
2354 # Specified n videos reached
2355 for id in video_ids:
2356 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2359 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2360 for id in video_ids:
2361 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2364 pagenum = pagenum + 1
2367 class YahooSearchIE(InfoExtractor):
2368 """Information Extractor for Yahoo! Video search queries."""
2369 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2370 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2371 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2372 _MORE_PAGES_INDICATOR = r'\s*Next'
2374 _max_yahoo_results = 1000
2375 IE_NAME = u'video.yahoo:search'
2377 def __init__(self, yahoo_ie, downloader=None):
2378 InfoExtractor.__init__(self, downloader)
2379 self._yahoo_ie = yahoo_ie
2381 def report_download_page(self, query, pagenum):
2382 """Report attempt to download playlist page with given number."""
2383 query = query.decode(preferredencoding())
2384 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2386 def _real_initialize(self):
2387 self._yahoo_ie.initialize()
2389 def _real_extract(self, query):
2390 mobj = re.match(self._VALID_URL, query)
2392 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2395 prefix, query = query.split(':')
2397 query = query.encode('utf-8')
2399 self._download_n_results(query, 1)
2401 elif prefix == 'all':
2402 self._download_n_results(query, self._max_yahoo_results)
2408 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2410 elif n > self._max_yahoo_results:
2411 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2412 n = self._max_yahoo_results
2413 self._download_n_results(query, n)
2415 except ValueError: # parsing prefix as integer fails
2416 self._download_n_results(query, 1)
2419 def _download_n_results(self, query, n):
2420 """Downloads a specified number of results for a query"""
2423 already_seen = set()
2427 self.report_download_page(query, pagenum)
2428 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2429 request = urllib2.Request(result_url)
2431 page = urllib2.urlopen(request).read()
2432 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2433 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2436 # Extract video identifiers
2437 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2438 video_id = mobj.group(1)
2439 if video_id not in already_seen:
2440 video_ids.append(video_id)
2441 already_seen.add(video_id)
2442 if len(video_ids) == n:
2443 # Specified n videos reached
2444 for id in video_ids:
2445 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2448 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2449 for id in video_ids:
2450 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2453 pagenum = pagenum + 1
2456 class YoutubePlaylistIE(InfoExtractor):
2457 """Information Extractor for YouTube playlists."""
2459 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2460 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2461 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2462 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2464 IE_NAME = u'youtube:playlist'
2466 def __init__(self, youtube_ie, downloader=None):
2467 InfoExtractor.__init__(self, downloader)
2468 self._youtube_ie = youtube_ie
2470 def report_download_page(self, playlist_id, pagenum):
2471 """Report attempt to download playlist page with given number."""
2472 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2474 def _real_initialize(self):
2475 self._youtube_ie.initialize()
2477 def _real_extract(self, url):
2478 # Extract playlist id
2479 mobj = re.match(self._VALID_URL, url)
2481 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2485 if mobj.group(3) is not None:
2486 self._youtube_ie.extract(mobj.group(3))
2489 # Download playlist pages
2490 # prefix is 'p' as default for playlists but there are other types that need extra care
2491 playlist_prefix = mobj.group(1)
2492 if playlist_prefix == 'a':
2493 playlist_access = 'artist'
2495 playlist_prefix = 'p'
2496 playlist_access = 'view_play_list'
2497 playlist_id = mobj.group(2)
2502 self.report_download_page(playlist_id, pagenum)
2503 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2504 request = urllib2.Request(url)
2506 page = urllib2.urlopen(request).read()
2507 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2508 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2511 # Extract video identifiers
2513 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2514 if mobj.group(1) not in ids_in_page:
2515 ids_in_page.append(mobj.group(1))
2516 video_ids.extend(ids_in_page)
2518 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2520 pagenum = pagenum + 1
2522 playliststart = self._downloader.params.get('playliststart', 1) - 1
2523 playlistend = self._downloader.params.get('playlistend', -1)
2524 video_ids = video_ids[playliststart:playlistend]
2526 for id in video_ids:
2527 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2531 class YoutubeUserIE(InfoExtractor):
2532 """Information Extractor for YouTube users."""
2534 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2535 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2536 _GDATA_PAGE_SIZE = 50
2537 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2538 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2540 IE_NAME = u'youtube:user'
2542 def __init__(self, youtube_ie, downloader=None):
2543 InfoExtractor.__init__(self, downloader)
2544 self._youtube_ie = youtube_ie
2546 def report_download_page(self, username, start_index):
2547 """Report attempt to download user page."""
2548 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2549 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2551 def _real_initialize(self):
2552 self._youtube_ie.initialize()
2554 def _real_extract(self, url):
2556 mobj = re.match(self._VALID_URL, url)
2558 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2561 username = mobj.group(1)
2563 # Download video ids using YouTube Data API. Result size per
2564 # query is limited (currently to 50 videos) so we need to query
2565 # page by page until there are no video ids - it means we got
2572 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2573 self.report_download_page(username, start_index)
2575 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2578 page = urllib2.urlopen(request).read()
2579 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2580 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2583 # Extract video identifiers
2586 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2587 if mobj.group(1) not in ids_in_page:
2588 ids_in_page.append(mobj.group(1))
2590 video_ids.extend(ids_in_page)
2592 # A little optimization - if current page is not
2593 # "full", ie. does not contain PAGE_SIZE video ids then
2594 # we can assume that this page is the last one - there
2595 # are no more ids on further pages - no need to query
2598 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2603 all_ids_count = len(video_ids)
2604 playliststart = self._downloader.params.get('playliststart', 1) - 1
2605 playlistend = self._downloader.params.get('playlistend', -1)
2607 if playlistend == -1:
2608 video_ids = video_ids[playliststart:]
2610 video_ids = video_ids[playliststart:playlistend]
2612 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2613 (username, all_ids_count, len(video_ids)))
2615 for video_id in video_ids:
2616 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2619 class DepositFilesIE(InfoExtractor):
2620 """Information extractor for depositfiles.com"""
2622 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2623 IE_NAME = u'DepositFiles'
2625 def __init__(self, downloader=None):
2626 InfoExtractor.__init__(self, downloader)
2628 def report_download_webpage(self, file_id):
2629 """Report webpage download."""
2630 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2632 def report_extraction(self, file_id):
2633 """Report information extraction."""
2634 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2636 def _real_extract(self, url):
2637 # At this point we have a new file
2638 self._downloader.increment_downloads()
2640 file_id = url.split('/')[-1]
2641 # Rebuild url in english locale
2642 url = 'http://depositfiles.com/en/files/' + file_id
2644 # Retrieve file webpage with 'Free download' button pressed
2645 free_download_indication = { 'gateway_result' : '1' }
2646 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2648 self.report_download_webpage(file_id)
2649 webpage = urllib2.urlopen(request).read()
2650 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2651 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2654 # Search for the real file URL
2655 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2656 if (mobj is None) or (mobj.group(1) is None):
2657 # Try to figure out reason of the error.
2658 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2659 if (mobj is not None) and (mobj.group(1) is not None):
2660 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2661 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2663 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2666 file_url = mobj.group(1)
2667 file_extension = os.path.splitext(file_url)[1][1:]
2669 # Search for file title
2670 mobj = re.search(r'<b title="(.*?)">', webpage)
2672 self._downloader.trouble(u'ERROR: unable to extract title')
2674 file_title = mobj.group(1).decode('utf-8')
2677 # Process file information
2678 self._downloader.process_info({
2679 'id': file_id.decode('utf-8'),
2680 'url': file_url.decode('utf-8'),
2682 'upload_date': u'NA',
2683 'title': file_title,
2684 'stitle': file_title,
2685 'ext': file_extension.decode('utf-8'),
2689 except UnavailableVideoError, err:
2690 self._downloader.trouble(u'ERROR: unable to download file')
2693 class FacebookIE(InfoExtractor):
2694 """Information Extractor for Facebook"""
2696 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2697 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2698 _NETRC_MACHINE = 'facebook'
2699 _available_formats = ['video', 'highqual', 'lowqual']
2700 _video_extensions = {
2705 IE_NAME = u'facebook'
2707 def __init__(self, downloader=None):
2708 InfoExtractor.__init__(self, downloader)
2710 def _reporter(self, message):
2711 """Add header and report message."""
2712 self._downloader.to_screen(u'[facebook] %s' % message)
2714 def report_login(self):
2715 """Report attempt to log in."""
2716 self._reporter(u'Logging in')
2718 def report_video_webpage_download(self, video_id):
2719 """Report attempt to download video webpage."""
2720 self._reporter(u'%s: Downloading video webpage' % video_id)
2722 def report_information_extraction(self, video_id):
2723 """Report attempt to extract video information."""
2724 self._reporter(u'%s: Extracting video information' % video_id)
2726 def _parse_page(self, video_webpage):
2727 """Extract video information from page"""
2729 data = {'title': r'\("video_title", "(.*?)"\)',
2730 'description': r'<div class="datawrap">(.*?)</div>',
2731 'owner': r'\("video_owner_name", "(.*?)"\)',
2732 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2735 for piece in data.keys():
2736 mobj = re.search(data[piece], video_webpage)
2737 if mobj is not None:
2738 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2742 for fmt in self._available_formats:
2743 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2744 if mobj is not None:
2745 # URL is in a Javascript segment inside an escaped Unicode format within
2746 # the generally utf-8 page
2747 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2748 video_info['video_urls'] = video_urls
2752 def _real_initialize(self):
2753 if self._downloader is None:
2758 downloader_params = self._downloader.params
2760 # Attempt to use provided username and password or .netrc data
2761 if downloader_params.get('username', None) is not None:
2762 useremail = downloader_params['username']
2763 password = downloader_params['password']
2764 elif downloader_params.get('usenetrc', False):
2766 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2767 if info is not None:
2771 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2772 except (IOError, netrc.NetrcParseError), err:
2773 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2776 if useremail is None:
2785 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2788 login_results = urllib2.urlopen(request).read()
2789 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2790 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2792 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2793 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2796 def _real_extract(self, url):
2797 mobj = re.match(self._VALID_URL, url)
2799 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2801 video_id = mobj.group('ID')
2804 self.report_video_webpage_download(video_id)
2805 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2807 page = urllib2.urlopen(request)
2808 video_webpage = page.read()
2809 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2810 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2813 # Start extracting information
2814 self.report_information_extraction(video_id)
2816 # Extract information
2817 video_info = self._parse_page(video_webpage)
2820 if 'owner' not in video_info:
2821 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2823 video_uploader = video_info['owner']
2826 if 'title' not in video_info:
2827 self._downloader.trouble(u'ERROR: unable to extract video title')
2829 video_title = video_info['title']
2830 video_title = video_title.decode('utf-8')
2831 video_title = sanitize_title(video_title)
2834 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2835 simple_title = simple_title.strip(ur'_')
2838 if 'thumbnail' not in video_info:
2839 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2840 video_thumbnail = ''
2842 video_thumbnail = video_info['thumbnail']
2846 if 'upload_date' in video_info:
2847 upload_time = video_info['upload_date']
2848 timetuple = email.utils.parsedate_tz(upload_time)
2849 if timetuple is not None:
2851 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2856 video_description = video_info.get('description', 'No description available.')
2858 url_map = video_info['video_urls']
2859 if len(url_map.keys()) > 0:
2860 # Decide which formats to download
2861 req_format = self._downloader.params.get('format', None)
2862 format_limit = self._downloader.params.get('format_limit', None)
2864 if format_limit is not None and format_limit in self._available_formats:
2865 format_list = self._available_formats[self._available_formats.index(format_limit):]
2867 format_list = self._available_formats
2868 existing_formats = [x for x in format_list if x in url_map]
2869 if len(existing_formats) == 0:
2870 self._downloader.trouble(u'ERROR: no known formats available for video')
2872 if req_format is None:
2873 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2874 elif req_format == 'worst':
2875 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2876 elif req_format == '-1':
2877 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2880 if req_format not in url_map:
2881 self._downloader.trouble(u'ERROR: requested format not available')
2883 video_url_list = [(req_format, url_map[req_format])] # Specific format
2885 for format_param, video_real_url in video_url_list:
2887 # At this point we have a new video
2888 self._downloader.increment_downloads()
2891 video_extension = self._video_extensions.get(format_param, 'mp4')
2894 # Process video information
2895 self._downloader.process_info({
2896 'id': video_id.decode('utf-8'),
2897 'url': video_real_url.decode('utf-8'),
2898 'uploader': video_uploader.decode('utf-8'),
2899 'upload_date': upload_date,
2900 'title': video_title,
2901 'stitle': simple_title,
2902 'ext': video_extension.decode('utf-8'),
2903 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2904 'thumbnail': video_thumbnail.decode('utf-8'),
2905 'description': video_description.decode('utf-8'),
2908 except UnavailableVideoError, err:
2909 self._downloader.trouble(u'\nERROR: unable to download video')
2911 class BlipTVIE(InfoExtractor):
2912 """Information extractor for blip.tv"""
2914 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2915 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2916 IE_NAME = u'blip.tv'
2918 def report_extraction(self, file_id):
2919 """Report information extraction."""
2920 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2922 def report_direct_download(self, title):
2923 """Report information extraction."""
2924 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2926 def _simplify_title(self, title):
2927 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2928 res = res.strip(ur'_')
2931 def _real_extract(self, url):
2932 mobj = re.match(self._VALID_URL, url)
2934 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2941 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2942 request = urllib2.Request(json_url)
2943 self.report_extraction(mobj.group(1))
2946 urlh = urllib2.urlopen(request)
2947 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2948 basename = url.split('/')[-1]
2949 title,ext = os.path.splitext(basename)
2950 ext = ext.replace('.', '')
2951 self.report_direct_download(title)
2956 'stitle': self._simplify_title(title),
2960 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2961 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2963 if info is None: # Regular URL
2965 json_code = urlh.read()
2966 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2967 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2971 json_data = json.loads(json_code)
2972 if 'Post' in json_data:
2973 data = json_data['Post']
2977 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2978 video_url = data['media']['url']
2979 umobj = re.match(self._URL_EXT, video_url)
2981 raise ValueError('Can not determine filename extension')
2982 ext = umobj.group(1)
2985 'id': data['item_id'],
2987 'uploader': data['display_name'],
2988 'upload_date': upload_date,
2989 'title': data['title'],
2990 'stitle': self._simplify_title(data['title']),
2992 'format': data['media']['mimeType'],
2993 'thumbnail': data['thumbnailUrl'],
2994 'description': data['description'],
2995 'player_url': data['embedUrl']
2997 except (ValueError,KeyError), err:
2998 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3001 self._downloader.increment_downloads()
3004 self._downloader.process_info(info)
3005 except UnavailableVideoError, err:
3006 self._downloader.trouble(u'\nERROR: unable to download video')
3009 class MyVideoIE(InfoExtractor):
3010 """Information Extractor for myvideo.de."""
3012 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3013 IE_NAME = u'myvideo'
3015 def __init__(self, downloader=None):
3016 InfoExtractor.__init__(self, downloader)
3018 def report_download_webpage(self, video_id):
3019 """Report webpage download."""
3020 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3022 def report_extraction(self, video_id):
3023 """Report information extraction."""
3024 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3026 def _real_extract(self,url):
3027 mobj = re.match(self._VALID_URL, url)
3029 self._download.trouble(u'ERROR: invalid URL: %s' % url)
3032 video_id = mobj.group(1)
3033 simple_title = mobj.group(2).decode('utf-8')
3034 # should actually not be necessary
3035 simple_title = sanitize_title(simple_title)
3036 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', simple_title)
3039 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3041 self.report_download_webpage(video_id)
3042 webpage = urllib2.urlopen(request).read()
3043 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3044 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3047 self.report_extraction(video_id)
3048 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3051 self._downloader.trouble(u'ERROR: unable to extract media URL')
3053 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3055 mobj = re.search('<title>([^<]+)</title>', webpage)
3057 self._downloader.trouble(u'ERROR: unable to extract title')
3060 video_title = mobj.group(1)
3061 video_title = sanitize_title(video_title)
3064 self._downloader.process_info({
3068 'upload_date': u'NA',
3069 'title': video_title,
3070 'stitle': simple_title,
3075 except UnavailableVideoError:
3076 self._downloader.trouble(u'\nERROR: Unable to download video')
3078 class ComedyCentralIE(InfoExtractor):
3079 """Information extractor for The Daily Show and Colbert Report """
3081 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3082 IE_NAME = u'comedycentral'
3084 def report_extraction(self, episode_id):
3085 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3087 def report_config_download(self, episode_id):
3088 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3090 def report_index_download(self, episode_id):
3091 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3093 def report_player_url(self, episode_id):
3094 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3096 def _simplify_title(self, title):
3097 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3098 res = res.strip(ur'_')
3101 def _real_extract(self, url):
3102 mobj = re.match(self._VALID_URL, url)
3104 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3107 if mobj.group('shortname'):
3108 if mobj.group('shortname') in ('tds', 'thedailyshow'):
3109 url = 'http://www.thedailyshow.com/full-episodes/'
3111 url = 'http://www.colbertnation.com/full-episodes/'
3112 mobj = re.match(self._VALID_URL, url)
3113 assert mobj is not None
3115 dlNewest = not mobj.group('episode')
3117 epTitle = mobj.group('showname')
3119 epTitle = mobj.group('episode')
3121 req = urllib2.Request(url)
3122 self.report_extraction(epTitle)
3124 htmlHandle = urllib2.urlopen(req)
3125 html = htmlHandle.read()
3126 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3127 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3130 url = htmlHandle.geturl()
3131 mobj = re.match(self._VALID_URL, url)
3133 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3135 if mobj.group('episode') == '':
3136 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3138 epTitle = mobj.group('episode')
3140 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"/>', html)
3141 if len(mMovieParams) == 0:
3142 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3145 playerUrl_raw = mMovieParams[0][0]
3146 self.report_player_url(epTitle)
3148 urlHandle = urllib2.urlopen(playerUrl_raw)
3149 playerUrl = urlHandle.geturl()
3150 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3151 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3154 uri = mMovieParams[0][1]
3155 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3156 self.report_index_download(epTitle)
3158 indexXml = urllib2.urlopen(indexUrl).read()
3159 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3160 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3163 idoc = xml.etree.ElementTree.fromstring(indexXml)
3164 itemEls = idoc.findall('.//item')
3165 for itemEl in itemEls:
3166 mediaId = itemEl.findall('./guid')[0].text
3167 shortMediaId = mediaId.split(':')[-1]
3168 showId = mediaId.split(':')[-2].replace('.com', '')
3169 officialTitle = itemEl.findall('./title')[0].text
3170 officialDate = itemEl.findall('./pubDate')[0].text
3172 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3173 urllib.urlencode({'uri': mediaId}))
3174 configReq = urllib2.Request(configUrl)
3175 self.report_config_download(epTitle)
3177 configXml = urllib2.urlopen(configReq).read()
3178 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3179 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3182 cdoc = xml.etree.ElementTree.fromstring(configXml)
3184 for rendition in cdoc.findall('.//rendition'):
3185 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3189 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3192 # For now, just pick the highest bitrate
3193 format,video_url = turls[-1]
3195 self._downloader.increment_downloads()
3197 effTitle = showId + '-' + epTitle
3202 'upload_date': officialDate,
3204 'stitle': self._simplify_title(effTitle),
3208 'description': officialTitle,
3209 'player_url': playerUrl
3213 self._downloader.process_info(info)
3214 except UnavailableVideoError, err:
3215 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3219 class EscapistIE(InfoExtractor):
3220 """Information extractor for The Escapist """
3222 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3223 IE_NAME = u'escapist'
3225 def report_extraction(self, showName):
3226 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3228 def report_config_download(self, showName):
3229 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3231 def _simplify_title(self, title):
3232 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3233 res = res.strip(ur'_')
3236 def _real_extract(self, url):
3237 htmlParser = HTMLParser.HTMLParser()
3239 mobj = re.match(self._VALID_URL, url)
3241 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3243 showName = mobj.group('showname')
3244 videoId = mobj.group('episode')
3246 self.report_extraction(showName)
3248 webPage = urllib2.urlopen(url).read()
3249 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3250 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3253 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3254 description = htmlParser.unescape(descMatch.group(1))
3255 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3256 imgUrl = htmlParser.unescape(imgMatch.group(1))
3257 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3258 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3259 configUrlMatch = re.search('config=(.*)$', playerUrl)
3260 configUrl = urllib2.unquote(configUrlMatch.group(1))
3262 self.report_config_download(showName)
3264 configJSON = urllib2.urlopen(configUrl).read()
3265 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3266 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3269 # Technically, it's JavaScript, not JSON
3270 configJSON = configJSON.replace("'", '"')
3273 config = json.loads(configJSON)
3274 except (ValueError,), err:
3275 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3278 playlist = config['playlist']
3279 videoUrl = playlist[1]['url']
3281 self._downloader.increment_downloads()
3285 'uploader': showName,
3286 'upload_date': None,
3288 'stitle': self._simplify_title(showName),
3291 'thumbnail': imgUrl,
3292 'description': description,
3293 'player_url': playerUrl,
3297 self._downloader.process_info(info)
3298 except UnavailableVideoError, err:
3299 self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3302 class CollegeHumorIE(InfoExtractor):
3303 """Information extractor for collegehumor.com"""
3305 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3306 IE_NAME = u'collegehumor'
3308 def report_webpage(self, video_id):
3309 """Report information extraction."""
3310 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3312 def report_extraction(self, video_id):
3313 """Report information extraction."""
3314 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3316 def _simplify_title(self, title):
3317 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3318 res = res.strip(ur'_')
3321 def _real_extract(self, url):
3322 htmlParser = HTMLParser.HTMLParser()
3324 mobj = re.match(self._VALID_URL, url)
3326 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3328 video_id = mobj.group('videoid')
3330 self.report_webpage(video_id)
3331 request = urllib2.Request(url)
3333 webpage = urllib2.urlopen(request).read()
3334 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3335 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3338 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3340 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3342 internal_video_id = m.group('internalvideoid')
3346 'internal_id': internal_video_id,
3349 self.report_extraction(video_id)
3350 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3352 metaXml = urllib2.urlopen(xmlUrl).read()
3353 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3354 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3357 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3359 videoNode = mdoc.findall('./video')[0]
3360 info['description'] = videoNode.findall('./description')[0].text
3361 info['title'] = videoNode.findall('./caption')[0].text
3362 info['stitle'] = self._simplify_title(info['title'])
3363 info['url'] = videoNode.findall('./file')[0].text
3364 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3365 info['ext'] = info['url'].rpartition('.')[2]
3366 info['format'] = info['ext']
3368 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3371 self._downloader.increment_downloads()
3374 self._downloader.process_info(info)
3375 except UnavailableVideoError, err:
3376 self._downloader.trouble(u'\nERROR: unable to download video')
3379 class XVideosIE(InfoExtractor):
3380 """Information extractor for xvideos.com"""
3382 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3383 IE_NAME = u'xvideos'
3385 def report_webpage(self, video_id):
3386 """Report information extraction."""
3387 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3389 def report_extraction(self, video_id):
3390 """Report information extraction."""
3391 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3393 def _simplify_title(self, title):
3394 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3395 res = res.strip(ur'_')
3398 def _real_extract(self, url):
3399 htmlParser = HTMLParser.HTMLParser()
3401 mobj = re.match(self._VALID_URL, url)
3403 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3405 video_id = mobj.group(1).decode('utf-8')
3407 self.report_webpage(video_id)
3409 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3411 webpage = urllib2.urlopen(request).read()
3412 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3413 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3416 self.report_extraction(video_id)
3420 mobj = re.search(r'flv_url=(.+?)&', webpage)
3422 self._downloader.trouble(u'ERROR: unable to extract video url')
3424 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3428 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3430 self._downloader.trouble(u'ERROR: unable to extract video title')
3432 video_title = mobj.group(1).decode('utf-8')
3435 # Extract video thumbnail
3436 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3438 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3440 video_thumbnail = mobj.group(1).decode('utf-8')
3444 self._downloader.increment_downloads()
3449 'upload_date': None,
3450 'title': video_title,
3451 'stitle': self._simplify_title(video_title),
3454 'thumbnail': video_thumbnail,
3455 'description': None,
3460 self._downloader.process_info(info)
3461 except UnavailableVideoError, err:
3462 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3465 class SoundcloudIE(InfoExtractor):
3466 """Information extractor for soundcloud.com
3467 To access the media, the uid of the song and a stream token
3468 must be extracted from the page source and the script must make
3469 a request to media.soundcloud.com/crossdomain.xml. Then
3470 the media can be grabbed by requesting from an url composed
3471 of the stream token and uid
3474 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3475 IE_NAME = u'soundcloud'
3477 def __init__(self, downloader=None):
3478 InfoExtractor.__init__(self, downloader)
3480 def report_webpage(self, video_id):
3481 """Report information extraction."""
3482 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3484 def report_extraction(self, video_id):
3485 """Report information extraction."""
3486 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3488 def _real_extract(self, url):
3489 htmlParser = HTMLParser.HTMLParser()
3491 mobj = re.match(self._VALID_URL, url)
3493 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3496 # extract uploader (which is in the url)
3497 uploader = mobj.group(1).decode('utf-8')
3498 # extract simple title (uploader + slug of song title)
3499 slug_title = mobj.group(2).decode('utf-8')
3500 simple_title = uploader + '-' + slug_title
3502 self.report_webpage('%s/%s' % (uploader, slug_title))
3504 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3506 webpage = urllib2.urlopen(request).read()
3507 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3508 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3511 self.report_extraction('%s/%s' % (uploader, slug_title))
3513 # extract uid and stream token that soundcloud hands out for access
3514 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3516 video_id = mobj.group(1)
3517 stream_token = mobj.group(2)
3519 # extract unsimplified title
3520 mobj = re.search('"title":"(.*?)",', webpage)
3522 title = mobj.group(1)
3524 # construct media url (with uid/token)
3525 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3526 mediaURL = mediaURL % (video_id, stream_token)
3529 description = u'No description available'
3530 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3532 description = mobj.group(1)
3536 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3539 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3540 except Exception as e:
3543 # for soundcloud, a request to a cross domain is required for cookies
3544 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3547 self._downloader.process_info({
3548 'id': video_id.decode('utf-8'),
3550 'uploader': uploader.decode('utf-8'),
3551 'upload_date': upload_date,
3552 'title': simple_title.decode('utf-8'),
3553 'stitle': simple_title.decode('utf-8'),
3557 'description': description.decode('utf-8')
3559 except UnavailableVideoError:
3560 self._downloader.trouble(u'\nERROR: unable to download video')
3563 class InfoQIE(InfoExtractor):
3564 """Information extractor for infoq.com"""
3566 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3569 def report_webpage(self, video_id):
3570 """Report information extraction."""
3571 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3573 def report_extraction(self, video_id):
3574 """Report information extraction."""
3575 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3577 def _simplify_title(self, title):
3578 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3579 res = res.strip(ur'_')
3582 def _real_extract(self, url):
3583 htmlParser = HTMLParser.HTMLParser()
3585 mobj = re.match(self._VALID_URL, url)
3587 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3590 self.report_webpage(url)
3592 request = urllib2.Request(url)
3594 webpage = urllib2.urlopen(request).read()
3595 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3596 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3599 self.report_extraction(url)
3603 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3605 self._downloader.trouble(u'ERROR: unable to extract video url')
3607 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3611 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3613 self._downloader.trouble(u'ERROR: unable to extract video title')
3615 video_title = mobj.group(1).decode('utf-8')
3617 # Extract description
3618 video_description = u'No description available.'
3619 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3620 if mobj is not None:
3621 video_description = mobj.group(1).decode('utf-8')
3623 video_filename = video_url.split('/')[-1]
3624 video_id, extension = video_filename.split('.')
3626 self._downloader.increment_downloads()
3631 'upload_date': None,
3632 'title': video_title,
3633 'stitle': self._simplify_title(video_title),
3635 'format': extension, # Extension is always(?) mp4, but seems to be flv
3637 'description': video_description,
3642 self._downloader.process_info(info)
3643 except UnavailableVideoError, err:
3644 self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3648 class PostProcessor(object):
3649 """Post Processor class.
3651 PostProcessor objects can be added to downloaders with their
3652 add_post_processor() method. When the downloader has finished a
3653 successful download, it will take its internal chain of PostProcessors
3654 and start calling the run() method on each one of them, first with
3655 an initial argument and then with the returned value of the previous
3658 The chain will be stopped if one of them ever returns None or the end
3659 of the chain is reached.
3661 PostProcessor objects follow a "mutual registration" process similar
3662 to InfoExtractor objects.
3667 def __init__(self, downloader=None):
3668 self._downloader = downloader
3670 def set_downloader(self, downloader):
3671 """Sets the downloader for this PP."""
3672 self._downloader = downloader
3674 def run(self, information):
3675 """Run the PostProcessor.
3677 The "information" argument is a dictionary like the ones
3678 composed by InfoExtractors. The only difference is that this
3679 one has an extra field called "filepath" that points to the
3682 When this method returns None, the postprocessing chain is
3683 stopped. However, this method may return an information
3684 dictionary that will be passed to the next postprocessing
3685 object in the chain. It can be the one it received after
3686 changing some fields.
3688 In addition, this method may raise a PostProcessingError
3689 exception that will be taken into account by the downloader
3692 return information # by default, do nothing
3695 class FFmpegExtractAudioPP(PostProcessor):
3697 def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
3698 PostProcessor.__init__(self, downloader)
3699 if preferredcodec is None:
3700 preferredcodec = 'best'
3701 self._preferredcodec = preferredcodec
3702 self._preferredquality = preferredquality
3703 self._keepvideo = keepvideo
3706 def get_audio_codec(path):
3708 cmd = ['ffprobe', '-show_streams', '--', path]
3709 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3710 output = handle.communicate()[0]
3711 if handle.wait() != 0:
3713 except (IOError, OSError):
3716 for line in output.split('\n'):
3717 if line.startswith('codec_name='):
3718 audio_codec = line.split('=')[1].strip()
3719 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3724 def run_ffmpeg(path, out_path, codec, more_opts):
3726 cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3727 ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3729 except (IOError, OSError):
3732 def run(self, information):
3733 path = information['filepath']
3735 filecodec = self.get_audio_codec(path)
3736 if filecodec is None:
3737 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3741 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3742 if filecodec in ['aac', 'mp3', 'vorbis']:
3743 # Lossless if possible
3745 extension = filecodec
3746 if filecodec == 'aac':
3747 more_opts = ['-f', 'adts']
3748 if filecodec == 'vorbis':
3752 acodec = 'libmp3lame'
3755 if self._preferredquality is not None:
3756 more_opts += ['-ab', self._preferredquality]
3758 # We convert the audio (lossy)
3759 acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'vorbis': 'libvorbis'}[self._preferredcodec]
3760 extension = self._preferredcodec
3762 if self._preferredquality is not None:
3763 more_opts += ['-ab', self._preferredquality]
3764 if self._preferredcodec == 'aac':
3765 more_opts += ['-f', 'adts']
3766 if self._preferredcodec == 'vorbis':
3769 (prefix, ext) = os.path.splitext(path)
3770 new_path = prefix + '.' + extension
3771 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3772 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3775 self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3778 # Try to update the date time for extracted audio file.
3779 if information.get('filetime') is not None:
3781 os.utime(new_path, (time.time(), information['filetime']))
3783 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
3785 if not self._keepvideo:
3788 except (IOError, OSError):
3789 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3792 information['filepath'] = new_path
3796 def updateSelf(downloader, filename):
3797 ''' Update the program file with the latest version from the repository '''
3798 # Note: downloader only used for options
3799 if not os.access(filename, os.W_OK):
3800 sys.exit('ERROR: no write permissions on %s' % filename)
3802 downloader.to_screen('Updating to latest version...')
3806 urlh = urllib.urlopen(UPDATE_URL)
3807 newcontent = urlh.read()
3809 vmatch = re.search("__version__ = '([^']+)'", newcontent)
3810 if vmatch is not None and vmatch.group(1) == __version__:
3811 downloader.to_screen('youtube-dl is up-to-date (' + __version__ + ')')
3815 except (IOError, OSError), err:
3816 sys.exit('ERROR: unable to download latest version')
3819 outf = open(filename, 'wb')
3821 outf.write(newcontent)
3824 except (IOError, OSError), err:
3825 sys.exit('ERROR: unable to overwrite current version')
3827 downloader.to_screen('Updated youtube-dl. Restart youtube-dl to use the new version.')
3834 def _format_option_string(option):
3835 ''' ('-o', '--option') -> -o, --format METAVAR'''
3839 if option._short_opts: opts.append(option._short_opts[0])
3840 if option._long_opts: opts.append(option._long_opts[0])
3841 if len(opts) > 1: opts.insert(1, ', ')
3843 if option.takes_value(): opts.append(' %s' % option.metavar)
3845 return "".join(opts)
3847 def _find_term_columns():
3848 columns = os.environ.get('COLUMNS', None)
3853 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3854 out,err = sp.communicate()
3855 return int(out.split()[1])
3861 max_help_position = 80
3863 # No need to wrap help messages if we're on a wide console
3864 columns = _find_term_columns()
3865 if columns: max_width = columns
3867 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
3868 fmt.format_option_strings = _format_option_string
3871 'version' : __version__,
3873 'usage' : '%prog [options] url [url...]',
3874 'conflict_handler' : 'resolve',
3877 parser = optparse.OptionParser(**kw)
3880 general = optparse.OptionGroup(parser, 'General Options')
3881 selection = optparse.OptionGroup(parser, 'Video Selection')
3882 authentication = optparse.OptionGroup(parser, 'Authentication Options')
3883 video_format = optparse.OptionGroup(parser, 'Video Format Options')
3884 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
3885 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
3886 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3888 general.add_option('-h', '--help',
3889 action='help', help='print this help text and exit')
3890 general.add_option('-v', '--version',
3891 action='version', help='print program version and exit')
3892 general.add_option('-U', '--update',
3893 action='store_true', dest='update_self', help='update this program to latest version')
3894 general.add_option('-i', '--ignore-errors',
3895 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3896 general.add_option('-r', '--rate-limit',
3897 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3898 general.add_option('-R', '--retries',
3899 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
3900 general.add_option('--dump-user-agent',
3901 action='store_true', dest='dump_user_agent',
3902 help='display the current browser identification', default=False)
3903 general.add_option('--list-extractors',
3904 action='store_true', dest='list_extractors',
3905 help='List all supported extractors and the URLs they would handle', default=False)
3907 selection.add_option('--playlist-start',
3908 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3909 selection.add_option('--playlist-end',
3910 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3911 selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
3912 selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
3914 authentication.add_option('-u', '--username',
3915 dest='username', metavar='USERNAME', help='account username')
3916 authentication.add_option('-p', '--password',
3917 dest='password', metavar='PASSWORD', help='account password')
3918 authentication.add_option('-n', '--netrc',
3919 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3922 video_format.add_option('-f', '--format',
3923 action='store', dest='format', metavar='FORMAT', help='video format code')
3924 video_format.add_option('--all-formats',
3925 action='store_const', dest='format', help='download all available video formats', const='all')
3926 video_format.add_option('--max-quality',
3927 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3928 video_format.add_option('-F', '--list-formats',
3929 action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
3932 verbosity.add_option('-q', '--quiet',
3933 action='store_true', dest='quiet', help='activates quiet mode', default=False)
3934 verbosity.add_option('-s', '--simulate',
3935 action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
3936 verbosity.add_option('--skip-download',
3937 action='store_true', dest='skip_download', help='do not download the video', default=False)
3938 verbosity.add_option('-g', '--get-url',
3939 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3940 verbosity.add_option('-e', '--get-title',
3941 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3942 verbosity.add_option('--get-thumbnail',
3943 action='store_true', dest='getthumbnail',
3944 help='simulate, quiet but print thumbnail URL', default=False)
3945 verbosity.add_option('--get-description',
3946 action='store_true', dest='getdescription',
3947 help='simulate, quiet but print video description', default=False)
3948 verbosity.add_option('--get-filename',
3949 action='store_true', dest='getfilename',
3950 help='simulate, quiet but print output filename', default=False)
3951 verbosity.add_option('--get-format',
3952 action='store_true', dest='getformat',
3953 help='simulate, quiet but print output format', default=False)
3954 verbosity.add_option('--no-progress',
3955 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3956 verbosity.add_option('--console-title',
3957 action='store_true', dest='consoletitle',
3958 help='display progress in console titlebar', default=False)
3961 filesystem.add_option('-t', '--title',
3962 action='store_true', dest='usetitle', help='use title in file name', default=False)
3963 filesystem.add_option('-l', '--literal',
3964 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3965 filesystem.add_option('-A', '--auto-number',
3966 action='store_true', dest='autonumber',
3967 help='number downloaded files starting from 00000', default=False)
3968 filesystem.add_option('-o', '--output',
3969 dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, and %% for a literal percent')
3970 filesystem.add_option('-a', '--batch-file',
3971 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3972 filesystem.add_option('-w', '--no-overwrites',
3973 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3974 filesystem.add_option('-c', '--continue',
3975 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3976 filesystem.add_option('--no-continue',
3977 action='store_false', dest='continue_dl',
3978 help='do not resume partially downloaded files (restart from beginning)')
3979 filesystem.add_option('--cookies',
3980 dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
3981 filesystem.add_option('--no-part',
3982 action='store_true', dest='nopart', help='do not use .part files', default=False)
3983 filesystem.add_option('--no-mtime',
3984 action='store_false', dest='updatetime',
3985 help='do not use the Last-modified header to set the file modification time', default=True)
3986 filesystem.add_option('--write-description',
3987 action='store_true', dest='writedescription',
3988 help='write video description to a .description file', default=False)
3989 filesystem.add_option('--write-info-json',
3990 action='store_true', dest='writeinfojson',
3991 help='write video metadata to a .info.json file', default=False)
3994 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3995 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3996 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3997 help='"best", "aac", "vorbis" or "mp3"; best by default')
3998 postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
3999 help='ffmpeg audio bitrate specification, 128k by default')
4000 postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4001 help='keeps the video file on disk after the post-processing; the video is erased by default')
4004 parser.add_option_group(general)
4005 parser.add_option_group(selection)
4006 parser.add_option_group(filesystem)
4007 parser.add_option_group(verbosity)
4008 parser.add_option_group(video_format)
4009 parser.add_option_group(authentication)
4010 parser.add_option_group(postproc)
4012 opts, args = parser.parse_args()
4014 return parser, opts, args
4016 def gen_extractors():
4017 """ Return a list of an instance of every supported extractor.
4018 The order does matter; the first extractor matched is the one handling the URL.
4020 youtube_ie = YoutubeIE()
4021 google_ie = GoogleIE()
4022 yahoo_ie = YahooIE()
4024 YoutubePlaylistIE(youtube_ie),
4025 YoutubeUserIE(youtube_ie),
4026 YoutubeSearchIE(youtube_ie),
4028 MetacafeIE(youtube_ie),
4031 GoogleSearchIE(google_ie),
4034 YahooSearchIE(yahoo_ie),
4051 parser, opts, args = parseOpts()
4053 # Open appropriate CookieJar
4054 if opts.cookiefile is None:
4055 jar = cookielib.CookieJar()
4058 jar = cookielib.MozillaCookieJar(opts.cookiefile)
4059 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4061 except (IOError, OSError), err:
4062 sys.exit(u'ERROR: unable to open cookie file')
4065 if opts.dump_user_agent:
4066 print std_headers['User-Agent']
4069 # Batch file verification
4071 if opts.batchfile is not None:
4073 if opts.batchfile == '-':
4076 batchfd = open(opts.batchfile, 'r')
4077 batchurls = batchfd.readlines()
4078 batchurls = [x.strip() for x in batchurls]
4079 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4081 sys.exit(u'ERROR: batch file could not be read')
4082 all_urls = batchurls + args
4084 # General configuration
4085 cookie_processor = urllib2.HTTPCookieProcessor(jar)
4086 opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
4087 urllib2.install_opener(opener)
4088 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4090 extractors = gen_extractors()
4092 if opts.list_extractors:
4093 for ie in extractors:
4095 matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4096 all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4097 for mu in matchedUrls:
4101 # Conflicting, missing and erroneous options
4102 if opts.usenetrc and (opts.username is not None or opts.password is not None):
4103 parser.error(u'using .netrc conflicts with giving username/password')
4104 if opts.password is not None and opts.username is None:
4105 parser.error(u'account username missing')
4106 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4107 parser.error(u'using output template conflicts with using title, literal title or auto number')
4108 if opts.usetitle and opts.useliteral:
4109 parser.error(u'using title conflicts with using literal title')
4110 if opts.username is not None and opts.password is None:
4111 opts.password = getpass.getpass(u'Type account password and press return:')
4112 if opts.ratelimit is not None:
4113 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4114 if numeric_limit is None:
4115 parser.error(u'invalid rate limit specified')
4116 opts.ratelimit = numeric_limit
4117 if opts.retries is not None:
4119 opts.retries = long(opts.retries)
4120 except (TypeError, ValueError), err:
4121 parser.error(u'invalid retry count specified')
4123 opts.playliststart = int(opts.playliststart)
4124 if opts.playliststart <= 0:
4125 raise ValueError(u'Playlist start must be positive')
4126 except (TypeError, ValueError), err:
4127 parser.error(u'invalid playlist start number specified')
4129 opts.playlistend = int(opts.playlistend)
4130 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4131 raise ValueError(u'Playlist end must be greater than playlist start')
4132 except (TypeError, ValueError), err:
4133 parser.error(u'invalid playlist end number specified')
4134 if opts.extractaudio:
4135 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis']:
4136 parser.error(u'invalid audio format specified')
4139 fd = FileDownloader({
4140 'usenetrc': opts.usenetrc,
4141 'username': opts.username,
4142 'password': opts.password,
4143 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4144 'forceurl': opts.geturl,
4145 'forcetitle': opts.gettitle,
4146 'forcethumbnail': opts.getthumbnail,
4147 'forcedescription': opts.getdescription,
4148 'forcefilename': opts.getfilename,
4149 'forceformat': opts.getformat,
4150 'simulate': opts.simulate,
4151 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4152 'format': opts.format,
4153 'format_limit': opts.format_limit,
4154 'listformats': opts.listformats,
4155 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4156 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4157 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4158 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4159 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4160 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4161 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4162 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4163 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4164 or u'%(id)s.%(ext)s'),
4165 'ignoreerrors': opts.ignoreerrors,
4166 'ratelimit': opts.ratelimit,
4167 'nooverwrites': opts.nooverwrites,
4168 'retries': opts.retries,
4169 'continuedl': opts.continue_dl,
4170 'noprogress': opts.noprogress,
4171 'playliststart': opts.playliststart,
4172 'playlistend': opts.playlistend,
4173 'logtostderr': opts.outtmpl == '-',
4174 'consoletitle': opts.consoletitle,
4175 'nopart': opts.nopart,
4176 'updatetime': opts.updatetime,
4177 'writedescription': opts.writedescription,
4178 'writeinfojson': opts.writeinfojson,
4179 'matchtitle': opts.matchtitle,
4180 'rejecttitle': opts.rejecttitle,
4182 for extractor in extractors:
4183 fd.add_info_extractor(extractor)
4186 if opts.extractaudio:
4187 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4190 if opts.update_self:
4191 updateSelf(fd, sys.argv[0])
4194 if len(all_urls) < 1:
4195 if not opts.update_self:
4196 parser.error(u'you must provide at least one URL')
4199 retcode = fd.download(all_urls)
4201 # Dump cookie jar if requested
4202 if opts.cookiefile is not None:
4205 except (IOError, OSError), err:
4206 sys.exit(u'ERROR: unable to save cookie jar')
4213 except DownloadError:
4215 except SameFileError:
4216 sys.exit(u'ERROR: fixed output name but more than one file to download')
4217 except KeyboardInterrupt:
4218 sys.exit(u'\nERROR: Interrupted by user')
4220 if __name__ == '__main__':
4223 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: