Adds status health page to the Spawning controller. This status page

listens on a different port and provides information about the state
of the child processes in HTML and JSON formats.
This commit is contained in:
Ryan Williams 2010-10-01 13:27:49 -07:00 committed by R. Tyler Croy
parent d31de6f961
commit 4d04e6578e
4 changed files with 489 additions and 16 deletions

View File

@ -16,7 +16,7 @@ If your wsgi applications perform a certain subset of blocking calls which have
Graceful Code Reloading
=======================
By default, Spawning watches all Python files that are imported into sys.modules for changes and performs a graceful reload on change. Old processes are told to stop accepting requests and finish any outstanding requests they are servicing, and shutdown. Meanwhile, new processes are started and begin accepting requests and servicing them with the new code. At no point will users of your site see "connection refused" errors because the server is continuously listening during reload.
Spawning can watch all Python files that are imported into sys.modules for changes and performs a graceful reload on change. To enable this behavior, specify --reload=dev on the command line. Old processes are told to stop accepting requests and finish any outstanding requests they are servicing, and shutdown. Meanwhile, new processes are started and begin accepting requests and servicing them with the new code. At no point will users of your site see "connection refused" errors because the server is continuously listening during reload.
Running spawning
================
@ -86,3 +86,23 @@ Additional Useful Arguments
this time limit has expired a SIGHUP will be sent to
spawning_controller, causing it to restart all of the
child processes.
--status-port=PORT, --status-host=HOST
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
If given, starts up a small web service to give
health status reports on the Spawning server. The
service listens on two urls,
* http://status_host:status_port/status
* http://status_host:status_port/status.json
The first is an HTML page that displays the status
of the server in a human-pleasing manner. The .json
url is a JSON formatting of the same data.
The status web service is only started if the
--status-port option is supplied and different than
the service port. --status-host is useful if
monitoring happens on a different ip address than
web application requests.

View File

@ -27,6 +27,7 @@
import eventlet
import eventlet.event
import eventlet.greenio
import eventlet.greenthread
import eventlet.hubs
import eventlet.wsgi
@ -131,10 +132,49 @@ class SystemInfo(URLInterceptor):
class ExitChild(Exception):
pass
class ChildStatus(object):
def __init__(self, controller_port):
self.controller_url = "http://127.0.0.1:%s/" % controller_port
self.server = None
def send_status_to_controller(self):
try:
child_status = {'pid':os.getpid()}
if self.server:
child_status['concurrent_requests'] = \
self.server.outstanding_requests
else:
child_status['error'] = 'Starting...'
body = json.dumps(child_status)
import urllib2
urllib2.urlopen(self.controller_url, body)
except (KeyboardInterrupt, SystemExit,
eventlet.greenthread.greenlet.GreenletExit):
raise
except Exception, e:
# we really don't want exceptions here to stop read_pipe_and_die
pass
_g_status = None
def init_statusobj(status_port):
global _g_status
if status_port:
_g_status = ChildStatus(status_port)
def get_statusobj():
return _g_status
def read_pipe_and_die(the_pipe, server_coro):
try:
eventlet.hubs.trampoline(the_pipe, read=True)
os.read(the_pipe, 1)
while True:
eventlet.hubs.trampoline(the_pipe, read=True)
c = os.read(the_pipe, 1)
# this is how the controller tells the child to send a status update
if c == 's' and get_statusobj():
get_statusobj().send_status_to_controller()
continue
else:
break
except socket.error:
pass
try:
@ -192,6 +232,13 @@ def serve_from_child(sock, config, controller_pid):
max_age = int(config.get('max_age'))
server_event = eventlet.event.Event()
# the status object wants to have a reference to the server object
if config.get('status_port'):
def send_server_to_status(server_event):
server = server_event.wait()
get_statusobj().server = server
eventlet.spawn(send_server_to_status, server_event)
http_version = config.get('no_keepalive') and 'HTTP/1.0' or 'HTTP/1.1'
try:
wsgi_args = (sock, wsgi_application)
@ -217,7 +264,7 @@ def serve_from_child(sock, config, controller_pid):
## Once we get here, we just need to handle outstanding sockets, not
## accept any new sockets, so we should close the server socket.
sock.close()
server = server_event.wait()
last_outstanding = None
@ -238,9 +285,11 @@ def serve_from_child(sock, config, controller_pid):
print "(%s) *** Child exiting: all requests completed at %s" % (
os.getpid(), time.asctime())
def child_sighup(*args, **kwargs):
exit(0)
def main():
parser = optparse.OptionParser()
parser.add_option("-r", "--reload",
@ -260,6 +309,9 @@ def main():
config = spawning.util.named(factory_qual)(json.loads(factory_args))
setproctitle("spawn: child (%s)" % ", ".join(config.get("args")))
## Set up status reporter, if requested
init_statusobj(config.get('status_port'))
## Set up the reloader
if config.get('reload'):

View File

@ -23,9 +23,9 @@
from __future__ import with_statement
import commands
import datetime
import errno
import logging
import os
import optparse
import pprint
import signal
@ -42,6 +42,7 @@ except ImportError:
import eventlet
import eventlet.backdoor
from eventlet.green import os
import spawning
import spawning.util
@ -81,13 +82,20 @@ def environ():
env['PYTHONPATH'] = ':'.join(new_path)
return env
class Child(object):
def __init__(self, pid, kill_pipe):
self.pid = pid
self.kill_pipe = kill_pipe
self.active = True
self.forked_at = datetime.datetime.now()
class Controller(object):
sock = None
factory = None
args = None
config = None
child_pipes = None
children = None
keep_going = True
panic = False
log = None
@ -99,13 +107,14 @@ class Controller(object):
self.factory = factory
self.config = spawning.util.named(factory)(args)
self.args = args
self.child_pipes = {}
self.children = {}
self.log = logging.getLogger('Spawning')
if not kwargs.get('log_handler'):
self.log.addHandler(logging.StreamHandler())
self.log.setLevel(logging.DEBUG)
self.controller_pid = os.getpid()
self.num_processes = int(self.config.get('num_processes', 0))
self.started_at = datetime.datetime.now()
def spawn_children(self, number=1):
parent_pid = os.getpid()
@ -139,19 +148,22 @@ class Controller(object):
# controller process
os.close(child_side)
self.child_pipes[child_pid] = parent_side
self.children[child_pid] = Child(child_pid, parent_side)
def children_count(self):
return len(self.children)
def runloop(self):
while self.keep_going:
eventlet.sleep(0.1)
## Only start the number of children we need
number = self.num_processes - len(self.child_pipes)
number = self.num_processes - self.children_count()
if number > 0:
self.log.debug('Should start %d new children', number)
self.spawn_children(number=number)
continue
if not self.child_pipes:
if not self.children:
## If we don't yet have children, let's loop
continue
@ -162,9 +174,10 @@ class Controller(object):
if e.errno != errno.EINTR:
raise
if pid and self.child_pipes.get(pid):
if pid and self.children.get(pid):
try:
os.close(self.child_pipes.pop(pid))
child = self.children.pop(pid)
os.close(child.kill_pipe)
except (IOError, OSError):
pass
@ -181,10 +194,11 @@ class Controller(object):
self.runloop()
def kill_children(self):
for pid, pipe in self.child_pipes.items():
for pid, child in self.children.items():
try:
os.write(pipe, ' ')
# all maintenance of child_pipes happens in runloop()
os.write(child.kill_pipe, 'k')
child.active = False
# all maintenance of children's membership happens in runloop()
# as children die and os.wait() gets results
except OSError, e:
if e.errno != errno.EPIPE:
@ -210,6 +224,12 @@ class Controller(object):
signal.signal(signal.SIGHUP, self.handle_sighup)
signal.signal(signal.SIGUSR1, self.handle_deadlychild)
if self.config.get('status_port'):
from spawning.util import status
eventlet.spawn(status.Server, self,
self.config['status_host'], self.config['status_port'])
try:
self.runloop()
except KeyboardInterrupt:
@ -331,6 +351,10 @@ def main():
help='Disable HTTP/1.1 KeepAlive')
parser.add_option('-z', '--z-restart-args', dest='restart_args',
help='For internal use only')
parser.add_option('--status-port', dest='status_port', type='int', default=0,
help='If given, hosts a server status page at that port. Two pages are served: a human-readable HTML version at http://host:status_port/status, and a machine-readable version at http://host:status_port/status.json')
parser.add_option('--status-host', dest='status_host', type='string', default='',
help='If given, binds the server status page to the specified local ip address. Defaults to the same value as --host. If --status-port is not supplied, the status page will not be activated.')
options, positional_args = parser.parse_args()
@ -446,6 +470,11 @@ def main():
# If you tell me to watch something, I'm going to reload then
if options.watch:
options.reload = True
if options.status_port == options.port:
options.status_port = None
sys.stderr.write('**> Status port cannot be the same as the service port, disabling status.\n')
factory_args = {
'verbose': options.verbose,
@ -464,6 +493,8 @@ def main():
'max_age' : options.max_age,
'argv_str': " ".join(sys.argv[1:]),
'args': positional_args,
'status_port': options.status_port,
'status_host': options.status_host or options.host
}
start_controller(sock, factory, factory_args)

370
src/spawning/util/status.py Normal file
View File

@ -0,0 +1,370 @@
import datetime
try:
import json
except ImportError:
import simplejson as json
import eventlet
from eventlet import event
from eventlet import wsgi
from eventlet.green import os
class Server(object):
def __init__(self, controller, host, port):
self.controller = controller
self.host = host
self.port = port
self.status_waiter = None
self.child_events = {}
socket = eventlet.listen((host, port))
wsgi.server(socket, self.application)
def get_status_data(self):
# using a waiter because we only want one child collection ping
# happening at a time; if there are multiple concurrent status requests,
# they all simply share the same set of data results
if self.status_waiter is None:
self.status_waiter = eventlet.spawn(self._collect_status_data)
return self.status_waiter.wait()
def _collect_status_data(self):
try:
now = datetime.datetime.now()
children = self.controller.children.values()
status_data = {
'active_children_count':len([c
for c in children
if c.active]),
'killed_children_count':len([c
for c in children
if not c.active]),
'configured_children_count':self.controller.num_processes,
'now':now.ctime(),
'pid':os.getpid(),
'uptime':format_timedelta(now - self.controller.started_at),
'started_at':self.controller.started_at.ctime(),
'config':self.controller.config}
# fire up a few greenthreads to wait on children's responses
p = eventlet.GreenPile()
for child in self.controller.children.values():
p.spawn(self.collect_child_status, child)
status_data['children'] = dict([pid_cd for pid_cd in p])
# total concurrent connections
status_data['concurrent_requests'] = sum([
child.get('concurrent_requests', 0)
for child in status_data['children'].values()])
finally:
# wipe out the waiter so that subsequent requests create new ones
self.status_waiter = None
return status_data
def collect_child_status(self, child):
self.child_events[child.pid] = event.Event()
# note: this may block the controller a little bit, we don't care
# so much because it's not doing any real work
try:
try:
# tell the child to POST its status to us, we handle it in the
# wsgi application below
os.write(child.kill_pipe, 's')
t = eventlet.Timeout(1)
results = self.child_events[child.pid].wait()
t.cancel()
except (OSError, IOError), e:
results = {'error': "%s %s" % (type(e), e)}
except eventlet.Timeout:
results = {'error':'Timed out'}
finally:
self.child_events.pop(child.pid, None)
results.update({
'pid':child.pid,
'active':child.active,
'uptime':format_timedelta(datetime.datetime.now() - child.forked_at),
'forked_at':child.forked_at.ctime()})
return child.pid, results
def application(self, environ, start_response):
if environ['REQUEST_METHOD'] == 'GET':
status_data = self.get_status_data()
if environ['PATH_INFO'] == '/status':
start_response('200 OK', [('content-type', 'text/html')])
return [fill_template(status_data)]
elif environ['PATH_INFO'] == '/status.json':
start_response('200 OK', [('content-type', 'application/json')])
return [json.dumps(status_data, indent=2)]
elif environ['REQUEST_METHOD'] == 'POST':
# it's a client posting its stats to us
body = environ['wsgi.input'].read()
child_status = json.loads(body)
pid = child_status['pid']
if pid in self.child_events:
self.child_events[pid].send(child_status)
start_response('200 OK', [('content-type', 'application/json')])
else:
start_response('500 Internal Server Error',
[('content-type', 'text/plain')])
print "Don't know about child pid %s" % pid
return [""]
# fallthrough case
start_response('404 Not Found', [('content-type', 'text/plain')])
return [""]
def format_timedelta(t):
"""Based on how HAProxy's status page shows dates.
10d 14h
3h 20m
1h 0m
12m
15s
"""
seconds = t.seconds
if t.days > 0:
return "%sd %sh" % (t.days, int(seconds/3600))
else:
if seconds > 3600:
hours = int(seconds/3600)
seconds -= hours*3600
return "%sh %sm" % (hours, int(seconds/60))
else:
if seconds > 60:
return "%sm" % int(seconds/60)
else:
return "%ss" % seconds
class Tag(object):
"""Yeah, there's a templating DSL in this status module. Deal with it."""
def __init__(self, name, *children, **attrs):
self.name = name
self.attrs = attrs
self.children = list(children)
def __str__(self):
al = []
for name, val in self.attrs.iteritems():
if name == 'cls':
name = "class"
if isinstance(val, (list, tuple)):
val = " ".join(val)
else:
val = str(val)
al.append('%s="%s"' % (name, val))
if al:
attrstr = " " + " ".join(al) + " "
else:
attrstr = ""
cl = []
for child in self.children:
cl.append(str(child))
if cl:
childstr = "\n" + "\n".join(cl) + "\n"
else:
childstr = ""
return "<%s%s>%s</%s>" % (self.name, attrstr, childstr, self.name)
def make_tag(name):
return lambda *c, **a: Tag(name, *c, **a)
p = make_tag('p')
div = make_tag('div')
table = make_tag('table')
tr = make_tag('tr')
th = make_tag('th')
td = make_tag('td')
h2 = make_tag('h2')
span = make_tag('span')
def fill_template(status_data):
# controller status
cont_div = table(id='controller')
cont_div.children.append(tr(th("PID:", title="Controller Process ID"),
td(status_data['pid'])))
cont_div.children.append(tr(th("Uptime:", title="Time since launch"),
td(status_data['uptime'])))
cont_div.children.append(tr(th("Host:", title="Host and port server is listening on, all means all interfaces."),
td("%s:%s" % (status_data['config']['host'] or "all",
status_data['config']['port']))))
cont_div.children.append(tr(th("Threads:", title="Threads per child"),
td(status_data['config']['threadpool_workers'])))
cont_div = div(cont_div)
# children headers and summaries
child_div = div(h2("Child Processes"))
count_td = td(status_data['active_children_count'], "/",
status_data['configured_children_count'])
if status_data['active_children_count'] < \
status_data['configured_children_count']:
count_td.attrs['cls'] = "alert"
count_td.children.append(
span("(", status_data['killed_children_count'], ")"))
children_table = table(
tr(
th('PID', title="Process ID"),
th('Active', title="Accepting New Requests"),
th('Uptime', title="Uptime"),
th('Concurrent', title="Concurrent Requests")),
tr(
td("Total"),
count_td,
td(), # no way to "total" uptime
td(status_data['concurrent_requests'])),
id="children")
child_div.children.append(children_table)
# children themselves
odd = True
for pid in sorted(status_data['children'].keys()):
child = status_data['children'][pid]
row = tr(td(pid), cls=['child'])
if odd:
row.attrs['cls'].append('odd')
odd = not odd
# active handling
row.children.append(td({True:'Y', False:'N'}[child['active']]))
if not child['active']:
row.attrs['cls'].append('dying')
# errors
if child.get('error'):
row.attrs['cls'].append('error')
row.children.append(td(child['error'], colspan=2))
else:
# no errors
row.children.append(td(child['uptime']))
row.children.append(td(child['concurrent_requests']))
children_table.children.append(row)
# config dump
config_div = div(
h2("Configuration"),
table(*[tr(th(key), td(status_data['config'][key]))
for key in sorted(status_data['config'].keys())]),
id='config')
to_format = {'cont_div': cont_div, 'child_div':child_div,
'config_div':config_div}
to_format.update(status_data)
return HTML_SHELL % to_format
HTML_SHELL = """
<!DOCTYPE html>
<html><head>
<title>Spawning Status</title>
<style type="text/css">
html, p, div, table, h1, h2, input, form {
margin: 0;
padding: 0;
border: 0;
outline: 0;
font-size: 12px;
font-family: Helvetica, Arial, sans-serif;
vertical-align: baseline;
}
body {
line-height: 1.2;
color: black;
background: white;
margin: 3em;
}
table {
border-collapse: separate;
border-spacing: 0;
}
th, td {
text-align: center;
padding: .1em;
padding-right: .4em;
}
#controller td, #controller th {
text-align: left;
}
#config td, #config th {
text-align: left;
}
#children {
clear: both;
}
#options {
float: right;
border: 1px solid #dfdfdf;
padding:.5em;
}
h1,h2 {
margin: .5em;
margin-left: 0em;
font-size: 130%%;
}
h2 {
font-size: 115%%;
}
tr.odd {
background: #dfdfdf;
}
input {
border: 1px solid grey;
}
#refresh form {
display: inline;
}
tr.child.dying {
background: #ffeecc;
color: #884400;
}
tr.child.error {
background: #ff6666;
}
.alert {
background: #ff4444;
}
/* Cut out the fat for mobile devices */
@media screen and (max-width: 400px) {
body {
margin-left: .2em;
margin-right: .2em;
}
#options {
float: none;
}
}
</style>
</head><body>
<h1>Spawning Status</h1>
<div id="options">
<p>%(now)s</p>
<div id="refresh">
<a href="">Refresh</a> (<form>
<input type="checkbox" /> every
<input type="text" value="5" size=2 />s
</form>)
</div>
<a href="status.json">JSON</a>
</div>
%(cont_div)s
%(child_div)s
%(config_div)s
<script type="text/javascript" src="http://ajax.googleapis.com/ajax/libs/jquery/1.4.2/jquery.min.js"></script>
<script type="text/javascript">
$(document).ready(function() {
var timer;
var arrangeTimeout = function () {
clearTimeout(timer);
if($('#refresh input[type=checkbox]').attr('checked')) {
timer = setTimeout(
function() {window.location.reload();},
$('#refresh input[type=text]').val() * 1000);
}
if($(this).is('form')) {
return false;
}
};
$('#refresh input[type=checkbox]').click(arrangeTimeout);
$('#refresh form').submit(arrangeTimeout).submit();
});
</script>
</body></html>
"""