Spawning/spawning/util/log_parser.py

172 lines
6.1 KiB
Python

import time
from datetime import datetime, timedelta
import sys
import optparse
import re
__all__ = ['parse_line', 'parse_lines', 'parse_casual_time',
'group_parsed_lines', 'select_timerange']
month_names = {'Jan': 1, 'Feb': 2, 'Mar':3, 'Apr':4, 'May':5, 'Jun':6, 'Jul':7,
'Aug':8, 'Sep': 9, 'Oct':10, 'Nov': 11, 'Dec': 12}
def parse_line(line):
""" Parses a Spawning log line into a dictionary of fields.
Returns the following fields:
* client_ip : The remote IP address.
* date : datetime object representing when the request completed
* method : HTTP method
* path : url path
* version : HTTP version
* status_code : HTTP status code
* size : length of the body
* duration : time in seconds to complete the request
"""
# note that a split-based version of the function is faster than
# a regexp-based version
segs = line.split()
if len(segs) != 11:
return None
retval = {}
try:
retval['client_ip'] = segs[0]
if segs[1] != '-' or segs[2] != '-':
return None
if segs[3][0] != '[' or segs[4][-1] != ']':
return None
# time parsing by explicitly poking at string slices is much faster
# than strptime, but it won't work in non-English locales because of
# the month names
d = segs[3]
t = segs[4]
retval['date'] = datetime(
int(d[8:12]), # year
month_names[d[4:7]], # month
int(d[1:3]), # day
int(t[0:2]), # hour
int(t[3:5]), # minute
int(t[6:8])) # second
if segs[5][0] != '"' or segs[7][-1] != '"':
return None
retval['method'] = segs[5][1:]
retval['path'] = segs[6]
retval['version'] = segs[7][:-1]
retval['status_code'] = int(segs[8])
retval['size'] = int(segs[9])
retval['duration'] = float(segs[10])
except (IndexError, ValueError):
return None
return retval
def parse_lines(fd):
"""Generator function that accepts an iterable file-like object and
yields all the parseable lines found in it.
"""
for line in fd:
parsed = parse_line(line)
if parsed is not None:
yield parsed
time_intervals = {"sec":1, "min":60, "hr":3600, "day": 86400,
"second":1, "minute":60, "hour":3600,
"s":1, "m":60, "h":3600, "d":86400}
for k,v in time_intervals.items(): # pluralize
time_intervals[k + "s"] = v
def parse_casual_time(timestr, relative_to):
"""Lenient relative time parser. Returns a datetime object if it can.
Accepts such human-friendly times as "-1 hour", "-30s", "15min", "2d", "now".
Any such relative time is interpreted as a delta applied to the relative_to
argument, which should be a datetime.
"""
timestr = timestr.lower()
try:
return datetime(*(time.strptime(timestr)[0:6]))
except ValueError:
pass
if timestr == "now":
return datetime.now()
# match stuff like "-1 hour", "-30s"
m = re.match(r'([-0-9.]+)\s*(\w+)?', timestr)
if m:
intervalsz = 1
if len(m.groups()) > 1 and m.group(2) in time_intervals:
intervalsz = time_intervals[m.group(2)]
relseconds = float(m.group(1)) * intervalsz
return relative_to + timedelta(seconds=relseconds)
def group_parsed_lines(lines, field):
"""Aggregates the parsed log lines by a field. Counts
the log lines in each group and their average duration. The return
value is a dict, where the keys are the unique field values, and the values
are dicts of count, avg_duration, and the key.
"""
grouped = {}
for parsed in lines:
key = parsed[field]
summary = grouped.setdefault(key, {'count':0, 'total_duration':0.0})
summary[field] = key
summary['count'] += 1
summary['total_duration'] += parsed['duration']
# average dat up
for summary in grouped.values():
summary['avg_duration'] = summary['total_duration']/summary['count']
del summary['total_duration']
return grouped
def select_timerange(lines, earliest=None, latest=None):
""" Generator that accepts an iterable of parsed log lines and yields
the log lines that are between the earliest and latest dates. If
either earliest or latest is None, it is ignored."""
for parsed in lines:
if earliest and parsed['date'] < earliest:
continue
if latest and parsed['date'] > latest:
continue
yield parsed
if __name__ == "__main__":
parser = optparse.OptionParser()
parser.add_option('--earliest', dest='earliest', default=None,
help='Earliest date to count, either as a full date or a relative time \
such as "-1 hour". Relative to --latest, so you generally want to\
specify a negative relative.')
parser.add_option('--latest', dest='latest', default=None,
help='Latest date to count, either as a full date or a relative time\
such as "-30s". Relative to now.')
parser.add_option('--group-by', dest='group_by', default='path',
help='Compute counts and aggregates for log lines grouped by this\
attribute. Good values include "status_code", "method", and\
"path" (the default).')
opts, args = parser.parse_args()
if opts.latest:
opts.latest = parse_casual_time(opts.latest, datetime.now())
if opts.earliest:
opts.earliest = parse_casual_time(opts.earliest,
opts.latest or datetime.now())
if opts.earliest or opts.latest:
print "Including dates between", \
opts.earliest or "the beginning of time", "and", opts.latest or "now"
parsed_lines = parse_lines(sys.stdin)
grouped = group_parsed_lines(
select_timerange(parsed_lines, opts.earliest, opts.latest),
opts.group_by)
flat = grouped.values()
flat.sort(key=lambda x: x['count'])
flat.reverse()
print "Count\tAvg Dur\t%s" % opts.group_by
for summary in flat:
print "%d\t%.4f\t%s" % (summary['count'],
summary['avg_duration'], summary[opts.group_by])