#!/bin/bash
#
# cs_show_error_patterns
#
# (c) 2011-2014 SUSE Linux GmbH, Germany. Author: L.Pinne.
# GNU Public License. No warranty.
#
# Version: 0.2 2014-03-27
#

EXE="$0"

CFG="/etc/ClusterTools2/cs_show_error_patterns"
test -s $CFG && source $CFG

DMSG="/tmp/dmesg.$RANDOM"

test -z "${SYSTEM_LOG}" &&\
	 SYSTEM_LOG="/var/log/messages /var/log/boot.msg /var/log/boot.omsg /var/spool/mail/root /proc/mdstat /proc/net/bonding/bond0"
	 SYSTEM_LOG="${SYSTEM_LOG} ${DMSG}"

test -z "/etc/corosync/corosync.conf" &&\
test -z "${CLUSTER_LOG}" &&\
	 CLUSTER_LOG=$(grep "logfile:.*/" /etc/corosync/corosync.conf |\
		 tr -d "    " | cut -d":" -f2)

test -z "${ZIPPED_LOG}" &&\
	ZIPPED_LOG="/var/log/messages*bz2"
	ZIPPED_LOG="${ZIPPED_LOG} ${CLUSTER_LOG}*bz2"

test -z "${ERROR_PATTERN}" &&\
	 ERROR_PATTERN="
"taint.flag"
"no.device.*found"
"be2net.*Error"
"bfa.*err"
"lpfc.*err"
"qla.*err"
"qla.*failed"
"qlge.*error"
"NMI.received"
"NMI.*error"
"soft.lockup.detected"
"kernel.*task.*abort"
"target.failure"
"reservation.conflict"
"SCSI.*reset"
"LUN.larger"
"I/O.error"
"Dropped.frame"
"disabling.*barrier"
"doesn.*support.*DPO.*FUA"
"Unable.*SMART.enabled"
"BBU.disabled"
"multipathd.*timed.out"
"multipathd.*mpath.*unable"
"multipathd.*reinstated"
"multi.*path.*down"
"[Ff]ail.*/dev/md"
"U_\|_U"
"found.0.matching.devices"
"/dev/md.*failed"
"duplicate.VG"
"duplicate.PV"
"not.found"
"dlm.*link.*down"
"ocfs2.*has.evicted"
"ocfs2.*ERR"
"ocfs2.*not.unmounted.cleanly"
"OCFS2.*kernel.interface.loaded"
"o2dlm_eviction_cb"
"fsck.*recommended"
"EXT.-fs.error"
"Starting.XFS.recovery"
"sdrServ:.*err"
"server.*not.responding"
"iSCSI.connection.*error"
"Sense.*error"
"bond.*link.*down"
"bond.*duplicate.address"
"bond.*not.detect.*failures"
"bond.*permanent.HWaddr.*in.use"
"link.*not.ready"
"link.failure"
"Link.Failure.Count"
"RPC.*multiple.fragments"
"martian.source.*dev"
"unexpectedly.shrunk.window"
"ntpd.*failed"
"ntpd.*time.reset"
"time.sync.status.changed"
"failed.to.bind.to.LDAP"
"conversation.failed"
"refused.mount.request"
"nfs..server.*not.responding"
"logrotate.*error"
"Failed.to.read"
"exits.with.status.*[1-9]"
"Failed.services.*runlevel"
"shutdown.*shutting.down"
"syslog-ng.shutting.down"
"kernel.*segfault.at"
"Out.of.memory"
"invoked.oom-killer"
"Call.Trace"
"kernel.*deprecated"
"annot.open.*file"
"nable.to.open.*file"
"nable.to.load"
"Potential.spurious.wakeup"
"annot.create..ocket"
"end.*failed"
"ead.*failed"
"GAB.*failure"
"Symantec.Technical.Support"
"error.*VRT"
"VCS.CRITICAL"
"vxvm.*volume.*not"
"Hangcheck.*clock"
"pagecache.limit.set"
"
# TODO: separate variable for application patterns (annot.open..rofile" ... "Hangcheck.*clock"
# TODO: raid.set.*active.with.2.out.of.2.mirrors ?
# TODO: check for un-matched userfreindly names (mpath[a-z])
# TODO: pattern for failed module (could not load)
# TODO: async IO from grep kio /proc/slabinfo ?
# TODO: direct IO from grep ...,DIR, /proc/slabinfo ?
# TODO: verify cluster log msgs. maybe more generic possible ?

test -z "${CLUERR_PATTERN}" &&\
	 CLUERR_PATTERN="
"TOTEM.*FAILED.*RECEIVE"
"TOTEM.*processor.failed"
"TOTEM.*Incrementing.problem"
"quorum.lost"
"fenced.because.*.un-expectedly.down"
"stonith.*Device.*not.fond"
"Node.*unclean"
"pcmk.*Child.process.*exited"
"pcmk.*message.*local.cib.failed"
"pengine.*fenced.*resource.failure"
"pengine.*Forcing.*away.*failures"
"crm_resource.*Error"
"crmd.*ERROR:"
"crmd.*Updating.failcount.*stop"
"crmd.*Updating.failcount.*start"
"sbd.*mbox.read.failed"
"sbd.*Latency"
"SoftDog.*not"
"

test -z "${APPERR_PATTERN}" &&\
	APPERR_PATTERN="
"vasd.will.not.synchronize.time"
"vascache.*error"
"


function help() {
	echo "usage:	$(basename $0)"
	echo "	$(basename $0) [OPTION]"
	echo
	echo " --help		show help."
	echo " --version	show version."
	echo " --zip		search compressed logs, too."
}


function run_grep() {
	echo "logs: $LOG" >/dev/stderr

	dmesg >$DMSG

	# TODO: more efficient loop
	for e in ${ERROR_PATTERN} ${CLUERR_PATTERN} ${APPERR_PATTERN}; do
        	echo -n "$e = "
		for f in ${LOG}; do
			test -r $f && cat $f
		done | zgrep -ic $e
	done

	rm $DMSG
}


# main()

case $1 in
	-v|--version)
		echo -n "$(basename $EXE) "
        	head -11 $EXE | grep "^# Version: "
		exit
	;;
	-h|--help)
		help
		exit
	;;
	-z|--zip)
		LOG="${SYSTEM_LOG}"
		test -s ${CLUSTER_LOG} && LOG="${LOG} ${CLUSTER_LOG}"
		for z in ${ZIPPED_LOG}; do
			test -s $z && LOG="${LOG} ${z}"
		done
		run_grep
		exit
	;;
	*)
		LOG="${SYSTEM_LOG}"
		test -s ${CLUSTER_LOG} && LOG="${LOG} ${CLUSTER_LOG}"
		run_grep
		exit		
	;;
esac
#
