#!/bin/bash
#
# Copyright (c) 2010-2012 Novell Inc. All Rights Reserved.
#
# Author: Tim Serong <tserong@suse.com>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of version 2 of the GNU General Public License as
# published by the Free Software Foundation.
#
# This program is distributed in the hope that it would be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# Further, this software is distributed without any warranty that it is
# free of the rightful claim of any third person regarding infringement
# or the like.  Any license provided herein, whether implied or
# otherwise, applies only to this software file.  Patent licenses, if
# any, provided herein do not apply to combinations of this program with
# other software, or any other product whatsoever.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write the Free Software Foundation,
# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA.
#

. /usr/lib/sleha-functions

declare SHARED_DEVICE
declare SBD_DEVICE
declare OCFS2_DEVICE
declare TEMPLATE

usage()
{
	cat <<END
Usage: $0 [options] [stage]

Options:
    -h          Display this usage information
    -q          Be quiet (don't describe what's happening, just do it)
    -y          Answer "yes" to all prompts (use with caution, this is
                destructive, especially during the "storage" stage)

    -i <if>     Default to IP address on interface 'if' instead of eth0
    -s <dev>    Block device to use for SBD fencing

    -t <name>   Optionally configure cluster with template 'name'
                (currently only "ocfs2" is valid here)

Options for ocfs2 template:
    -p <dev>    Partition this shared storage device (only used in
                "storage" stage)
    -o <dev>    Block device to use for OCFS2 (only used in "vgfs" stage)

Stage can be one of:
    ssh         Create SSH keys for passwordless SSH between cluster nodes
    csync2      Configure csync2
    corosync    Configure corosync
    storage     Partition shared storage (ocfs2 template only)
    sbd         Configure SBD (requires -s <dev>)
    cluster     Bring the cluster online
    vgfs        Create volume group and filesystem (ocfs2 template only,
                requires -o <dev>)

Note:
  - If stage is not specified, the script will run through each stage
    in sequence, with prompts for required information.
  - If using the ocfs2 template, the storage stage will partition a block
    device into two pieces, one for SBD, the remainder for OCFS2.  This is
    good for testing and demonstration, but not ideal for production.
    To use storage you have already configured, pass -s and -o to specify
    the block devices for SBD and OCFS2, and the automatic partitioning
    will be skipped.
END
	exit 0
}

init_ssh()
{
	start_service sshd

	invoke mkdir -m 700 -p /root/.ssh

	if [ -f /root/.ssh/id_rsa ]; then
		confirm \
			'/root/.ssh/id_rsa already exists - overwrite?' || return
		invoke rm -f /root/.ssh/id_rsa
	fi

	status "Generating ssh key"
	invoke ssh-keygen -q -f /root/.ssh/id_rsa \
		-C 'Cluster Internal' -N '' \
		|| error "Failed to generate SSH key"
	append /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys
}

init_csync2()
{
	status "Configuring csync2"

	# Not automatically updating /etc/hosts - risky in the general case.
	#etc_hosts_add_me
	#[ -n "$(etc_hosts_get_me)" ] || error "No valid entry for $(hostname) in /etc/hosts - csync2 can't work"

	if [ -f "$CSYNC2_KEY" ] || \
		grep -v -q -s '^[[:space:]]*\(#.*\)*$' $CSYNC2_CFG
	then
		confirm 'csync2 is already configured - overwrite?' || return
	fi

	invoke rm -f $CSYNC2_KEY
	status_long "Generating csync2 shared key (this may take a while)"
	# On a quiet VM, "a while" can be "way too long" (not enough entropy)
	invoke csync2 -k $CSYNC2_KEY || error "Can't create csync2 key $keyfile"
	status_done

	local tmp_conf=${CSYNC2_CFG}.$$
	cat > $tmp_conf <<END
group ha_group
{
	key /etc/csync2/key_hagroup;
	host $(hostname);
	include /etc/booth/booth.conf;
	include /etc/corosync/corosync.conf;
	include /etc/corosync/authkey;
	include /etc/csync2/csync2.cfg;
	include /etc/csync2/key_hagroup;
	include /etc/ctdb/nodes;
	include /etc/drbd.d;
	include /etc/ha.d/ldirectord.cf;
	include /etc/lvm/lvm.conf;
	include /etc/multipath.conf;
	include /etc/samba/smb.conf;
	include /etc/sysconfig/openais;
	include /etc/sysconfig/pacemaker;
	include /etc/sysconfig/sbd;
}
END
	install_tmp $tmp_conf $CSYNC2_CFG

	# Can't use start_service for csync2, because it runs inside xinetd
	status "Enabling csync2 service"
	invoke chkconfig csync2 on
	start_service xinetd

	status "csync2 checking files"
	invoke csync2 -cr /
}

# It would be nice if we could just have csync2.cfg include a directory,
# which in turn included one file per node which would be referenced via
# something like "group ha_group { ... config: /etc/csync2/hosts/*; }"
# That way, adding a new node would just mean adding a single new file
# to that directory.  Unfortunately, the 'config' statement only allows
# inclusion of specific individual files, not multiple files via wildcard.
# So we have this function which is called by sleha-join to add the new
# remote node to csync2 config on some existing node.  It is intentionally
# not documented in sleha-init's user-visible usage information.
init_csync2_remote()
{
	local newhost=$1
	#local newline="$2"
	local thishost=$(hostname)
	[ -n "$newhost" ] || error "Hostname not specified"

	BE_QUIET=true

	# Not automatically updating /etc/hosts - risky in the general case.
	#if [ -n "$newline" ]; then
	#	etc_hosts_add_one $newhost "$newline"
	#else
	#	log ": Not updating /etc/hosts - remote host line not specified"
	#fi

	# If host doesn't already exist in csync2 config, add it
	if ! egrep -q -s \
		"^[[:space:]]*host.*[[:space:]]+${newhost}[[:space:];]" \
		$CSYNC2_CFG
	then
		local tmp_conf=${CSYNC2_CFG}.$$
		awk '/^[[:space:]]*host[[:space:]]/ \
			{ if (h==0) print "\thost '$newhost';"; h=1; } \
			{ print $0 } ' $CSYNC2_CFG > $tmp_conf
		install_tmp $tmp_conf $CSYNC2_CFG
		invoke csync2 -c $CSYNC2_CFG
	else
		log ": Not updating $CSYNC2_CFG - remote host $newhost already exists"
	fi
}

init_corosync()
{
	if $YES_TO_ALL; then
		status 'Configuring corosync'
	else
		status "
Configure Corosync:
  This will configure the cluster messaging layer.  You will need
  to specify a network address over which to communicate (default
  is ${NET_IF}'s network, but you can use the network address of any
  active interface), a multicast address and multicast port.
"
	fi

	if [ -f "$COROSYNC_CONF" ]; then
		confirm \
			"$COROSYNC_CONF already exists - overwrite?" || return
	fi

	local bindnetaddr=$(prompt_for_string \
		'Network address to bind to (e.g.: 192.168.1.0)' \
		'([0-9]+\.){3}[0-9]+' "$IP_NETWORK")
	[ -z "$bindnetaddr" ] && error 'No value for bindnetaddr'

	local mcastaddr=$(prompt_for_string \
		'Multicast address (e.g.: 239.x.x.x)' \
		'([0-9]+\.){3}[0-9]+' \
		239.$(random_256).$(random_256).$(random_256))
	[ -z "$mcastaddr" ] && error 'No value for mcastaddr'

	local mcastport=$(prompt_for_string \
		'Multicast port' \
		'[0-9]+' 5405);
	[ -z "$mcastport" ] && error 'No value for mcastport'

	local tmp_conf=${COROSYNC_CONF}.$$
	if grep -q -s '^[[:space:]]*name:[[:space:]]*pacemaker' ${COROSYNC_CONF}.example; then
		# "name: pacemaker" in corosync example conf is our assumption
		# that the example is sane, so use it:
		sed \
			-e 's/^\([[:space:]]*bindnetaddr:[[:space:]]*\).*/\1'$bindnetaddr'/' \
			-e 's/^\([[:space:]]*mcastaddr:[[:space:]]*\).*/\1'$mcastaddr'/' \
			-e 's/^\([[:space:]]*mcastport:[[:space:]]*\).*/\1'$mcastport'/' \
			/etc/corosync/corosync.conf.example > $tmp_conf
	else
		# Fallback to known sane config (defaults from SLE HA 11 SP1
		# corosync.conf.example.patch)
		cat > $tmp_conf <<END
# Please read the corosync.conf.5 manual page
compatibility: whitetank
aisexec {
	group:		root
	user:		root
}
service {
	ver:		0
	name:		pacemaker
	use_mgmtd:	yes
	use_logd:	yes
}
totem {
	version:	2
	token:		5000
	token_retransmits_before_loss_const: 10
	join:		60
	consensus:	6000
	vsftype:	none
	max_messages:	20
	clear_node_high_bit: yes
	secauth:	off
	threads:	0
	interface {
		ringnumber:	0
		bindnetaddr:	$bindnetaddr
		mcastaddr:	$mcastaddr
		mcastport:	$mcastport
	}
}
logging {
	fileline:	off
	to_stderr:	no
	to_logfile:	no
	timestamp:	on
	to_syslog:	yes
	syslog_facility: daemon
	debug:		off
	timestamp:	off
}
amf {
	mode:	disable
}
END
	fi
	install_tmp $tmp_conf $COROSYNC_CONF
	invoke csync2 -xv $COROSYNC_CONF
}

# Non-generic, i.e. not for use with every type of cluster you can imagine:
# requires shared storage, and specifically configures one block device as
# sbd partition + remainder for OCFS2 filesystem (created later).
init_storage()
{
	local dev="$SHARED_DEVICE"
	local partitions
	local part
	local -a devices
	local -i dev_looks_sane=0

	if $YES_TO_ALL || [ -n "$SHARED_DEVICE" ]; then
		status 'Configuring shared storage'
	else
		status "
Configure Shared Storage:
  You will need to provide the path to a shared storage device,
  for example a SAN volume or iSCSI target.  The device path must
  be persistent and consistent across all nodes in the cluster,
  so /dev/disk/by-id/* devices are a good choice.  This device
  will be automatically paritioned into two pieces, 1MB for SBD
  fencing, and the remainder for an OCFS2 filesystem.
"
	fi

	while [ $dev_looks_sane -eq 0 ]; do
		dev=$(prompt_for_string \
			'Path to storage device (e.g. /dev/disk/by-id/...)' \
			'\/.*' "$dev")
		# This will be empty if -y was specified but -p wasn't
		[ -z "$dev" ] && error "No value for shared storage device"

		if [ ! -b "$dev" ]; then
			$YES_TO_ALL && error "$dev is not a block device" \
				|| echo "    That doesn't look like a block device" >&2
		else
			#
			# Got something that looks like a block device, there
			# are four possibilities now:
			#
			#  1) It's completely broken/inaccessible
			#  2) No recognizable partition table
			#  3) Empty partition table
			#  4) Non-empty parition table
			#
			partitions=$(parted -s $dev print | awk '/^[[:space:]]*[0-9]+/ { print $1; }')
			if [ -n "$partitions" ]; then
				# Partitions exist
				status "WARNING: Partitions exist on $dev!"
				confirm 'Are you ABSOLUTELY SURE you want to overwrite?' \
					&& dev_looks_sane=1 || dev=
			else
				# It's either broken, no partition table, or empty partition table
				status "$dev appears to be empty"
				confirm 'Are you sure you wish to use this device' \
					&& dev_looks_sane=1 || dev=
			fi
		fi
	done

	if [ -n "$partitions" ]; then
		confirm 'Really?' || exit
		status_long "Erasing existing partitions..."
		for part in $partitions; do
			invoke parted -s $dev rm $part \
				|| error "Failed to remove partition $part"
		done
		status_done
	fi

	status_long "Creating partitions..."
	invoke parted -s $dev mklabel msdos || error "Failed to create partition table"

	# This is a bit rough, and probably won't result in great performance,
	# but it's fine for test/demo purposes to carve off 1MB for SBD.  Note
	# we have to specify the size of the first partition in this in bytes
	# rather than MB, or parted's rounding gives us a ~30Kb partition
	# (see rhbz#623268).
	invoke parted -s $dev mkpart primary 0 1048576B || error "Failed to create first partition"
	invoke parted -s $dev mkpart primary 1M 100% || error "Failed to create second partition"

	status_done

	# TODO: May not be strictly necessary, but...
	probe_partitions

	# TODO: THIS IS *WRONG* FOR MULTIPATH! (but possibly nothing we can do about it)
	devices=( $(fdisk -l $dev | awk '/^\/dev/ { print $1; }') )

	SBD_DEVICE=${devices[0]}
	[ -z "$SBD_DEVICE" ] && error "Unable to determine device path for SBD partition"

	OCFS2_DEVICE=${devices[1]}
	[ -z "$OCFS2_DEVICE" ] && error "Unable to determine device path for OCFS2 partition"

	status
	status "Created $SBD_DEVICE for SBD partition"
	status "Created $OCFS2_DEVICE for OCFS2 partition"
}

init_sbd()
{
	local dev
	local -i dev_looks_sane=0

	if [ -z "$SBD_DEVICE" ]; then
		# SBD device not set up by init_storage (ocfs2 template) and
		# also not passed in as command line argument - prompt user
		if $YES_TO_ALL; then
			warn "Not configuring SBD ($SYSCONFIG_SBD left untouched)."
			return
		fi
		status "
Configure SBD:
  If you have shared storage, for example a SAN or iSCSI target,
  you can use it avoid split-brain scenarios by configuring SBD.
  This requires a 1 MB partition, accessible to all nodes in the
  cluster.  The device path must be persistent and consistent
  across all nodes in the cluster, so /dev/disk/by-id/* devices
  are a good choice.  Note that all data on the partition you
  specify here will be destroyed.
"

		if [ -f "$SYSCONFIG_SBD" ] && grep -q '^SBD_DEVICE=' $SYSCONFIG_SBD; then
			confirm \
				"SBD is already configured - overwrite?" || return
		fi

		if ! confirm "Do you wish to use SBD?"; then
			warn "Not configuring SBD - STONITH will be disabled."
			# Comment out SBD devices if present
			if [ -f "$SYSCONFIG_SBD" ]; then
				local tmp_conf=$SYSCONFIG_SBD.$$
				sed -e 's/^\([^#].*\)$/#\1/g' $SYSCONFIG_SBD > $tmp_conf
				install_tmp $tmp_conf $SYSCONFIG_SBD
				invoke csync2 -xv $SYSCONFIG_SBD
			fi
			return
		fi
		while [ $dev_looks_sane -eq 0 ]; do
			dev=$(prompt_for_string \
				'Path to storage device (e.g. /dev/disk/by-id/...)' \
				'\/.*' "$dev")
			if [ ! -b "$dev" ]; then
				echo "    That doesn't look like a block device" >&2
			else
				status "All data on $dev will be destroyed"
				confirm 'Are you sure you wish to use this device' \
					&& dev_looks_sane=1 || dev=
			fi
		done
		SBD_DEVICE="$dev"
	fi

	[ -b "$SBD_DEVICE" ] || error "SBD device $SBD_DEVICE does not exist"

	# TODO: need to ensure watchdog is available
	# (actually, should work if watchdog unavailable, it'll just whine in the logs...)
	# TODO: what about timeouts for multipath devices?
	status_long 'Initializing SBD...'
	invoke sbd -d $SBD_DEVICE create || error "Failed to initialize SBD device"
	status_done

	local tmp_conf=$SYSCONFIG_SBD.$$
	cat > $tmp_conf <<END
SBD_DEVICE="$SBD_DEVICE"
SBD_OPTS="-W"
END
	install_tmp $tmp_conf $SYSCONFIG_SBD
	invoke csync2 -xv $SYSCONFIG_SBD
}

init_cluster()
{
	init_cluster_local

	local -i node_count=$(crmadmin -N|grep 'node'|wc -l)
	[ "$node_count" -lt 1 ] && error "No nodes found in cluster"
	[ "$node_count" -gt 1 ] && error "Joined existing cluster - will not reconfigure."

	status "Loading initial configuration"

	# base config
	local tmp_conf=/tmp/crm.$$
	cat > $tmp_conf <<END
property \$id="cib-bootstrap-options" \\
	stonith-enabled="false" \\
	no-quorum-policy="ignore" \\
	placement-strategy="balanced"
op_defaults \$id="op-options" \\
	timeout="600" \\
	record-pending="true"
rsc_defaults \$id="rsc-options" \\
	resource-stickiness="1" \\
	migration-threshold="3"
END
	crm_configure_load replace $tmp_conf

	# sbd fencing if applicable (getting local sbd_device here direct
	# from SBD config, in case init_cluster is invoked on its own)
	local sbd_device=$([ -f /etc/sysconfig/sbd ] && \
		. /etc/sysconfig/sbd ; echo $SBD_DEVICE)
	if [ -n "$sbd_device" ]; then
		# Can't do crm configure load update here with only
		# stonith-enabled, or it wipes out then entire
		# cib-bootstrap-options section
		# TODO: find out if this is a bug in the crm shell
		invoke crm configure primitive stonith-sbd stonith:external/sbd params sbd_device="$sbd_device" \
			|| error "Can't create stonith-sbd primitive"
		invoke crm configure property stonith-enabled="true" \
			|| error "Can't enable STONITH"
	fi
}

# Non-generic, i.e. not for use with every type of cluster you can imagine:
# creates specific cluster config (TODO: would be nice to template this in
# future somehow)
init_vgfs()
{
	[ -b "$OCFS2_DEVICE" ] || error "OCFS2 device $OCFS2_DEVICE does not exist"

	# TODO: confiugrable mountpoint and vg name
	local tmp_conf=/tmp/crm.$$
	cat > $tmp_conf <<END
primitive dlm ocf:pacemaker:controld \\
	op start timeout="90" op stop timeout="100"
primitive clvm ocf:lvm2:clvmd \\
	op start timeout="90" op stop timeout="100"
primitive cmirror ocf:lvm2:cmirrord \\
	op start timeout="90" op stop timeout="100"
primitive o2cb ocf:ocfs2:o2cb \\
	op start timeout="90" op stop timeout="100"
primitive vg ocf:heartbeat:LVM params volgrpname="clustervg" \\
	op start timeout="30" op stop timeout="30" \\
	meta target-role="Stopped"
primitive filesys ocf:heartbeat:Filesystem \\
	params directory="/clusterfs" fstype="ocfs2" device="/dev/clustervg/clusterlv" \\
	op monitor interval="20" timeout="40" \\
	op start timeout="60" op stop timeout="60"
group clusterfs dlm clvm o2cb cmirror vg filesys
clone c-clusterfs clusterfs meta interleave="true"
END
	crm_configure_load update $tmp_conf

	wait_for_resource "Waiting for O2CB" o2cb:0
	wait_for_stop "Making sure VG is not active" vg:0

	if pvdisplay $OCFS2_DEVICE >/dev/null 2>&1; then
		if ! confirm "Existing Physical Volume found on $OCFS2_DEVICE - destroy?"; then
			invoke crm resource stop c-clusterfs
			wait_for_stop "Waiting for resource to stop" c-clusterfs
			invoke crm configure delete \
				dlm clvm cmirror o2cb vg filesys \
				clusterfs c-clusterfs
			return
		fi
	fi

	status_long "Creating LVM Volume"
	# -ff forces create if something already exists
	# (dangerous, but you asked for it...)
	invoke pvcreate -ff -y $OCFS2_DEVICE || error "Failed to create physical volume on $OCFS2_DEVICE"
	local pvdev=$(pvdisplay $OCFS2_DEVICE | awk '/PV Name/ { print $3; }')
	[ -z "$pvdev" ] && error "Can't determine PV Name on $OCFS2_DEVICE"
	invoke vgcreate --clustered y clustervg $pvdev || error "Failed to create VG on $pvdev"
	invoke lvcreate --name clusterlv --extents 100%FREE clustervg || error "Failed to create LV on clustervg"
	local lvdev=$(lvdisplay clustervg | awk '/LV Name/ { print $3; }')
	[ -z "$lvdev" ] && error "Unable to determine LV Name for clustervg"
	status_done

	status_long "Creating OCFS2 filesystem"
	# TODO: want "-T vmstore", but this'll only fly on >2GB partition
	# Note: using undocumented '-x' switch to avoid prompting if overwriting
	# existing partition.  For the commit that introduced this, see:
	# http://oss.oracle.com/git/?p=ocfs2-tools.git;a=commit;h=8345a068479196172190f4fa287052800fa2b66f
	invoke mkfs.ocfs2 -x $lvdev || error "Failed to create OCFS2 filesystem on $lvdev"
	status_done

	# TODO: refactor, maybe
	invoke mkdir -p /clusterfs || error "Can't create mountpoint /clusterfs"
	invoke crm resource meta vg delete target-role \
		|| error "Can't start cluster filesystem clone"
	wait_for_resource "Waiting for /clusterfs to be mounted" filesys:0
}

#------------------------------------------------------------------------------

while getopts 'hi:o:p:qs:t:y' o; do
	case $o in
	h) usage;;
	i) NET_IF=$OPTARG;;
	o) OCFS2_DEVICE=$OPTARG;;
	p) SHARED_DEVICE=$OPTARG;;
	q) BE_QUIET=true;;
	s) SBD_DEVICE=$OPTARG;;
	t) TEMPLATE=$OPTARG;;
	y) YES_TO_ALL=true;;
	esac
done

shift $(($OPTIND - 1))

stage=$1 ; shift

# vgfs stage requires running cluster, everything else requires inactive cluster,
# except ssh and csync2 (which don't care) and csync2_remote (which mustn't care,
# just in case this breaks sleha-join on another node).
rcopenais status >/dev/null
rc=$?
if [ "$stage" == "vgfs" ]; then
	[ $rc -ne 0 ] && error "Cluster is inactive  - can't run vgfs stage"
elif [ "$stage" != "ssh" -a "$stage" != "csync2" -a "$stage" != "csync2_remote" ]; then
	[ $rc -eq 0 ] && error "Cluster is currently active - can't run"
	[ $rc -ne 7 ] && error "rcopenais status returned unexpected status ($rc)"
fi

# Need hostname resolution to work, want NTP (but don't block csync2_remote)
if [ "$stage" != "csync2_remote" ]; then
	check_prereqs
fi

case $TEMPLATE in
ocfs2)	;;
"")	;;
*)	error "Invalid template ($TEMPLATE)"
esac

case $stage in
ssh|csync2|csync2_remote|corosync|storage|sbd|cluster|vgfs)
	init
	# $2 == nasty hack to pick up IP arg to csync2_remote (not strictly
	# necessary currently, as we're not auto-updating /etc/hosts)
	init_$stage "$@"
	;;
"")
	init
	init_ssh
	init_csync2
	init_corosync
	if [ "$TEMPLATE" == "ocfs2" ]; then
		[ -n "$SBD_DEVICE" -a -n "$OCFS2_DEVICE" ] || init_storage
	fi
	init_sbd
	init_cluster
	[ "$TEMPLATE" == "ocfs2" ] && init_vgfs
	;;
*)	echo -e "Invalid stage ($1)\n"
	usage
esac

status "Done (log saved to $LOG_FILE)"
exit 0
