#!/usr/bin/bash
# Pull every CVMFS repository from its original source to a staging area,
#  then push each repository to a backup machine's staging area, and
#  finally make them available to be served to cvmfs clients, both on
#  this machine and on the backup one.  The pushing starts happening
#  for each repository in a separate process while pulling the next
#  repository.
# The data directory is shared between the staging and served areas, but
#  it doesn't hurt for extra data files to be available.
# If /etc/cvmfs/runstandalone does not exist and any errors happen when
#  updating the backup machine, updates are not made available to clients
#  from either machine.  In that way they stay in sync.
# If /etc/cvmfs/runstandalone exists, this still attempts to push most
#  of the files to the backup machine, but the updates will only show up
#  on the local machine.  Care must be taken to not allow the backup
#  machine to become the master before runstandalone is removed and the
#  backup is caught up to date.
# Repositories are assumed to be created by add-repository.  
# Ssh must have permission to run between the master and backup machines
#  as root without a password.  If using kerberos then put
#     host/other.host@DOMAIN.NAME
#  in ~root/.k5login on each machine where other.host is the full name
#  of the other host and DOMAIN.NAME is your kerberos domain name.
# Output goes to *-pull.log and *-push.log files in /var/log/cvmfs.
#
# Originally written by Dave Dykstra 29 March 2013
# Adapted for cvmfs-server 2.1 15 July 2013
# Adapted to use cvmfs_server snapshot instead of rsync for the main
#   replication 4 September 2014.  Assumes apache is listening on port
#   ${HTTPPORT:-8081}.
# Adapted for cvmfs-server 2.2 and parallel updates 24 March 2016.

MAXPARALLELPULL=4

. /etc/cvmfs/hastratum1.conf

usage()
{
    echo "Usage: $0 [reponame|ABORT]" >&2
    echo " If ABORT then abort any update in process" >&2
    echo " If no reponame given, update all repositories in $SRV" >&2
    exit 2
}

ABORT=false
REPOS=""
if [ $# = 1 ] && [ "$1" = "ABORT" ]; then
    ABORT=true
elif [ $# = 1 ]; then
    REPOS="$1"
elif [ $# != 0 ]; then
    usage
fi

THISHOST="`uname -n`"
case $THISHOST in
    $HOST1) OTHERHOST=$HOST2;;
    $HOST2) OTHERHOST=$HOST1;;
    *) echo "Not running on $HOST1 or $HOST2" >&2; exit 1;;
esac

HTTPPORT=${HTTPPORT:-8081}

if [ ! -d "$SRV" ]; then
    echo "$SRV does not exist, nothing to do" >&2
    exit 1
fi

if [ "`id -u`" != 0 ]; then
    echo "Not running as root" >&2
    exit 1
fi

if [ "$REPOS" != "" ]; then
    if [ ! -d "$SRV/$REPOS" ]; then
	echo "$SRV/$REPOS does not exist" >&2
	exit 1
    fi
else
    # Sort the repositories by last snapshot time, oldest to last,
    #  to avoid starvation.
    # The '.' in the wildcard avoids the info directory.
    REPOS="$(
        cd $SRV
        for REPO in *.*; do
            SNAPFILE=$REPO/.cvmfs_last_snapshot
            SNAPTIME=0
            if [ -f $SNAPFILE ]; then
                SNAPTIME="$(stat --format='%Y' $SNAPFILE)"
            fi
            echo $SNAPTIME:$REPO
        done|sort -n|cut -d: -f2)"
fi

MAXFDS="`ulimit -n`"
if [ $MAXFDS -lt 16384 ]; then
    MAXFDS=16384
    ulimit -n $MAXFDS
fi

if [ -f /etc/krb5.keytab ]; then
    export KRB5CCNAME=FILE:/tmp/krb5cc_root_hastratum1
    PATH=$PATH:/usr/krb5/bin:/usr/kerberos/bin kinit -k host/$THISHOST
fi

# base log name
LOGBASE="${SRV##*/}"

RUNDIR=/var/run/cvmfs
mkdir -p $RUNDIR
LOCKBASE=$RUNDIR/${LOGBASE}-pullpush.lock
TMPLOCK="$LOCKBASE.tmp.$$"
# put in parent's process id so can abort whole process group
echo "$PPID" >"$TMPLOCK"

if [ ! -d /var/log/cvmfs ]; then
    mkdir /var/log/cvmfs
fi
PULLLOG=/var/log/cvmfs/${LOGBASE}-pull.log
PUSHLOG=/var/log/cvmfs/${LOGBASE}-push.log

LOCKN=0
GOTONE=false
OTHERPIDS=""
LOCKFILE=""
TMPPULLOG=""
TMPPUSHLOG=""
while [ "$LOCKN" -lt "$MAXPARALLELPULL" ]; do
    let LOCKN+=1
    LOCKFILE=$LOCKBASE.$LOCKN
    TMPPULLLOG=$RUNDIR/${LOGBASE}-pull.log.$LOCKN
    TMPPUSHLOG=$RUNDIR/${LOGBASE}-push.log.$LOCKN
    if ln $TMPLOCK $LOCKFILE 2>/dev/null; then
        # succeeded in getting this lock, job was not running
        if $ABORT; then
            # try the other jobs too
            rm -f $LOCKFILE
            continue
        fi
        GOTONE=true
        break
    fi
    # did not get the lock
    OTHERPID="`cat $LOCKFILE 2>/dev/null`"
    if [ -z "$OTHERPID" ] || ! kill -0 "$OTHERPID" 2>/dev/null; then
        rm -f $LOCKFILE
        if $ABORT; then
            echo;echo "Removing $LOCKFILE at `date` because the process $OTHERPID was not running"
            continue
        fi
        if ! ln $TMPLOCK $LOCKFILE 2>/dev/null; then
            echo;echo "Attempted to break lock $LOCKFILE from $OTHERPID at `date` but failed"
            continue
        fi
        echo;echo "Broke lock $LOCKFILE from $OTHERPID at `date` because process was not running"
        GOTONE=true
        break
    fi
    if $ABORT; then
        # Note: this does not abort a push in progress, only a pull.
        # This should not cause a problem because aborts are done
        #  before switching mastership, and if the other machine
        #  becomes a master while the snapshot is still in progress 
        #  and tries to pull on that repository, it will have its own
        #  lock and so won't clash.
        echo;echo "Aborting update pid $OTHERPID in progress at `date`"
        kill -15 -"$OTHERPID"  # -15 works best when it was run from cron
        sleep 3
        kill -9 -"$OTHERPID" 2>/dev/null # -9 works best when it was run from shell
        rm -f $LOCKFILE
        sleep 3 
        # save its aborted logs if it didn't do it itself
        if [ -f $TMPPULLLOG ]; then
            # stdout is going to $PULLOG
            cat $TMPPULLLOG 
            rm -f $TMPPULLLOG
        fi
        if [ -f $TMPPUSHLOG ]; then
            cat $TMPPUSHLOG  >> $PUSHLOG
            rm -f $TMPPUSHLOG
        fi
        GOTONE=true
        continue
    fi
    OTHERPIDS="$OTHERPIDS $OTHERPID"
done >>$PULLLOG 2>&1

rm -f $TMPLOCK

if ! $GOTONE; then
  if $ABORT; then
        echo;echo "Abort not needed at `date` because update not in progress"
        exit
    fi
    echo;echo "Cannot create $LOCKBASE.* at `date`, already created by running process(es)$OTHERPIDS"
    exit 1
fi >>$PULLLOG 2>&1

if $ABORT; then
    exit
fi

CVSC='/etc/cvmfs/repositories.d/$REPO/server.conf' # $REPO substituted later
FIXS1STAGE="grep -q CVMFS_STRATUM1 $CVSC && ! grep -q 'CVMFS_STRATUM1.*/stage' $CVSC && echo Adding /stage to CVMFS_STRATUM1 in $CVSC && sed -i -e 's,^CVMFS_STRATUM1=\(.*\),CVMFS_STRATUM1=\1/stage,' $CVSC"


(
echo
echo "Logging in $RUNDIR/${LOGBASE}-pu*.log.$LOCKN at `date`"
) >>$PULLLOG

(
for REPO in $REPOS; do
    #stdout here is used to coordinate with the push side, so
    #  everything else has to be redirected to stderr
    REPODIR=/etc/cvmfs/repositories.d/$REPO
    if [ ! -d $REPODIR ]; then
	echo "$REPODIR does not exist, skipping!" >&2
	continue
    fi
    STOREDIR="`grep '^CVMFS_UPSTREAM_STORAGE=' $REPODIR/server.conf|cut -d, -f3`"
    if [ -z "$STOREDIR" ]; then
	continue
    fi
    if [ ! -f $STORAGE/$REPO/.cvmfs_last_snapshot ]; then
	echo >&2
	echo "Initial snapshot of $REPO is not finished, skipping" >&2
	continue
    fi
    if (
    echo
    echo "Pulling to $STOREDIR started at `date` (lock $LOCKN)"
    eval $FIXS1STAGE
    if ! cvmfs_server snapshot -t $REPO; then
	echo "ERROR from cvmfs_server, skipping push!"
	false
    fi
    ) >&2
    then
	# tell push to proceed on this dir
	echo "$REPO $STOREDIR"
    fi
done 
echo "Pulling finished at `date`" >&2
) 2>>$TMPPULLLOG | (while read REPO STOREDIR; do
    (
    echo
    echo "Pushing $STOREDIR started at `date` (lock $LOCKN)"
    # Only data directory is copied here, to make sure .cvmfs files
    #  get copied last, in case fail-over to the backup happens in the
    #  middle of a push.
    # Need to first make sure other side has initial snapshot, to avoid
    #  waiting long if the other side is getting re-installed from scratch.
    if ssh -o ConnectTimeout=10 -o StrictHostKeyChecking=no $OTHERHOST "if [ ! -f $STOREDIR/.cvmfs_last_snapshot ]; then echo "Initial snapshot not finished, skipping"; exit 1; fi;REPO=$REPO;$FIXS1STAGE;ulimit -n $MAXFDS;CVMFS_SERVER_OVERRIDE=http://$THISHOST:$HTTPPORT/cvmfs/$REPO/stage cvmfs_server snapshot -t $REPO" || \
		[ -f /etc/cvmfs/runstandalone ]; then
	SERVED="`cd $STOREDIR;cd $(/bin/pwd)/..;pwd`"
	if [ ! -d "$SERVED" ]; then
	    echo "$SERVED" does not exist!
	    continue
	fi
	RSYNCFLAGS="-aW --exclude .cvmfs_master_replica --exclude data --exclude stage"
	if [ -f /etc/cvmfs/runstandalone ]; then
	    echo "Copying to $SERVED on local machine only"
	    rsync $RSYNCFLAGS $STOREDIR/ $SERVED/
	else
	    echo "Copying to $SERVED"
	    if rsync -e 'ssh -o ConnectTimeout=10' $RSYNCFLAGS $STOREDIR/ $OTHERHOST:$SERVED/; then
		rsync $RSYNCFLAGS $STOREDIR/ $SERVED/
	    fi
	fi
    fi
    echo "Pushing $STOREDIR ended at `date`"
    ) </dev/null
done
echo "Pushing finished at `date`"
) >>$TMPPUSHLOG 2>&1

cat $TMPPULLLOG >>$PULLLOG
cat $TMPPUSHLOG >>$PUSHLOG

rm -f $TMPPULLLOG $TMPPUSHLOG $LOCKFILE
