#! /bin/bash
# set vim: syntax=sh:

# repeats: Searches for duplicate files in the specified directories

# Copyright (C) 2004-2020 by Brian Lindholm.  This file is part of the
# littleutils utility set.
#
# The repeats utility is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 3, or (at your option) any later version.
#
# The repeats utility is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
# more details.
#
# You should have received a copy of the GNU General Public License along with
# the littleutils.  If not, see <https://www.gnu.org/licenses/>.

# get a valid temporary directory and set up traps
TMPWILD=`tempname -w repeats_$$` || exit 99
trap 'rm -f ${TMPWILD} ; exit 1' 1 2 3 13 15
trap 'rm -f ${TMPWILD} ; exit 0' 0

# get command-line options
ALGORITHM='8'
HARDLINKS='n'
MIDSIZE='65536'
PARANOID='n'
VERBOSE='n'
ZEROS='n'
while getopts a:hlm:pvz opts
do
  case $opts in
    a) ALGORITHM=${OPTARG} ;;
    h) echo 'repeats 1.2.3'
       echo 'usage: repeats [-a hash_algorithm] [-h(elp)] [-l(inks_hard)]'
       echo '         [-m(idsize) bytecount] [-p(aranoid)] [-v(erbose)] [-z(eros)]'
       echo '         [directory ...]'
       echo 'algorithms:  1 = MD5, 2 = SHA1, 3 = SHA224, 4 = SHA256, 5 = SHA384,'
       echo '             6 = SHA512, 7 = BLAKE2B-256, 8 = BLAKE2B-512 (default)'
       exit 0 ;;
    l) HARDLINKS='y' ;;
    m) MIDSIZE=${OPTARG} ;;
    p) PARANOID='y' ;;
    v) VERBOSE='y' ;;
    z) ZEROS='y' ;;
    *) echo 'repeats 1.2.3'
       echo 'usage: repeats [-a hash_algorithm] [-h(elp)] [-l(inks_hard)]'
       echo '         [-m(idsize) bytecount] [-p(aranoid)] [-v(erbose)] [-z(eros)]'
       echo '         [directory ...]'
       echo 'algorithms:  1 = MD5, 2 = SHA1, 3 = SHA224, 4 = SHA256, 5 = SHA384,'
       echo '             6 = SHA512, 7 = BLAKE2B-256, 8 = BLAKE2B-512 (default)'
       exit 1 ;;
  esac
done
shift `expr ${OPTIND} - 1`

# generate the initial list of files
TMPFILE0=`tempname repeats_$$_1` || exit 99
if [ $# -eq 0 ]; then
  find . -type f -readable -print | sed -e 's/^\.\///' > ${TMPFILE0}
elif [ $# -eq 1 ]; then
  if [ -d "$1" -a -r "$1" -a -x "$1" ]; then
    find "$1" -type f -readable -print > ${TMPFILE0}
  else
    echo "repeats error: $1 is not a readable directory"
    rm -f ${TMPWILD}
    exit 1
  fi
else
  while [ $# -gt 0 ]; do
    if [ -d "$1" -a -r "$1" -a -x "$1" ]; then
      find "$1" -type f -readable -print >> ${TMPFILE0}
    else
      echo "repeats error: $1 is not a readable directory"
      rm -f ${TMPWILD}
      exit 1
    fi
    shift
  done
fi
if [ "$VERBOSE" = 'y' ]; then
  echo "repeats stage 0: total number of files = `wc -l ${TMPFILE0}`" | sed -e "s: ${TMPFILE0}::" 1>&2
fi

# grab filesizes and eliminated zero-length files if requested
TMPFILE1=`tempname repeats_$$_1` || exit 99
if [ "$ZEROS" = 'n' ]; then
  sort -u ${TMPFILE0} | filesize -p | grep -v '	0$' | sort -t '	' -k2n,2n | /usr/lib64/littleutils/rep_size > ${TMPFILE1}
  if [ "$VERBOSE" = 'y' ]; then
    echo "repeats stage 1: num files with non-unique and non-zero filesize = `wc -l ${TMPFILE1}`" | sed -e "s: ${TMPFILE1}::" 1>&2
  fi
else
  sort -u ${TMPFILE0} | filesize -p | sort -t '	' -k2n,2n | /usr/lib64/littleutils/rep_size > ${TMPFILE1}
  if [ "$VERBOSE" = 'y' ]; then
    echo "repeats stage 1: num files with non-unique filesize = `wc -l ${TMPFILE1}`" | sed -e "s: ${TMPFILE1}::" 1>&2
  fi
fi

# search for duplicates based on node numbers (eliminate hardlinks)
TMPFILE2=`tempname repeats_$$_2` || exit 99
if [ "$HARDLINKS" = 'n' ]; then
  sort ${TMPFILE1} | /usr/lib64/littleutils/rep_hard -p | sort -t '	' -k4n,4n -k3n,3n -k2n,2n -k1,1 | /usr/lib64/littleutils/rep_node > ${TMPFILE2}
  if [ "$VERBOSE" = 'y' ]; then
    echo "repeats stage 2: num files excluding hardlinks = `wc -l ${TMPFILE2}`" | sed -e "s: ${TMPFILE2}::" 1>&2
  fi
else
  mv ${TMPFILE1} ${TMPFILE2}
fi

# search for duplicates based on a partial filehash
TMPFILE3=`tempname repeats_$$_3` || exit 99
sort ${TMPFILE2} | filehash -v -s -${ALGORITHM} -p -n ${MIDSIZE} | sort -t '	' -k2n,2n -k3,3 -k1,1 | /usr/lib64/littleutils/rep_hash > ${TMPFILE3}
if [ "$VERBOSE" = 'y' ]; then
  echo "repeats stage 3: num file pairs with matching digest after ${MIDSIZE} bytes = `wc -l ${TMPFILE3}`" | sed -e "s: ${TMPFILE3}::" 1>&2
fi

# search for duplicates based on a complete filehash
TMPFILE4=`tempname repeats_$$_4` || exit 99
sed -e 's/\t/\n/' ${TMPFILE3} | sort -u | filehash -v -s -${ALGORITHM} -p | sort -t '	' -k2n,2n -k3,3 -k1,1 | /usr/lib64/littleutils/rep_hash > ${TMPFILE4}
if [ "$VERBOSE" = 'y' ]; then
  echo "repeats stage 4: num file pairs with matching complete digest = `wc -l ${TMPFILE4}`" | sed -e "s: ${TMPFILE4}::" 1>&2
fi

# do final paranoia check if requested
if [ "$PARANOID" = 'n' ]; then
  # make it final: print results
  sort ${TMPFILE4}
else
  TMPFILE5=`tempname repeats_$$_5` || exit 99
  sort ${TMPFILE4} | /usr/lib64/littleutils/rep_cmp > ${TMPFILE5}
  if [ "$VERBOSE" = 'y' ]; then
    echo "repeats stage 5: num file pairs based on cmp results = `wc -l ${TMPFILE5}`" | sed -e "s: ${TMPFILE5}::" 1>&2
  fi
  # make it final: print results
  cat ${TMPFILE5}
fi

# clean up temp files
rm -f ${TMPWILD}
