#!/bin/bash
# Validate productconfig files from a directory one by one, then stitch them
# together, so the docserv script can build from them.
#
# Arguments (in order of appearance):
# --valid-languages="en-us ..."   # Space-separated list of valid language codes
# --valid-site-sections="..."     # Space-separated list of valid sections
# --simplify                      # Simplify user-generated productconfig so the
#                                 # docserv daemon can use it (optional)
# --revalidate-only               # If a file named of the same name as the
#                                 # output file already exists and it contains a
#                                 # <hashes/> section matching the current MD5
#                                 # sums of the XML files in the input directory,
#                                 # do not do a complete validation
# --spinner                       # Show a spinner while working, if you or your
#                                 # CI are impatient and eager to kill processes
# INPUT_DIR                       # Directory with config files
# OUTPUT_FILE                     # Output file name (parent directory must exist)
#                                 # (optional; if not specified, this will only
#                                 # validate the configuration)
#
# XML tool deps: xmlstarlet, jing, xmllint, xsltproc

out() {
  >&2 echo -e "$1"
  exit 1
}

me=$(test -L $(realpath $0) && readlink $(realpath $0) || echo $(realpath $0))
mydir=$(dirname $me)

source $mydir/docserv-dirs

app_help() {
  sed -rn '/#!/{n; p; :loop n; p; /^[ \t]*$/q; b loop}' $me | sed -r -e 's/^# ?//' -e "s/\\\$0/$(basename $0)/"
  exit
}

add_issue() {
  # we apply "fold" only to the first line, under the assumption that that is
  # the prose issue description while all the rest is assumed to not be foldable
  # (xml errors and such)
  issuelist+="\n\n- "$(echo -e "$1" | head -1 | fold -s -w 78 | sed '2,${s/^ */  /}')
  [[ $(echo -e "$1" | wc -l) -gt 1 ]] && issuelist+='\n'$(echo -e "$1" | tail -n +2 | sed 's/^/  /')
}

add_multiple_issues() {
  affected_file="$1"
  issues="$2\n---"

  issue=''
  while read line; do
    if [[ "$line" == '---' ]] && [[ "$issue" ]]; then
      add_issue "$affected_file: $issue"
      issue=''
    elif [[ $(echo "$line" | sed 's/[ \t]//g') ]]; then
      issue+="$line\n"
    fi
  done < <(echo -e "$issues")
}

# Mostly because we need to put something random on screen when running within
# CI, so we don't get killed for inactivity
spin() {
  [[ "$enable_spinner" -eq 0 ]] && return

  if [[ "$1" == 'clear' ]]; then
    echo -e '\b '
    return
  fi

  if [[ "$spinstate" == '/' ]]; then spinstate='-'
  elif [[ "$spinstate" == '-' ]]; then spinstate='\'
  elif [[ "$spinstate" == '\' ]]; then spinstate='|'
  else spinstate='/'
  fi

  echo -ne "\b$spinstate"
}

xmllint='xmllint'
jing='jing'
starlet='xmlstarlet'

stacksize=${stacksize:-"-Xss4096K"}
java_flags="-Dorg.apache.xerces.xni.parser.XMLParserConfiguration=org.apache.xerces.parsers.XIncludeParserConfiguration"

schema_file=$share_dir/validate-product-config/product-config-schema.rnc
checks_dir=$share_dir/validate-product-config/checks
references_stylesheet=$share_dir/validate-product-config/global-check-ref-list.xsl
simplify_stylesheet=$share_dir/simplify-product-config/simplify.xsl

[[ ! -f $schema_file ]] && out "Schema $schema_file does not exist.$(readme_message)"
[[ ! -d $checks_dir ]] && out "Directory $checks_dir does not exist.$(readme_message)"
[[ ! -f $simplify_stylesheet ]] && out "Stylesheet $simplify_stylesheet does not exist.$(readme_message)"

simplify_config=0
enable_spinner=0
revalidate=0
valid_languages=""
valid_site_sections=""
unknown=""
input_dir=""
output_file=""

for i in "$@"
  do
    case $i in
      -h|--help)
        app_help
      ;;
      --simplify)
        simplify_config=1
      ;;
      --revalidate-only)
        revalidate=1
      ;;
      --spinner)
        enable_spinner=1
      ;;
      --valid-languages=*)
        valid_languages="${i#*=}"
      ;;
      --valid-site-sections=*)
        valid_site_sections="${i#*=}"
      ;;
      *)
        if [[ -d $(readlink -f -- $i) ]] && [[ ! $input_dir ]]; then
          input_dir=$(readlink -f -- $i)
        elif [[ -d $(readlink -f -- $(dirname $i)) ]] && [[ ! $output_file ]]; then
          output_file=$(readlink -f -- $i)
        else
          unknown+="  $i\n"
        fi
      ;;
    esac
done

[[ "$unknown" ]] && \
  out "There are unknown parameters:\n$unknown\n(Note that input and output directories must exist before running this script.)"

# FIXME: For --valid-languages and --valid-site-sections, we allow duplicated
# entries. This is benign, because we deduplicate them, but since it hints at a
# slightly b0rk config, we may not want to accept that.

# matches ds.type.lang in product-config-schema.rnc
[[ ! $(echo " $valid_languages" | grep -oP '^( +[a-z]{2}(-[a-z]{2,8})?)+$') ]] && \
  out "Language codes parameter string does not conform to scheme (must be la-ng scheme, space-separated)."

# matches ds.type.alphanumeric in product-config-schema.rnc
[[ ! $(echo " $valid_site_sections" | grep -oP '^( +[-_a-zA-Z0-9]+)+$') ]] && \
  out "Site sections parameter string does not conform to scheme (must be alphanumeric-_, space-separated)."

([[ "$simplify_config" -eq 1 ]] && [[ -z "$output_file" ]]) && \
  out "--simplify can only be used together with an OUTPUT_FILE parameter."

([[ "$revalidate" -eq 1 ]] && [[ -z "$output_file" ]]) && \
  out "--revalidate-only can only be used together with an OUTPUT_FILE parameter."

cd $input_dir

[[ ! $(ls *.xml 2>/dev/null) ]] && \
  out "There are no product configuration files."


valid_languages_sorted=''
[[ "$valid_languages" ]] && valid_languages_sorted=$(echo -e "$valid_languages" | tr ' ' '\n' | sort -u)
valid_site_sections_sorted=''
[[ "$valid_site_sections" ]] && valid_site_sections_sorted=$(echo -e "$valid_site_sections" | tr ' ' '\n' | sort -u)

issuelist=''


# Create md5 hashes for all config files. From the md5sum output, we are only
# keeping the hashes, as the file names of the config files are actually
# irrelevant.
hashsums=$(md5sum *.xml | sort | sed -r 's/\s*[^ ]+$//' | tr '\n' '!')

if [[ "$revalidate" == '1' ]] && [[ -f $output_file ]]; then
  comparisonhashsums=$($starlet sel -t -v "(//hashes)[1]" $output_file)
  [[ "$comparisonhashsums" == "$hashsums" ]] && exit 0
fi

outfile='<?xml version="1.0" encoding="UTF-8"?>\n<docservconfig>\n'
outfile+="<hashes>${hashsums}</hashes>\n\n"

for file in *.xml; do

  # xmllint is faster and gives more readable error messages than jing, so
  # run that first

  wellformed=$(2>&1 xmllint --noout --noent $file)
  [[ $? -gt 0 ]] && add_issue "$input_dir/$file: File is not well-formed:\n$wellformed" && continue
  spin

  valid=$(2>&1 ADDITIONAL_FLAGS="$java_flags" ADDITIONAL_OPTIONS="$java_flags" \
    $jing -ci $schema_file $file)
  [[ $? -gt 0 ]] && add_issue "$input_dir/$file: File is not valid:\n$valid"
  spin

  # This attribute is optional, and defaults to '1'/'true'
  enabled=$($starlet sel -t -v "/product/@enabled" $file)
  if [[ "$enabled" == 'false' ]] || [[ "$enabled" == '0' ]]; then
    continue
  fi

  # Additional validation checks may come either as XSLTs or Bash scripts.
  # No script output is assumed to mean there are no issues.
  for check_file in $checks_dir/check-*.{sh,xsl}; do
    # As long as at least one variety of check files does not exist, the glob
    # pattern above will not resolve correctly in one case and the literal
    # file name "check-*.xsl" (or .sh) will enter this loop. The hack below
    # avoids that.
    [[ $check_file =~ \*\. ]] && continue
    command='bash'
    [[ $check_file =~ \.xsl$ ]] && command='xsltproc'
    result=$( \
      xmllint="$xmllint" jing="$jing" starlet="$starlet" \
      valid_languages="$valid_languages_sorted" \
      valid_site_sections="$valid_site_sections_sorted" \
      $command $check_file $file)
    [[ $? -eq 0 ]] || out "Validation check file $check_file is misbehaving."
    [[ "$result" ]] && add_multiple_issues "$input_dir/$file" "$result"
    spin
  done

  outfile+=$($starlet sel -t -c "/*" $file)
  outfile+='\n'

done

outfile+='\n</docservconfig>\n'

spin 'clear'

if [[ "$issuelist" ]]; then
  out "The following issues occurred when validating:$issuelist\n"
fi


# check for uniqueness of productid values (which we can't do before
# stitching the file together)

productids=$(echo -e "$outfile" | $starlet sel -t -v '//@productid' | sort)
uniqueproductids=$(echo -e "$productids" | sort -u)

[[ ! "$productids" == "$uniqueproductids" ]] && \
    add_issue \
      "(global check): Some productid values in $input_dir are not unique. Check for the following productid values: "$(comm -2 -3 <(echo -e "$productids") <(echo -e "$uniqueproductids") | tr '\n' ' ')"."

referencecheck=$(echo -e "$outfile" | xsltproc $references_stylesheet -)
[[ "$referencecheck" ]] && add_multiple_issues "(global check)" "$referencecheck"

if [[ "$issuelist" ]]; then
  out "The following issues occurred when validating:$issuelist\n"
fi

cd - >/dev/null


if [[ $simplify_config == 1 ]]; then
  echo -e "$outfile" | xsltproc $simplify_stylesheet - > $output_file
elif [[ "$output_file" ]]; then
  echo -e "$outfile" > $output_file
fi
