#!/bin/bash
REPOS_PATH="/opt/eprints3/archives"
REPO=""
VERBOSE="0"
usage() { echo "USAGE: $0 [ -a <ARCHIVE_ID> | -h | -p <ARCHIVES_PATH> | -v ]"; exit 1; }
while getopts ":a:hp:v" arg; do
  case "${arg}" in
    a)
      REPO=$OPTARG
      ;;
    h)
      usage
      ;;
    p)
      REPOS_PATH=$OPTARG
      ;;
    v)      
      VERBOSE="1"
      ;;
	*)
      usage
      ;;
  esac
done
if [ ! -d  $REPOS_PATH ]; then
        echo "ERROR: No repository archives at $REPOS_PATHS"
        exit 1
fi

REPOS=""
if [ "$REPO" != "" ]; then
	if [ ! -e "$REPOS_PATH/$REPO" ]; then
		echo "No repository archive exists at $REPOS_PATH/$REPO"
		exit 2
	fi
	REPOS="$REPO"
else
	REPOS="`ls $REPOS_PATH`"
fi

REPO_PATH=""
DB=""
USER=""
PASS=""
for repo in "$REPOS"; do
    REPO_PATH="$REPOS_PATH/$repo"
	if [ -e "$REPO_PATH/cfg/cfg.d/database.pl" ]; then
		DB=`cat $REPO_PATH//cfg/cfg.d/database.pl | grep -v "^#" | grep dbname | awk 'BEGIN{FS="[= ]+"}{print $2}' | tr -d '"' | tr -d  "'" | tr -d ';'`
		USER=`cat $REPO_PATH/cfg/cfg.d/database.pl | grep -v "^#" | grep dbuser | awk 'BEGIN{FS="[= ]+"}{print $2}' | tr -d '"' | tr -d  "'" | tr -d ';'`
		PASS=`cat $REPO_PATH/cfg/cfg.d/database.pl | grep -v "^#" | grep dbpass | awk 'BEGIN{FS="[= ]+"}{print $2}' | tr -d '"' | tr -d  "'" | tr -d ';'`
    	break
	fi
done

if [[ "$REPO_PATH" == "" || "$DB" == "" || "$USER" == "" || "$PASS" == "" ]]; then
	echo "No suitable database configuration found under any $REPOS_PATH sub-directories."
	exit 3
fi
MYSQL_CMD_BASE="mysql -u $USER -p$PASS $DB"

UNINDEXED_FILE=$REPO_PATH/var/eprint_rindex_unindexed.txt
IGNORE_UNINDEXED_FILE="$REPO_PATH/var/ignore_eprint_rindex_unindexed.txt"
if [ -f $IGNORE_UNINDEXED_FILE ]; then
        IGNORE_EPRINTIDS=`cat $IGNORE_UNINDEXED_FILE | tr '\n' ',' | sed 's/,$//'`
       	IGNORE_EPRINTIDS=" AND eprint.eprintid NOT IN ($IGNORE_EPRINTIDS)"
fi

if [ "$VERBOSE" == "1" ]; then
	echo "Checking Database index for $REPO_PATH live archive"
	echo "  Checking title indexing"
fi
$MYSQL_CMD_BASE -e "SELECT eprint.eprintid FROM eprint WHERE eprint_status = 'archive' AND metadata_visibility = 'show' AND LENGTH(title) > 2 AND eprintid NOT IN (SELECT DISTINCT eprintid FROM eprint__rindex WHERE field = 'title')$IGNORE_EPRINTIDS;" | tail -n +2 | grep -v '^$' > $UNINDEXED_FILE
if [ "$VERBOSE" == "1" ]; then
    echo "  Checking abstract indexing"
fi
$MYSQL_CMD_BASE -e "SELECT eprint.eprintid FROM eprint WHERE eprint_status = 'archive' AND metadata_visibility = 'show' AND LENGTH(abstract) > 3 AND eprintid NOT IN (SELECT DISTINCT eprintid FROM eprint__rindex WHERE field = 'abstract')$IGNORE_EPRINTIDS;" | tail -n +2 | grep -v '^$' >> $UNINDEXED_FILE
if [ "$VERBOSE" == "1" ]; then
    echo "  Checking creators indexing"
fi
$MYSQL_CMD_BASE -e "SELECT eprint.eprintid FROM eprint INNER JOIN eprint_creators_name ON eprint.eprintid = eprint_creators_name.eprintid WHERE eprint_status = 'archive' AND metadata_visibility = 'show' AND eprint.eprintid NOT IN (SELECT DISTINCT eprintid FROM eprint__rindex WHERE field = 'creators_name')$IGNORE_EPRINTIDS;" | tail -n +2 | grep -v '^$' >> $UNINDEXED_FILE

sort -un $UNINDEXED_FILE > ${UNINDEXED_FILE}.tmp
mv ${UNINDEXED_FILE}.tmp $UNINDEXED_FILE
if [ "$VERBOSE" == "1" ]; then
	eprintids=`cat $UNINDEXED_FILE`;
    echo -e "\nThe following eprint records were found to not be fully indexed:\n$eprintids\n\n"
fi

