#!/usr/bin/env bash

##### textEnhance
#     author: David Huss
#     email: david.huss@phonetik.uni-muenchen.de
#     For more information, consult README.md or run
#         `textEnhance --help`
#####

# TODO:
# use human readable output of `file`?
# add logfile cli argument?
# add noclobber option?
# format helpers with black


set -e  # exit immediately when encountering errors
set -u  # treat unset variables as errors
set -o pipefail  # don't ignore pipeline errors

##### Global variables
program=textEnhance
script_dir=$(dirname `readlink -f $0`)  # this gives us the true location
                                        # of the script and its helpers
                                        # if we start via symbolic link
verbose=null  # will actually be defined by the user, but we have to include
              # it here, otherwise we cannot define the log function below
#####


function log {
    # usage: log info|warning|error <message>
    #
    # `log info` prints debug information to
    # stdout, `log warning` prints a warning to
    # stderr, `log error` prints an error to
    # stderr and exits with exit code 1
    case $1 in
        info)
        if [ $verbose = 'True' ]; then
            echo "DEBUG: $program: $2"
            # alternatively, one of these two:
            # shift; echo "$*"
            # echo "${@:2}"
        fi
        ;;
        warning)
        echo "WARNING: $program: $2" >> /dev/stderr
        ;;
        error)
        echo "ERROR: $program: $2" >> /dev/stderr
        exit 1
        ;;
    esac
}


##### Check bash version
if [ $(echo -e "$BASH_VERSION\n4.4" | sort -rV | head -1) = '4.4' ]; then
    log error "This script requires bash version >= 4.4 (see README for details)"
fi
#####


##### Parse command line arguments
# We pass the arguments to parse-arguments.py, which returns them in a
# "bash-readable" format, i.e.
# ```
# input=file.text
# verbose=False
# ```
# etc, including a header that says '#sourcethis' (preventing us from
# accidentally parsing the help page or the version info)
parser_output=$(bash -c "python3 $script_dir/parse-arguments.py ${*@Q}")
    # the `@Q` operator returns a quoted array, requires bash version >= 4.4
    # the string in double quotes is the command that invokes the argument
    # parser. why on earth we have to pass this as a `-c` parameter to another
    # bash instance instead of being able to run it directly is frankly beyond
    # me, but god knows it took me way too long to figure out
if [ "${parser_output:0:11}" = '#sourcethis' ]; then
    # only `eval` if the first 11 characters are '#sourcethis'
    eval "$parser_output"
    log info "Parsed command line arguments"
    log info "Beginning processing of input file '$input'"
else
    # otherwise, the user must have requested the help page
    # or the version number, so print that and exit
    echo "$parser_output"
    exit
fi
#####


##### Input file checks
if [ ! -e "$input" ]; then
    log error "The input file '$input' does not exist"
elif [ -d "$input" ]; then
    log error "The input file '$input' is a directory"
elif [ ! -f "$input" ]; then
    log error "The input file '$input' is not a regular file"
elif [ ! -r "$input" ]; then
    log error "There is no read permission for the input file '$input'"
elif [ ! -s "$input" ]; then
    log warning "The input file '$input' is empty"
    encoding='utf-8'  # just process this like a normal file
elif diff "$input" <(echo) > /dev/null; then
    # since `echo` without any arguments prints only a newline,
    # the `<(echo)` argument acts like a file containing only that
    log warning "The input file '$input' contains only a trailing newline"
    encoding='utf-8'  # just process this like a normal file
elif [ $(file -b --mime-encoding "$input") = 'binary' ]; then
    # the `file` command returns `binary` for empty input files,
    # which is why we're only performing this check if we haven't
    # already determined that the input is empty (or consists only
    # of a trailing newline)
    log error "The input file '$input' appears to be a binary file"
else
    # we should arrive at these commands when the input is a completely
    # normal file
    encoding=$(file -b --mime-encoding "$input")
    log info "Identified encoding as '$encoding'"
fi
#####


##### Output file checks
if [ -f "$outfile" -o ! -e "$outfile" ]; then
    # if the outfile already exists and is a regular file,
    # or if it just doesn't exist yet, we can operate on it
    # as we please
    cp "$input" "$outfile"
    outfile_is_regular_file=True
    log info "Copied input to output file '$outfile'"
else
    # if it's not a regular file, we will only actually write
    # to it at the very end
    final_outfile="$outfile"
    outfile="${input}_enhanced"
    cp "$input" "$outfile"
    outfile_is_regular_file=False
    log info "The output '$final_outfile' is not a regular file,
                    created temporary output file '$outfile'"
    # also, ensure temporary files are cleaned up if this script fails
    trap "test -f '$outfile' && rm '$outfile'" EXIT
fi
#####


##### Normalize encodings and line terminators
# `iconv` takes care of the conversion between encodings for us
iconv --from-code="$encoding" --to-code="utf-8" "$outfile" --output="$outfile"
log info "Converted encoding from '$encoding' to 'utf-8'"
# Next up, convert line terminators. This is achieved using `dos2unix`.
# It converts Windows style CRLF as well as old Mac style CR to Unix
# style LF (`mac2unix` exists too, but this is just a symlink to `dos2unix`)
if ! dos2unix "$outfile" 2> /dev/null; then
    # only display output of `dos2unix` if it failed
    dos2unix "$outfile"
fi
log info "Ensured LF line terminators"
# If the file contains a BOM (byte order mark), `dos2unix` also removes this
# for us. Normally these should only be present in UTF-16 encoded files and
# consequently removed during conversion by `iconv`, but MacOS frequently inserts
# them in UTF-8 encoded files too.
if ! diff <(tail --bytes=1 "$outfile") <(echo) > /dev/null; then
    # Windows files do not end with a trailing newline. If this is missing,
    # we have to insert it manually with the command below. (the `<(echo)` in
    # the if statement above serves the same purpose as when it was used in
    # the input file checks section)
    echo >> "$outfile"
    log info "Appended trailing newline"
fi
# Lastly, convert comments using our helper script
python3 "$script_dir/convert-comments.py" \
            "$outfile" "$brackets" \
            "$left_bracket" "$whitespace_replacement"
log info "Converted comments to be BAS-compatible"
#####


##### Write to output (if not done so already)
if [ $outfile_is_regular_file = 'False' ]; then
    cp "$outfile" "$final_outfile"
    rm "$outfile"
    log info "Removed temporary output file '$outfile'
                    (copied contents to '$final_outfile' before removal)"
fi
#####


log info "Successfully processed '$input'"
