#!/usr/bin/env bash

##### textEnhance
#     author: David Huss
#     email: david.huss@phonetik.uni-muenchen.de
#     For more information, consult README.md or run
#         `textEnhance --help`
#####

# TODO:
# use human readable output of `file`?
# add logfile cli argument?
# add noclobber option?
# format helpers with black
# the trap command doesn't work when the program aborts due
    # to dos2unix not being found. why is that?
# format errors and warnings


set -e  # exit immediately when encountering errors
set -u  # treat unset variables as errors
set -o pipefail  # don't ignore pipeline errors

##### Global variables
program=textEnhance
script_dir=$(dirname `readlink -f $0`)  # this gives us the true location
                                        # of the script and its helpers
                                        # if we start via symbolic link
verbose=null  # will actually be defined by the user, but we have to include
              # it here, otherwise we cannot define the log function below
#####


function log {
    # usage: log info|warning|error <message>
    #
    # `log info` prints debug information to
    # stdout, `log warning` prints a warning to
    # stderr, `log error` prints an error to
    # stderr and exits with exit code 1
    case $1 in
        info)
        if [ $verbose = 'True' ]; then
            echo "DEBUG: $program: $2"
            # alternatively, one of these two:
            # shift; echo "$*"
            # echo "${@:2}"
        fi
        ;;
        warning)
        echo "WARNING: $program: $2" >> /dev/stderr
        ;;
        error)
        echo "ERROR: $program: $2" >> /dev/stderr
        exit 1
        ;;
    esac
}


##### Check bash version
if [ $(echo -e "$BASH_VERSION\n4.4" | sort -rV | head -1) = '4.4' ]; then
    log error "This script requires bash version >= 4.4 (see README for details)"
fi
#####


##### Parse command line arguments
# We pass the arguments to parse-arguments.py, which returns them in a
# "bash-readable" format, i.e.
# ```
# input=file.text
# verbose=False
# ```
# etc, including a header that says '#sourcethis' (preventing us from
# accidentally parsing the help page or the version info)
parser_output=$(bash -c "python3 $script_dir/parse-arguments.py ${*@Q}")
    # the `@Q` operator returns a quoted array, requires bash version >= 4.4
    # the string in double quotes is the command that invokes the argument
    # parser. why on earth we have to pass this as a `-c` parameter to another
    # bash instance instead of being able to run it directly is frankly beyond
    # me, but god knows it took me way too long to figure out
if [ "${parser_output:0:11}" = '#sourcethis' ]; then
    # only `eval` if the first 11 characters are '#sourcethis'
    eval "$parser_output"
    log info "Parsed command line arguments"
    log info "Beginning processing of input file '$input'"
else
    # otherwise, the user must have requested the help page
    # or the version number, so print that and exit
    echo "$parser_output"
    exit
fi
#####


##### Input file checks
if [ ! -e "$input" ]; then
    log error "The input file '$input' does not exist"
elif [ -d "$input" ]; then
    log error "The input file '$input' is a directory"
elif [ ! -f "$input" ]; then
    log error "The input file '$input' is not a regular file"
elif [ ! -r "$input" ]; then
    log error "There is no read permission for the input file '$input'"
elif [ ! -s "$input" ]; then
    log error "The input file '$input' is empty"
elif diff "$input" <(echo) > /dev/null; then
    # since `echo` without any arguments prints only a newline,
    # the `<(echo)` argument acts like a file containing only that
    log error "The input file '$input' contains only a trailing newline"
elif echo "$input" | grep -Piq '\.(docx?|xlsx?|od[ts]|pdf)$'; then
    log error "Document formats (as used by Word, Excel etc.) are not supported"
elif [ $(file --brief --mime-encoding "$input") = 'binary' ]; then
    # the `file` command returns `binary` for empty input files,
    # which is why we're only performing this check if we haven't
    # already determined that the input is empty (or consists only
    # of a trailing newline)
    log error "The input file '$input' appears to be a binary file"
else
    # we should arrive at these commands when the input is a completely
    # normal file
    encoding=$(file --brief --mime-encoding "$input")
    log info "Identified encoding as '$encoding'"
fi
#####


##### Create tempfile
# There are several reasons we would want to write to a tempfile
# before writing to the actual output - the output might be a
# non-regular file such as /dev/null which we can't operate very
# flexibly on. Additionally, programs such as `iconv` cannot handle
# modifying very large files (<32kB) in place.
tmpfile=$(mktemp)
log info "Created temporary output file '$tmpfile'"
trap "rm -f '$tmpfile'" EXIT  # this ensures that this file will (almost)
                              # always be cleaned up if this script fails
#####


##### Normalize encodings and line terminators
# `iconv` takes care of the conversion between encodings for us
if [ "$encoding" != 'utf-8' -a "$encoding" != 'us-ascii' ]; then
    # conversion is only necessary for encodings other than
    # these two (US-ASCII is a subset of UTF-8)
    if iconv --list | grep -iwq "$encoding"; then
        log warning "Converting encoding from '$encoding' to 'utf-8'"
        iconv --from-code="$encoding" --to-code="utf-8" "$input" --output="$tmpfile"
    else
        log error "The encoding '$encoding' (as determined by \`file\`)
                        is not known to \`iconv\`"
    fi
else
    cp "$input" "$tmpfile"
fi
# Next up, convert line terminators. This is achieved using `dos2unix`.
# It converts Windows style CRLF as well as old Mac style CR to Unix
# style LF (`mac2unix` exists too, but this is just a symlink to `dos2unix`)
if file "$tmpfile" | grep -Pq 'with CR(LF)? line terminators'; then
    log warning "Converting line terminators to LF (Unix style)"
fi
# If we already have LF line terminators, running dos2unix won't change anything.
# However, in case `file` determines them incorrectly, it's better to be safe
# than sorry.
if ! dos2unix "$tmpfile" 2> /dev/null; then
    # only display output of `dos2unix` if it failed
    dos2unix "$tmpfile"
fi
# If the file contains a BOM (byte order mark), `dos2unix` also removes this
# for us. Normally these should only be present in UTF-16 encoded files and
# consequently removed during conversion by `iconv`, but MacOS frequently inserts
# them in UTF-8 encoded files too.
if ! diff <(tail --bytes=1 "$tmpfile") <(echo) > /dev/null; then
    # Windows files do not end with a trailing newline. If this is missing,
    # we have to insert it manually with the command below. (the `<(echo)` in
    # the if statement above serves the same purpose as when it was used in
    # the input file checks section)
    log warning "Appending trailing newline"
    echo >> "$tmpfile"
fi
# Lastly, convert comments using our helper script
python3 "$script_dir/convert-comments.py" \
            "$tmpfile" "$brackets" \
            "$left_bracket" "$whitespace_replacement"
log info "Ensured comments being BAS-compatible"
#####


##### Write to output
cp "$tmpfile" "$outfile"
rm -f "$tmpfile"
log info "Removed temporary output file '$tmpfile'
                    (copied contents to '$outfile' before removal)"
#####


log info "Successfully processed '$input'"
