#!/usr/bin/env python3

###### mausbpf2exb
#   author: David Huss
#   email: david.huss@phonetik.uni-muenchen.de
#   requires: python version >= 3.4
#   For more information, consult README.md or run
#       `python3 mausbpf2exb.py --help`
######


# TODO:
# add debug information for parsing as well, not only for conversion
# overhaul echo function
# use textwrap to format error messages properly


import argparse
from os import path
import sys
import re
from collections import OrderedDict
from xml.etree.ElementTree import ElementTree, Element, fromstring, tostring
from xml.dom import minidom
import shutil
import subprocess
from textwrap import indent


class ParFile:
    """
    A class which represents/encapsulates files in the BAS Partitur Format.
    
    Constructor arguments:
        inputstr (str, optional): A string with the contents of a BPF file.
        If this argument is provided, the method parse() will automatically be called.
        An alternative way to instantiate the class is to use the class method fromFilepath() -
        this method expects a filename and will automatically retrieve the contents of the file and parse them.
    
    Important attributes:
        header: An ordered dictionary of the header elements.
        tiers: A list of ExbTier instances belonging to this ParFile.
        link_times: A list of dictionaries, each with a 'start' and 'end' entry.
                    The index of these entries in the list is the symbolic link
                    whose times they contain.
    """
    def __init__(self, inputstr = None):
        self.header = OrderedDict()
        self.raw_tiers = OrderedDict()
        self.tiers = []
        self.obligatory_header_elements = {"LHD", "REP", "SNB", "SAM",
                                           "SBF", "SSB", "NCH", "SPN"}
        self.optional_header_elements = {"FIL", "TYP", "DBN", "VOL",
                                         "DIR", "SRC", "BEG", "END",
                                         "RED", "RET", "RCC", "CMT",
                                         "SPI", "PCF", "PCN", "EXP",
                                         "SYS", "DAT", "SPA", "MAO",
                                         "GPO", "SAO"}
        self.all_possible_header_elements = self.obligatory_header_elements.union(self.optional_header_elements)
        if inputstr:
            self.parse(inputstr)
    
    def parse(self, inputstr):
        """
        Take the contents of a BPF file, parse it, and encapsulate it in the class's properties and methods.
        
        Arguments:
            inputstr (str): The contents of the BPF file.
        """
        if not re.match("[A-Z]{3}:\s", inputstr):
            # this is to check that the input is actually a BPF file.
            # it won't catch everything, but at least the files that
            # are very clearly something different.
            echo("The input does not seem to be a BPF file", type = "error")
        
        lines = (line for line in inputstr.splitlines())  # creates a generator
        bpf_line = re.compile("^([A-Z0-9]{3}):[ \t]*(.*)$")
            # this is a general regex for valid lines of a BPF file.
            # Its first group is the 3-letter tier or header element
            # name, its second group are the contents of the line
            # after the colon and whitespace separator
        
        # extract and parse header
        for line in lines:
            if not line.startswith("LBD:"):  # the LBD line signifies the end of the header, hence the break statement below
                if bpf_line.match(line):
                    label, value = bpf_line.match(line).groups()
                    self.header[label] = value
                    if not value and label != "LHD":
                        # LHD is the one header element that does not necessarily need any content
                        echo("The BPF header element '{}' has no content".format(label),
                             type = "warning")
                else:
                    echo("Could not parse the following line:\n\t{}".format(line),
                         type = "error")
            else:
                break
        if not self.obligatory_header_elements.issubset(self.header):
            # check if all obligatory header elements are present in header after parsing
            echo("The BPF header is missing the following obligatory element(s): {}".format(
                 ", ".join(self.obligatory_header_elements.difference(self.header))),
                 type = "warning")
        if not set(self.header).issubset(self.all_possible_header_elements):
            # check that there are no unknown header elements
            echo("Detected one or more unknown BPF header elements, which will be discarded: {}".format(
                 ", ".join(set(self.header).difference(self.all_possible_header_elements))),
                 type = "warning")
        
        # extract and parse tiers
        for line in lines:
            # since lines is a generator object, this loop won't start over
            # at the first line, but rather after we broke out of the last loop
            if bpf_line.match(line):
                line_tiername, line_values = bpf_line.match(line).groups()
                if line_tiername in self.raw_tiers:
                    self.raw_tiers[line_tiername].append(line_values)
                else:
                    # if the dictionary entry for this tier doesn't exist yet,
                    # create it and insert the first value
                    self.raw_tiers[line_tiername] = [line_values]
            else:
                echo("Could not parse the following line:\n\t{}".format(line),
                     type = "error")
        
        for tier_name in self.raw_tiers:
            try:
                self.tiers.append(ParTier(tier_name).parse(self.raw_tiers[tier_name]))
            except AssertionError:
                echo("The input contains the following unknown BPF tier, which will be discarded: {}".format(tier_name),
                     type = "warning")
        
        del self.raw_tiers
    
    @classmethod
    def fromFilepath(cls, filepath):
        """
        Open a BPF file and pass its contents to the class constructor, returning an instance of the class.
        
        Arguments:
            filepath (str): The name of the BPF file.
        """
        with open(filepath) as file:
            return cls(file.read())
    
    def pop_header_element(self, label):
        """
        Pop an element from the BPF header.
        
        Arguments:
            label (str): The name of the header element.
        """
        return self.header.pop(label)
    
    def in_header(self, label):
        """
        Return a boolean indicating whether the specified element is present in the BPF header.
        
        Arguments:
            label (str): The name of the header element.
        """
        return label in self.header
    
    def samples_to_seconds(self, sample):
        """
        Convert samples to seconds in accordance with the BPF file's sampling rate.
        
        Arguments:
            sample (int): Number of samples.
        
        Returns (int): Number of seconds.
        """
        if "SAM" in self.header and self.header["SAM"] != "":
            return sample / int(self.header["SAM"])
        else:
            echo("Could not convert seconds to samples because of missing or empty BPF header element 'SAM'",
                 type = "error")
    
    def remove_tier(self, name):
        """
        Remove a BPF tier belonging to the BPF file object.
        
        Arguments:
            name (str): The name of the tier.
        """
        for tier in self.tiers:
            if tier.name == name:
                self.tiers.remove(tier)
    
    def get_tier_object(self, name):
        """
        Return a BPF tier object (which belongs to the BPF file object).
        
        Arguments:
            name (str): The name of the tier.
        """
        for tier in self.tiers:
            if tier.name == name:
                return tier
    
    def tier_exists(self, name):
        """
        Return a boolean indicating whether the specified tier is present in the input.
        
        Arguments:
            name (str): The name of the tier.
        """
        for tier in self.tiers:
            if tier.name == name:
                return True
        else:
            return False
    
    @staticmethod
    def get_reference_tier_hierarchy():
        """
        Return the 'hierarchy' of reference tiers used for symbolic link time inference.
        For more information, run the program with the option --list-reference-tiers.
        """
        return ["MAU", "SAP", "WOR", "PHO", "MAS"]
    
    def get_reference_tier(self):
        """
        Return the name of the tier which will be used as the reference tier
        for the symbolic link time inference.
        """
        for reference_tier in ParFile.get_reference_tier_hierarchy():
            if self.tier_exists(reference_tier) and \
               self.get_tier_object(reference_tier).is_valid_reference_tier():
                return reference_tier
        else:
            # if this happens, the exception will be caught in par_to_exb
            assert False, "no valid reference tiers"
    
    def get_tiers_requiring_link_inference(self):
        """
        Return a list of all the ParTier objects belonging to this ParFile instance
        which require symbolic link time inference.
        """
        return [tier for tier in self.tiers \
            if tier.tier_class == 1 \
            or tier.tier_class == 5]
    
    def infer_link_times(self):
        """
        Create a list `self.link_times` which contains the start and end time
        for each symbolic link present in the BPF file. Each element of the
        list is a dictionary with a 'start' and 'end' entry, measuring the time
        in samples of the time in question. The index of the item within the list
        is the number of the symbolic link it describes.
        """
        reference_tier = self.get_tier_object(self.get_reference_tier())
        self.link_times = []
        for item in reference_tier.sorted_tier_items():
            # iterate over a sorted list of all the tier items of the reference tier
            if item["linktype"] == "single" and item["link"] != -1:
                # BPF typically uses the link -1 for when no actual link is
                # applicable for the current item
                # also, only single links (no comma- or semicolon-separated lists
                # of links) can tell us something reliably about the time of a
                # symbolic link
                i = item["link"]  # index/number of the link
                start = item["start"]
                end = item["start"] + item["duration"]
                if len(self.link_times) <= i:
                    # if this is the first item in the reference tier referring
                    # to the current link, create a new entry in self.link_times
                    self.link_times.append({"start": None, "end": None})
                link = self.link_times[i]  # position in the list == number of the link
                if link["start"] == None or start < link["start"]:
                    # only record if we don't have an entry for the start of this
                    # link yet or if the current start time is lower than anything
                    # we've had before
                    link["start"] = start
                if link["end"] == None or end > link["end"]:
                    # only record if we don;t have an entry for the end of this
                    # link yet or if the current end time is lower than anything
                    # we've had before
                    link["end"] = end
    
    def get_link_times(self, linkindex):
        """
        Return a tuple consisting of the start time and end time (in samples) of
        the given symbolic link, inferred using the reference tier.
        
        Arguments:
            linkindex (int): the number of the symbolic link.
        """
        return (self.link_times[linkindex]["start"], self.link_times[linkindex]["end"])


class ParTier:
    """
    A class which represents/encapsulates tiers of a BPF file.
    
    Constructor arguments:
        name (str): The name of the tier.
    
    Important attributes and properties:
        name: A string with the name of the tier.
        tier_items: A list of dictionaries containing the items of the tier.
        tier_class: The BPF tier class.
    """
    def __init__(self, name):
        self.name = name
        self.tier_items = []
        self.tier_classes = {"KAN": 1, "KSS": 1, "MRP": 1, "KAS": 1, "PTR": 1, "ORT": 1,
                             "TRL": 1, "TR2": 1, "TRO": 1, "SUP": 1, "DAS": 1, "PRS": 1,
                             "NOI": 1, "PRO": 1, "SYN": 1, "FUN": 1, "LEX": 1, "POS": 1,
                             "LMA": 1, "TRS": 1, "TLN": 1, "TRW": 1, "SPK": 1, "IPA": 2,
                             "GES": 2, "USH": 2, "USM": 2, "OCC": 2, "SPD": 2, "VAD": 2,
                             "LBP": 3, "LBG": 3, "PRM": 3, "PHO": 4, "SAP": 4, "MAU": 4,
                             "WOR": 4, "TRN": 4, "USP": 4, "MAS": 4, "PRB": 5}
        assert self.name in self.tier_classes, "unknown tier"
            # if false, this exception will be caught in ParFile.parse
    
    def parse(self, content):
        """
        Parse the raw text contents of a BPF tier (as taken from the file) into a list of dictionaries,
        where each entry represents one item of the tier.
        
        Arguments:
            content: a list, representing for example a BPF tier like this:
                `ORT:    0    das
                 ORT:    1    Dunkel
                 ORT:    2    war`
            and passed to this function as:
                 ["0    das",
                  "1    Dunkel",
                  "2    war"]
            becomes:
                 [{"link": 0, "linktype": "single", "content": "das"},
                  {"link": 1, "linktype": "single", "content": "Dunkel"},
                  {"link": 2, "linktype": "single", "content": "war"}]
        
        Returns (ParTier): itself.
        """
        sep = re.compile("[ \t]+")  # regex that matches the separator between intra-tier values, such as "108885    2056    1    d_s"
        for item in content:
            if not len(sep.split(item)) >= self.component_count:
                # assert that the line contains all the elements we need
                echo("The following line in tier '{}' does not contain {} components as expected:\n\t{}".format(
                     self.name, self.component_count, item),
                     type = "error")
            new_tier_item = {}
            # The reason we're operating with split here instead of simply indexing the items
            # is that we're "deconstructing" each tier item. first we remove those elements
            # which are specific to the items of certain tier classes, then we remove those
            # which are shared by all
            if self.tier_class in (2, 4):
                start, duration, item = sep.split(item, maxsplit = 2)
                new_tier_item["start"] = int(start)
                new_tier_item["duration"] = int(duration)
            if self.tier_class in (3, 5):
                time, item = sep.split(item, maxsplit = 1)
                new_tier_item["time"] = int(time)
            if self.tier_class in (1, 4, 5):
                link, item = sep.split(item, maxsplit = 1)
                if "," in link:
                    link = [int(num) for num in link.split(",")]
                    linktype = "multiple"
                elif ";" in link:
                    link = [int(num) for num in link.split(";")]
                    linktype = "between"
                else:
                    link = int(link)
                    linktype = "single"
                new_tier_item["link"] = link
                new_tier_item["linktype"] = linktype
            new_tier_item["content"] = item
            self.tier_items.append(new_tier_item)
        return self
    
    def sorted_tier_items(self):
        """
        Return a list of this tier's items, sorted by their symbolic link index.
        The original list will be unmodified, the returned list is a deep copy.
        If used with a tier with no symbolic relation (i.e. a class 2 or class 4
        tier), this should result in an error.
        """
        return sorted(self.tier_items, key = lambda item: item["link"])
    
    @property
    def tier_class(self):
        """
        Return the BPF tier class.
        """
        return self.tier_classes[self.name]
    
    @property
    def component_count(self):
        """
        Return the number of components (time, symbolic link, etc) per line
        for this type of tier.
        """
        count_per_class = {1: 2,
                           2: 3,
                           3: 2,
                           4: 4,
                           5: 3}
        return count_per_class[self.tier_class]
    
    def is_valid_reference_tier(self):
        """
        Return a boolean indicating whether this tier is a valid reference tier.
        This will return `True` if the tier is one of the five eligible reference
        tiers, and if it includes a reference to every symbolic link in "pure"
        form, i.e. not as part of a comma- or semicolon-separated list of links.
        """
        if self.name in ParFile.get_reference_tier_hierarchy():
            links = {item['link'] for item in self.tier_items if item['linktype'] == 'single'}
                # construct a set of all "pure" links present in this tier
            link_range = range(min(links), max(links) + 1)
                # these are all the links that *should* be present, since BPF
                # files do not allow links to be skipped (e.g. [1, 2, 3, 5])
            if links.issuperset(link_range):
                return True
        return False  # if the function has not returned True so far, return False


class ExbFile:
    """
    A class which represents/encapsulates files in the EXMARaLDA Partitur-Editor format (.exb).
    
    Important attributes:
        timeline: A list of Element objects (part of the ElementTree module), ready for being
                  included in the final .exb output. Will be constructed at the end of the
                  conversion process.
        tiers: A list of ExbTier objects belonging to this tier.
        template: A string providing the bare-bones xml structure of an .exb file.
    """
    def __init__(self):
        self.timeline = []
        self.tiers = []
        self.times = []
        self.template = ('<basic-transcription><head><meta-information><project-name>'
                         '</project-name><transcription-name></transcription-name>'
                         '<referenced-file url=""/><ud-meta-information>'
                         '</ud-meta-information><comment></comment><transcription-convention>'
                         '</transcription-convention></meta-information><speakertable>'
                         '</speakertable></head><basic-body><common-timeline>'
                         '</common-timeline></basic-body></basic-transcription>')
        self.tree = ElementTree(fromstring(self.template))
    
    def set_header_element(self, element, content):
        """
        Set the content of an element in <head> section of the .exb file object.
        
        Arguments:
            element (str): The name of the header element.
            content (str): The text to be inserted between the opening and closing tag of the header element.
        """
        self.tree.find("./head//" + element).text = content
    
    def set_header_attribute(self, element, attribute, value):
        """
        Set an attribute of an element in the <head> section of the .exb file object.
        
        Arguments:
            element (str): The name of the header element.
            attribute (str): The name of the element's attribute to be modified.
            value (str): The value of the attribute.
        """
        self.tree.find("./head//" + element).set(attribute, value)
    
    def add_meta_information(self, name, content):
        """
        Add a <ud-information> element as child of the <ud-meta-information> element in the <head> section of the .exb file object.
        
        Arguments:
            name (str): The attribute name of the piece of meta information.
            content (str): The content of the piece of meta information, which will be inserted between the opening and closing tag.
        """
        information_element = Element("ud-information", {"attribute-name": name})
        information_element.text = content
        self.tree.find("./head/meta-information/ud-meta-information").append(information_element)
    
    def add_speaker(self, speaker_id, abbreviation = ""):
        """
        Add a <speaker> element as a child of the <speakertable> element in the
        header.
        
        Arguments:
            speaker_id (str): The 'id' attribute of the <speaker> element.
            abbreviation (str): The <abbreviation> element, which is a subelement
                of the <speaker> element. Its conent is normally the same as the
                'id' attribute mentioned above.
        """
        speaker_element = Element("speaker", {"id": speaker_id})
        speaker_element.extend([  # add subelements
            Element("abbreviation"),
            Element("sex", {"value": "u"}),
                # 'unknown' or 'undefined' since this information is not provided in a .par file
            Element("languages-used"),
            Element("l1"),
            Element("l2"),
            Element("ud-speaker-information"),
            Element("comment")
        ])
        speaker_element.find("./abbreviation").text = abbreviation
        self.tree.find("./head/speakertable").append(speaker_element)
    
    def add_tier(self, *args, **kwargs):
        """
        A sort of wrapper function which instantiates the ExbTier class and adds a reference to said instance the list self.tiers.
        Globally, the ExbTier class should generally not need to be instantiated through any means other than this function.
        
        Arguments:
            Same as for ExbTier's constructor (see that class's docstring).
        """
        new_tier = ExbTier(*args, **kwargs)
        self.tiers.append(new_tier)
        return new_tier
    
    def remove_tier(self, tier):
        """
        Remove an ExbTier object belonging to the ExbFile object.
        
        Arguments:
            name (ExbTier): A reference to the tier instance.
        """
        self.tiers.remove(tier)
    
    def construct_common_timeline(self):
        """
        Fill the <common-timeline> element of the .exb output with <tli> elements.
        """
        common_timeline = self.tree.find("./basic-body/common-timeline")
        # first step: collect ALL start and end times present in the ExbTier objects
        for tier in self.tiers:
            for event in tier.events:
                self.times.append(event["start_time"])
                self.times.append(event["end_time"])
        # second step: remove duplicate values and sort the list
        self.times = list(set(self.times))
        self.times.sort()
        # third step: now that we have all the individual times that will later
        # rendered as <tli> elements, iterate over all the tiers once again
        # and provide each of their elements with a start|end_ID in addition
        # to their start|end_time, which contains the index of the <tli> element
        # that is equivalent to the timestamp.
        # Also, create the actual <tli> elements and append them to `self.common_timeline`.
        for i, time in enumerate(self.times):
            ID = "T%d" % i
            for tier in self.tiers:
                for event in tier.events:
                    if time == event["start_time"]:
                        event["start_ID"] = ID
                    if time == event["end_time"]:
                        event["end_ID"] = ID
            tli = Element("tli", {
                "id": ID,
                "time": "{:.8f}".format(time)
                    # we could just be using str(time), but this sometimes
                    # generates scientific notation. With the format option
                    # we can avoid this, and also ensure it uses precisely
                    # eight digits after the decimal point.
            })
            common_timeline.append(tli)
    
    def construct_tiers(self):
        """
        Fill the <basic-body> element of the .exb output with <tier> elements,
        and fill each of those <tier> elements with <event> elements.
        """
        basic_body = self.tree.find("./basic-body")
        for i, tier in enumerate(self.tiers):
            # first step: create all the <tier> elements
            tier_element = Element("tier", {
                "id": "TIE%d" % i,
                "speaker": tier.speaker,
                "category": tier.category,
                "display-name": tier.display_name,
                "type": tier.tier_type
            })
            basic_body.append(tier_element)
            # second step: create all the <event> elements of the current <tier> element
            for event in tier.events:
                event_element = Element("event", {
                    "start": event["start_ID"],
                    "end": event["end_ID"],
                })
                event_element.text = event["content"]
                tier_element.append(event_element)
    
    def generate(self):
        """
        Generate the XML output of the current state of the ExbFile object.
        
        Returns (str): The output string, ready for writing to file.
        """
        self.construct_common_timeline()
        self.construct_tiers()
        outputstr = minidom.parseString(
            tostring(self.tree.getroot())
        ).toprettyxml()  # this just prettifies the output
        return outputstr


class ExbTier:
    """
    A class which represents/encapsulates tiers of an .exb file.
    
    Constructor arguments:
        # All arguments are XML attributes of the tier element.
        category (str): The category of the tier.
        tier_type (str): The type of the tier.
        display_name (str): The name of the tier which will be displayed prominently in the EXMARaLDA GUI.
        speaker (str): The ID of the speaker of the tier.
    
    Attributes:
        events: A list of dicionaries containing the events of the tier.
        Also, see the constructor arguments described above.
    """
    def __init__(self, category, tier_type, display_name = "", speaker = None):
        self.category = category
        self.tier_type = tier_type
        self.display_name = display_name
        self.speaker = speaker
        self.events = []
    
    def add_event(self, start_time, end_time, content):
        """
        Add an event to the tier.
        
        Arguments:
            start_time (int, float): The start time of the event in seconds.
            end_time (int, float): The end time of the event in seconds.
            content (str): The text to be inserted between the opening and closing tag of the event element.
        """
        self.events.append({
            "start_ID": None,
            "end_ID": None,
            "start_time": start_time,
            "end_time": end_time,
            "content": content
        })


def par_to_exb(par, exb, referenced_file = None):
    """
    Convert a ParFile object's attributes to an ExbFile object's attributes.
    
    Arguments:
        par (ParFile): the input ParFile object.
        exb (ExbFile): the output ExbFile object.
        referenced_file (str, optional): the name of the audio file which the BPF file annotates.
    """
    
    ###### CONVERSION OF HEADER
    echo("--- Starting header conversion ---\n")
    
    echo("Setting header attributes")
    if par.in_header("DBN"):
        project_name = par.pop_header_element("DBN")
        # the reason we're popping elements from the par.header dictionary instead of
        # simply reading them is that we're trying to "dismantle" it, so to speak.
        # once a handful of "special" header elements (the ones forming part of this
        # and the following if statements) have been handled, the other ones will be
        # inserted into the .exb file hierarchy using exb.add_meta_information
        exb.set_header_element("project-name", project_name)
        echo("Converted header elements: 'DBN' --> <project-name>")
    
    if referenced_file:
        exb.set_header_attribute("referenced-file", "url", referenced_file)
        echo("Inserted header element passed via command line parameter: -r --> <referenced-file>")
    elif par.in_header("SRC"):
        referenced_file = par.pop_header_element("SRC")
        exb.set_header_attribute("referenced-file", "url", referenced_file)
        echo("Converted header elements: 'SRC' --> <referenced-file>")
    
    if par.in_header("CMT"):
        comment = par.pop_header_element("CMT")
        exb.set_header_element("comment", comment)
        echo("Converted header elements: 'CMT' --> <comment>")
    
    if par.in_header("SPN"):
        speaker_id = par.pop_header_element("SPN")
        exb.add_speaker(speaker_id, abbreviation = speaker_id)
    else:
        # the SPN entry is obligatory for BPF headers, but nonetheless there are many files without it.
        # the default value for such cases is 'DefaultSpeaker'
        speaker_id = "DefaultSpeaker"
        exb.add_speaker(speaker_id, abbreviation = speaker_id)
        echo("Obligatory BPF header element 'SPN' (speaker ID) missing from input - resorting to default value 'DefaultSpeaker'", type = "warning")
    echo("Added speaker with ID '{}' to <speakertable>".format(speaker_id))
    
    if par.in_header("SYS"):
        transcription_convention = par.pop_header_element("SYS")
        exb.set_header_element("transcription-convention", transcription_convention)
        echo("Converted header elements: 'SYS' --> <transcription-convention>")
    
    echo("Appending remaining header elements to <ud-meta-information>")
    for element, content in par.header.items():
        # iterate over all remaining BPF header elements and add them
        # to the .exb file as part of the user-defined meta information
        # (provided they actually have content)
        if content:
            exb.add_meta_information(element, content)
            echo("Appended header element '{}' as a <ud-information> element to <ud-meta-information>".format(element), required_verbosity = 2)
    
    if args.verbosity > 0:
        print()  # because we need an empty line here
    echo("--- Finished header conversion ---\n")
    ######
    
    ###### CONVERSION OF TIERS
    echo("--- Starting tier conversion ---\n")
    ###### perform symbolic link time inference if necessary
    if par.get_tiers_requiring_link_inference():
        try:
            par.infer_link_times()
            echo("Successfully performed symbolic link time inference using the reference tier '{}'".format(par.get_reference_tier()),
                 required_verbosity = 1)
        except AssertionError:
            echo("No valid reference tiers could be found - exiting...\n\tIf the input contains any class 1 or class 5 tiers, the program will cycle through a hierarchy of selected class 4 tiers (in the order MAU->SAP->WOR->PHO->MAS) and use the first one it finds to infer the times of the symbolic links. If there is not at least one of these tiers present which contains a reference to every symbolic link (and provided the reference is a singular link, not a comma or semicolon-separated list of links), the file cannot be converted.\n\tIf you would instead like to discard all class 1 and class 5 tiers in your file, you can append the option '--ignore {}'".format(
                 ",".join([tier.name for tier in par.get_tiers_requiring_link_inference()])),
                 type = "error")
    ######
    
    for par_tier in par.tiers:
        ###### instantiate new ExbTier object, which we'll be working on for the current tier
        exb_tier = exb.add_tier(category = par_tier.name, tier_type = "t", display_name = par_tier.name, speaker = speaker_id)
        echo("Starting conversion of tier '{}'".format(par_tier.name),
             "Starting conversion of tier '{}' (class {})".format(par_tier.name, par_tier.tier_class),
             type = "debug_multiple")
        discarded_item_warning = False
            # this variable is there so that the warning for when the program
            # encounters an item of duration <= 0 is only given once per tier
        ######
        
        ###### process (i.e. discard) class 3 tiers
        if par_tier.tier_class == 3:
            # discard the current tier and continue with the next one
            echo("Class 3 tiers are currently not supported. Skipping tier '{}'".format(par_tier.name), type = "warning")
            exb.remove_tier(exb_tier)
            continue
        ######
        
        ###### prepare the processing of class 1 and class 5 tiers
        elif par_tier.tier_class in (1, 5):
            inference_necessary = True  # this variable tells us whether this
                                        # tier has its own time information
                                        # or whether we will have to rely on
                                        # the inferred symbolic link times
            exb_tier.tier_type = "a"
            echo("The tier '{}' is a class {} tier - will refer to symbolic link time inference".format(par_tier.name, par_tier.tier_class),
                 "The tier '{}' is a class {} tier - it is necessary to infer the time of the symbolic links, which will be accomplished by making use of a selection of class 4 tiers. See --list-reference-tiers for more information".format(par_tier.name, par_tier.tier_class),
                 type = "debug_multiple")
        ######
        
        ###### prepare the processing of class 2 and class 4 tiers
        elif par_tier.tier_class in (2, 4):
            inference_necessary = False
            if par.get_tiers_requiring_link_inference() and par_tier.name != par.get_reference_tier():
                additional_speaker_id = speaker_id + "-" + par_tier.name
                exb.add_speaker(additional_speaker_id, abbreviation = additional_speaker_id)
                exb_tier.speaker = additional_speaker_id
                echo("Correction for header conversion: added speaker with ID '{}' to <speakertable>".format(additional_speaker_id),
                     "Correction for header conversion: EXMARaLDA allows only one tier of type = 't' per speaker. Since the file's main speaker ID '{}' is already in use for the tier '{}', we will add an additional speaker with ID '{}' to <speakertable>".format(speaker_id, par.get_reference_tier(), additional_speaker_id),
                     type = "debug_multiple")
        ######
        
        ###### iterate over tier items and add each to the current ExbTier object
        for i, tier_item in enumerate(par_tier.tier_items):
            if inference_necessary:  # if this is a class 1 or class 5 tier, infer link times
                if tier_item["linktype"] == "single":
                    # simplest case - a single link
                    start, end = par.get_link_times(tier_item["link"])
                    echo("Retrieving inferred time of link '{}'".format(tier_item["link"]), required_verbosity = 2)
                elif tier_item["linktype"] == "multiple":
                    # if it's a comma-separated list of links, use the lowest and highest
                    # to infer start time and end time, respectively
                    start, _ = par.get_link_times(min(tier_item["link"]))
                    _, end = par.get_link_times(max(tier_item["link"]))
                    echo("Retrieving inferred time of link '{}'".format(tier_item["link"]), required_verbosity = 2)
                elif tier_item["linktype"] == "between":
                    # discard the current tier and continue with the next one
                    echo("Cannot handle semicolon-separated symbolic links (such as '2;3'). Skipping tier '{}'".format(par_tier.name), type = "warning")
                    exb.remove_tier(exb_tier)
                    break
            else:
                # if it's a class 2 or class 4 tier, simply use the time
                # present in the original item
                start = tier_item["start"]
                end = start + tier_item["duration"]
            
            ###### resolve potential time issues
            if end <= start:
                # if the item has a duration of 0 samples
                # or lower (yes, that does happen),
                # discard it
                if not discarded_item_warning:
                    echo("The tier '{}' contains one or more items with a duration of zero or negative. These items will be discarded".format(par_tier.name), type = "warning")
                    discarded_item_warning = True
                continue
            # this next bit requires some explanation. BPF files measure time in
            # samples, and they normally expect the entire duration of the audio
            # file which it accompanies to be "covered" by the tiers. That is to
            # say: every sample should be annotated by at least one tier.
            # However, since samples, as opposed to most units for measuring time,
            # are discrete, the start time of a given item should not be equal to
            # the end time of the previous item, but rather the end time of the
            # previous item PLUS ONE. When then converting these time measurements
            # in samples to seconds, this can result in a problem: if, for example,
            # item A has the end time 3099 [samples], and item B has the start time
            # 3100 [samples], simply converting the times to seconds with respect
            # to the referenced audio file's sample rate would result in a gap with
            # the length of one sample inbetween. The simplest workaround for this
            # (and also the most plausible with respect to the physics behind it)
            # would be to add half a sample to each end time and to subtract half
            # a sample from each start time (unless it's zero, since we can't)
            # have a negative start time.
            # When I say "the simplest workaround", I don't mean that it was simple
            # to come up with, btw. What I rather mean is that I was wrecking my
            # head over the problem for days and in the end basically got the solution
            # pointed out to me by Florian. So basically this is a lesson to talk
            # to others if there's a problem you're not getting on with because the
            # solution might be obvious to others, while it isn't to you.
            echo("Padding start and end time of the current item with half a sample",
                 required_verbosity = 2)
            if start >= 1:
                start -= 0.5
            end += 0.5
            ######
            
            # finally, convert samples to seconds and append the ExbTier object to the ExbFile object
            start, end = par.samples_to_seconds(start), par.samples_to_seconds(end)
            exb_tier.add_event(start, end, tier_item["content"])
            echo("Converted BPF tier item {} of index {} to .exb tier event".format(repr(tier_item["content"]), i), required_verbosity = 2)
        ######
        echo("Added tier '{}'\n".format(par_tier.name))
    
    echo("--- Finished tier conversion ---\n")
    ######


def echo(*messages, type = "debug", required_verbosity = 1):
    """
    Display debug information, a warning, or an error to the user.
    (to the console - this function does not raise python errors!)
    
    Arguments:
        messages (arbitrary number of str): The strings to be displayed. For all values of type except 'debug_multiple',
            the strings will be joined with spaces. If the value of type is 'debug_multiple', then one message
            should be provided for each verbosity level, in ascending order. Example usage:
            `echo('message for verbosity = 0', 'message for verbosity = 1', 'message for verbosity = 2', type = 'debug_multiple')`
        type (str): either 'debug', 'debug_multiple', 'warning', or 'error'.
        required_verbosity (int): if the value of type is 'debug', then this is the minimum required verbosity
            necessary for the message(s) to be displayed.
    """
    if type == "debug":
        if args.verbosity >= required_verbosity:
            print("DEBUG: mausbpf2exb:", *messages)
    elif type == "debug_multiple":
        if args.verbosity > 0:
            print("DEBUG: mausbpf2exb:", messages[args.verbosity - 1])
    elif type == "warning":
        print("WARNING: mausbpf2exb:", *messages, file = sys.stderr)
    elif type == "error":
        sys.exit("ERROR: mausbpf2exb: " + " ".join(messages))


if __name__ == "__main__":
    ###### this block implements the parser as well as the help page etc
    parser = argparse.ArgumentParser(prog = "mausbpf2exb", description = "This program converts speech annotation files in the BAS Partitur Format (BPF) to files for the Partitur-Editor of the EXMARaLDA speech software suite.\nFor more detailed information, consult README.md.")
    parser.add_argument("input", nargs = "?", help = "input file (.par, .bpf). If not specified, STDIN will be used")
    parser.add_argument("-o", "--output", help = "output file (.exb). If not specified, STDOUT will be used")
    parser.add_argument("-v", "--verbosity", type = int, default = 0, help = "verbosity level of debug information on a scale of 0 to 2 (default = 0, meaning only errors and warnings, but no debug information)")
    parser.add_argument("-r", "--referenced-file", help = "name of referenced audio file")
    parser.add_argument("--ignore", dest = "ignored_tier", help = "name (or comma-separated list of names) of BPF tier(s) to be ignored during conversion")
    parser.add_argument("--version", action = "version", version = "1.0")
    parser.add_argument("--list-reference-tiers", action = "store_true", help = "print the hierarchy of class 4 tiers used to infer times of symbolic links and exit")
    args = parser.parse_args()
    ######
    
    ###### this block is for when the user passes the --list-reference-tiers parameter
    if args.list_reference_tiers:
        for tier in ParFile.get_reference_tier_hierarchy():
            print(tier)
        print("If the input contains any class 1 or class 5 tiers (which it usually does), the program will cycle through a hierarchy of selected class 4 tiers and use the first one it finds to infer the times of the symbolic links. If there is not at least one of these tiers present which contains a reference to every symbolic link (and provided the reference is a singular link, not a comma or semicolon-separated list of links), the file cannot be converted. However, you can instead choose to discard all class 1 and class 5 tiers; for this, run the program once with your input, the resulting error message will tell you exactly which option you will have to use to do so. (This annotation has been printed to stderr, so you can still safely pipe the above list)",
              file = sys.stderr)
        sys.exit()
    ######
    
    ###### this block takes care of the actual conversion
    # if an input file name has been provided, use that, otherwise, use STDIN
    if args.input:
        if path.exists(args.input):
            parFile = ParFile.fromFilepath(args.input)
        else:
            echo("The input file '{}' could not be found.".format(args.input), type = "error")
    else:
        parFile = ParFile(sys.stdin.read())
    exbFile = ExbFile()
    # remove tier(s) that the user decided to ignore
    if args.ignored_tier:
        ignored_tiers = args.ignored_tier.split(",")
        for ignored_tier in ignored_tiers:
            if parFile.tier_exists(ignored_tier):
                echo("Discarding tier '{}'".format(ignored_tier),
                     "Discarding tier '{}' as specified with the --ignore paramter".format(ignored_tier),
                     type = "debug_multiple")
                parFile.remove_tier(ignored_tier)
            else:
                echo("The tier you have chosen to ignore ('{}') is not present in the input anyway".format(
                     ignored_tier),
                     type = "warning")
        print()  # print an empty line for visual separation in case we were printing debug information about ignored tiers
    par_to_exb(parFile, exbFile, args.referenced_file)
    echo("Generating and formatting ouput")
    outputstr = exbFile.generate()
    ######
    
    ######
    # this block will attempt to use the program xmllint (pre-installed on many Unix systems)
    # to check whether the output complies with the DTD (document type definition) for
    # EXMARaLDA basic transcriptions, as specified in basic-transcription.dtd
    # (if you don't have it, you can download it here: https://www.exmaralda.org/files/basic-transcription.dtd)
    if shutil.which("xmllint") is not None:  # check if xmllint is installed
        scriptdir = path.dirname(path.realpath(__file__))  # location of mausbpf2exb.py
        dtdpath = path.join(scriptdir, "basic-transcription.dtd")
        if path.isfile(dtdpath):  # check if basic-transcription.dtd exists
            try:
                subprocess.run(
                    # this runs the shell command responsible for checking the output against the DTD
                    ["xmllint", "--dtdvalid", "basic-transcription.dtd", "--noout", "-"], # the last argument (dash) is xmllint's placeholder for stdin
                    input = outputstr,  # pass our output to stdin
                    stdout = subprocess.PIPE,  # capture stdout
                    stderr = subprocess.PIPE,  # capture stderr
                    cwd = scriptdir,  # execute subprocess in directory of script
                    universal_newlines = True,  # this allows us to use strings instead of bytes objects for the standard streams
                    check = True  # raise an error if xmllint fails
                )
                echo("Successfully verified output with the document type definition 'basic-transcription.dtd' using xmllint.")
            except subprocess.CalledProcessError as linterror:
                errormessage = indent(linterror.stderr, "\t")
                echo("The verification of the converter's output against the document type definition using xmllint produced the following error:\n{}".format(errormessage),
                     type = "error")
        else:
            echo("The document type definition 'basic-transcription.dtd' could not be found, the output could therefore not be verified using xmllint. You can download the DTD here: https://www.exmaralda.org/files/basic-transcription.dtd", type = "warning")
    else:
        echo("The program 'xmllint' is not installed on your system. The output of the converter could not be verified with the document type definition (basic-transcription.dtd, which can be downloaded here: https://www.exmaralda.org/files/basic-transcription.dtd)", type = "warning")
    ######
    
    # if an output file name has been provided, use that, otherwise, use STDOUT
    if args.output:
        with open(args.output, mode = "w", encoding = "utf-8", newline = "\n") as outputfile:
            echo("Writing ouput to {}".format(args.output))
            outputfile.write(outputstr)
    else:
        print(outputstr)
