#!/usr/bin/env python3

# example command: py mausbpf2exb.py example.par
# possibly use properties instead of attributes
# TODO:
# 	watch out for line terminators and encoding on ExbFile.write
#	'construct common timeline'
#	use re.compile for sep1 and sep2
#	fix the scientific notation thingy with the first few tlis
#	check all instances of 'pass' and 'oh no'
#	ExbTier.events should probably be a list of dictionaries rather than of tuples
#	unify attribute and method names for ParFile <-> ExbFile as well as ParTier <-> ExbTier
#	add option to ignore specific tiers
#	add argument for "sample distance leeway" (concerning class 4 and poss. class 2)
#	format with black
#	compile sep1 and sep2
#	should everything but the xml output be printed to stderr? cause otherwise stuff like "Successfully converted" will be included in any piping
#	maybe use filename conversion after all instead of printing to STDOUT when args.output is not provided
#	make a function for outputting warnings


import argparse
from os import path
import sys
import re
from xml.etree.ElementTree import *
from xml.dom import minidom


class ParFile:
	
	def __init__(self, inputstr = None):
		self.header = {}
		self.raw_tiers = {}
		self.tiers = []
		if inputstr:
			self.parse(inputstr)
	
	def parse(self, inputstr):
		
		lines = (line for line in inputstr.splitlines()) # creates a generator
		sep1 = ":[ \t]+"
			# regex that matches the separator between labels and their values, such as "REP: Muenchen"
		sep2 = "[ \t]+"
			# regex that matches the separator between intra-tier values, such as "108885	2056	1	d_s"
		
		# extract and parse header
		for line in lines:
			if line != "LBD:": # the LBD line signifies the end of the header, hence the break statement below
				label, value = re.split(sep1, line, maxsplit = 1)
				self.header[label] = value
			else:
				break
		
		# extract and parse tiers
		for line in lines:
			# since lines is a generator object, this loop won't start over
			# at the first line, but rather after we broke out of the last loop
			line_tiername, line_values = re.split(sep1, line, maxsplit = 1)
			line_values = re.split(sep2, line_values)
			try:
				self.raw_tiers[line_tiername].append(line_values)
			except KeyError:
				# if the dictionary entry for this tier doesn't exist yet,
				# create it and insert the first value 
				self.raw_tiers[line_tiername] = [line_values]
		
		for tier_name in self.raw_tiers:
			self.tiers.append(ParTier(tier_name).parse(self.raw_tiers[tier_name]))
		
		del self.raw_tiers
	
	def get_header_element(self, label):
		return self.header[label]
	
	def set_header_element(self, label, value):
		self.header[label] = value
		# this should be modified so that it inserts the element in the correct position as well
	
	def in_header(self, label):
		return label in self.header
	
	def add_tier(self, *args, **kwargs): # unused
		self.tiers.append(ParTier(*args, **kwargs))
	
	def remove_tier(self, name):
		for tier in self.tiers:
			if tier.name == name:
				self.tiers.remove(tier)
	
	def get_link_times(self, link):
		tiers_class4 = [tier for tier in self.tiers if tier.tier_class == 4]
		if tiers_class4:
			start_times = []
			end_times = []
			for tier in tiers_class4:
				for item in tier.tier_items:
					if item["linktype"] == "single":
						if item["link"] == link:
							start_times.append(item["start"])
							end_times.append(item["start"] + item["duration"])
#					elif min(item["link"]) == link or max(item["link"]) == link:
			if start_times and end_times:
				return (min(start_times), max(end_times))
			else:
				raise Exception("oh no: link time could not be inferred")
		else:
			raise Exception("oh no: no class 4 tiers")
	
	def samples_to_seconds(self, sample):
		return sample / int(self.get_header_element("SAM"))


class ParTier:
	
	def __init__(self, name):
		self.name = name
		self.tier_items = []
		self.tier_classes = {"KAN": 1, "KSS": 1, "MRP": 1, "KAS": 1, "PTR": 1, "ORT": 1, "TRL": 1, "TR2": 1, "TRO": 1, "SUP": 1, "DAS": 1, "PRS": 1, "NOI": 1, "PRO": 1, "SYN": 1, "FUN": 1, "LEX": 1, "POS": 1, "LMA": 1, "TRS": 1, "TLN": 1, "TRW": 1, "SPK": 1, "IPA": 2, "GES": 2, "USH": 2, "USM": 2, "OCC": 2, "SPD": 2, "LBP": 3, "LBG": 3, "PRM": 3, "PHO": 4, "SAP": 4, "MAU": 4, "WOR": 4, "TRN": 4, "USP": 4, "MAS": 4, "PRB": 5}
	
	def parse(self, content):
		for item in content:
			self.tier_items.append({})
			if self.tier_class in (2, 4):
				self.tier_items[-1]["start"] = int(item.pop(0))
				self.tier_items[-1]["duration"] = int(item.pop(0))
			if self.tier_class in (3, 5):
				self.tier_items[-1]["time"] = int(item.pop(0))
			if self.tier_class in (1, 4, 5):
				link = item.pop(0)
				if "," in link:
					link = [int(num) for num in link.split(",")]
					linktype = "multiple"
				elif ";" in link:
					link = [int(num) for num in link.split(";")]
					linktype = "between"
				else:
					link = int(link)
					linktype = "single"
				self.tier_items[-1]["link"] = link
				self.tier_items[-1]["linktype"] = linktype
			self.tier_items[-1]["content"] = " ".join(item)
		return self
	
	@property
	def tier_class(self):
		return self.tier_classes[self.name]


class ExbFile:
	
	def __init__(self):
		self.timeline_old = {}
		self.timeline = []
		self.tiers = []
		self.times = []
		self.template = """
<basic-transcription>
	<head>
		<meta-information>
			<project-name></project-name>
			<transcription-name></transcription-name>
			<referenced-file url=""/>
			<ud-meta-information></ud-meta-information>
			<comment></comment>
			<transcription-convention></transcription-convention>
		</meta-information>
		<speakertable>
			<speaker id="">
				<abbreviation></abbreviation>
				<sex value=""/>
				<languages-used/>
				<l1/>
				<l2/>
				<ud-speaker-information></ud-speaker-information>
				<comment></comment>
			</speaker>
		</speakertable>
	</head>
	<basic-body>
		<common-timeline></common-timeline>
	</basic-body>
</basic-transcription>
		"""
#		self.template = "<basic-transcription><head><meta-information><project-name></project-name><transcription-name></transcription-name><referenced-file url=""/><ud-meta-information></ud-meta-information><comment></comment><transcription-convention></transcription-convention></meta-information><speakertable><speaker id=""><abbreviation></abbreviation><sex value=""/><languages-used/><l1/><l2/><ud-speaker-information></ud-speaker-information><comment></comment></speaker></speakertable></head><basic-body><common-timeline></common-timeline></basic-body></basic-transcription>"
#		<tier id="" speaker="" category="" display-name="" type=""></tier>
		# ^ this really ought to be prettier
		self.tree = ElementTree(fromstring(self.template))
	
	def set_header_element(self, element, content):
		next(self.tree.iter(element)).text = content
	
	def set_header_attribute(self, element, attribute, value):
		next(self.tree.iter(element)).set(attribute, value)
	
	def add_tli_old(self, time, id = None):
		if id:
			self.timeline_old[id] = time
		else:
			T_index = 0
			for previous_id, previous_time in self.timeline_old.items():
				if re.match("T\d+", previous_id):
					T_index = previous_id
	
	def add_tli(self, time):
		self.timeline.append(time)
	
	def add_tier(self, *args, **kwargs):
		new_tier = ExbTier(*args, **kwargs)
		self.tiers.append(new_tier)
		return new_tier
	
	def construct_timeline(self):
		common_timeline = next(self.tree.iter("common-timeline")) # this should be retrieved in some more elegant way
		for tier in self.tiers:
			for event in tier.events:
				self.times.append(event["start_time"])
				self.times.append(event["end_time"])
		self.times = list(set(self.times)) # removes duplicate values
		self.times.sort()
#		for event in tier.events:
#			event["start_ID"] = self.times.index()
		for i, time in enumerate(self.times):
			ID = "T%d" % i
			for tier in self.tiers:
				for event in tier.events:
					if time == event["start_time"]:
						event["start_ID"] = ID
					if time == event["end_time"]:
						event["end_ID"] = ID
			tli = Element("tli", {
				"id": ID,
				"time": str(time)
			})
#			tli.text = "\n"
			common_timeline.append(tli)
	
	def construct_tiers(self):
		basic_body = next(self.tree.iter("basic-body")) # this should be retrieved in some more elegant way
		for i, tier in enumerate(self.tiers):
			tier_element = Element("tier", {
				"id": "TIE%d" % i,
				"speaker": tier.speaker,
				"category": tier.category,
				"display-name": tier.display_name,
				"type": tier.tier_type
			})
			basic_body.append(tier_element)
			for event in tier.events:
				event_start = self.times.index(event["start_time"])
				event_end = self.times.index(event["end_time"])
				event_element = Element("event", {
					"start": event["start_ID"],
					"end": event["end_ID"],
				})
				event_element.text = event["content"]
				tier_element.append(event_element)
	
	def generate(self):
		self.construct_timeline()
		self.construct_tiers()
		outputstr = minidom.parseString(tostring(self.tree.getroot())).toprettyxml(indent = "")
#		outputstr = re.sub("\n+", "\n", outputstr)
		return outputstr


class ExbTier:
	
	def __init__(self, category, tier_type, display_name = "", speaker = None):
		self.category = category
		self.tier_type = tier_type
		self.display_name = display_name
		self.speaker = speaker
		self.events = []
	
	def add_event(self, start_time, end_time, content):
		self.events.append({
			"start_ID": None,
			"end_ID": None,
			"start_time": start_time,
			"end_time": end_time,
			"content": content
		})


def par_to_exb(par, exb):
	# compatibility checks
	tiers_class4 = [tier for tier in par.tiers if \
		tier.tier_class == 4 and \
		all([tier_item["linktype"] == "single" for tier_item in tier.tier_items])]
#	if 4 not in tier_classes:
#		sys.exit("Error: Input file must contain at least one class 4 tier")
	"""
	if tiers_class4:
		for tier in tiers_class4:
			for tier_item in tier.tier_items:
				try:
					tier_item["link"] = int(tier_item["link"])
					# if the link contains a comma or a semicolon, this will fail
				except ValueError:
					sys.exit("oh no")
	else:
		sys.exit("oh no")
	"""
	if tiers_class4:
		pass
	else:
		sys.exit("oh no")
	
	# see header-equivalents.txt
	# also this could probably be made much nicer with a dictionary or something
	# instead of a thousand if statements
	if par.in_header("DBN"):
		project_name = par.get_header_element("DBN")
		exb.set_header_element("project-name", project_name)
	elif par.in_header("SRC"):
		pass # you wanted to do something here
	
	if par.in_header("SRC"):
		referenced_file = par.get_header_element("SRC")
		exb.set_header_element("referenced-file", referenced_file)
	
	if par.in_header("CMT"):
		comment = par.get_header_element("CMT")
		exb.set_header_element("comment", comment)
	
	if par.in_header("SPN"): # compulsory element anyway
		speaker_id = par.get_header_element("SPN")
		exb.set_header_attribute("speaker", "id", speaker_id)
#		exb.set_header_attribute("tier", "speaker", speaker_id)
	
	exb.set_header_attribute("sex", "value", "u")
		# 'unknown' or 'undefined' since this information is not provided in a .par file
	
	for par_tier in par.tiers:
		exb_tier = exb.add_tier(category = "0", tier_type = "0", display_name = par_tier.name, speaker = speaker_id)
		for i, tier_item in enumerate(par_tier.tier_items):
			if par_tier.tier_class == 1:
				start, end = par.get_link_times(tier_item["link"])
			elif par_tier.tier_class == 3:
				start = 0
				end = 0
			elif par_tier.tier_class == 4:
				"""
				previous_tier_item = par_tier.tier_items[i - 1]
				if tier_item["start"] == previous_tier_item["end"] + 1:
					start = previous_tier_item["end"]
				else:
					start = tier_item["start"]
					end = 
				end = start + tier_item["duration"]
				"""
				# do this stuff for class 2 as well? (yeah probably)
				if tier_item["start"] == previous_end_time + 1:
					start = previous_end_time
					end = start + tier_item["duration"] + 1
				else:
					start = tier_item["start"]
					end = start + tier_item["duration"]
			previous_end_time = end
			start, end = par.samples_to_seconds(start), par.samples_to_seconds(end)
			exb_tier.add_event(start, end, tier_item["content"])


if __name__ == "__main__":
	
	parser = argparse.ArgumentParser()
	parser.add_argument("input", nargs = "?", help = "input file (.par, .bpf). If not specified, STDIN will be used")
	parser.add_argument("-o", "--output", help = "output file (.exb). If not specified, STDOUT will be used")
	parser.add_argument("-r", "--referenced-file", help = "name (or comma-separated list of names) of referenced audio file(s)")
	parser.add_argument("--ignore", help = "name (or comma-separated list of names) of BPF tier(s) to be ignored during conversion")
	args = parser.parse_args()
	
	if args.input:
		with open(args.input) as inputfile:
			inputstr = inputfile.read()
	else:
		inputstr = sys.stdin.read()
	
	parFile = ParFile(inputstr)
	exbFile = ExbFile()
	parFile.remove_tier("PRM") # this command should be removed later
	par_to_exb(parFile, exbFile)
	outputstr = exbFile.generate()
	
	if args.output:
		with open(args.output, mode = "w") as outputfile:
			outputfile.write(outputstr)
	else:
		print(outputstr)
	
	print("Successfully converted.")
	
	def foo():
		for tier in parFile.tiers:
			print(tier.name)
			for tier_item in tier.tier_items:
				print(tier_item)
			print()

