
############################################################
# read packages and define file locations
# If you move the package to new location change the defintion 
# of 'pfad2' in this file
source("/Users/mes/Documents/ABMs/PublicABM/Rcmd/pathsAndLibraries.R")


# Load database 1 
brwd=load_emuDB("/Users/mes/Documents/dbs/Braidwood_emuDB")

# Segment lists and labels
fric.s=query(brwd, query="[phonetic=i|ʔ -> #phonetic=ʃ|s]")
word.s=requery_hier(brwd,fric.s, level="word")
unique(word.s$labels) # (check only target words included)
temp=word.s$labels %in% c("seem", "sane", "sheep", "Shane", "stream")
word.s=word.s[temp,]
fric.s=fric.s[temp,]

fric.lab=label(fric.s)
word.lab=label(word.s)
u = utt(fric.s)
vpn = substring(u, 6, 7)

# Initial segments in target word 
initial = rep("", nrow(fric.s))
temp = word.lab %in% c("seem", "sane")
initial[temp] = "s"
temp = word.lab %in% c("sheep", "Shane")
initial[temp] = "ʃ"
temp = word.lab == "stream"
initial[temp] = "str"

# build dataframe
str.df = cbind(fric.s, Word = factor(word.lab), Speaker=factor(vpn), Initial=factor(initial))

# reduce to 20 speakers
length(unique(str.df$Speaker))
# 21 - remove AS, RS who did not do Brwd4 experiment
str.tmp=str.df[!str.df$Speaker %in% c("AS", "RS"),]
fric.s=fric.s[!str.df$Speaker %in% c("AS", "RS"),]
str.df = str.tmp
str.df$Speaker=factor(str.df$Speaker)

# anonymise speakers
str.df$Speaker=mapvalues(str.df$Speaker, from= c("AD", "AG", "AT", "BG", "CY", "EM", "FS", "JB", "JL", "JM", "KC", "KS", "KT", "LC", "LH", "MW", "NM", "NU", "PG"), to= c("F01", "F02", "F03", "M04", "F05", "F06", "F07", "M08", "M09", "F10", "F11", "F12", "F13", "F14", "F15", "M16", "F17", "M18", "M19"))

# increase M16 to 10 'sane' tokens by copying one 'sane' token in str.df
# WARNING: we can do this here as an emergency measure, since we have more than 4 tokens;
# however, you *cannot* do the same trick on three tokens only, since the cov matrix
# would become singular! 
#str.tmp=str.df[str.df$Speaker=="M16" & str.df$Word=="sane",]
#fric.tmp=fric.s[str.df$Speaker=="M16" & str.df$Word=="sane",]
#str.df=rbind(str.df, str.tmp[1,])
#fric.s=rbind(fric.s, fric.tmp[1,])
str.df$Speaker=factor(str.df$Speaker)

# overview of tokens for expt
table(str.df$Speaker, str.df$Word)

# Calculate DFT spectra with default setting for all /s/ segments:
tr = get_trackdata(brwd,ssffTrackName="dft",seglist=fric.s,onTheFlyFunctionName="dftSpectrum") 
#tr1 = tr         # only if we need to save this object
# Calculate the (correct) spectral moments 1-4 from the power spectrum in 500-15000Hz as tracks over time:
tr.pow = dbtopower(tr) 
mom.pow = fapply(tr.pow[,500:15000],moments,minval=F)
#colnames(mom.pow$data) = paste("M", 1:4, sep="") 

# Speaker normalise the moment data using Lobanov
# the speaker mean is substracted and divided by speaker SD
# done over all tracks of a speaker; therefore
# relative distances between sibilant classes within a speaker
# are *not* normalized away.
mom.pow = norm(mom.pow, as.character(str.df$Speaker), "lob")
mom.pow = as.trackdata(mom.pow$data, mom.pow$index, mom.pow$ftime)

# linearly time-normalise the data (although DCT coefficients should be indifferent 
# to whether or not linear time normalisation has occurs since they are based on shape...)
mom.pow = tracklinear(mom.pow[,1])
# obtain average M1 from mid 50% of track (to have a broader feature to look at if necessary)
mom.pow.M1 = trapply(dcut(mom.pow[,1], 0.25, 0.75, prop=T), mean, simplify=T)
# add gender as a possible grouping factor
gender= factor(substr(str.df$Speaker,1,1))
# obtain ABM parameters (must be 3-dim!):  DCT coeffs on linearly time-normalised M1 tracks
mom.pow.dct = trapply(mom.pow, dct, 2, simplify=T)
# bind in DCT coeffs and average M1 into data-frame  
str.df = data.frame(str.df, M1 = mom.pow.M1, k0 = mom.pow.dct[,1], k1 = mom.pow.dct[,2], k2 = mom.pow.dct[,3],gender)





# intermediate store of str.df 
str1.df = str.df


####################################################
# load second database: extra tokens from Braidwood 4 
# these are the word-medial tokens
####################################################

brwd=load_emuDB("/Users/mes/Documents/dbs/Braidwood4_emuDB")

# Segment lists and labels
fric.s=query(brwd, query="phonetic=ʃ|s")
word.s=requery_hier(brwd,fric.s, level="word")
unique(word.s$labels) # (check only target words included)
#[1] "Minnesota"      "Overshadowed"   "Astronomy"      "Catastrophic"   "Assembly"       "Disheveled"    
#[7] "Pastrami"       "Astringent"     "Assault"        "Machine"        "Destroy"        "Restrict"      etc
fric.lab=label(fric.s)
u = utt(fric.s)
vpn = substring(u, 1, 2)
word.lab=label(word.s)

# sibilant label (s, S, str)
initial=rep("str", length(word.lab))
initial[word.s$labels %in% c("Minnesota", "Assembly", "Assault", "Soak", "Fascinating", "Possible", "Messy", "Motorcycle", "Policy")]="s"
initial[word.s$labels %in% c("Overshadowed", "Disheveled", "Machine", "Show", "Information", "Passionate", "Tissue", "Perishable", "Polishing")]="ʃ"

# build dataframe 
str.df = cbind(fric.s, Word = factor(word.lab), Speaker=factor(vpn), Initial=factor(initial))

# reduce to 19 speakers?
length(unique(str.df$Speaker))

# anonymise speakers
library(plyr)
# the next command maps 19 speakers (removed RS who is M20)
str.df$Speaker=mapvalues(str.df$Speaker, from= c("AD", "AG", "AT", "BG", "CY", "EM", "FS", "JB", "JL", "JM", "KC", "KS", "KT", "LC", "LH", "MW", "NM", "NU", "PG"), to= c("F01", "F02", "F03", "M04", "F05", "F06", "F07", "M08", "M09", "F10", "F11", "F12", "F13", "F14", "F15", "M16", "F17", "M18", "M19"))

str.df$Speaker=factor(str.df$Speaker)

# overview of tokens
table(str.df$Speaker, str.df$Word)

#####MS
# Calculate DFT spectra with default setting for all /s/ segments:
tr = get_trackdata(brwd,ssffTrackName="dft",seglist=fric.s,onTheFlyFunctionName="dftSpectrum") 
#tr2 = tr         # only if we need to save this object
# Calculate the (correct) spectral moments 1-4 from the power spectrum in 500-15000Hz as tracks over time:
tr.pow = dbtopower(tr) 
mom.pow = fapply(tr.pow[,500:15000],moments,minval=F)
#colnames(mom.pow$data) = paste("M", 1:4, sep="") 

# Speaker normalise the moment data using Lobanov
# the speaker mean is substracted and divided by speaker SD
# done over all tracks of a speaker; therefore
# relative distances between sibilant classes within a speaker
# are *not* normalized away.
mom.pow = norm(mom.pow, as.character(str.df$Speaker), "lob")
mom.pow = as.trackdata(mom.pow$data, mom.pow$index, mom.pow$ftime)

# linearly time-normalise the data (although DCT coefficients should be indifferent 
# to whether or not linear time normalisation has occurs since they are based on shape...)
mom.pow = tracklinear(mom.pow[,1])
# obtain average M1 from mid 50% of track (to have a broader feature to look at if necessary)
mom.pow.M1 = trapply(dcut(mom.pow[,1], 0.25, 0.75, prop=T), mean, simplify=T)
# add gender as a possible grouping factor
gender= factor(substr(str.df$Speaker,1,1))
# obtain ABM parameters (must be 3-dim!):  DCT coeffs on linearly time-normalised M1 tracks
mom.pow.dct = trapply(mom.pow, dct, 2, simplify=T)
# bind in DCT coeffs and average M1 into data-frame  
str.df = data.frame(str.df, M1 = mom.pow.M1, k0 = mom.pow.dct[,1], k1 = mom.pow.dct[,2], k2 = mom.pow.dct[,3],gender)

# intermediate store of str.df 
str2.df = str.df

#END  Brwd4 data
######

#############
# Combine data from 2 databases and save
str.df = rbind(str1.df, str2.df)
write.table(str.df,file.path(pfad2,"data/str.df"))
