## plots & analysis for kirby2014transitions (ISSP)
## last tested 11 may 2014 with 3.1.0 (2014-04-10) -- "Spring Dance"
  
require(ggplot2)
require(plyr)
require(reshape2)
require(lme4)
require(grid)
require(prob)
require(lmerTest)   ##overloads lmer

## helper function
std.error <- function(x, na.rm=T) { sqrt(var(x, na.rm=na.rm)/length(x[complete.cases(x)])) }

## set Tufte theme
theme_set(theme_bw())  

tstyle =  theme(legend.position="top", axis.title.x = element_text(vjust = -0.75, hjust=0.5, size = 14),  axis.title.y = element_text(angle = 90, vjust = 0.125, size = 14), legend.text = element_text(size=14), legend.title = element_text(size=14), axis.text.x=element_text(size=14), axis.text.y=element_text(size=14), strip.text.x=element_text(size=14)) 

## load raw data
dur <- read.csv('issp_dur.csv', header=T)
  
## set of items that potentially *could* have a transitional element: i.e. things w/onset clusters 
ev <- droplevels(subset(dur, id=='ev'))
## remove a few outliers where t<0
ev <- subset(ev, t>0)
## length of initial cluster
ev$clen <- ev$t-ev$m
## proportion of total length 
ev$cprop <- ev$clen/ev$t
## reordering
ev$c1<-factor(ev$c1, levels=c('p','t','k','m','l'))
ev$c2<-factor(ev$c2, levels= c('p','t','k','b','d','m','n','ŋ','l','r','s','h'))
ev$c2.manner<-factor(ev$c2.manner, levels=c('vless stop','fricative','rhotic','liquid', 'nas','vcd stop','glottal'))
ev$condition<-factor(ev$condition, levels=c('slow','fast'))
ev$c1.place<-factor(ev$c1.place, levels=c('labial','alveolar','velar'))
ev$c2.place<-factor(ev$c2.place, levels=c('labial','alveolar','velar','glottal'))
ev$porder<-factor(ev$porder, levels=c('front-back','back-front'))


###########################################
## 3.1: Distribution of voiced transitions
###########################################

## Table 1
round(xtabs(~c1+c2+condition,data=subset(ev,v>0)) / xtabs(~c1+c2+condition,data=ev),2)

## by subject
round(xtabs(~condition+subject,data=subset(ev,v>0)) / xtabs(~condition+subject,data=ev),2)

## models
## Class 3
rate.class3<-glmer(hasv~c1+c2+condition+(1|subject)+(1|item),data=subset(ev,huff=='Class 3'),family=binomial(link="logit"))

## Class 2 /n ŋ l/
rate.class2<-glmer(hasv~c1+c2+condition+(1|subject)+(1|item),data=subset(ev,huff=='Class 2'&c2%in%c('n','l','ŋ')),family=binomial(link="logit"))


######################################
## 3.2: Duration of voiced transitions
######################################

## plots
clong <- melt(data=ev, id.var=c("subject", "condition", "hasv"), measure.vars=c("clen"))
clen.mean <- ddply(clong, .(subject, condition, hasv), summarise, "mean" = mean(value))
clen.error <- ddply(clong, .(subject, condition, hasv), summarise, "sd" = sd(value), "se" = std.error(value), "ci" = std.error(value) * qt(0.95/2 + .5, nrow(clong)-1))
clen.sum <- na.omit(merge(clen.mean, clen.error, by=c("subject", "condition", "hasv"), all=T))
clen.sum$subject <- factor(clen.sum$subject, levels=c('s1', 's2', 's3', 's4', 's5', 's6', 's7', 's8', 's9', 's10'))
clen.sum$condition <- factor(clen.sum$condition, levels=c('slow', 'fast'))
names(clen.sum)[3]<-"hasV"

quartz(,7.6,5)

ggplot(clen.sum, aes(x = condition, y = mean, fill = hasV)) + 
    geom_bar(position = position_dodge(width=0.9), stat="identity") + 
    geom_errorbar(aes(ymin=mean-ci, ymax=mean+ci), position=position_dodge(width=0.9), width=0.75)  + 
    facet_wrap(~subject,nrow=2) + ylab("mean cluster length (ms)") + 
    tstyle + scale_fill_grey(start=0.4,end=0.8) 

## over subjects
clong <- melt(data=ev, id.var=c("condition", "hasv"), measure.vars=c("clen"))
clen.mean <- ddply(clong, .(condition, hasv), summarise, "mean" = mean(value))
clen.error <- ddply(clong, .(condition, hasv), summarise, "sd" = sd(value), "se" = std.error(value), "ci" = std.error(value) * qt(0.95/2 + .5, nrow(clong)-1))
clen.sum <- na.omit(merge(clen.mean, clen.error, by=c("condition", "hasv"), all=T))
clen.sum$condition <- factor(clen.sum$condition, levels=c('slow', 'fast'))
names(clen.sum)[2]<-"hasV"

ggplot(clen.sum, aes(x = condition, y = mean, fill = hasV)) + 
    geom_bar(position = position_dodge(width=0.9), stat="identity") + 
    geom_errorbar(aes(ymin=mean-ci, ymax=mean+ci), position=position_dodge(width=0.9),width=0.75) + 
    ylab("mean cluster length (ms)") + tstyle + scale_fill_grey(start=0.4, end=0.8)

## over subjects by class
clong <- melt(data=ev, id.var=c("condition", "hasv", "huff"), measure.vars=c("clen"))
clen.mean <- ddply(clong, .(condition, hasv, huff), summarise, "mean" = mean(value))
clen.error <- ddply(clong, .(condition, hasv, huff), summarise, "sd" = sd(value), "se" = std.error(value), "ci" = std.error(value) * qt(0.95/2 + .5, nrow(clong)-1))
clen.sum <- na.omit(merge(clen.mean, clen.error, by=c("condition", "hasv", "huff"), all=T))
clen.sum$condition <- factor(clen.sum$condition, levels=c('slow', 'fast'))
names(clen.sum)[2]<-"hasV"

ggplot(clen.sum, aes(x = condition, y = mean, fill = hasV)) + 
    geom_bar(position = position_dodge(width=0.9), stat="identity") + 
    geom_errorbar(aes(ymin=mean-ci, ymax=mean+ci), position=position_dodge(width=0.9),width=0.75) + 
    facet_wrap(~huff) + ylab("mean cluster length (ms)") + tstyle + scale_fill_grey(start=0.4, end=0.8)

## duration model
m0 <- lmer(clen~hasv*condition+c1+c2+(1+condition|subject)+(1+condition|item), data=ev)


#######################################################
## 3.3: comparison with lexical vowels
#######################################################
vcomp<-droplevels(subset(dur, t>0 & ((id=='ev' & hasv=='Y')|id=='mono')))
## compute length of CC-sequence
vcomp$clen<-0
vcomp[vcomp$id=='ev',]$clen <- vcomp[vcomp$id=='ev',]$t-vcomp[vcomp$id=='ev',]$m
vcomp[vcomp$id=='mono',]$clen <- vcomp[vcomp$id=='mono',]$t
levels(vcomp$id)<-c(levels(vcomp$id), 'CvN')
vcomp[vcomp$id=='mono'&vcomp$c2.manner=='nas',]$id<-'CvN'

vcomp$cc<-rep('OO')
vcomp[(vcomp$c1.manner=='vless stop' & vcomp$c2.manner=='nas'),]$cc<-'OS'
vcomp[(vcomp$c1.manner=='vless stop' & vcomp$c2.manner=='liquid'),]$cc<-'OS'
vcomp[vcomp$c1.manner=='nas',]$cc<-'SO'
vcomp[vcomp$c1.manner=='lateral',]$cc<-'SO'
vcomp[(vcomp$c1.manner=='nas' & vcomp$c2.manner=='nas'),]$cc<-'SS'
vcomp[(vcomp$c1.manner=='nas' & vcomp$c2.manner=='liquid'),]$cc<-'SS'
vcomp<-droplevels(subset(vcomp,c2.manner!='rhotic'))

oo<-droplevels(subset(vcomp,cc=='OO'))
os<-droplevels(subset(vcomp,cc=='OS'))

## models
## obstruent-obstruent
oo.model<-lmer(clen~id*condition+(1|subject)+(1|item),data=oo)
## obstruent-sonorant
os.model<-lmer(clen~id*condition+(1|subject)+(1|item),data=os)


######################################
# 3.4: formant transitions of \exists
######################################

fmt<-read.csv('issp_vout.csv')
fmt$F1<-as.numeric(as.character(fmt$F1))
fmt$F2<-as.numeric(as.character(fmt$F2))
fmt<-fmt[,c(1:8,12:14)]
dmt<-dur[,c(1:5,19:33)] ## include hasv column to check that it's done what we want it to...
fdur<-merge(fmt,dmt)
fdur<-droplevels(subset(fdur,id=='ev'))

## now what does format transition look like by c1 and c2?
## averaging over subjects
simps <- na.omit(droplevels(subset(fdur, huff!='Class 1' & c2.place!='glottal' & !(c2 %in% c('r','l','s')) & !(subject %in% c('s5','s6')))))
flong <- melt(data=simps, id.var=c("condition", "t", "c1.place", "c2.place"), measure.vars=c("F1", "F2"))
flong$t<-as.factor(flong$t)
f2.mean <- ddply(subset(flong,variable=='F2'), .(condition, t, c1.place, c2.place), summarise, "mean" = mean(value))
f2.error <- ddply(subset(flong,variable=='F2'), .(condition, t, c1.place, c2.place), summarise, "sd" = sd(value), "se" = std.error(value), "ci" = std.error(value) * qt(0.95/2 + .5, nrow(subset(flong,variable=='F2'))-1))
f2.sum <- na.omit(merge(f2.mean, f2.error, by=c("condition", "t", "c1.place", "c2.place"), all=T))
f2.sum$condition <- factor(f2.sum$condition, levels=c('slow', 'fast'))

f2.plot.avg<-ggplot(f2.sum, aes(x=t, y=mean, colour=c1.place, linetype=condition, group=c1.place:condition)) + facet_wrap(~c2.place)
f2.plot.avg + geom_point(aes(shape=c1.place)) + 
    geom_line() + geom_errorbar(width=0.01, aes(condition=mean, ymin=mean-se, ymax=mean+se)) + 
    xlab("position in v") + ylab("F2 (Hz)") + tstyle

## models
## does F2 at t=2 lie on the straight line between F2 at t=1 and F2 at t=3?
t1 = droplevels(subset(fdur, t==1 & c2.place!='glottal'))
t2 = droplevels(subset(fdur, t==2 & c2.place!='glottal'))
t3 = droplevels(subset(fdur, t==3 & c2.place!='glottal'))

t2$seg.1 <- t1$F2-t2$F2
t2$seg.2 <- t2$F2-t3$F2
## remove crazy things
qplot(x=F2,y=t2$seg.1,data=t2,geom="point",colour=c2.place) + facet_wrap(~c1.place)
qplot(x=F2,y=t2$seg.2,data=t2,geom="point",colour=c2.place) + facet_wrap(~c1.place)

summary(lmer(seg.1~1+(1|subject)+(1|item), data=subset(t2, c1.place==c2.place)))
summary(lmer(seg.2~1+(1|subject)+(1|item), data=subset(t2, c1.place==c2.place)))


#######################################################
## 3.5: Voiceless transitions
#######################################################

ptk<-droplevels(subset(ev, o/t < 0.4&o<125&c1%in%c('p','t','k')&!(condition=='fast'&c2.manner=='vless stop'&hasv=='Y')))
levels(ptk$c2.manner) <- c('/pt/', '/ps/', '/pr/', '/pl/', '/pn/', '/pd/', '/ph/')
ptk$c2.manner <- factor(ptk$c2.manner, levels=c('/ph/', '/pl/', '/pn/', '/ps/', '/pt/', '/pd/', '/pr/'))
ptk$ctype <- ptk$c2.manner
levels(ptk$ctype)<-c(levels(ptk$ctype), '/pn/ (3)')
ptk$ctype[ptk$ctype=='/pn/'&ptk$huff=='Class 3']<-'/pn/ (3)'
levels(ptk$ctype)<-c(levels(ptk$ctype), '/pn/ (2)')
ptk$ctype[ptk$ctype=='/pn/']<-'/pn/ (2)'
ptk<-droplevels(ptk)
ptk$ctype <- factor(ptk$ctype, levels=c('/ph/', '/pl/', '/pn/ (2)', '/ps/', '/pt/',  '/pn/ (3)', '/pd/', '/pr/'))

ptk$dgroup <- rep('short')
ptk[ptk$ctype %in% c('/ph/', '/pl/', '/pn/ (2)'),]$dgroup <- rep('long')
ptk$dgroup <- as.factor(ptk$dgroup)

ptk$cgroup <- rep('other')
ptk[ptk$ctype=='/ph/',]$cgroup <- rep('ph')
ptk[ptk$ctype=='/pr/',]$cgroup <- rep('pr')
ptk$cgroup<-factor(ptk$cgroup,levels=c('ph','other','pr'))

## Figure 4
qplot(c2.manner,o,colour=huff,fill=huff,data=ptk,geom='boxplot') + facet_wrap(~condition, scales="free_x") + xlab("cluster type") + ylab("C1 release duration (ms)") + theme_bw() + theme(legend.position="top",panel.margin=unit(1,"lines")) + scale_colour_grey(start=0.5,end=0.2) + scale_fill_grey(start=0.1,end=1) + labs(colour="Huffman class", fill="Huffman class")

## models
## backwards difference coding
my.backward.diff<-matrix(c(-7/8,1/8,1/8,1/8,1/8,1/8,1/8,1/8, -6/8,-6/8,2/8,2/8,2/8,2/8,2/8,2/8, -5/8,-5/8,-5/8,3/8,3/8,3/8,3/8,3/8, -4/8,-4/8,-4/8,-4/8,4/8,4/8,4/8,4/8, -3/8,-3/8,-3/8,-3/8,-3/8,5/8,5/8,5/8, -2/8,-2/8,-2/8,-2/8,-2/8,-2/8,6/8,6/8, -1/8,-1/8,-1/8,-1/8,-1/8,-1/8,-1/8,7/8), ncol=7)

contrasts(ptk$ctype)<-my.backward.diff

## ctype ordered so coefficient comparison matches order in Figure 4
dslow<-lmer(o~hasv+dgroup+(1|subject)+(1|item),data=subset(ptk,condition=='slow'))
dfast<-lmer(o~hasv+dgroup+(1|subject)+(1|item),data=subset(ptk,condition=='fast'))


#######################################################
## 3.6: Cluster length as proportion of syllable
#######################################################

## Figure 5
qplot(c2.manner,clen,colour=huff,fill=huff,data=ptk,geom='boxplot') + facet_wrap(~condition, scales="free_x") + xlab("cluster type") + ylab("CC cluster duration (ms)") + theme_bw() + theme(legend.position="top",panel.margin=unit(1,"lines")) + scale_colour_grey(start=0.5,end=0.2) + scale_fill_grey(start=0.1,end=1) + labs(colour="Huffman class", fill="Huffman class")

## cluster length
ev$cgroup<-rep('/pt/')
ev[ev$c1%in%c('p','t','k')&ev$c2%in%c('b','d'),]$cgroup<-'/pd/'
ev[ev$c1%in%c('p','t','k')&ev$c2%in%c('n','ŋ','l'),]$cgroup<-'/pn/'
ev[ev$c1%in%c('p','t','k')&ev$c2%in%c('n','ŋ','l'),]$cgroup<-'/pn/'
ev[ev$c1%in%c('p','t','k')&ev$c2%in%c('h'),]$cgroup<-'/ph/'
ev[ev$c1%in%c('p','t','k')&ev$c2%in%c('r'),]$cgroup<-'/pr/'
ev[ev$c1%in%c('p','t','k')&ev$c2%in%c('l'),]$cgroup<-'/pl/'
ev[ev$c1%in%c('l')&ev$c2%in%c('p'),]$cgroup<-'/lp/'
ev[ev$c1%in%c('m')&ev$c2%in%c('p','t'),]$cgroup<-'/mt/'
ev[ev$c1%in%c('m')&ev$c2%in%c('b','d'),]$cgroup<-'/md/'
ev[ev$c1%in%c('l')&ev$c2%in%c('b'),]$cgroup<-'/lb/'
ev[ev$c1%in%c('m')&ev$c2%in%c('n','l'),]$cgroup<-'/mn/'
ev[ev$c1%in%c('m')&ev$c2%in%c('r'),]$cgroup<-'/mr/'
ev[ev$c1%in%c('m')&ev$c2%in%c('s'),]$cgroup<-'/ms/'
ev[ev$c1%in%c('m','l')&ev$c2%in%c('h'),]$cgroup<-'/mh/'
ev$cgroup<-as.factor(ev$cgroup)

ev$cgroup<- factor(ev$cgroup, levels=c())

## difference within class 1
class1<-droplevels(subset(ptk, huff=='Class 1'))
contrasts(class1$c2.manner)<-contr.helmert(3)
c1<-lmer(clen~c2.manner+hasv*condition+(1|subject)+(1|item),data=class1)

## classes 2 and 3
class23<-droplevels(subset(ptk, huff!='Class 1'))

clus0<-lmer(clen~c2.manner+hasv+condition+(1|subject)+(1|item),data=class23)
clus1<-lmer(clen~c2.manner+hasv*condition+(1|subject)+(1|item),data=class23)

## upshot: there are phonetic differences that cross-cut the 'class' systems
## yet, class system reflects something about patterns of gestural organisation in this language.


#####################################
## Not in paper: place order effects
#####################################

## appearance of v
round(xtabs(~porder+subject+condition,data=subset(ev,v>0)) / xtabs(~porder+subject+condition,data=ev),2)

porder.df<-as.data.frame(round(xtabs(~porder+subject+condition,data=subset(ev,v>0)) / xtabs(~porder+subject+condition,data=ev),2))
porder.df$subject<-factor(porder.df$subject,levels=c('s1','s2','s3','s4','s5','s6','s7','s8','s9','s10'))
## by subject
ggplot(data=porder.df, aes(x = condition, y = Freq, fill = porder)) + 
    geom_bar(position = position_dodge(), stat="identity") + facet_wrap(~subject, nrow=2) + 
    ylab(expression(paste("proportion of clusters with ", symbol("\044")))) + 
    theme(legend.position="top",panel.grid.major = element_blank()) 
## over subjects
ggplot(data=porder.df, aes(x = condition, y = Freq, fill = porder)) + 
    geom_bar(position = position_dodge(), stat="identity") + ylab
    (expression(paste("proportion of clusters with ", symbol("\044")))) + 
    theme(legend.position="top",panel.grid.major = element_blank()) 

## model
has1<-glmer(hasv~porder*condition+(1+condition|subject)+(1+condition|item),family=binomial(link="logit"),data=ev)

## duration of v
## by subject
pdur <- melt(data=subset(ev,v>0), id.var=c("subject", "condition", "porder"), measure.vars=c("v"))
pdur.mean <- ddply(pdur, .(subject, condition, porder), summarise, "mean" = mean(value))
pdur.error <- ddply(pdur, .(subject, condition, porder), summarise, "sd" = sd(value), "se" = std.error(value), "ci" = std.error(value) * qt(0.95/2 + .5, nrow(pdur)-1))
pdur.sum <- na.omit(merge(pdur.mean, pdur.error, by=c("subject", "condition", "porder"), all=T))
pdur.sum$subject <- factor(pdur.sum$subject, levels=c('s1', 's2', 's3', 's4', 's5', 's6', 's7', 's8', 's9', 's10'))
pdur.sum$condition <- factor(pdur.sum$condition, levels=c('slow', 'fast'))

ggplot(data=pdur.sum, aes(x = condition, y = mean, fill = porder)) + 
  geom_bar(position = position_dodge(), stat="identity") + 
  geom_errorbar(aes(ymin=mean-ci, ymax=mean+ci), position=position_dodge()) + 
  facet_wrap(~subject, nrow=2) + ylab("mean excr v length (ms)") + tstyle + scale_fill_grey(start=0.4, end=0.8)

## over subjects
pdur <- melt(data=subset(ev,v>0), id.var=c("condition", "porder"), measure.vars=c("v"))
pdur.mean <- ddply(pdur, .(condition, porder), summarise, "mean" = mean(value))
pdur.error <- ddply(pdur, .(condition, porder), summarise, "sd" = sd(value), "se" = std.error(value), "ci" = std.error(value) * qt(0.95/2 + .5, nrow(pdur)-1))
pdur.sum <- na.omit(merge(pdur.mean, pdur.error, by=c("condition", "porder"), all=T))
pdur.sum$condition <- factor(pdur.sum$condition, levels=c('slow', 'fast'))

ggplot(data=pdur.sum, aes(x = condition, y = mean, fill = porder)) + 
    geom_bar(position = position_dodge(), stat="identity") + 
    geom_errorbar(aes(ymin=mean-ci, ymax=mean+ci), position=position_dodge()) + 
    ylab("mean excr v length (ms)") + tstyle + scale_fill_grey(start=0.4, end=0.8)

## model
m1<-lmer(v~porder*condition+(1+condition|subject)+(1+condition|item),data=subset(ev, v>0))
## porder is not significant, nor is the interaction term