# R code for Kirby and Yu (2007) 'Lexical and phonotactic effects on wordlikeness judgments in Cantonese'
# Version: 19 April 2017
# Tested with: R version 3.2.4 (2016-03-10) "Very Secure Dishes"
# Written by: James Kirby <j.kirby@ed.ac.uk>

# This R code, together with the accompanying .txt file, largely reproduces the 
# analyses and figures of the 2007 ICPhS paper. Discrepencies, where present, 
# have been noted.

# The raw (non-averaged) response data are also available as part of this dataset at

#   http://lel.ed.ac.uk/~jkirby/cantonese/results1_10.txt

# This code is (largely) designed to replicate what is in the paper. This is not 
# an endorsement of the analysis; we would do things rather differently now.
# By making the data and code available, we hope to encourage alternative analyses of
# these data.


library(plyr)
library(gplots)

# SEM
std.error <- function(x) sd(x)/sqrt(length(x))

# load data
all = read.delim("http://lel.ed.ac.uk/~jkirby/cantonese/average2.txt")

# slightly updated for 2017
all$Code1<-factor(all$Code1)
all$Code2<-factor(all$Code2)
all$Code3<-factor(all$Code3)
all$syl_type <- rep('Lex')
all[all$Code2=='1',]$syl_type <- 'Acc'
all[all$Code2%in%c('2','3'),]$syl_type <- 'Tone'
all[all$Code2%in%c('4','5','6'),]$syl_type <- 'Lab'
all[all$Code2%in%c('7','8'),]$syl_type <- 'Cor'
all[all$Code2%in%c('9','10'),]$syl_type <- 'Mul'

# Figure 1
means <- ddply(all, .(syl_type), summarise, mean=mean(Zscorearcsin), sem=std.error(Zscorearcsin))
means <- means[order(means$mean, decreasing=TRUE),]

# graphics device
quartz(width=6, height=6)
mybarcol<-"grey20"

# this seems to reverse the error bars for Tone and Acc in Fig 1 of the paper
mp <- barplot2(means$mean, names.arg=means$syl_type, ylab='Zscorearcsin', xlab='syllable type', plot.ci = TRUE, ci.l = (means$mean-means$sem), ci.u = (means$mean+means$sem), plot.grid = TRUE, grid.inc=9, col="grey",col.sub = mybarcol, legend.text=c('Lex=Lexical','Tone=Onset-Tone','Acc=Accidental', 'Lab=Labial','Mul=Multiple','Cor=Coronal'))


# Figure 2
non<-subset(all,all$Code1!=0)
words<-subset(all,all$Code1==0)

plot(words$nnd,words$Zscorearcsin,pch=19,xlab="weighted neighborhood density",ylab="Zscorearcsin")
points(non$nnd,non$Zscorearcsin,pch=21)
legend(300,-.5,c("words","nonwords"),pch=c(19,21))
abline(lm(all$Zscorearcsin~all$nnd))


# Statistics in runnign text

wilcox.test(non$goodness, words$goodness)
# can only assume 39217.5 was a typo??
wilcox.test(subset(all, syl_type=='Acc')$goodness, subset(all, syl_type=='Tone')$goodness)
# otoh none of the U statistics match exactly

# full regression
full <- lm(Zscorearcsin~nnd+ptpnt_token, data=all) # matches
# partial regression 
nd_only <- lm(Zscorearcsin~nnd, data=all) # dfs match but adj R2 does not 

# descriptive stats match
mean(words$nnd) 
sd(words$nnd)
mean(non$nnd)
sd(non$nnd)

# groupwise regressions
# words match, but reported R^2 is not actually adjusted
word.model <- lm(Zscorearcsin~nnd+ptpnt_token, data=words)
# nonwords match
non.model <- lm(Zscorearcsin~nnd+ptpnt_token, data=non)
non.model2 <- lm(Zscorearcsin~nnd, data=non)