# Run this to make sure you have the latest version of R and all the packages we'll use:
if( !( "installr" %in% installed.packages()[,"Package"] ) ){ install.packages("installr") } 
library(installr); updateR()
pkgs <- c( "lme4", "languageR", "devtools", "vioplot", "lmerTest", "car", "multcomp", "e1071" )
pkgs <- pkgs[ !(pkgs %in% installed.packages()[,"Package"]) ]
if( length(pkgs)>0 ){ install.packages( pkgs[ !(pkgs %in% installed.packages()[,"Package"]) ] ) }
if( !( "yarrr" %in% installed.packages()[,"Package"] ) ){ library(devtools); install_github("ndphillips/yarrr") }


















#######################
### LOGISTIC MODELS ###
#######################

### A plot illustrating the problem with converting categorical data into percentage

# make some fake subject accuracy data
subj_percentages <- rnorm( n=24, mean=70, sd=20 )
subj_percentages <- ifelse( subj_percentages>100, 100, ifelse( subj_percentages<0, 0, subj_percentages ) )

# make some fake proficiency data correlated with it
correlatedValue = function(x, r){
  r2 = r**2
  ve = 1-r2
  SD = sqrt(ve)
  e  = rnorm(length(x), mean=0, sd=SD)
  y  = r*x + e
  return(y)
}
proficiency <- correlatedValue( subj_percentages, .05 )

# make a plot to show that linear regression predicts Y-values below 0 and above 100%
plot( proficiency, subj_percentages, xlim=c( min(proficiency)-.5*(range(proficiency)[2]-range(proficiency)[1]), max(proficiency)+.5*(range(proficiency)[2]-range(proficiency)[1]) ), ylim=c(-50, 150), xlab="proficiency", ylab="accuracy" )
abline( lm(subj_percentages~proficiency), col="red", lwd=2 )
abline( 100, 0 )
abline( 0, 0 )


### Make a plot to show the relationship between proportions and log odds, i.e., that proportions
###	are bounded between 1 and 0 but log odds are continuous

par( mfrow=c(1,2) )

# A range of log odds values
lo <- -8:8

# Plots the corresponding proportions
plot( lo, exp(lo) / (1+exp(lo)), xlab="log odds", ylab="proportion" ) 

# Same idea, the other way around
props <- seq(.01,.99,by=.01)
plot( props, log(1/(1/props-1)), xlab="proportion", ylab="log odds" )




### Do a model

# Load some data

# load sample dataset
load( url( "http://www.polyu.edu.hk/cbs/sjpolit/UCL_Rworkshop/File_S2.RData" ) )

# clean up data
resp <- Respdata[ Respdata$Quantifier != "filler", ]
resp$Item <- factor( resp$Item )
resp$Quantifier <- factor( resp$Quantifier )
resp$Boundedness <- factor( resp$Boundedness )

# Deviation-coded model, so the intercept represents the grand mean and we can see if people were better than chance overall
summary( model <- glmer( Accuracy ~ Quantifier*Boundedness + (1|Subject) + (1|Item), resp, family="binomial", contrasts=list( Quantifier=contr.sum(2)/2, Boundedness=contr.sum(2)/2 ) ) )

# We know that the z-statistic for the intercept is comparing the intercept vs. 0. Let's run this (plugging a hypothetical
#	zero intercept into the formula we know for converting odds to proportions) to demonstrate that odds of 0 means 50%:
exp(0) / (1 + exp(0) )

# now let's see what our intercept corresponds to in percent accuracy terms
exp( 3.1936 ) / ( 1 + exp(3.1936) )

# Let's try comparing this coefficient against a hypothetical chance level of 25% instead of 50%
proportion_chance <- .25
logodds_chance <- log( 1 / ( (1/proportion_chance) - 1 ) )
coefficients <- summary(model)$coefficients
b <- coefficients["(Intercept)","Estimate"]
SE <- coefficients["(Intercept)","Std. Error"]
(z <- (b - logodds_chance)/SE)
( p <- 2 * (1 - pnorm(z)) )

# Dummy-coded model so we can compare specific pairs of conditions
summary( model <- glmer( Accuracy ~ Quantifier*Boundedness + (1|Subject) + (1|Item), resp, family="binomial" ) )

# getting the proportion for the baseline condition
logodds_baseline <- fixef(model)["(Intercept)"]
( proportion_baseline <- exp(logodds_baseline) / ( 1 + exp(logodds_baseline) ) )

# getting the proportion for the simple effect of Quantifier at this level of Boundendess
logodds_comparison <- fixef(model)["(Intercept)"] + fixef(model)["Quantifiersome of them"]
( proportion_comparison <- exp(logodds_comparison) / ( 1 + exp(logodds_comparison) ) )

# There we saw that the comparison condition is about .5% less accurate than the baseline condition. Note that the conversion
#	from log odds to proportion requires the actual predicted value, not just the coefficient. As we see below, just
#	converting the coefficient would give us the wrong value
( incorrectly_calculated_proportion_difference <- exp(fixef(model)["Quantifiersome of them"]) / ( 1 + exp(fixef(model)["Quantifiersome of them"]) ) )
















###################
### ORDINAL IVs ###
###################

# load implicit priming data
load( url( "http://www.polyu.edu.hk/cbs/sjpolit/UCL_Rworkshop/implicitpriming.RData" ) )

# show the effect of itemrepetitionnumber
tapply( trim$RT, trim$ItemRepetitionNumber, mean )
tapply( trim$RT, trim$ItemRepetitionNumber, sd )
xtabs( ~ ItemRepetitionNumber, trim )

# show that the ordinal model doesn't capture the effect very well when our ordinal data aren't nicely distributed
summary( lmer( RT ~ ItemRepetitionNumber + (ItemRepetitionNumber|Subject) + (ItemRepetitionNumber|Item), trim ) )

# It gets a little better if we exclude the cell with only one datapoint, but still not great (note that it's saying the
#	linear component is 100ms, which is bigger than what we saw in the cell means)
summary( lmer( RT ~ ItemRepetitionNumber + (ItemRepetitionNumber|Subject) + (ItemRepetitionNumber|Item), trim[ trim$ItemRepetitionNumber<3, ] ) ) 

# Does the best if we just recode this variable as binary
summary( lmer( RT ~ Repeated + (Repeated|Subject) + (Repeated|Item), trim ) )


















###################
### ORDINAL DVs ###
###################

# example with built-in data
library(ordinal)
summary( clmm( rating ~ temp + (temp|judge) + (temp|bottle), wine ) )




















######################
### Reshaping data ###
######################


# load wide-format EEG data
load( url( "http://www.polyu.edu.hk/cbs/sjpolit/UCL_Rworkshop/eegdata.RData" ) )

# show it
( eegdata_wide <- eegdata )

# convert to long format
library(reshape2)
eegdata <- melt(eegdata_wide)

# But we only have a column indicating condition, not columns indicating the various factors...
head( eegdata )

# Here I use the condition names (e.g. "T2T3_A") and split them along the underscores to create two columns, one for each within-subjects ANOVA factor
spl <- strsplit( as.character(eegdata$variable), "_" )
eegdata$Contrast <- factor( unlist( lapply( spl, function(x){x[[1]]} ) ) )
eegdata$Direction <- factor( unlist( lapply( spl, function(x){x[[2]]} ) ) )




dcast( eegdata, Subject+Group ~ Contrast+Direction, value.var="value", mean )