# Generate some 'data', with normally distributed residuals,
# with the two variables known to have a certain relationship

dtr <- (rnorm(n = 100, mean = 30, sd = 4))

density <- rnorm(n= 100, mean = 0.5, sd = 0.1) * dtr


# Examine the data in three ways
dtr
density
data1 <- data.frame(dtr,density)
hist(density, col = "light grey")

# set plot frame to have one row & one column (only needed if you are changing this occasionallY)
# and plot the data
par(mfrow = c(1,1))
plot(dtr,density, pch = 19, col = 'blue')

#pretty the plot up a little by modifying the x-axis label (xlab) and y-axis label (ylab)
plot(dtr,density, pch = 19, col = 'blue', xlab = 'Distance to Road (m)', ylab = 'Population Density (ind/m2)')

# Examine summary statistics for these two variables
# There are functions for descriptive statistics in the R base stats package, though some descriptive statistics
# require combining two or more of these base functions
mean(density)
sd(density)
se <- sd(density)/sqrt(length(density))   
se
cv <- sd(density)/mean(density)

cv

# As with most things in R, packages can be installed to make this faster and easier
# Must install the package 'pastecs' to use the stat.desc() function
library(pastecs)
stat.desc(density)


# Now, fit an ORDINARY LEAST SQUARES (OLS) linear regression model to the data with lm().
mod1 <- lm(density~dtr)

# Add the regression lion to the plot
abline(mod1, col = 'blue')

# Look at a summmary of the regression model's results
summary(mod1)

# Understanding the t-test and coefficient of determination

abline(mean(density),0, col = 'red', lty = 3)

library(ggplot2)
ggplot(data1, aes(x=dtr, y=density)) +
          geom_point(colour = 'red', size = 5) +   
          geom_smooth(method=lm, colour = 'black', fill = 'red', alpha = 0.25) +
          ylab("Density (ind/km2)") +
          xlab("Distance to River (km))")


# Shift from simple OLS regression to multiple OLS regression
veg <- rnorm(n= 100, mean = 0.7, sd = 0.1) * dtr
mod1A <- lm(density ~ dtr + veg)
summary (mod1A)

# Consider collinearity when interpreting the multiple regression results
cor(dtr,veg)                  # often there are correlations between predictor variables
mod1C <- lm(density~veg)      
summary(mod1C)                # leading to inconsistent estimates of effects, depending on other 
                              # effects already in the regression model    

#quick, simple plot of the relationships with scatterplot3d() function of package with same name
library(scatterplot3d)
scatterplot3d(density ~ dtr + veg)


# Now leave OLS framework and instead, fit a linear regression model 
# using MAXIMUM LIKELIHOOD with glm(family = gaussian).
mod2 <- glm(density ~ dtr, family = gaussian)
summary(mod2)

mod1 <- lm(density ~dtr)
summary(mod1)

# compare the output for mod2 and mod1


# GLM can model data for which OLS is not appropriate.

#family = gaussian(link = "identity")   
#Appropriate for continuous, normally distributed dependent variable, identical to OLS regression

#family = binomial(link = "logit")
# appropriate for dependent variables that are binomially distributed 
# such as survival (lived vs died) or occupancy (present vs absent) or detection (observed or not, when present)

#family = poisson(link = "log") 
#Appropriate for dependent variables that are counts (integers only) such as group size

survive <- c(0,0,0,0,1,0,1,1,1,1,1)
home.range.quality <- seq(0,1,0.1)


#Fit an **inappropriate** OLS linear model using lm()

mod4b <- lm(survive ~ home.range.quality)
summary(mod4b)


plot(home.range.quality, survive, pch = 19, ylim = c(-0.2, 1.2),
     main = 'Inappropriate lm() fit to binomial Y')
points(home.range.quality,fitted(mod4b),col=2, pch = 19)
lines(home.range.quality,fitted(mod4b),col=2)


mod4a <- glm(survive ~ home.range.quality, family = binomial)


plot(home.range.quality, survive, pch = 19, ylim = c(-0.2, 1.2), 
     main  = 'Appropriate glm() fit to binomial Y')

points(home.range.quality,fitted(mod4a),col=2, pch = 19)   


summary(mod4a)

#compare slope from this glm to slope from OLS regression, but because there is a 
# logit 'link function' or tranformation of the Y-variabel, we first must backtransform 
# estimate from logit (or log-odds ratio) scale to original scale of measurement

slope <- inv.logit(13.046)
slope

#Model selection using AIC scores

#first, clean up by removing all existing objects from the workspace
rm(list=ls(all=TRUE))

# to read a file into a dataframe you will have to make sure the file is in the directory 
# you specify in the read command
kenya.herdsize = read.table("kenyaherdsize3.txt", 
                            header = TRUE, sep = ",", fill= TRUE, dec = ".")


reduced.data <-subset(kenya.herdsize, DistPred < 2.0, select = 
                                c(DistPred, GroupSize, Species,BushWoodGrass, HabOpen.Close))


# Model selection using AIC scores requires the following steps:

#1. Develop a set of hypotheses about the factors that affect herd size.  
#2. Collect the data required.
#3. Fit a model for each hypothesis in set.
#4. Compare models using AIC scores
#5. Use the information from step 4 to estimate regression coefficients  by using 'model-averaging'.

#Here is a simple set of four hypotheses to evaluate:

#Model A: Wildebeest group size is affected by differences in vegetation structure in woodland, bushland and grassland
#Model B: Wildebeest group size is affected by vegetation structure in a simpler manner, depending only on whether the habitat is open or closed.
#Model C:  Wildebeest group size is affected by the proximity of predators
#Model D: Wildebeest group size is affected by proximity of predators and by the differences among woodland, bushland and grassland.

#Fit the four models using glm()


wildebeest.only <-  subset(reduced.data, Species == 'Wildbst', 
                           select = c(GroupSize, BushWoodGrass, HabOpen.Close, DistPred))

modA <- glm(formula = GroupSize ~ BushWoodGrass, data = wildebeest.only)

modB <- glm(formula = GroupSize ~ HabOpen.Close, data = wildebeest.only)

modC <- glm(formula = GroupSize ~ DistPred, data = wildebeest.only)

modD <- glm(formula = GroupSize ~ DistPred + BushWoodGrass, data = wildebeest.only)


# Compare AIC scores for the 4 models using functions in the MuMIn (Multi-Model Inference) package:

library(MuMIn)
Cand.mods <- list(modA, modB, modC, modD)
aictab <- model.sel(Cand.mods)
aictab
print.data.frame(aictab,digits=2)


# Obtain estimates of the regression coefficients that are averaged across models, with each value weighted by the AIC weight for the model that contained it:

x <-model.avg(Cand.mods, beta = TRUE, revised.var = TRUE)
summary(x, digits = 3)

# GLM can model data for which OLS is not appropriate.

#family = gaussian(link = "identity")** - Same as OLS regression.  
#Appropriate for normally distributed variables

#family = binomial(link = "logit")** 
# appropriate for dependent variables that are binomially distributed 
# such as survival (lived vs died) or occupancy (present vs absent)

#family = poisson(link = "log")** - 
#Appropriate for dependent variables that are counts (integers only) such as group size