# Generate some 'data', with normally distributed residuals, # with the two variables known to have a certain relationship dtr <- (rnorm(n = 100, mean = 30, sd = 4)) density <- rnorm(n= 100, mean = 0.5, sd = 0.1) * dtr # Examine the data in three ways dtr density data1 <- data.frame(dtr,density) hist(density, col = "light grey") # set plot frame to have one row & one column (only needed if you are changing this occasionallY) # and plot the data par(mfrow = c(1,1)) plot(dtr,density, pch = 19, col = 'blue') #pretty the plot up a little by modifying the x-axis label (xlab) and y-axis label (ylab) plot(dtr,density, pch = 19, col = 'blue', xlab = 'Distance to Road (m)', ylab = 'Population Density (ind/m2)') # Examine summary statistics for these two variables # There are functions for descriptive statistics in the R base stats package, though some descriptive statistics # require combining two or more of these base functions mean(density) sd(density) se <- sd(density)/sqrt(length(density)) se cv <- sd(density)/mean(density) cv # As with most things in R, packages can be installed to make this faster and easier # Must install the package 'pastecs' to use the stat.desc() function library(pastecs) stat.desc(density) # Now, fit an ORDINARY LEAST SQUARES (OLS) linear regression model to the data with lm(). mod1 <- lm(density~dtr) # Add the regression lion to the plot abline(mod1, col = 'blue') # Look at a summmary of the regression model's results summary(mod1) # Understanding the t-test and coefficient of determination abline(mean(density),0, col = 'red', lty = 3) library(ggplot2) ggplot(data1, aes(x=dtr, y=density)) + geom_point(colour = 'red', size = 5) + geom_smooth(method=lm, colour = 'black', fill = 'red', alpha = 0.25) + ylab("Density (ind/km2)") + xlab("Distance to River (km))") # Shift from simple OLS regression to multiple OLS regression veg <- rnorm(n= 100, mean = 0.7, sd = 0.1) * dtr mod1A <- lm(density ~ dtr + veg) summary (mod1A) # Consider collinearity when interpreting the multiple regression results cor(dtr,veg) # often there are correlations between predictor variables mod1C <- lm(density~veg) summary(mod1C) # leading to inconsistent estimates of effects, depending on other # effects already in the regression model #quick, simple plot of the relationships with scatterplot3d() function of package with same name library(scatterplot3d) scatterplot3d(density ~ dtr + veg) # Now leave OLS framework and instead, fit a linear regression model # using MAXIMUM LIKELIHOOD with glm(family = gaussian). mod2 <- glm(density ~ dtr, family = gaussian) summary(mod2) mod1 <- lm(density ~dtr) summary(mod1) # compare the output for mod2 and mod1 # GLM can model data for which OLS is not appropriate. #family = gaussian(link = "identity") #Appropriate for continuous, normally distributed dependent variable, identical to OLS regression #family = binomial(link = "logit") # appropriate for dependent variables that are binomially distributed # such as survival (lived vs died) or occupancy (present vs absent) or detection (observed or not, when present) #family = poisson(link = "log") #Appropriate for dependent variables that are counts (integers only) such as group size survive <- c(0,0,0,0,1,0,1,1,1,1,1) home.range.quality <- seq(0,1,0.1) #Fit an **inappropriate** OLS linear model using lm() mod4b <- lm(survive ~ home.range.quality) summary(mod4b) plot(home.range.quality, survive, pch = 19, ylim = c(-0.2, 1.2), main = 'Inappropriate lm() fit to binomial Y') points(home.range.quality,fitted(mod4b),col=2, pch = 19) lines(home.range.quality,fitted(mod4b),col=2) mod4a <- glm(survive ~ home.range.quality, family = binomial) plot(home.range.quality, survive, pch = 19, ylim = c(-0.2, 1.2), main = 'Appropriate glm() fit to binomial Y') points(home.range.quality,fitted(mod4a),col=2, pch = 19) summary(mod4a) #compare slope from this glm to slope from OLS regression, but because there is a # logit 'link function' or tranformation of the Y-variabel, we first must backtransform # estimate from logit (or log-odds ratio) scale to original scale of measurement slope <- inv.logit(13.046) slope #Model selection using AIC scores #first, clean up by removing all existing objects from the workspace rm(list=ls(all=TRUE)) # to read a file into a dataframe you will have to make sure the file is in the directory # you specify in the read command kenya.herdsize = read.table("kenyaherdsize3.txt", header = TRUE, sep = ",", fill= TRUE, dec = ".") reduced.data <-subset(kenya.herdsize, DistPred < 2.0, select = c(DistPred, GroupSize, Species,BushWoodGrass, HabOpen.Close)) # Model selection using AIC scores requires the following steps: #1. Develop a set of hypotheses about the factors that affect herd size. #2. Collect the data required. #3. Fit a model for each hypothesis in set. #4. Compare models using AIC scores #5. Use the information from step 4 to estimate regression coefficients by using 'model-averaging'. #Here is a simple set of four hypotheses to evaluate: #Model A: Wildebeest group size is affected by differences in vegetation structure in woodland, bushland and grassland #Model B: Wildebeest group size is affected by vegetation structure in a simpler manner, depending only on whether the habitat is open or closed. #Model C: Wildebeest group size is affected by the proximity of predators #Model D: Wildebeest group size is affected by proximity of predators and by the differences among woodland, bushland and grassland. #Fit the four models using glm() wildebeest.only <- subset(reduced.data, Species == 'Wildbst', select = c(GroupSize, BushWoodGrass, HabOpen.Close, DistPred)) modA <- glm(formula = GroupSize ~ BushWoodGrass, data = wildebeest.only) modB <- glm(formula = GroupSize ~ HabOpen.Close, data = wildebeest.only) modC <- glm(formula = GroupSize ~ DistPred, data = wildebeest.only) modD <- glm(formula = GroupSize ~ DistPred + BushWoodGrass, data = wildebeest.only) # Compare AIC scores for the 4 models using functions in the MuMIn (Multi-Model Inference) package: library(MuMIn) Cand.mods <- list(modA, modB, modC, modD) aictab <- model.sel(Cand.mods) aictab print.data.frame(aictab,digits=2) # Obtain estimates of the regression coefficients that are averaged across models, with each value weighted by the AIC weight for the model that contained it: x <-model.avg(Cand.mods, beta = TRUE, revised.var = TRUE) summary(x, digits = 3) # GLM can model data for which OLS is not appropriate. #family = gaussian(link = "identity")** - Same as OLS regression. #Appropriate for normally distributed variables #family = binomial(link = "logit")** # appropriate for dependent variables that are binomially distributed # such as survival (lived vs died) or occupancy (present vs absent) #family = poisson(link = "log")** - #Appropriate for dependent variables that are counts (integers only) such as group size