###----------------------------------------###
### BRM Session 5 - Descriptive statistics ###
###----------------------------------------###
# Dennis Abel and Lukas Birkenmaier
# Intro and setup ---------------------------------------------------------
# In this script, we want to address major univariate and bivariate approaches for
# visual and numerical summary statistics.
# Based on chapter 5 of Nav arro, Danielle "Learning statistics with R: A tutorial for psychology
# students and other beginners (Version 0.6)"
# available at https://learningstatisticswithr.com/lsr-0.6.pdf
# Load required packages
library(tidyverse) # Many relevant functions including ggplot
library(epiDisplay) # for frequency tables for categorical variables
library(vtable) # For descriptive statistics tables
library(gtsummary) # For a nice cross-table
library(vcd) # For mosaic-plots
# Working directory
setwd("[insert directory here]")
# Use read_rds-function to load rds data
oddjob <- read_rds("oddjob.rds")
# Univariate graphs and tables --------------------------------------------
# Barchart for country
ggplot(data = oddjob) +
geom_bar(mapping = aes(x = country))
# Create barchart of proportions instead
ggplot(data = oddjob) +
geom_bar(mapping = aes(x = country, y = ..prop.., group = 1))
# Histogram for number of flights
ggplot(data = oddjob, mapping = aes(x=nflights)) +
geom_histogram(binwidth=10, colour="black", fill="white") +
geom_vline(aes(xintercept=mean(nflights)), color="blue", linetype="dashed", size=1)+
geom_vline(aes(xintercept=median(nflights)), color="red", linetype="solid", size=1)
# Add density plot to histogram (age)
ggplot(data = oddjob, mapping = aes(x=age)) +
geom_histogram(aes(y=..density..), binwidth=10, colour="black", fill="white") +
geom_density(alpha=.2,fill="#FF6666")
# Add normal curve to histogram (nps)
ggplot(data = oddjob, mapping = aes(x=nps)) +
geom_histogram(aes(y=..density..), binwidth=1, colour="black", fill="white") +
stat_function(fun=dnorm,
args=list(mean=mean(oddjob_sub$nps),
sd=sd(oddjob_sub$nps)),
col="red",
size=1)
# Boxplot for age
ggplot(data = oddjob, mapping = aes(y=age)) +
geom_boxplot()
# Frequency tables for categorical variables
tab1(oddjob$country, sort.group="decreasing", cum.percent=TRUE)
# In the previous session we introduced the various measures of central tendency and dispersion.
# Instead of calculating these individually, we can also call functions in R which report
# these descriptive statistics jointly.
# The summary-function prints summary statistics for all variables depending on their class.
# For numerical variables, we get the min and max, the 1st and 3rd quartiles, median, and mean.
# For categorical (factor) variables, the function reports the number of observations for
# each value.
summary(oddjob)
# Alternative is to use the st-function from the vtable package
vtable::st(oddjob)
# Sort your dataset: first numerical variables and then factor variables for readability
# Check out documentation of the function in order to adjust the table for your own purposes
# Summ-option to specify which summary statistics to include
# Bivariate visualization and statistics ----------------------------------------------------
# Barcharts for two categorical variables
# Positions=fill is again stacking but with each bar the same height. This makes it easy
# to compare proportions across groups.
ggplot(data = oddjob) +
geom_bar(mapping = aes(x = country, fill = language), position = "fill")
# Position=dodge places objects next to each other
ggplot(data = oddjob) +
geom_bar(mapping = aes(x = country, fill = language), position = "dodge")
# Histogram for several variables: a grouping variable and a numerical variable
ggplot(data = oddjob, mapping = aes(x=nps, color=language)) +
geom_histogram(binwidth=1, fill="white", position="dodge", alpha=0.5)
# Facets often easier to read
ggplot(data = oddjob, mapping = aes(x=nps, color=language)) +
geom_histogram(binwidth=1, fill="white", position="dodge", alpha=0.5)+
facet_grid(~language)
# Boxplot for two variables: one categorical and one numerical and adjusted theme and color scheme
ggplot(data = oddjob, mapping = aes(x = flight_purpose, y = nps, color=flight_purpose)) +
geom_boxplot() +
labs(x="Flight purpose", y="NPS")+
scale_color_manual(values=c("#999999", "#E69F00"))+
theme_minimal()
# Fancy: mosaic plot for two or three categorical variables
mosaic(~gender+flight_class+status, data=oddjob,
shade=TRUE)
# Cross-tabulation of two categorical variables
tbl_cross(oddjob, row=gender, col=language, percent="cell")
# Scatterplot for two continuous variables (and possibly further grouping variables)
# Scatterplot for age, nflights and status
ggplot(data=oddjob)+
geom_point(mapping=aes(x=age,y=nflights, color=status))+
ylim(0,100)+
xlim(20,80) +
labs(x="Age in years", y="Number of flights")
# By adding "geom_smooth(method="lm")", we are drawing the linear regression
# line into the plot.
ggplot(data=oddjob, mapping=aes(x=age,y=nflights))+
geom_point()+
ylim(0,100)+
xlim(20,80) +
geom_smooth(method="lm")+
labs(x="Age in years", y="Number of flights")
# We can also do this for the separate status groups
ggplot(data=oddjob, mapping=aes(x=age,y=nflights, color=status))+
geom_point()+
ylim(0,100)+
xlim(20,80) +
geom_smooth(method="lm")+
labs(x="Age in years", y="Number of flights")
# Scatterplot for age, nflights and status and using facets
ggplot(data=oddjob, mapping=aes(x=age,y=nflights, color=status))+
geom_point()+
ylim(0,100)+
xlim(20,80) +
geom_smooth(method="lm")+
labs(x="Age in years", y="Number of flights") +
facet_wrap(~status)
# Correlation -------------------------------------------------------------
# The simplest way to calculate correlations in R for two variables is with the cor-function
# Let's investigate the relationship between reputation and overall satisfaction
cor(x=oddjob$reputation, y=oddjob$overall_sat)
# Calculate the coefficient of determination (R-squared)
cor(x=oddjob$reputation, y=oddjob$overall_sat)^2
# By default, Pearson's r is calculated. If you are working with ordinal data, you can specify
# Spearman's in the options.
cor(x=oddjob$reputation, y=oddjob$overall_sat, method="spearman")
# cor.test directly reports the significance test results
cor.test(x=oddjob$reputation, y=oddjob$overall_sat)
# You can also create a correlation matrix of all numeric variables. Let's subset the data
# to the numeric variables only:
oddjob_numeric <- subset(oddjob, select=c(age, nflights, nps, reputation, overall_sat,
commitment))
# The cor-function produces a quick overview about all correlations
cor(oddjob_numeric)
# Optional: Calculate covariance and correlation manually -----------------
# Manually calculate the covariance using the deviation (x minus mean(x)).
# Let's assume we want to investigate the relationship between age and commitment
# Let's investigate the means of the variables (sum of all x divided by number of x)
mean(oddjob$age)
mean(oddjob$commitment)
# Get devation for each value of our variables, which is x-mean(x)
deviation.age <- oddjob$age - mean(oddjob$age)
deviation.commitment <- oddjob$commitment - mean(oddjob$commitment)
# Get n (number of persons)
n <- length(oddjob$commitment)
# Create the upper part of the formula (remember, this was called cross-product deviations)
# and the lower part
crossprod <- deviation.age * deviation.commitment
crossprod
upperpart <- sum(crossprod)
upperpart
lowerpart <- n-1
lowerpart
my.covariance <- upperpart/lowerpart
my.covariance
# should be equal to the value generated by the "cov"-function
cov(oddjob$age,oddjob$commitment)
# Based on the covariance, we can manually calculate the correlation as well.
# Upperpart stays the same
# For the lower part we need (n-1) and the sd of x and y. We create a new object "lowerpart1"
# So we don't confuse it with the one used before
# Check the standard deviation of both variables
sd(oddjob$age)
sd(oddjob$commitment)
# Define lowerpart of correlation
lowerpart1 <- (n-1)*sd(oddjob$age)*sd(oddjob$commitment)
# Finalize the formula
my.correlation <- upperpart/lowerpart1
my.correlation
# should be equal to the value generated by the "cor"-function
cor(oddjob$age, oddjob$commitment)