Statistical Analysis
Statistical analysis involves applying statistical methods to data to draw conclusions, make inferences, and test hypotheses. This tutorial covers common statistical techniques in R.
Descriptive Statistics
Measures of Central Tendency
# Mean
mean(data$variable, na.rm = TRUE)
# Median
median(data$variable, na.rm = TRUE)
# Mode (custom function)
get_mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
get_mode(data$variable)
# Weighted mean
weighted.mean(data$value, data$weight)
Measures of Dispersion
# Standard deviation
sd(data$variable, na.rm = TRUE)
# Variance
var(data$variable, na.rm = TRUE)
# Range
range(data$variable, na.rm = TRUE)
diff(range(data$variable, na.rm = TRUE))
# Interquartile range
IQR(data$variable, na.rm = TRUE)
# Coefficient of variation
cv <- function(x) sd(x, na.rm = TRUE) / mean(x, na.rm = TRUE)
cv(data$variable)
Summary Statistics
# Complete summary
summary(data$variable)
# Custom summary function
library(dplyr)
data %>%
summarize(
mean = mean(variable, na.rm = TRUE),
median = median(variable, na.rm = TRUE),
sd = sd(variable, na.rm = TRUE),
min = min(variable, na.rm = TRUE),
max = max(variable, na.rm = TRUE),
q25 = quantile(variable, 0.25, na.rm = TRUE),
q75 = quantile(variable, 0.75, na.rm = TRUE)
)
Hypothesis Testing
One-Sample T-Test
# Test if mean equals a specific value
t.test(data$variable, mu = 100)
# One-sided test
t.test(data$variable, mu = 100, alternative = "greater")
t.test(data$variable, mu = 100, alternative = "less")
Two-Sample T-Test
# Independent samples
t.test(variable ~ group, data = data)
t.test(data$group1, data$group2)
# Paired samples
t.test(data$before, data$after, paired = TRUE)
# Equal variances assumption
t.test(variable ~ group, data = data, var.equal = TRUE)
Chi-Square Test
# Test of independence
chisq.test(table(data$var1, data$var2))
# Goodness of fit
observed <- c(10, 20, 30)
expected <- c(15, 20, 25)
chisq.test(observed, p = expected / sum(expected))
ANOVA
# One-way ANOVA
aov_result <- aov(variable ~ factor, data = data)
summary(aov_result)
# Post-hoc tests
TukeyHSD(aov_result)
# Two-way ANOVA
aov_result2 <- aov(variable ~ factor1 * factor2, data = data)
summary(aov_result2)
Non-Parametric Tests
# Wilcoxon rank-sum test (Mann-Whitney)
wilcox.test(variable ~ group, data = data)
# Wilcoxon signed-rank test
wilcox.test(data$before, data$after, paired = TRUE)
# Kruskal-Wallis test
kruskal.test(variable ~ factor, data = data)
# Friedman test
friedman.test(variable ~ factor | block, data = data)
Correlation Analysis
Pearson Correlation
# Correlation coefficient
cor(data$var1, data$var2, use = "complete.obs")
# Correlation test
cor.test(data$var1, data$var2)
# Correlation matrix
cor(data[, c("var1", "var2", "var3")], use = "complete.obs")
Spearman Correlation
# Spearman rank correlation
cor(data$var1, data$var2, method = "spearman")
cor.test(data$var1, data$var2, method = "spearman")
Partial Correlation
library(ppcor)
# Partial correlation (controlling for other variables)
pcor.test(data$var1, data$var2, data$control_var)
Regression Analysis
Simple Linear Regression
# Fit model
model <- lm(y ~ x, data = data)
summary(model)
# Model diagnostics
plot(model)
# Predictions
predict(model, newdata = data.frame(x = c(10, 20, 30)))
# Confidence intervals
confint(model)
predict(model, newdata = data.frame(x = 10), interval = "confidence")
predict(model, newdata = data.frame(x = 10), interval = "prediction")
Multiple Linear Regression
# Multiple predictors
model <- lm(y ~ x1 + x2 + x3, data = data)
summary(model)
# Interaction terms
model <- lm(y ~ x1 * x2, data = data)
# Polynomial terms
model <- lm(y ~ x + I(x^2), data = data)
Logistic Regression
# Binary outcome
model <- glm(outcome ~ predictor, data = data, family = binomial)
summary(model)
# Odds ratios
exp(coef(model))
exp(confint(model))
Model Selection
# Stepwise selection
null_model <- lm(y ~ 1, data = data)
full_model <- lm(y ~ ., data = data)
step_model <- step(null_model, scope = list(upper = full_model), direction = "both")
# AIC comparison
AIC(model1, model2)
Time Series Analysis
library(forecast)
# Create time series object
ts_data <- ts(data$value, start = c(2020, 1), frequency = 12)
# Decompose
decompose(ts_data)
# Autocorrelation
acf(ts_data)
pacf(ts_data)
# ARIMA model
arima_model <- auto.arima(ts_data)
forecast(arima_model, h = 12)
Survival Analysis
library(survival)
# Kaplan-Meier estimator
surv_obj <- Surv(time = data$time, event = data$event)
km_fit <- survfit(surv_obj ~ group, data = data)
plot(km_fit)
# Log-rank test
survdiff(surv_obj ~ group, data = data)
# Cox proportional hazards
cox_model <- coxph(surv_obj ~ predictor, data = data)
summary(cox_model)
Power Analysis
library(pwr)
# T-test power
pwr.t.test(d = 0.5, power = 0.8, sig.level = 0.05)
# ANOVA power
pwr.anova.test(k = 3, f = 0.25, power = 0.8, sig.level = 0.05)
# Correlation power
pwr.r.test(r = 0.3, power = 0.8, sig.level = 0.05)
Effect Sizes
library(effectsize)
# Cohen's d
cohens_d(data$group1, data$group2)
# Eta squared
eta_squared(aov_result)
# Cramér's V
cramers_v(table(data$var1, data$var2))
Example: Complete Statistical Analysis
library(dplyr)
library(broom)
# Load data
data <- read.csv("data.csv")
# 1. Descriptive statistics
summary_stats <- data %>%
group_by(group) %>%
summarize(
n = n(),
mean = mean(value, na.rm = TRUE),
sd = sd(value, na.rm = TRUE),
median = median(value, na.rm = TRUE),
q25 = quantile(value, 0.25, na.rm = TRUE),
q75 = quantile(value, 0.75, na.rm = TRUE)
)
# 2. Normality test
shapiro.test(data$value)
# 3. Compare groups
# If normal: t-test
t_test <- t.test(value ~ group, data = data)
tidy(t_test)
# If not normal: Wilcoxon test
wilcox_test <- wilcox.test(value ~ group, data = data)
tidy(wilcox_test)
# 4. Correlation
cor_test <- cor.test(data$var1, data$var2)
tidy(cor_test)
# 5. Regression
model <- lm(value ~ predictor1 + predictor2, data = data)
summary(model)
tidy(model)
glance(model)
# 6. Model diagnostics
plot(model)
Best Practices
- Check assumptions: Verify assumptions before tests
- Choose appropriate tests: Consider data distribution and type
- Report effect sizes: Don't just report p-values
- Multiple comparisons: Adjust for multiple testing
- Interpret results: Understand what results mean
- Document methods: Keep track of analyses performed
Next Steps
Learn about Data Visualization to effectively communicate your statistical findings.