Exploratory Data Analysis

Exploratory Data Analysis (EDA) is the process of investigating data to understand its main characteristics, discover patterns, spot anomalies, and check assumptions. It's typically done before formal modeling.

Goals of EDA

Understand the data: Structure, types, distributions
Detect patterns: Trends, relationships, clusters
Identify anomalies: Outliers, missing data, inconsistencies
Generate hypotheses: Form ideas for further analysis
Check assumptions: Validate assumptions for statistical tests

Univariate Analysis

Categorical Variables

library(dplyr)
library(ggplot2)

# Frequency table
table(data$category)
prop.table(table(data$category))

# Using dplyr
data %>%
  count(category) %>%
  mutate(proportion = n / sum(n))

# Bar plot
ggplot(data, aes(x = category)) +
  geom_bar() +
  labs(title = "Distribution of Categories")

# Pie chart (less recommended)
ggplot(data, aes(x = "", fill = category)) +
  geom_bar(width = 1) +
  coord_polar("y")

Numerical Variables

# Summary statistics
summary(data$numeric_column)
mean(data$numeric_column, na.rm = TRUE)
median(data$numeric_column, na.rm = TRUE)
sd(data$numeric_column, na.rm = TRUE)
quantile(data$numeric_column, c(0.25, 0.5, 0.75))

# Histogram
ggplot(data, aes(x = numeric_column)) +
  geom_histogram(bins = 30, fill = "steelblue") +
  labs(title = "Distribution of Numeric Variable")

# Density plot
ggplot(data, aes(x = numeric_column)) +
  geom_density(fill = "steelblue", alpha = 0.7) +
  labs(title = "Density Plot")

# Box plot
ggplot(data, aes(y = numeric_column)) +
  geom_boxplot() +
  labs(title = "Box Plot")

# Violin plot
ggplot(data, aes(x = category, y = numeric_column)) +
  geom_violin() +
  labs(title = "Distribution by Category")

Bivariate Analysis

Categorical vs Categorical

# Contingency table
table(data$category1, data$category2)
prop.table(table(data$category1, data$category2), margin = 1)

# Stacked bar chart
ggplot(data, aes(x = category1, fill = category2)) +
  geom_bar(position = "stack") +
  labs(title = "Stacked Bar Chart")

# Grouped bar chart
ggplot(data, aes(x = category1, fill = category2)) +
  geom_bar(position = "dodge") +
  labs(title = "Grouped Bar Chart")

Numerical vs Numerical

# Scatter plot
ggplot(data, aes(x = var1, y = var2)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = TRUE) +
  labs(title = "Scatter Plot with Trend Line")

# Correlation
cor(data$var1, data$var2, use = "complete.obs")
cor.test(data$var1, data$var2)

# Correlation matrix
cor_matrix <- cor(data[, c("var1", "var2", "var3")], use = "complete.obs")
library(corrplot)
corrplot(cor_matrix, method = "circle")

Categorical vs Numerical

# Box plots by category
ggplot(data, aes(x = category, y = numeric_var)) +
  geom_boxplot() +
  labs(title = "Numeric Variable by Category")

# Violin plots
ggplot(data, aes(x = category, y = numeric_var)) +
  geom_violin() +
  labs(title = "Distribution by Category")

# Grouped statistics
data %>%
  group_by(category) %>%
  summarize(
    mean = mean(numeric_var, na.rm = TRUE),
    median = median(numeric_var, na.rm = TRUE),
    sd = sd(numeric_var, na.rm = TRUE)
  )

Multivariate Analysis

Pairwise Relationships

# Pair plot
library(GGally)
ggpairs(data[, c("var1", "var2", "var3")])

# Correlation heatmap
library(corrplot)
cor_matrix <- cor(data[, sapply(data, is.numeric)], use = "complete.obs")
corrplot(cor_matrix, method = "color", type = "upper")

Faceting

# Multiple plots by category
ggplot(data, aes(x = var1, y = var2)) +
  geom_point() +
  facet_wrap(~ category) +
  labs(title = "Scatter Plots by Category")

# Grid of plots
ggplot(data, aes(x = var1, y = var2)) +
  geom_point() +
  facet_grid(category1 ~ category2)

Advanced Visualizations

Time Series

library(lubridate)

# Time series plot
data$date <- ymd(data$date_string)
ggplot(data, aes(x = date, y = value)) +
  geom_line() +
  geom_point() +
  labs(title = "Time Series Plot") +
  theme_minimal()

Distribution Comparisons

# Overlapping distributions
ggplot(data, aes(x = value, fill = category)) +
  geom_density(alpha = 0.5) +
  labs(title = "Overlapping Distributions")

# Q-Q plots
qqnorm(data$numeric_var)
qqline(data$numeric_var)

Statistical Tests

Normality Tests

# Shapiro-Wilk test
shapiro.test(data$numeric_var)

# Kolmogorov-Smirnov test
ks.test(data$numeric_var, "pnorm", mean = mean(data$numeric_var), sd = sd(data$numeric_var))

Association Tests

# Chi-square test
chisq.test(table(data$cat1, data$cat2))

# T-test
t.test(numeric_var ~ category, data = data)

# ANOVA
aov_result <- aov(numeric_var ~ category, data = data)
summary(aov_result)

EDA Workflow Example

library(dplyr)
library(ggplot2)
library(corrplot)

# Load data
data <- read.csv("data.csv")

# 1. Initial overview
dim(data)
str(data)
summary(data)
head(data)

# 2. Check missing values
colSums(is.na(data))

# 3. Univariate analysis
# Numeric variables
numeric_vars <- sapply(data, is.numeric)
summary(data[, numeric_vars])

# Histograms for all numeric variables
data[, numeric_vars] %>%
  gather(key = "variable", value = "value") %>%
  ggplot(aes(x = value)) +
  geom_histogram(bins = 30) +
  facet_wrap(~ variable, scales = "free")

# Categorical variables
categorical_vars <- sapply(data, is.factor) | sapply(data, is.character)
for (var in names(data)[categorical_vars]) {
  print(table(data[[var]]))
  print(ggplot(data, aes_string(x = var)) + geom_bar())
}

# 4. Bivariate analysis
# Correlation matrix
cor_matrix <- cor(data[, numeric_vars], use = "complete.obs")
corrplot(cor_matrix, method = "circle")

# Scatter plots for highly correlated variables
high_cor <- which(abs(cor_matrix) > 0.7 & abs(cor_matrix) < 1, arr.ind = TRUE)
# Plot relationships

# 5. Group comparisons
data %>%
  group_by(category) %>%
  summarize(
    count = n(),
    mean_value = mean(numeric_var, na.rm = TRUE),
    sd_value = sd(numeric_var, na.rm = TRUE)
  )

# 6. Identify outliers
boxplot(data[, numeric_vars])

Best Practices

Start simple: Begin with basic summaries and plots
Look for patterns: Trends, clusters, relationships
Check assumptions: Normality, linearity, independence
Document findings: Keep notes on what you discover
Iterate: EDA is an iterative process
Use multiple views: Different visualizations reveal different insights
Question everything: Don't take data at face value

Common EDA Questions

What is the distribution of each variable?
Are there any outliers or anomalies?
What are the relationships between variables?
Are there any patterns or trends?
What assumptions can we make?
What further analysis is needed?

Next Steps

After exploring your data, proceed to Statistical Analysis for more formal analysis techniques.