# === Step 1: Load and Prepare the Data ===
# Load necessary libraries
install.packages("titanic")
install.packages("caret")
library(titanic)
library(dplyr)
library(ggplot2)
library(caret)

# Load and inspect the dataset
data("titanic_train")
df <- titanic_train
str(df)

# Drop unnecessary columns
df <- df %>% select(-Name, -Ticket, -Cabin)
str(df)

# Convert categorical variables into factors
df$Survived <- as.factor(df$Survived)
df$Pclass <- as.factor(df$Pclass)
df$Sex <- as.factor(df$Sex)
df$Embarked <- as.factor(df$Embarked)


# Handle missing values
df$Age[is.na(df$Age)] <- median(df$Age, na.rm = TRUE)
df$Embarked[is.na(df$Embarked)] <- "S"  # Replace missing embarkation with most common
df$Embarked[df$Embarked == ""] <- "S"  # Replace empty strings with "S"
df$Embarked <- droplevels(factor(df$Embarked))  # Refactor and clean levels
summary(df)

# Split into training (80%) and testing (20%) sets
set.seed(123)  # For reproducibility
train_index <- createDataPartition(df$Survived, p = 0.8, list = FALSE)
train_data <- df[train_index, ]
test_data <- df[-train_index, ]

# Match factor levels for Embarked
test_data$Embarked <- droplevels(factor(test_data$Embarked, levels = levels(train_data$Embarked)))

# Checking if split worked
nrow(train_data)  # Should be about 712 rows (80%)
nrow(test_data)   # Should be about 179 rows (20%)
table(train_data$Survived)  # Check class balance
table(test_data$Survived)

# Check dataset structure and summary
str(train_data)
summary(train_data)

# === Step 2: Logistic Regression ===
# Train the logistic regression model
log_model <- glm(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked,
                 data = train_data,
                 family = binomial)

# View model summary
summary(log_model)

# Predict probabilities on test set
log_preds_prob <- predict(log_model, newdata = test_data, type = "response")
head(log_preds_prob)

# Convert probabilities to class predictions
log_preds_class <- ifelse(log_preds_prob > 0.5, 1, 0)
log_preds_class <- as.factor(log_preds_class)

# Evaluate model performance
confusionMatrix(log_preds_class, test_data$Survived, positive = "1")

# === Step 3: Decision Tree Model ===
# Install and load packages
install.packages("rpart")
install.packages("rpart.plot")
library(rpart)
library(rpart.plot)

# Train decision tree
tree_model <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked,
                    data = train_data,
                    method = "class")  

# View Tree
rpart.plot(tree_model)

# Predict classes
tree_preds_class <- predict(tree_model, newdata = test_data, type = "class")

# Evaluating model performance
confusionMatrix(tree_preds_class, test_data$Survived, positive = "1")

# === Step 4: ROC + AUC Comparison ===
# Load package
library(pROC)

# Predict probabilities from decision tree model
tree_preds_prob <- predict(tree_model, newdata = test_data, type = "prob")[,2]  # probability of class '1'

# ROC for logistic regression
roc_log <- roc(test_data$Survived, log_preds_prob)

# ROC for decision tree
roc_tree <- roc(test_data$Survived, tree_preds_prob)

# Plot logistic ROC first
plot(roc_log, col = "blue", main = "ROC Curves: Logistic vs. Decision Tree")

# Add tree ROC on top
lines(roc_tree, col = "darkgreen")
legend("bottomright", legend = c(
  paste("Logistic Regression (AUC =", round(auc(roc_log), 3), ")"),
  paste("Decision Tree (AUC =", round(auc(roc_tree), 3), ")")
), col = c("blue", "darkgreen"), lwd = 2)