Lab4 | Notion

#1- Linear Regression Model 
#Dataset: <https://www.kaggle.com/datasets/shashanknecrothapa/ames-housing-dataset>
#
df <- read.csv("D:\\\\Houses.csv")
head(df,3)
View(df)
nrow(df)

#Split the data into training and testing
train.index<- sample(1:nrow(df1), 0.8*nrow(df))  # row indices for training data
train <- df[train.index, ]  # model training data
test  <- df[-train.index, ]

#Create the model (x = Living area, y= Price)
#We put the dependent variable to the left of the '~' and the independent variable(s) to the right
model <- lm(SalePrice ~ Gr.Liv.Area, data = train)

#Test the model
pred<- predict(model, test)
#Compare the results
actual_pred <- data.frame(actual=test$SalePrice, predicted=pred)
View(actual_pred)

#Find the y-intercept and bias values
model$coefficients

#See all details of the model
#p-value of 0.05 or lower is generally considered statistically significant.
#The more the stars beside the variable’s p-Value, the more significant the variable
#SE - represents standard deviation
#T- score: measures standard deviation from 0
#Residuals: yTrue - yPredicted
#R-Squared tells us is the proportion of variation in the dependent (response) variable that has been explained by this model.
#R-score:average ground area explains 49.9% of the variance in house prices. 
#Degrees of Freedom: Number of observations minus the number of coefficients (including intercepts).
summary(model)

#RMSE- Root mean squared error
sigma(model)
#Correlation - Positive Correlation
cor(df$SalePrice, df$Gr.Liv.Area)

#Plot the model
install.packages("ggplot2",include_dependencies = TRUE)
library(ggplot2)

ggplot(data=df) +
  aes(x=Gr.Liv.Area, y= SalePrice) +
  geom_point() +
  geom_smooth(method ="lm")

#Function to predict for new values
PricePredict <- function(area){
  area * model$coefficients[2] + model$coefficients[1]
}
res <- PricePredict(c(2000,5000))
#Alternative method
res <-predict(model,data.frame(Gr.Liv.Area = c(2000,5000)))

#Multi-variable regression: include multiple features
model2 <- lm(SalePrice ~ Gr.Liv.Area + Year.Built +Lot.Area + Garage.Area, data = train)
model2$coefficients
summary(model2)
pred<- predict(model2, test)
#Compare the results
actual_pred <- data.frame(actual=test$SalePrice, predicted=pred)
View(actual_pred)

#Alternatively: we can use update() to update the model formula used in model1
model2 <- update(model, . ~ . + Year.Built)

#Cross validation: creates and trains multiple models
install.packages("caret")
library(caret)

cv_model1 <- train(
  form = SalePrice ~ Gr.Liv.Area, 
  data = df, 
  method = "lm",
  trControl = trainControl(method = "cv", number = 10)
)
#The resulting cross-validated RMSE is $56,410.89 (this is the average RMSE across the 10 CV folds). When applied to unseen data, the predictions this model makes are, on average, about $56,410.89 off from the actual sale price.

summary(cv_model2)
#We can perform cross-validation on the other two models.
df$Garage.Area[is.na(df$Garage.Area)]<-  mean(df$Garage.Area, na.rm=TRUE)
cv_model2 <- train(
  SalePrice ~ Gr.Liv.Area + Year.Built + Garage.Area , 
  data = df, 
  method = "lm",
  trControl = trainControl(method = "cv", number = 10)
)

# Extract out of sample performance measures
#RMSE reduces from $55425.40 down to $46,292.38 
summary(resamples(list(
  model1 = cv_model1, 
  model2 = cv_model2)))

#2- Logistic Regression - Predicts class /category
#Dataset Link:<https://www.kaggle.com/competitions/titanic/data?select=train.csv>
df <- read.csv("D:\\\\Titanic.csv")
View(df)

#Select specific columns
df <- subset(df,select=c(2,3,5,6,7,8,10,12))

#Factor categorical columns
df$Sex <- factor(df$Sex)
df$Embarked <- factor(df$Embarked)
df$Survived <- factor(df$Survived)
is.factor(df$Survived)

#Split the data into training and testing
train <- df[1:800,]
test <- df[801:nrow(df),]

#Create the model 
#binomial logistic regression” since the variable to predict is binary 
library(tidyverse)
mod <- glm( Survived ~ ., data = train, family = binomial)
summary(mod)$coef
#we can see that SibSp, Fare and Embarked are not statistically significant. As for the statistically significant variables, sex has the lowest p-value suggesting a strong association of the sex of the passenger with the probability of having survived.
summary(mod)

#Make predictions
fitted.results <- predict(mod,newdata=test)
fitted.results <- ifelse(fitted.results > 0.5,1,0)
actual_pred <- data.frame(actual=test$Survived, predicted=fitted.results)
View(actual_pred)

#Accuracy
Acc <- mean(fitted.results == test$Survived, na.rm=TRUE)
print(paste('Accuracy = ',Acc))