#1- Linear Regression Model
#Dataset: <https://www.kaggle.com/datasets/shashanknecrothapa/ames-housing-dataset>
#
df <- read.csv("D:\\\\Houses.csv")
head(df,3)
View(df)
nrow(df)
#Split the data into training and testing
train.index<- sample(1:nrow(df1), 0.8*nrow(df)) # row indices for training data
train <- df[train.index, ] # model training data
test <- df[-train.index, ]
#Create the model (x = Living area, y= Price)
#We put the dependent variable to the left of the '~' and the independent variable(s) to the right
model <- lm(SalePrice ~ Gr.Liv.Area, data = train)
#Test the model
pred<- predict(model, test)
#Compare the results
actual_pred <- data.frame(actual=test$SalePrice, predicted=pred)
View(actual_pred)
#Find the y-intercept and bias values
model$coefficients
#See all details of the model
#p-value of 0.05 or lower is generally considered statistically significant.
#The more the stars beside the variable’s p-Value, the more significant the variable
#SE - represents standard deviation
#T- score: measures standard deviation from 0
#Residuals: yTrue - yPredicted
#R-Squared tells us is the proportion of variation in the dependent (response) variable that has been explained by this model.
#R-score:average ground area explains 49.9% of the variance in house prices.
#Degrees of Freedom: Number of observations minus the number of coefficients (including intercepts).
summary(model)
#RMSE- Root mean squared error
sigma(model)
#Correlation - Positive Correlation
cor(df$SalePrice, df$Gr.Liv.Area)
#Plot the model
install.packages("ggplot2",include_dependencies = TRUE)
library(ggplot2)
ggplot(data=df) +
aes(x=Gr.Liv.Area, y= SalePrice) +
geom_point() +
geom_smooth(method ="lm")
#Function to predict for new values
PricePredict <- function(area){
area * model$coefficients[2] + model$coefficients[1]
}
res <- PricePredict(c(2000,5000))
#Alternative method
res <-predict(model,data.frame(Gr.Liv.Area = c(2000,5000)))
#Multi-variable regression: include multiple features
model2 <- lm(SalePrice ~ Gr.Liv.Area + Year.Built +Lot.Area + Garage.Area, data = train)
model2$coefficients
summary(model2)
pred<- predict(model2, test)
#Compare the results
actual_pred <- data.frame(actual=test$SalePrice, predicted=pred)
View(actual_pred)
#Alternatively: we can use update() to update the model formula used in model1
model2 <- update(model, . ~ . + Year.Built)
#Cross validation: creates and trains multiple models
install.packages("caret")
library(caret)
cv_model1 <- train(
form = SalePrice ~ Gr.Liv.Area,
data = df,
method = "lm",
trControl = trainControl(method = "cv", number = 10)
)
#The resulting cross-validated RMSE is $56,410.89 (this is the average RMSE across the 10 CV folds). When applied to unseen data, the predictions this model makes are, on average, about $56,410.89 off from the actual sale price.
summary(cv_model2)
#We can perform cross-validation on the other two models.
df$Garage.Area[is.na(df$Garage.Area)]<- mean(df$Garage.Area, na.rm=TRUE)
cv_model2 <- train(
SalePrice ~ Gr.Liv.Area + Year.Built + Garage.Area ,
data = df,
method = "lm",
trControl = trainControl(method = "cv", number = 10)
)
# Extract out of sample performance measures
#RMSE reduces from $55425.40 down to $46,292.38
summary(resamples(list(
model1 = cv_model1,
model2 = cv_model2)))
#2- Logistic Regression - Predicts class /category
#Dataset Link:<https://www.kaggle.com/competitions/titanic/data?select=train.csv>
df <- read.csv("D:\\\\Titanic.csv")
View(df)
#Select specific columns
df <- subset(df,select=c(2,3,5,6,7,8,10,12))
#Factor categorical columns
df$Sex <- factor(df$Sex)
df$Embarked <- factor(df$Embarked)
df$Survived <- factor(df$Survived)
is.factor(df$Survived)
#Split the data into training and testing
train <- df[1:800,]
test <- df[801:nrow(df),]
#Create the model
#binomial logistic regression” since the variable to predict is binary
library(tidyverse)
mod <- glm( Survived ~ ., data = train, family = binomial)
summary(mod)$coef
#we can see that SibSp, Fare and Embarked are not statistically significant. As for the statistically significant variables, sex has the lowest p-value suggesting a strong association of the sex of the passenger with the probability of having survived.
summary(mod)
#Make predictions
fitted.results <- predict(mod,newdata=test)
fitted.results <- ifelse(fitted.results > 0.5,1,0)
actual_pred <- data.frame(actual=test$Survived, predicted=fitted.results)
View(actual_pred)
#Accuracy
Acc <- mean(fitted.results == test$Survived, na.rm=TRUE)
print(paste('Accuracy = ',Acc))