% distinct() %>% count() df<- df %>% distinct() #Turn last reviews to date format and remove missing values df$last.review[df$last.review == ''] <- NA df$last.review <- as.Date(df$last.review,"%m/%d/%Y") df$last.review[is.na(df$last.review)]<- min(df$last.review) #Cleaning the price column #Remove $ sign from the price column df$price <- as.numeric(gsub("\\$","", df$price)) df$service.fee <- as.numeric(gsub("\\$","", df$service.fee)) Finding all missing values colSums(is.na(df)) #Price missing values sum(is.na(df$price)) df$price[is.na(df$price)] <- round(mean(df$price, na.r"> % distinct() %>% count() df<- df %>% distinct() #Turn last reviews to date format and remove missing values df$last.review[df$last.review == ''] <- NA df$last.review <- as.Date(df$last.review,"%m/%d/%Y") df$last.review[is.na(df$last.review)]<- min(df$last.review) #Cleaning the price column #Remove $ sign from the price column df$price <- as.numeric(gsub("\\$","", df$price)) df$service.fee <- as.numeric(gsub("\\$","", df$service.fee)) Finding all missing values colSums(is.na(df)) #Price missing values sum(is.na(df$price)) df$price[is.na(df$price)] <- round(mean(df$price, na.r"> % distinct() %>% count() df<- df %>% distinct() #Turn last reviews to date format and remove missing values df$last.review[df$last.review == ''] <- NA df$last.review <- as.Date(df$last.review,"%m/%d/%Y") df$last.review[is.na(df$last.review)]<- min(df$last.review) #Cleaning the price column #Remove $ sign from the price column df$price <- as.numeric(gsub("\\$","", df$price)) df$service.fee <- as.numeric(gsub("\\$","", df$service.fee)) Finding all missing values colSums(is.na(df)) #Price missing values sum(is.na(df$price)) df$price[is.na(df$price)] <- round(mean(df$price, na.r">
#Dataset Link: <https://github.com/adishourya/Airbnb/blob/master/new-york-city-airbnb-open-data/AB_NYC_2019.csv>
df <- read.csv("Airbnb_Open_Data.csv")
head(df,3)
View(df)
#Drop unwanted columns
df <- subset(df, select = -c(id,host.id,host.name,lat,long, calculated.host.listings.count,house_rules,license,review.rate.number))
#Find column datatypes
lapply(df, typeof)
#Remove duplicate rows
library(dplyr)
df %>% distinct() %>% count()
df<- df %>% distinct()
#Turn last reviews to date format and remove missing values
df$last.review[df$last.review == ''] <- NA
df$last.review <- as.Date(df$last.review,"%m/%d/%Y")
df$last.review[is.na(df$last.review)]<- min(df$last.review)
#Cleaning the price column
#Remove $ sign from the price column
df$price <- as.numeric(gsub("\\\\$","", df$price))
df$service.fee <- as.numeric(gsub("\\\\$","", df$service.fee))
Finding all missing values
colSums(is.na(df))
#Price missing values
sum(is.na(df$price))
df$price[is.na(df$price)] <- round(mean(df$price, na.rm = TRUE))
#Alternative - replace missing values with 0
df$price <- ifelse(is.na(df$price), 0, df$price)
#Service Fee missing values
perc<- df$service.fee / df$price
value<-mean(perc, na.rm=TRUE)
df$service.fee <- ifelse(is.na(df$service.fee),round(df$price*value), df$service.fee)
#Handling all missing values using pipeline
library(tidyverse)
df <- df %>%
replace_na(list(Construction.year =round(mean(df$Construction.year,na.rm=TRUE)),
minimum.nights = 1,
number.of.reviews = 0,
reviews.per.month = 0,
availability.365=365))
#Instant Bookable missing values - Boolean Variable
table(df$instant_bookable) #Majority of rows are false
df$instant_bookable[is.na(df$instant_bookable)] <- FALSE
#Convert names to lower case
df$NAME <-lapply(df$NAME,tolower)
#Select all numerical columns
numeric_columns <- sapply(df, is.numeric)
df_numeric <- df[,numeric_columns]
df_numeric['last.review'] <-df$last.review
View(df_numeric)
#Calculate mean price per neighbourhood
mean_price <- tapply(df$price,INDEX=df$neighbourhood.group,FUN=mean, na.rm=TRUE)
#Analyze the price based on the neighbourhood group
res <- df %>%
group_by(neighbourhood.group) %>%
summarise(min_price = min(price), max_price = max(price), avg_price = mean(price)) %>%
arrange(desc(avg_price))
#Show result as a dataframe
res<- data.frame(Area=res['neighbourhood.group'],
minimum=res['min_price'],maximum=res['max_price'],
average=res['avg_price'])
View(res)
#Find the neighbourhoods that have the most 'Entire home/apt' type of rooms
df %>%
filter(room.type == 'Entire home/apt')%>%
group_by(neighbourhood.group) %>%
count(room.type)%>%
arrange(desc(n))